bio2zarr 0.0.10__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +91 -20
- bio2zarr/core.py +31 -4
- bio2zarr/vcf2zarr/icf.py +4 -3
- bio2zarr/vcf2zarr/vcz.py +47 -11
- bio2zarr/vcf_utils.py +10 -5
- {bio2zarr-0.0.10.dist-info → bio2zarr-0.1.1.dist-info}/METADATA +4 -4
- bio2zarr-0.1.1.dist-info/RECORD +20 -0
- bio2zarr-0.0.10.dist-info/RECORD +0 -20
- {bio2zarr-0.0.10.dist-info → bio2zarr-0.1.1.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.10.dist-info → bio2zarr-0.1.1.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.10.dist-info → bio2zarr-0.1.1.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.10.dist-info → bio2zarr-0.1.1.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -44,7 +44,13 @@ zarr_path = click.argument(
|
|
|
44
44
|
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
num_partitions = click.
|
|
47
|
+
num_partitions = click.option(
|
|
48
|
+
"-n",
|
|
49
|
+
"--num-partitions",
|
|
50
|
+
type=click.IntRange(min=1),
|
|
51
|
+
default=None,
|
|
52
|
+
help="Target number of partitions to split into",
|
|
53
|
+
)
|
|
48
54
|
|
|
49
55
|
partition = click.argument("partition", type=click.IntRange(min=0))
|
|
50
56
|
|
|
@@ -58,6 +64,13 @@ force = click.option(
|
|
|
58
64
|
help="Force overwriting of existing directories",
|
|
59
65
|
)
|
|
60
66
|
|
|
67
|
+
progress = click.option(
|
|
68
|
+
"-P /-Q",
|
|
69
|
+
"--progress/--no-progress",
|
|
70
|
+
default=True,
|
|
71
|
+
help="Show progress bars (default: show)",
|
|
72
|
+
)
|
|
73
|
+
|
|
61
74
|
one_based = click.option(
|
|
62
75
|
"--one-based",
|
|
63
76
|
is_flag=True,
|
|
@@ -165,6 +178,15 @@ def check_overwrite_dir(path, force):
|
|
|
165
178
|
shutil.rmtree(tmp_delete_path)
|
|
166
179
|
|
|
167
180
|
|
|
181
|
+
def check_partitions(num_partitions):
|
|
182
|
+
if num_partitions is None:
|
|
183
|
+
raise click.UsageError(
|
|
184
|
+
"-n/--num-partitions must currently be specified. Future versions "
|
|
185
|
+
"will provide reasonable defaults or other means of specifying "
|
|
186
|
+
"partitions."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
168
190
|
def get_compressor(cname):
|
|
169
191
|
if cname is None:
|
|
170
192
|
return None
|
|
@@ -190,9 +212,17 @@ def show_work_summary(work_summary, json):
|
|
|
190
212
|
@verbose
|
|
191
213
|
@column_chunk_size
|
|
192
214
|
@compressor
|
|
215
|
+
@progress
|
|
193
216
|
@worker_processes
|
|
194
217
|
def explode(
|
|
195
|
-
vcfs,
|
|
218
|
+
vcfs,
|
|
219
|
+
icf_path,
|
|
220
|
+
force,
|
|
221
|
+
verbose,
|
|
222
|
+
column_chunk_size,
|
|
223
|
+
compressor,
|
|
224
|
+
progress,
|
|
225
|
+
worker_processes,
|
|
196
226
|
):
|
|
197
227
|
"""
|
|
198
228
|
Convert VCF(s) to intermediate columnar format
|
|
@@ -205,7 +235,7 @@ def explode(
|
|
|
205
235
|
worker_processes=worker_processes,
|
|
206
236
|
column_chunk_size=column_chunk_size,
|
|
207
237
|
compressor=get_compressor(compressor),
|
|
208
|
-
show_progress=
|
|
238
|
+
show_progress=progress,
|
|
209
239
|
)
|
|
210
240
|
|
|
211
241
|
|
|
@@ -218,6 +248,7 @@ def explode(
|
|
|
218
248
|
@compressor
|
|
219
249
|
@json
|
|
220
250
|
@verbose
|
|
251
|
+
@progress
|
|
221
252
|
@worker_processes
|
|
222
253
|
def dexplode_init(
|
|
223
254
|
vcfs,
|
|
@@ -228,14 +259,16 @@ def dexplode_init(
|
|
|
228
259
|
compressor,
|
|
229
260
|
json,
|
|
230
261
|
verbose,
|
|
262
|
+
progress,
|
|
231
263
|
worker_processes,
|
|
232
264
|
):
|
|
233
265
|
"""
|
|
234
266
|
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
235
|
-
over
|
|
267
|
+
over some number of paritions.
|
|
236
268
|
"""
|
|
237
269
|
setup_logging(verbose)
|
|
238
270
|
check_overwrite_dir(icf_path, force)
|
|
271
|
+
check_partitions(num_partitions)
|
|
239
272
|
work_summary = vcf2zarr.explode_init(
|
|
240
273
|
icf_path,
|
|
241
274
|
vcfs,
|
|
@@ -243,7 +276,7 @@ def dexplode_init(
|
|
|
243
276
|
column_chunk_size=column_chunk_size,
|
|
244
277
|
worker_processes=worker_processes,
|
|
245
278
|
compressor=get_compressor(compressor),
|
|
246
|
-
show_progress=
|
|
279
|
+
show_progress=progress,
|
|
247
280
|
)
|
|
248
281
|
show_work_summary(work_summary, json)
|
|
249
282
|
|
|
@@ -310,6 +343,7 @@ def mkschema(icf_path):
|
|
|
310
343
|
@samples_chunk_size
|
|
311
344
|
@max_variant_chunks
|
|
312
345
|
@max_memory
|
|
346
|
+
@progress
|
|
313
347
|
@worker_processes
|
|
314
348
|
def encode(
|
|
315
349
|
icf_path,
|
|
@@ -321,6 +355,7 @@ def encode(
|
|
|
321
355
|
samples_chunk_size,
|
|
322
356
|
max_variant_chunks,
|
|
323
357
|
max_memory,
|
|
358
|
+
progress,
|
|
324
359
|
worker_processes,
|
|
325
360
|
):
|
|
326
361
|
"""
|
|
@@ -337,7 +372,7 @@ def encode(
|
|
|
337
372
|
max_variant_chunks=max_variant_chunks,
|
|
338
373
|
worker_processes=worker_processes,
|
|
339
374
|
max_memory=max_memory,
|
|
340
|
-
show_progress=
|
|
375
|
+
show_progress=progress,
|
|
341
376
|
)
|
|
342
377
|
|
|
343
378
|
|
|
@@ -351,6 +386,7 @@ def encode(
|
|
|
351
386
|
@samples_chunk_size
|
|
352
387
|
@max_variant_chunks
|
|
353
388
|
@json
|
|
389
|
+
@progress
|
|
354
390
|
@verbose
|
|
355
391
|
def dencode_init(
|
|
356
392
|
icf_path,
|
|
@@ -362,12 +398,13 @@ def dencode_init(
|
|
|
362
398
|
samples_chunk_size,
|
|
363
399
|
max_variant_chunks,
|
|
364
400
|
json,
|
|
401
|
+
progress,
|
|
365
402
|
verbose,
|
|
366
403
|
):
|
|
367
404
|
"""
|
|
368
405
|
Initialise conversion of intermediate format to VCF Zarr. This will
|
|
369
406
|
set up the specified ZARR_PATH to perform this conversion over
|
|
370
|
-
|
|
407
|
+
some number of partitions.
|
|
371
408
|
|
|
372
409
|
The output of this commmand is the actual number of partitions generated
|
|
373
410
|
(which may be less then the requested number, if there is not sufficient
|
|
@@ -379,6 +416,7 @@ def dencode_init(
|
|
|
379
416
|
"""
|
|
380
417
|
setup_logging(verbose)
|
|
381
418
|
check_overwrite_dir(zarr_path, force)
|
|
419
|
+
check_partitions(num_partitions)
|
|
382
420
|
work_summary = vcf2zarr.encode_init(
|
|
383
421
|
icf_path,
|
|
384
422
|
zarr_path,
|
|
@@ -387,7 +425,7 @@ def dencode_init(
|
|
|
387
425
|
variants_chunk_size=variants_chunk_size,
|
|
388
426
|
samples_chunk_size=samples_chunk_size,
|
|
389
427
|
max_variant_chunks=max_variant_chunks,
|
|
390
|
-
show_progress=
|
|
428
|
+
show_progress=progress,
|
|
391
429
|
)
|
|
392
430
|
show_work_summary(work_summary, json)
|
|
393
431
|
|
|
@@ -413,12 +451,13 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
|
|
|
413
451
|
@click.command
|
|
414
452
|
@zarr_path
|
|
415
453
|
@verbose
|
|
416
|
-
|
|
454
|
+
@progress
|
|
455
|
+
def dencode_finalise(zarr_path, verbose, progress):
|
|
417
456
|
"""
|
|
418
457
|
Final step for distributed conversion of ICF to VCF Zarr.
|
|
419
458
|
"""
|
|
420
459
|
setup_logging(verbose)
|
|
421
|
-
vcf2zarr.encode_finalise(zarr_path, show_progress=
|
|
460
|
+
vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
|
|
422
461
|
|
|
423
462
|
|
|
424
463
|
@click.command(name="convert")
|
|
@@ -428,6 +467,7 @@ def dencode_finalise(zarr_path, verbose):
|
|
|
428
467
|
@variants_chunk_size
|
|
429
468
|
@samples_chunk_size
|
|
430
469
|
@verbose
|
|
470
|
+
@progress
|
|
431
471
|
@worker_processes
|
|
432
472
|
def convert_vcf(
|
|
433
473
|
vcfs,
|
|
@@ -436,6 +476,7 @@ def convert_vcf(
|
|
|
436
476
|
variants_chunk_size,
|
|
437
477
|
samples_chunk_size,
|
|
438
478
|
verbose,
|
|
479
|
+
progress,
|
|
439
480
|
worker_processes,
|
|
440
481
|
):
|
|
441
482
|
"""
|
|
@@ -448,7 +489,7 @@ def convert_vcf(
|
|
|
448
489
|
zarr_path,
|
|
449
490
|
variants_chunk_size=variants_chunk_size,
|
|
450
491
|
samples_chunk_size=samples_chunk_size,
|
|
451
|
-
show_progress=
|
|
492
|
+
show_progress=progress,
|
|
452
493
|
worker_processes=worker_processes,
|
|
453
494
|
)
|
|
454
495
|
|
|
@@ -481,6 +522,7 @@ vcf2zarr_main.add_command(dencode_finalise)
|
|
|
481
522
|
@click.argument("in_path", type=click.Path())
|
|
482
523
|
@click.argument("zarr_path", type=click.Path())
|
|
483
524
|
@worker_processes
|
|
525
|
+
@progress
|
|
484
526
|
@verbose
|
|
485
527
|
@variants_chunk_size
|
|
486
528
|
@samples_chunk_size
|
|
@@ -489,6 +531,7 @@ def convert_plink(
|
|
|
489
531
|
zarr_path,
|
|
490
532
|
verbose,
|
|
491
533
|
worker_processes,
|
|
534
|
+
progress,
|
|
492
535
|
variants_chunk_size,
|
|
493
536
|
samples_chunk_size,
|
|
494
537
|
):
|
|
@@ -499,7 +542,7 @@ def convert_plink(
|
|
|
499
542
|
plink.convert(
|
|
500
543
|
in_path,
|
|
501
544
|
zarr_path,
|
|
502
|
-
show_progress=
|
|
545
|
+
show_progress=progress,
|
|
503
546
|
worker_processes=worker_processes,
|
|
504
547
|
samples_chunk_size=samples_chunk_size,
|
|
505
548
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -517,11 +560,39 @@ plink2zarr.add_command(convert_plink)
|
|
|
517
560
|
|
|
518
561
|
@click.command
|
|
519
562
|
@version
|
|
520
|
-
@click.argument("vcf_path", type=click.Path())
|
|
521
|
-
@
|
|
522
|
-
@
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
563
|
+
@click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
|
|
564
|
+
@verbose
|
|
565
|
+
@num_partitions
|
|
566
|
+
@click.option(
|
|
567
|
+
"-s",
|
|
568
|
+
"--partition-size",
|
|
569
|
+
type=str,
|
|
570
|
+
default=None,
|
|
571
|
+
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
|
|
572
|
+
)
|
|
573
|
+
def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
|
|
574
|
+
"""
|
|
575
|
+
Output bcftools region strings that partition an indexed VCF/BCF file
|
|
576
|
+
into either an approximate number of parts (-n), or parts of approximately
|
|
577
|
+
a given size (-s). One of -n or -s must be supplied.
|
|
578
|
+
|
|
579
|
+
Note that both the number of partitions and sizes are a target, and the
|
|
580
|
+
returned number of partitions may not exactly correspond. In particular,
|
|
581
|
+
there is a maximum level of granularity determined by the associated index
|
|
582
|
+
which cannot be exceeded.
|
|
583
|
+
|
|
584
|
+
Note also that the partitions returned may vary considerably in the number
|
|
585
|
+
of records that they contain.
|
|
586
|
+
"""
|
|
587
|
+
setup_logging(verbose)
|
|
588
|
+
if num_partitions is None and partition_size is None:
|
|
589
|
+
raise click.UsageError(
|
|
590
|
+
"Either --num-partitions or --partition-size must be specified"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
|
|
594
|
+
regions = indexed_vcf.partition_into_regions(
|
|
595
|
+
num_parts=num_partitions, target_part_size=partition_size
|
|
596
|
+
)
|
|
597
|
+
for region in regions:
|
|
598
|
+
click.echo(f"{region}\t{vcf_path}")
|
bio2zarr/core.py
CHANGED
|
@@ -7,8 +7,10 @@ import math
|
|
|
7
7
|
import multiprocessing
|
|
8
8
|
import os
|
|
9
9
|
import os.path
|
|
10
|
+
import sys
|
|
10
11
|
import threading
|
|
11
12
|
import time
|
|
13
|
+
import warnings
|
|
12
14
|
|
|
13
15
|
import humanfriendly
|
|
14
16
|
import numcodecs
|
|
@@ -78,6 +80,11 @@ def du(path):
|
|
|
78
80
|
|
|
79
81
|
|
|
80
82
|
class SynchronousExecutor(cf.Executor):
|
|
83
|
+
# Arguably we should use workers=0 as the default and use this
|
|
84
|
+
# executor implementation. However, the docs are fairly explicit
|
|
85
|
+
# about saying we shouldn't instantiate Future objects directly,
|
|
86
|
+
# so it's best to keep this as a semi-secret debugging interface
|
|
87
|
+
# for now.
|
|
81
88
|
def submit(self, fn, /, *args, **kwargs):
|
|
82
89
|
future = cf.Future()
|
|
83
90
|
future.set_result(fn(*args, **kwargs))
|
|
@@ -191,8 +198,11 @@ _progress_counter = None
|
|
|
191
198
|
|
|
192
199
|
|
|
193
200
|
def update_progress(inc):
|
|
194
|
-
|
|
195
|
-
|
|
201
|
+
# If the _progress_counter has not been set we are working in a
|
|
202
|
+
# synchronous non-progress tracking context
|
|
203
|
+
if _progress_counter is not None:
|
|
204
|
+
with _progress_counter.get_lock():
|
|
205
|
+
_progress_counter.value += inc
|
|
196
206
|
|
|
197
207
|
|
|
198
208
|
def get_progress():
|
|
@@ -206,6 +216,22 @@ def setup_progress_counter(counter):
|
|
|
206
216
|
_progress_counter = counter
|
|
207
217
|
|
|
208
218
|
|
|
219
|
+
def warn_py39_mac():
|
|
220
|
+
if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
|
|
221
|
+
warnings.warn(
|
|
222
|
+
"There is a known issue with bio2zarr on MacOS Python 3.9 "
|
|
223
|
+
"in which OS-level named semaphores are leaked. "
|
|
224
|
+
"You will also probably see warnings like 'There appear to be N "
|
|
225
|
+
"leaked semaphore objects at shutdown'. "
|
|
226
|
+
"While this is likely harmless for a few runs, it could lead to "
|
|
227
|
+
"issues if you do a lot of conversion. To get prevent this issue "
|
|
228
|
+
"either: (1) use --worker-processes=0 or (2) upgrade to a newer "
|
|
229
|
+
"Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
|
|
230
|
+
"for more details.",
|
|
231
|
+
stacklevel=2,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
209
235
|
class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
210
236
|
def __init__(self, worker_processes=1, progress_config=None):
|
|
211
237
|
# Need to specify this explicitly to suppport Macs and
|
|
@@ -214,9 +240,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
214
240
|
global _progress_counter
|
|
215
241
|
_progress_counter = ctx.Value("Q", 0)
|
|
216
242
|
if worker_processes <= 0:
|
|
217
|
-
# NOTE: this is only for testing, not for
|
|
243
|
+
# NOTE: this is only for testing and debugging, not for
|
|
244
|
+
# production. See note on the SynchronousExecutor class.
|
|
218
245
|
self.executor = SynchronousExecutor()
|
|
219
246
|
else:
|
|
247
|
+
warn_py39_mac()
|
|
220
248
|
self.executor = cf.ProcessPoolExecutor(
|
|
221
249
|
max_workers=worker_processes,
|
|
222
250
|
mp_context=ctx,
|
|
@@ -248,7 +276,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
248
276
|
def _update_progress(self):
|
|
249
277
|
current = get_progress()
|
|
250
278
|
inc = current - self.progress_bar.n
|
|
251
|
-
# print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
|
|
252
279
|
self.progress_bar.update(inc)
|
|
253
280
|
|
|
254
281
|
def _update_progress_worker(self):
|
bio2zarr/vcf2zarr/icf.py
CHANGED
|
@@ -263,9 +263,6 @@ def scan_vcf(path, target_num_partitions):
|
|
|
263
263
|
)
|
|
264
264
|
|
|
265
265
|
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
|
|
266
|
-
logger.info(
|
|
267
|
-
f"Split {path} into {len(regions)} regions (target={target_num_partitions})"
|
|
268
|
-
)
|
|
269
266
|
for region in regions:
|
|
270
267
|
metadata.partitions.append(
|
|
271
268
|
VcfPartition(
|
|
@@ -275,6 +272,10 @@ def scan_vcf(path, target_num_partitions):
|
|
|
275
272
|
region=region,
|
|
276
273
|
)
|
|
277
274
|
)
|
|
275
|
+
logger.info(
|
|
276
|
+
f"Split {path} into {len(metadata.partitions)} "
|
|
277
|
+
f"partitions target={target_num_partitions})"
|
|
278
|
+
)
|
|
278
279
|
core.update_progress(1)
|
|
279
280
|
return metadata, vcf.raw_header
|
|
280
281
|
|
bio2zarr/vcf2zarr/vcz.py
CHANGED
|
@@ -34,7 +34,7 @@ DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
@dataclasses.dataclass
|
|
37
|
-
class
|
|
37
|
+
class ZarrArraySpec:
|
|
38
38
|
name: str
|
|
39
39
|
dtype: str
|
|
40
40
|
shape: tuple
|
|
@@ -54,7 +54,7 @@ class ZarrColumnSpec:
|
|
|
54
54
|
|
|
55
55
|
@staticmethod
|
|
56
56
|
def new(**kwargs):
|
|
57
|
-
spec =
|
|
57
|
+
spec = ZarrArraySpec(
|
|
58
58
|
**kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
|
|
59
59
|
)
|
|
60
60
|
spec._choose_compressor_settings()
|
|
@@ -94,7 +94,7 @@ class ZarrColumnSpec:
|
|
|
94
94
|
dimensions.append("genotypes")
|
|
95
95
|
else:
|
|
96
96
|
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
|
|
97
|
-
return
|
|
97
|
+
return ZarrArraySpec.new(
|
|
98
98
|
vcf_field=vcf_field.full_name,
|
|
99
99
|
name=variable_name,
|
|
100
100
|
dtype=vcf_field.smallest_dtype(),
|
|
@@ -127,6 +127,23 @@ class ZarrColumnSpec:
|
|
|
127
127
|
|
|
128
128
|
self.compressor["shuffle"] = shuffle
|
|
129
129
|
|
|
130
|
+
@property
|
|
131
|
+
def chunk_nbytes(self):
|
|
132
|
+
"""
|
|
133
|
+
Returns the nbytes for a single chunk in this array.
|
|
134
|
+
"""
|
|
135
|
+
items = 1
|
|
136
|
+
dim = 0
|
|
137
|
+
for chunk_size in self.chunks:
|
|
138
|
+
size = min(chunk_size, self.shape[dim])
|
|
139
|
+
items *= size
|
|
140
|
+
dim += 1
|
|
141
|
+
# Include sizes for extra dimensions.
|
|
142
|
+
for size in self.shape[dim:]:
|
|
143
|
+
items *= size
|
|
144
|
+
dt = np.dtype(self.dtype)
|
|
145
|
+
return items * dt.itemsize
|
|
146
|
+
|
|
130
147
|
@property
|
|
131
148
|
def variant_chunk_nbytes(self):
|
|
132
149
|
"""
|
|
@@ -157,6 +174,24 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
157
174
|
filters: list
|
|
158
175
|
fields: list
|
|
159
176
|
|
|
177
|
+
def validate(self):
|
|
178
|
+
"""
|
|
179
|
+
Checks that the schema is well-formed and within required limits.
|
|
180
|
+
"""
|
|
181
|
+
for field in self.fields:
|
|
182
|
+
# This is the Blosc max buffer size
|
|
183
|
+
if field.chunk_nbytes > 2147483647:
|
|
184
|
+
# TODO add some links to documentation here advising how to
|
|
185
|
+
# deal with PL values.
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"Field {field.name} chunks are too large "
|
|
188
|
+
f"({field.chunk_nbytes} > 2**31 - 1 bytes). "
|
|
189
|
+
"Either generate a schema and drop this field (if you don't "
|
|
190
|
+
"need it) or reduce the variant or sample chunk sizes."
|
|
191
|
+
)
|
|
192
|
+
# TODO other checks? There must be lots of ways people could mess
|
|
193
|
+
# up the schema leading to cryptic errors.
|
|
194
|
+
|
|
160
195
|
def field_map(self):
|
|
161
196
|
return {field.name: field for field in self.fields}
|
|
162
197
|
|
|
@@ -171,7 +206,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
171
206
|
ret.samples = [icf.Sample(**sd) for sd in d["samples"]]
|
|
172
207
|
ret.contigs = [icf.Contig(**sd) for sd in d["contigs"]]
|
|
173
208
|
ret.filters = [icf.Filter(**sd) for sd in d["filters"]]
|
|
174
|
-
ret.fields = [
|
|
209
|
+
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
|
|
175
210
|
return ret
|
|
176
211
|
|
|
177
212
|
@staticmethod
|
|
@@ -192,7 +227,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
192
227
|
)
|
|
193
228
|
|
|
194
229
|
def spec_from_field(field, variable_name=None):
|
|
195
|
-
return
|
|
230
|
+
return ZarrArraySpec.from_field(
|
|
196
231
|
field,
|
|
197
232
|
num_samples=n,
|
|
198
233
|
num_variants=m,
|
|
@@ -204,7 +239,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
204
239
|
def fixed_field_spec(
|
|
205
240
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
206
241
|
):
|
|
207
|
-
return
|
|
242
|
+
return ZarrArraySpec.new(
|
|
208
243
|
vcf_field=vcf_field,
|
|
209
244
|
name=name,
|
|
210
245
|
dtype=dtype,
|
|
@@ -230,13 +265,13 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
230
265
|
),
|
|
231
266
|
fixed_field_spec(
|
|
232
267
|
name="variant_allele",
|
|
233
|
-
dtype="
|
|
268
|
+
dtype="O",
|
|
234
269
|
shape=(m, max_alleles),
|
|
235
270
|
dimensions=["variants", "alleles"],
|
|
236
271
|
),
|
|
237
272
|
fixed_field_spec(
|
|
238
273
|
name="variant_id",
|
|
239
|
-
dtype="
|
|
274
|
+
dtype="O",
|
|
240
275
|
),
|
|
241
276
|
fixed_field_spec(
|
|
242
277
|
name="variant_id_mask",
|
|
@@ -267,7 +302,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
267
302
|
chunks = [variants_chunk_size, samples_chunk_size]
|
|
268
303
|
dimensions = ["variants", "samples"]
|
|
269
304
|
colspecs.append(
|
|
270
|
-
|
|
305
|
+
ZarrArraySpec.new(
|
|
271
306
|
vcf_field=None,
|
|
272
307
|
name="call_genotype_phased",
|
|
273
308
|
dtype="bool",
|
|
@@ -280,7 +315,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
280
315
|
shape += [ploidy]
|
|
281
316
|
dimensions += ["ploidy"]
|
|
282
317
|
colspecs.append(
|
|
283
|
-
|
|
318
|
+
ZarrArraySpec.new(
|
|
284
319
|
vcf_field=None,
|
|
285
320
|
name="call_genotype",
|
|
286
321
|
dtype=gt_field.smallest_dtype(),
|
|
@@ -291,7 +326,7 @@ class VcfZarrSchema(core.JsonDataclass):
|
|
|
291
326
|
)
|
|
292
327
|
)
|
|
293
328
|
colspecs.append(
|
|
294
|
-
|
|
329
|
+
ZarrArraySpec.new(
|
|
295
330
|
vcf_field=None,
|
|
296
331
|
name="call_genotype_mask",
|
|
297
332
|
dtype="bool",
|
|
@@ -447,6 +482,7 @@ class VcfZarrWriter:
|
|
|
447
482
|
self.icf = icf
|
|
448
483
|
if self.path.exists():
|
|
449
484
|
raise ValueError("Zarr path already exists") # NEEDS TEST
|
|
485
|
+
schema.validate()
|
|
450
486
|
partitions = VcfZarrPartition.generate_partitions(
|
|
451
487
|
self.icf.num_records,
|
|
452
488
|
schema.variants_chunk_size,
|
bio2zarr/vcf_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import gzip
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
import pathlib
|
|
5
6
|
import struct
|
|
@@ -13,6 +14,8 @@ import numpy as np
|
|
|
13
14
|
|
|
14
15
|
from bio2zarr.typing import PathType
|
|
15
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
16
19
|
CSI_EXTENSION = ".csi"
|
|
17
20
|
TABIX_EXTENSION = ".tbi"
|
|
18
21
|
TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
|
|
@@ -411,6 +414,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
411
414
|
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
412
415
|
self.vcf = cyvcf2.VCF(vcf_path)
|
|
413
416
|
self.vcf.set_index(str(self.index_path))
|
|
417
|
+
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
|
|
414
418
|
self.sequence_names = None
|
|
415
419
|
if self.index_type == "csi":
|
|
416
420
|
# Determine the file-type based on the "aux" field.
|
|
@@ -450,15 +454,16 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
450
454
|
def _filter_empty_and_refine(self, regions):
|
|
451
455
|
"""
|
|
452
456
|
Return all regions in the specified list that have one or more records,
|
|
453
|
-
and refine the start coordinate of the region to be the actual first coord
|
|
457
|
+
and refine the start coordinate of the region to be the actual first coord.
|
|
458
|
+
|
|
459
|
+
Because this is a relatively expensive operation requiring seeking around
|
|
460
|
+
the file, we return the results as an iterator.
|
|
454
461
|
"""
|
|
455
|
-
ret = []
|
|
456
462
|
for region in regions:
|
|
457
463
|
var = next(self.variants(region), None)
|
|
458
464
|
if var is not None:
|
|
459
465
|
region.start = var.POS
|
|
460
|
-
|
|
461
|
-
return ret
|
|
466
|
+
yield region
|
|
462
467
|
|
|
463
468
|
def partition_into_regions(
|
|
464
469
|
self,
|
|
@@ -490,7 +495,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
490
495
|
target_part_size_bytes = file_length // num_parts
|
|
491
496
|
elif target_part_size_bytes is not None:
|
|
492
497
|
num_parts = ceildiv(file_length, target_part_size_bytes)
|
|
493
|
-
part_lengths =
|
|
498
|
+
part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
|
|
494
499
|
file_offsets, region_contig_indexes, region_positions = self.index.offsets()
|
|
495
500
|
|
|
496
501
|
# Search the file offsets to find which indexes the part lengths fall at
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -207,7 +207,7 @@ License: Apache License
|
|
|
207
207
|
|
|
208
208
|
Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
|
|
209
209
|
Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
|
|
210
|
-
Classifier: Development Status ::
|
|
210
|
+
Classifier: Development Status :: 4 - Beta
|
|
211
211
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
212
212
|
Classifier: Operating System :: POSIX
|
|
213
213
|
Classifier: Operating System :: POSIX :: Linux
|
|
@@ -223,8 +223,8 @@ Classifier: Topic :: Scientific/Engineering
|
|
|
223
223
|
Requires-Python: >=3.9
|
|
224
224
|
Description-Content-Type: text/markdown
|
|
225
225
|
License-File: LICENSE
|
|
226
|
-
Requires-Dist: numpy
|
|
227
|
-
Requires-Dist: zarr
|
|
226
|
+
Requires-Dist: numpy <2
|
|
227
|
+
Requires-Dist: zarr <3,>=2.17
|
|
228
228
|
Requires-Dist: click
|
|
229
229
|
Requires-Dist: tabulate
|
|
230
230
|
Requires-Dist: tqdm
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
+
bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
|
|
3
|
+
bio2zarr/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
|
|
4
|
+
bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
|
|
5
|
+
bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
|
|
6
|
+
bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
|
|
7
|
+
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
8
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
9
|
+
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
|
|
11
|
+
bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
|
|
12
|
+
bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
|
|
13
|
+
bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
|
|
14
|
+
bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
|
|
15
|
+
bio2zarr-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
16
|
+
bio2zarr-0.1.1.dist-info/METADATA,sha256=RR9oM_5UYB5slsheIFzkIlRZt5du8eCb1_bMT_e7QjY,14854
|
|
17
|
+
bio2zarr-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
18
|
+
bio2zarr-0.1.1.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
|
|
19
|
+
bio2zarr-0.1.1.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
20
|
+
bio2zarr-0.1.1.dist-info/RECORD,,
|
bio2zarr-0.0.10.dist-info/RECORD
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
-
bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
|
|
3
|
-
bio2zarr/_version.py,sha256=IBUgg21Ew0JtWj9Z6eN1r4zXlrNseQQNV4zo-nYzlEY,413
|
|
4
|
-
bio2zarr/cli.py,sha256=Bv4k9V-5HJVVbqBMiYLWz5IQyILQ0bTicqgkQrr9hd0,13209
|
|
5
|
-
bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
|
|
6
|
-
bio2zarr/core.py,sha256=3UFh7nKB3CbAIaJV3wgoqlkRy1M235C2vz7Iua73qwM,9234
|
|
7
|
-
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
8
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
9
|
-
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=b3Ti1AFXFlK7S1mu6jotqHPrujCIQXBKIHH8yIzd3zk,17781
|
|
11
|
-
bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
|
|
12
|
-
bio2zarr/vcf2zarr/icf.py,sha256=BJKPJDeqP8QtVz7ebm6NQQgvsba8H-JLsNEz4whOxsw,41559
|
|
13
|
-
bio2zarr/vcf2zarr/vcz.py,sha256=sy8VVYuOntMuPs5gUwQx6IA39_Gl_YFW2h-CeRyQw2A,36865
|
|
14
|
-
bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
|
|
15
|
-
bio2zarr-0.0.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
16
|
-
bio2zarr-0.0.10.dist-info/METADATA,sha256=7su1JbkFtR7eDjq2Rp5A8CjP9KnvwWaDS6bPH43Z2qI,14850
|
|
17
|
-
bio2zarr-0.0.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
18
|
-
bio2zarr-0.0.10.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
|
|
19
|
-
bio2zarr-0.0.10.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
20
|
-
bio2zarr-0.0.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|