bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__main__.py CHANGED
@@ -14,9 +14,9 @@ def bio2zarr():
14
14
  # install individual commands as console scripts. However, this
15
15
  # is handy for development and for those whose PATHs aren't set
16
16
  # up in the right way.
17
- bio2zarr.add_command(cli.vcf2zarr)
17
+ bio2zarr.add_command(cli.vcf2zarr_main)
18
18
  bio2zarr.add_command(cli.plink2zarr)
19
- bio2zarr.add_command(cli.vcf_partition)
19
+ bio2zarr.add_command(cli.vcfpartition)
20
20
 
21
21
  if __name__ == "__main__":
22
22
  bio2zarr()
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.9'
16
- __version_tuple__ = version_tuple = (0, 0, 9)
15
+ __version__ = version = '0.1.0'
16
+ __version_tuple__ = version_tuple = (0, 1, 0)
bio2zarr/cli.py CHANGED
@@ -5,11 +5,11 @@ import shutil
5
5
 
6
6
  import click
7
7
  import coloredlogs
8
- import humanfriendly
9
8
  import numcodecs
10
9
  import tabulate
11
10
 
12
- from . import plink, provenance, vcf, vcf_utils
11
+ from . import plink, provenance, vcf2zarr, vcf_utils
12
+ from .vcf2zarr import icf as icf_mod
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -44,7 +44,13 @@ zarr_path = click.argument(
44
44
  "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
45
45
  )
46
46
 
47
- num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
47
+ num_partitions = click.option(
48
+ "-n",
49
+ "--num-partitions",
50
+ type=click.IntRange(min=1),
51
+ default=None,
52
+ help="Target number of partitions to split into",
53
+ )
48
54
 
49
55
  partition = click.argument("partition", type=click.IntRange(min=0))
50
56
 
@@ -58,6 +64,27 @@ force = click.option(
58
64
  help="Force overwriting of existing directories",
59
65
  )
60
66
 
67
+ progress = click.option(
68
+ "-P /-Q",
69
+ "--progress/--no-progress",
70
+ default=True,
71
+ help="Show progress bars (default: show)",
72
+ )
73
+
74
+ one_based = click.option(
75
+ "--one-based",
76
+ is_flag=True,
77
+ flag_value=True,
78
+ help="Partition indexes are interpreted as one-based",
79
+ )
80
+
81
+ json = click.option(
82
+ "--json",
83
+ is_flag=True,
84
+ flag_value=True,
85
+ help="Output summary data in JSON format",
86
+ )
87
+
61
88
  version = click.version_option(version=f"{provenance.__version__}")
62
89
 
63
90
  worker_processes = click.option(
@@ -151,14 +178,33 @@ def check_overwrite_dir(path, force):
151
178
  shutil.rmtree(tmp_delete_path)
152
179
 
153
180
 
181
+ def check_partitions(num_partitions):
182
+ if num_partitions is None:
183
+ raise click.UsageError(
184
+ "-n/--num-partitions must currently be specified. Future versions "
185
+ "will provide reasonable defaults or other means of specifying "
186
+ "partitions."
187
+ )
188
+
189
+
154
190
  def get_compressor(cname):
155
191
  if cname is None:
156
192
  return None
157
- config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
193
+ config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
158
194
  config["cname"] = cname
159
195
  return numcodecs.get_codec(config)
160
196
 
161
197
 
198
+ def show_work_summary(work_summary, json):
199
+ if json:
200
+ output = work_summary.asjson()
201
+ else:
202
+ data = work_summary.asdict()
203
+ output = tabulate.tabulate(list(data.items()), tablefmt="plain")
204
+ # output = "\n".join(f"{k}\t{v}" for k, v in data.items())
205
+ click.echo(output)
206
+
207
+
162
208
  @click.command
163
209
  @vcfs
164
210
  @new_icf_path
@@ -166,22 +212,30 @@ def get_compressor(cname):
166
212
  @verbose
167
213
  @column_chunk_size
168
214
  @compressor
215
+ @progress
169
216
  @worker_processes
170
217
  def explode(
171
- vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
218
+ vcfs,
219
+ icf_path,
220
+ force,
221
+ verbose,
222
+ column_chunk_size,
223
+ compressor,
224
+ progress,
225
+ worker_processes,
172
226
  ):
173
227
  """
174
228
  Convert VCF(s) to intermediate columnar format
175
229
  """
176
230
  setup_logging(verbose)
177
231
  check_overwrite_dir(icf_path, force)
178
- vcf.explode(
232
+ vcf2zarr.explode(
179
233
  icf_path,
180
234
  vcfs,
181
235
  worker_processes=worker_processes,
182
236
  column_chunk_size=column_chunk_size,
183
237
  compressor=get_compressor(compressor),
184
- show_progress=True,
238
+ show_progress=progress,
185
239
  )
186
240
 
187
241
 
@@ -192,7 +246,9 @@ def explode(
192
246
  @force
193
247
  @column_chunk_size
194
248
  @compressor
249
+ @json
195
250
  @verbose
251
+ @progress
196
252
  @worker_processes
197
253
  def dexplode_init(
198
254
  vcfs,
@@ -201,39 +257,47 @@ def dexplode_init(
201
257
  force,
202
258
  column_chunk_size,
203
259
  compressor,
260
+ json,
204
261
  verbose,
262
+ progress,
205
263
  worker_processes,
206
264
  ):
207
265
  """
208
266
  Initial step for distributed conversion of VCF(s) to intermediate columnar format
209
- over the requested number of paritions.
267
+ over some number of paritions.
210
268
  """
211
269
  setup_logging(verbose)
212
270
  check_overwrite_dir(icf_path, force)
213
- num_partitions = vcf.explode_init(
271
+ check_partitions(num_partitions)
272
+ work_summary = vcf2zarr.explode_init(
214
273
  icf_path,
215
274
  vcfs,
216
275
  target_num_partitions=num_partitions,
217
276
  column_chunk_size=column_chunk_size,
218
277
  worker_processes=worker_processes,
219
278
  compressor=get_compressor(compressor),
220
- show_progress=True,
279
+ show_progress=progress,
221
280
  )
222
- click.echo(num_partitions)
281
+ show_work_summary(work_summary, json)
223
282
 
224
283
 
225
284
  @click.command
226
285
  @icf_path
227
286
  @partition
228
287
  @verbose
229
- def dexplode_partition(icf_path, partition, verbose):
288
+ @one_based
289
+ def dexplode_partition(icf_path, partition, verbose, one_based):
230
290
  """
231
- Convert a VCF partition to intermediate columnar format. Must be called *after*
232
- the ICF path has been initialised with dexplode_init. Partition indexes must be
233
- from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
291
+ Convert a VCF partition to intermediate columnar format. Must be called
292
+ after the ICF path has been initialised with dexplode_init. By default,
293
+ partition indexes are from 0 to the number of partitions N (returned by
294
+ dexplode_init), exclusive. If the --one-based option is specifed,
295
+ partition indexes are in the range 1 to N, inclusive.
234
296
  """
235
297
  setup_logging(verbose)
236
- vcf.explode_partition(icf_path, partition)
298
+ if one_based:
299
+ partition -= 1
300
+ vcf2zarr.explode_partition(icf_path, partition)
237
301
 
238
302
 
239
303
  @click.command
@@ -244,7 +308,7 @@ def dexplode_finalise(icf_path, verbose):
244
308
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
245
309
  """
246
310
  setup_logging(verbose)
247
- vcf.explode_finalise(icf_path)
311
+ vcf2zarr.explode_finalise(icf_path)
248
312
 
249
313
 
250
314
  @click.command
@@ -255,7 +319,7 @@ def inspect(path, verbose):
255
319
  Inspect an intermediate columnar format or Zarr path.
256
320
  """
257
321
  setup_logging(verbose)
258
- data = vcf.inspect(path)
322
+ data = vcf2zarr.inspect(path)
259
323
  click.echo(tabulate.tabulate(data, headers="keys"))
260
324
 
261
325
 
@@ -266,7 +330,7 @@ def mkschema(icf_path):
266
330
  Generate a schema for zarr encoding
267
331
  """
268
332
  stream = click.get_text_stream("stdout")
269
- vcf.mkschema(icf_path, stream)
333
+ vcf2zarr.mkschema(icf_path, stream)
270
334
 
271
335
 
272
336
  @click.command
@@ -279,6 +343,7 @@ def mkschema(icf_path):
279
343
  @samples_chunk_size
280
344
  @max_variant_chunks
281
345
  @max_memory
346
+ @progress
282
347
  @worker_processes
283
348
  def encode(
284
349
  icf_path,
@@ -290,6 +355,7 @@ def encode(
290
355
  samples_chunk_size,
291
356
  max_variant_chunks,
292
357
  max_memory,
358
+ progress,
293
359
  worker_processes,
294
360
  ):
295
361
  """
@@ -297,7 +363,7 @@ def encode(
297
363
  """
298
364
  setup_logging(verbose)
299
365
  check_overwrite_dir(zarr_path, force)
300
- vcf.encode(
366
+ vcf2zarr.encode(
301
367
  icf_path,
302
368
  zarr_path,
303
369
  schema_path=schema,
@@ -306,7 +372,7 @@ def encode(
306
372
  max_variant_chunks=max_variant_chunks,
307
373
  worker_processes=worker_processes,
308
374
  max_memory=max_memory,
309
- show_progress=True,
375
+ show_progress=progress,
310
376
  )
311
377
 
312
378
 
@@ -319,6 +385,8 @@ def encode(
319
385
  @variants_chunk_size
320
386
  @samples_chunk_size
321
387
  @max_variant_chunks
388
+ @json
389
+ @progress
322
390
  @verbose
323
391
  def dencode_init(
324
392
  icf_path,
@@ -329,12 +397,14 @@ def dencode_init(
329
397
  variants_chunk_size,
330
398
  samples_chunk_size,
331
399
  max_variant_chunks,
400
+ json,
401
+ progress,
332
402
  verbose,
333
403
  ):
334
404
  """
335
405
  Initialise conversion of intermediate format to VCF Zarr. This will
336
406
  set up the specified ZARR_PATH to perform this conversion over
337
- NUM_PARTITIONS.
407
+ some number of partitions.
338
408
 
339
409
  The output of this commmand is the actual number of partitions generated
340
410
  (which may be less then the requested number, if there is not sufficient
@@ -346,7 +416,8 @@ def dencode_init(
346
416
  """
347
417
  setup_logging(verbose)
348
418
  check_overwrite_dir(zarr_path, force)
349
- num_partitions, max_memory = vcf.encode_init(
419
+ check_partitions(num_partitions)
420
+ work_summary = vcf2zarr.encode_init(
350
421
  icf_path,
351
422
  zarr_path,
352
423
  target_num_partitions=num_partitions,
@@ -354,141 +425,104 @@ def dencode_init(
354
425
  variants_chunk_size=variants_chunk_size,
355
426
  samples_chunk_size=samples_chunk_size,
356
427
  max_variant_chunks=max_variant_chunks,
357
- show_progress=True,
428
+ show_progress=progress,
358
429
  )
359
- formatted_size = humanfriendly.format_size(max_memory, binary=True)
360
- # NOTE adding the size to the stdout here so that users can parse it
361
- # and use in their submission scripts. This is a first pass, and
362
- # will most likely change as we see what works and doesn't.
363
- # NOTE we probably want to format this as a table, which lists
364
- # some other properties, line by line
365
- # NOTE This size number is also not quite enough, you need a bit of
366
- # headroom with it (probably 10% or so). We should include this.
367
- click.echo(f"{num_partitions}\t{formatted_size}")
430
+ show_work_summary(work_summary, json)
368
431
 
369
432
 
370
433
  @click.command
371
434
  @zarr_path
372
435
  @partition
373
436
  @verbose
374
- def dencode_partition(zarr_path, partition, verbose):
375
- """
376
- Convert a partition from intermediate columnar format to VCF Zarr.
377
- Must be called *after* the Zarr path has been initialised with dencode_init.
378
- Partition indexes must be from 0 (inclusive) to the number of paritions
379
- returned by dencode_init (exclusive).
437
+ @one_based
438
+ def dencode_partition(zarr_path, partition, verbose, one_based):
380
439
  """
440
+ Convert a partition from intermediate columnar format to VCF Zarr. Must be
441
+ called after the Zarr path has been initialised with dencode_init. By
442
+ default, partition indexes are from 0 to the number of partitions N
443
+ (returned by dencode_init), exclusive. If the --one-based option is
444
+ specifed, partition indexes are in the range 1 to N, inclusive."""
381
445
  setup_logging(verbose)
382
- vcf.encode_partition(zarr_path, partition)
446
+ if one_based:
447
+ partition -= 1
448
+ vcf2zarr.encode_partition(zarr_path, partition)
383
449
 
384
450
 
385
451
  @click.command
386
452
  @zarr_path
387
453
  @verbose
388
- def dencode_finalise(zarr_path, verbose):
454
+ @progress
455
+ def dencode_finalise(zarr_path, verbose, progress):
389
456
  """
390
457
  Final step for distributed conversion of ICF to VCF Zarr.
391
458
  """
392
459
  setup_logging(verbose)
393
- vcf.encode_finalise(zarr_path, show_progress=True)
460
+ vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
394
461
 
395
462
 
396
463
  @click.command(name="convert")
397
464
  @vcfs
398
465
  @new_zarr_path
466
+ @force
399
467
  @variants_chunk_size
400
468
  @samples_chunk_size
401
469
  @verbose
470
+ @progress
402
471
  @worker_processes
403
472
  def convert_vcf(
404
- vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
473
+ vcfs,
474
+ zarr_path,
475
+ force,
476
+ variants_chunk_size,
477
+ samples_chunk_size,
478
+ verbose,
479
+ progress,
480
+ worker_processes,
405
481
  ):
406
482
  """
407
483
  Convert input VCF(s) directly to vcfzarr (not recommended for large files).
408
484
  """
409
485
  setup_logging(verbose)
410
- vcf.convert(
486
+ check_overwrite_dir(zarr_path, force)
487
+ vcf2zarr.convert(
411
488
  vcfs,
412
489
  zarr_path,
413
490
  variants_chunk_size=variants_chunk_size,
414
491
  samples_chunk_size=samples_chunk_size,
415
- show_progress=True,
492
+ show_progress=progress,
416
493
  worker_processes=worker_processes,
417
494
  )
418
495
 
419
496
 
420
497
  @version
421
- @click.group(cls=NaturalOrderGroup)
422
- def vcf2zarr():
498
+ @click.group(cls=NaturalOrderGroup, name="vcf2zarr")
499
+ def vcf2zarr_main():
423
500
  """
424
501
  Convert VCF file(s) to the vcfzarr format.
425
502
 
426
- The simplest usage is:
427
-
428
- $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
429
-
430
- This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
431
- step. As this writes the intermediate columnar format to a temporary directory,
432
- we only recommend this approach for small files (< 1GB, say).
433
-
434
- The recommended approach is to run the conversion in two passes, and
435
- to keep the intermediate columnar format ("exploded") around to facilitate
436
- experimentation with chunk sizes and compression settings:
437
-
438
- \b
439
- $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
440
- $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
441
-
442
- The inspect command provides a way to view contents of an exploded ICF
443
- or Zarr:
444
-
445
- $ vcf2zarr inspect [PATH]
446
-
447
- This is useful when tweaking chunk sizes and compression settings to suit
448
- your dataset, using the mkschema command and --schema option to encode:
449
-
450
- \b
451
- $ vcf2zarr mkschema [ICF_PATH] > schema.json
452
- $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
453
-
454
- By editing the schema.json file you can drop columns that are not of interest
455
- and edit column specific compression settings. The --max-variant-chunks option
456
- to encode allows you to try out these options on small subsets, hopefully
457
- arriving at settings with the desired balance of compression and query
458
- performance.
459
-
460
- ADVANCED USAGE
461
-
462
- For very large datasets (terabyte scale) it may be necessary to distribute the
463
- explode and encode steps across a cluster:
464
-
465
- \b
466
- $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
467
- $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
468
- $ vcf2zarr dexplode-finalise [ICF_PATH]
469
-
470
- See the online documentation at [FIXME] for more details on distributed explode.
503
+ See the online documentation at https://sgkit-dev.github.io/bio2zarr/
504
+ for more information.
471
505
  """
472
506
 
473
507
 
474
- # TODO figure out how to get click to list these in the given order.
475
- vcf2zarr.add_command(convert_vcf)
476
- vcf2zarr.add_command(inspect)
477
- vcf2zarr.add_command(explode)
478
- vcf2zarr.add_command(mkschema)
479
- vcf2zarr.add_command(encode)
480
- vcf2zarr.add_command(dexplode_init)
481
- vcf2zarr.add_command(dexplode_partition)
482
- vcf2zarr.add_command(dexplode_finalise)
483
- vcf2zarr.add_command(dencode_init)
484
- vcf2zarr.add_command(dencode_partition)
485
- vcf2zarr.add_command(dencode_finalise)
508
+ vcf2zarr_main.add_command(convert_vcf)
509
+ vcf2zarr_main.add_command(inspect)
510
+ vcf2zarr_main.add_command(explode)
511
+ vcf2zarr_main.add_command(mkschema)
512
+ vcf2zarr_main.add_command(encode)
513
+ vcf2zarr_main.add_command(dexplode_init)
514
+ vcf2zarr_main.add_command(dexplode_partition)
515
+ vcf2zarr_main.add_command(dexplode_finalise)
516
+ vcf2zarr_main.add_command(dencode_init)
517
+ vcf2zarr_main.add_command(dencode_partition)
518
+ vcf2zarr_main.add_command(dencode_finalise)
486
519
 
487
520
 
488
521
  @click.command(name="convert")
489
522
  @click.argument("in_path", type=click.Path())
490
523
  @click.argument("zarr_path", type=click.Path())
491
524
  @worker_processes
525
+ @progress
492
526
  @verbose
493
527
  @variants_chunk_size
494
528
  @samples_chunk_size
@@ -497,6 +531,7 @@ def convert_plink(
497
531
  zarr_path,
498
532
  verbose,
499
533
  worker_processes,
534
+ progress,
500
535
  variants_chunk_size,
501
536
  samples_chunk_size,
502
537
  ):
@@ -507,7 +542,7 @@ def convert_plink(
507
542
  plink.convert(
508
543
  in_path,
509
544
  zarr_path,
510
- show_progress=True,
545
+ show_progress=progress,
511
546
  worker_processes=worker_processes,
512
547
  samples_chunk_size=samples_chunk_size,
513
548
  variants_chunk_size=variants_chunk_size,
@@ -525,11 +560,39 @@ plink2zarr.add_command(convert_plink)
525
560
 
526
561
  @click.command
527
562
  @version
528
- @click.argument("vcf_path", type=click.Path())
529
- @click.option("-i", "--index", type=click.Path(), default=None)
530
- @click.option("-n", "--num-parts", type=int, default=None)
531
- # @click.option("-s", "--part-size", type=int, default=None)
532
- def vcf_partition(vcf_path, index, num_parts):
533
- indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
534
- regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
535
- click.echo("\n".join(map(str, regions)))
563
+ @click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
564
+ @verbose
565
+ @num_partitions
566
+ @click.option(
567
+ "-s",
568
+ "--partition-size",
569
+ type=str,
570
+ default=None,
571
+ help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
572
+ )
573
+ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
574
+ """
575
+ Output bcftools region strings that partition an indexed VCF/BCF file
576
+ into either an approximate number of parts (-n), or parts of approximately
577
+ a given size (-s). One of -n or -s must be supplied.
578
+
579
+ Note that both the number of partitions and sizes are a target, and the
580
+ returned number of partitions may not exactly correspond. In particular,
581
+ there is a maximum level of granularity determined by the associated index
582
+ which cannot be exceeded.
583
+
584
+ Note also that the partitions returned may vary considerably in the number
585
+ of records that they contain.
586
+ """
587
+ setup_logging(verbose)
588
+ if num_partitions is None and partition_size is None:
589
+ raise click.UsageError(
590
+ "Either --num-partitions or --partition-size must be specified"
591
+ )
592
+
593
+ indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
594
+ regions = indexed_vcf.partition_into_regions(
595
+ num_parts=num_partitions, target_part_size=partition_size
596
+ )
597
+ for region in regions:
598
+ click.echo(f"{region}\t{vcf_path}")
bio2zarr/constants.py ADDED
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+
3
+ INT_MISSING = -1
4
+ INT_FILL = -2
5
+ STR_MISSING = "."
6
+ STR_FILL = ""
7
+
8
+ FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
9
+ np.float32
10
+ )
11
+ FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
12
+ [0x7F800001, 0x7F800002], dtype=np.int32
13
+ )
14
+
15
+
16
+ MIN_INT_VALUE = np.iinfo(np.int32).min + 2
17
+ VCF_INT_MISSING = np.iinfo(np.int32).min
18
+ VCF_INT_FILL = np.iinfo(np.int32).min + 1