bio2zarr 0.0.6__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__main__.py CHANGED
@@ -14,9 +14,9 @@ def bio2zarr():
14
14
  # install individual commands as console scripts. However, this
15
15
  # is handy for development and for those whose PATHs aren't set
16
16
  # up in the right way.
17
- bio2zarr.add_command(cli.vcf2zarr)
17
+ bio2zarr.add_command(cli.vcf2zarr_main)
18
18
  bio2zarr.add_command(cli.plink2zarr)
19
- bio2zarr.add_command(cli.vcf_partition)
19
+ bio2zarr.add_command(cli.vcfpartition)
20
20
 
21
21
  if __name__ == "__main__":
22
22
  bio2zarr()
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.6'
16
- __version_tuple__ = version_tuple = (0, 0, 6)
15
+ __version__ = version = '0.0.10'
16
+ __version_tuple__ = version_tuple = (0, 0, 10)
bio2zarr/cli.py CHANGED
@@ -5,11 +5,11 @@ import shutil
5
5
 
6
6
  import click
7
7
  import coloredlogs
8
- import humanfriendly
9
8
  import numcodecs
10
9
  import tabulate
11
10
 
12
- from . import plink, provenance, vcf, vcf_utils
11
+ from . import plink, provenance, vcf2zarr, vcf_utils
12
+ from .vcf2zarr import icf as icf_mod
13
13
 
14
14
  logger = logging.getLogger(__name__)
15
15
 
@@ -58,6 +58,20 @@ force = click.option(
58
58
  help="Force overwriting of existing directories",
59
59
  )
60
60
 
61
+ one_based = click.option(
62
+ "--one-based",
63
+ is_flag=True,
64
+ flag_value=True,
65
+ help="Partition indexes are interpreted as one-based",
66
+ )
67
+
68
+ json = click.option(
69
+ "--json",
70
+ is_flag=True,
71
+ flag_value=True,
72
+ help="Output summary data in JSON format",
73
+ )
74
+
61
75
  version = click.version_option(version=f"{provenance.__version__}")
62
76
 
63
77
  worker_processes = click.option(
@@ -154,11 +168,21 @@ def check_overwrite_dir(path, force):
154
168
  def get_compressor(cname):
155
169
  if cname is None:
156
170
  return None
157
- config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
171
+ config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
158
172
  config["cname"] = cname
159
173
  return numcodecs.get_codec(config)
160
174
 
161
175
 
176
+ def show_work_summary(work_summary, json):
177
+ if json:
178
+ output = work_summary.asjson()
179
+ else:
180
+ data = work_summary.asdict()
181
+ output = tabulate.tabulate(list(data.items()), tablefmt="plain")
182
+ # output = "\n".join(f"{k}\t{v}" for k, v in data.items())
183
+ click.echo(output)
184
+
185
+
162
186
  @click.command
163
187
  @vcfs
164
188
  @new_icf_path
@@ -175,7 +199,7 @@ def explode(
175
199
  """
176
200
  setup_logging(verbose)
177
201
  check_overwrite_dir(icf_path, force)
178
- vcf.explode(
202
+ vcf2zarr.explode(
179
203
  icf_path,
180
204
  vcfs,
181
205
  worker_processes=worker_processes,
@@ -192,6 +216,7 @@ def explode(
192
216
  @force
193
217
  @column_chunk_size
194
218
  @compressor
219
+ @json
195
220
  @verbose
196
221
  @worker_processes
197
222
  def dexplode_init(
@@ -201,6 +226,7 @@ def dexplode_init(
201
226
  force,
202
227
  column_chunk_size,
203
228
  compressor,
229
+ json,
204
230
  verbose,
205
231
  worker_processes,
206
232
  ):
@@ -210,7 +236,7 @@ def dexplode_init(
210
236
  """
211
237
  setup_logging(verbose)
212
238
  check_overwrite_dir(icf_path, force)
213
- num_partitions = vcf.explode_init(
239
+ work_summary = vcf2zarr.explode_init(
214
240
  icf_path,
215
241
  vcfs,
216
242
  target_num_partitions=num_partitions,
@@ -219,21 +245,26 @@ def dexplode_init(
219
245
  compressor=get_compressor(compressor),
220
246
  show_progress=True,
221
247
  )
222
- click.echo(num_partitions)
248
+ show_work_summary(work_summary, json)
223
249
 
224
250
 
225
251
  @click.command
226
252
  @icf_path
227
253
  @partition
228
254
  @verbose
229
- def dexplode_partition(icf_path, partition, verbose):
255
+ @one_based
256
+ def dexplode_partition(icf_path, partition, verbose, one_based):
230
257
  """
231
- Convert a VCF partition to intermediate columnar format. Must be called *after*
232
- the ICF path has been initialised with dexplode_init. Partition indexes must be
233
- from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
258
+ Convert a VCF partition to intermediate columnar format. Must be called
259
+ after the ICF path has been initialised with dexplode_init. By default,
260
+ partition indexes are from 0 to the number of partitions N (returned by
261
+ dexplode_init), exclusive. If the --one-based option is specifed,
262
+ partition indexes are in the range 1 to N, inclusive.
234
263
  """
235
264
  setup_logging(verbose)
236
- vcf.explode_partition(icf_path, partition, show_progress=False)
265
+ if one_based:
266
+ partition -= 1
267
+ vcf2zarr.explode_partition(icf_path, partition)
237
268
 
238
269
 
239
270
  @click.command
@@ -244,7 +275,7 @@ def dexplode_finalise(icf_path, verbose):
244
275
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
245
276
  """
246
277
  setup_logging(verbose)
247
- vcf.explode_finalise(icf_path)
278
+ vcf2zarr.explode_finalise(icf_path)
248
279
 
249
280
 
250
281
  @click.command
@@ -255,7 +286,7 @@ def inspect(path, verbose):
255
286
  Inspect an intermediate columnar format or Zarr path.
256
287
  """
257
288
  setup_logging(verbose)
258
- data = vcf.inspect(path)
289
+ data = vcf2zarr.inspect(path)
259
290
  click.echo(tabulate.tabulate(data, headers="keys"))
260
291
 
261
292
 
@@ -266,7 +297,7 @@ def mkschema(icf_path):
266
297
  Generate a schema for zarr encoding
267
298
  """
268
299
  stream = click.get_text_stream("stdout")
269
- vcf.mkschema(icf_path, stream)
300
+ vcf2zarr.mkschema(icf_path, stream)
270
301
 
271
302
 
272
303
  @click.command
@@ -297,7 +328,7 @@ def encode(
297
328
  """
298
329
  setup_logging(verbose)
299
330
  check_overwrite_dir(zarr_path, force)
300
- vcf.encode(
331
+ vcf2zarr.encode(
301
332
  icf_path,
302
333
  zarr_path,
303
334
  schema_path=schema,
@@ -319,6 +350,7 @@ def encode(
319
350
  @variants_chunk_size
320
351
  @samples_chunk_size
321
352
  @max_variant_chunks
353
+ @json
322
354
  @verbose
323
355
  def dencode_init(
324
356
  icf_path,
@@ -329,6 +361,7 @@ def dencode_init(
329
361
  variants_chunk_size,
330
362
  samples_chunk_size,
331
363
  max_variant_chunks,
364
+ json,
332
365
  verbose,
333
366
  ):
334
367
  """
@@ -346,7 +379,7 @@ def dencode_init(
346
379
  """
347
380
  setup_logging(verbose)
348
381
  check_overwrite_dir(zarr_path, force)
349
- num_partitions, max_memory = vcf.encode_init(
382
+ work_summary = vcf2zarr.encode_init(
350
383
  icf_path,
351
384
  zarr_path,
352
385
  target_num_partitions=num_partitions,
@@ -356,30 +389,25 @@ def dencode_init(
356
389
  max_variant_chunks=max_variant_chunks,
357
390
  show_progress=True,
358
391
  )
359
- formatted_size = humanfriendly.format_size(max_memory, binary=True)
360
- # NOTE adding the size to the stdout here so that users can parse it
361
- # and use in their submission scripts. This is a first pass, and
362
- # will most likely change as we see what works and doesn't.
363
- # NOTE we probably want to format this as a table, which lists
364
- # some other properties, line by line
365
- # NOTE This size number is also not quite enough, you need a bit of
366
- # headroom with it (probably 10% or so). We should include this.
367
- click.echo(f"{num_partitions}\t{formatted_size}")
392
+ show_work_summary(work_summary, json)
368
393
 
369
394
 
370
395
  @click.command
371
396
  @zarr_path
372
397
  @partition
373
398
  @verbose
374
- def dencode_partition(zarr_path, partition, verbose):
375
- """
376
- Convert a partition from intermediate columnar format to VCF Zarr.
377
- Must be called *after* the Zarr path has been initialised with dencode_init.
378
- Partition indexes must be from 0 (inclusive) to the number of paritions
379
- returned by dencode_init (exclusive).
399
+ @one_based
400
+ def dencode_partition(zarr_path, partition, verbose, one_based):
380
401
  """
402
+ Convert a partition from intermediate columnar format to VCF Zarr. Must be
403
+ called after the Zarr path has been initialised with dencode_init. By
404
+ default, partition indexes are from 0 to the number of partitions N
405
+ (returned by dencode_init), exclusive. If the --one-based option is
406
+ specifed, partition indexes are in the range 1 to N, inclusive."""
381
407
  setup_logging(verbose)
382
- vcf.encode_partition(zarr_path, partition)
408
+ if one_based:
409
+ partition -= 1
410
+ vcf2zarr.encode_partition(zarr_path, partition)
383
411
 
384
412
 
385
413
  @click.command
@@ -390,24 +418,32 @@ def dencode_finalise(zarr_path, verbose):
390
418
  Final step for distributed conversion of ICF to VCF Zarr.
391
419
  """
392
420
  setup_logging(verbose)
393
- vcf.encode_finalise(zarr_path, show_progress=True)
421
+ vcf2zarr.encode_finalise(zarr_path, show_progress=True)
394
422
 
395
423
 
396
424
  @click.command(name="convert")
397
425
  @vcfs
398
426
  @new_zarr_path
427
+ @force
399
428
  @variants_chunk_size
400
429
  @samples_chunk_size
401
430
  @verbose
402
431
  @worker_processes
403
432
  def convert_vcf(
404
- vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
433
+ vcfs,
434
+ zarr_path,
435
+ force,
436
+ variants_chunk_size,
437
+ samples_chunk_size,
438
+ verbose,
439
+ worker_processes,
405
440
  ):
406
441
  """
407
442
  Convert input VCF(s) directly to vcfzarr (not recommended for large files).
408
443
  """
409
444
  setup_logging(verbose)
410
- vcf.convert(
445
+ check_overwrite_dir(zarr_path, force)
446
+ vcf2zarr.convert(
411
447
  vcfs,
412
448
  zarr_path,
413
449
  variants_chunk_size=variants_chunk_size,
@@ -418,71 +454,27 @@ def convert_vcf(
418
454
 
419
455
 
420
456
  @version
421
- @click.group(cls=NaturalOrderGroup)
422
- def vcf2zarr():
457
+ @click.group(cls=NaturalOrderGroup, name="vcf2zarr")
458
+ def vcf2zarr_main():
423
459
  """
424
460
  Convert VCF file(s) to the vcfzarr format.
425
461
 
426
- The simplest usage is:
427
-
428
- $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
429
-
430
- This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
431
- step. As this writes the intermediate columnar format to a temporary directory,
432
- we only recommend this approach for small files (< 1GB, say).
433
-
434
- The recommended approach is to run the conversion in two passes, and
435
- to keep the intermediate columnar format ("exploded") around to facilitate
436
- experimentation with chunk sizes and compression settings:
437
-
438
- \b
439
- $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
440
- $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
441
-
442
- The inspect command provides a way to view contents of an exploded ICF
443
- or Zarr:
444
-
445
- $ vcf2zarr inspect [PATH]
446
-
447
- This is useful when tweaking chunk sizes and compression settings to suit
448
- your dataset, using the mkschema command and --schema option to encode:
449
-
450
- \b
451
- $ vcf2zarr mkschema [ICF_PATH] > schema.json
452
- $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
453
-
454
- By editing the schema.json file you can drop columns that are not of interest
455
- and edit column specific compression settings. The --max-variant-chunks option
456
- to encode allows you to try out these options on small subsets, hopefully
457
- arriving at settings with the desired balance of compression and query
458
- performance.
459
-
460
- ADVANCED USAGE
461
-
462
- For very large datasets (terabyte scale) it may be necessary to distribute the
463
- explode and encode steps across a cluster:
464
-
465
- \b
466
- $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
467
- $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
468
- $ vcf2zarr dexplode-finalise [ICF_PATH]
469
-
470
- See the online documentation at [FIXME] for more details on distributed explode.
462
+ See the online documentation at https://sgkit-dev.github.io/bio2zarr/
463
+ for more information.
471
464
  """
472
465
 
473
466
 
474
- # TODO figure out how to get click to list these in the given order.
475
- vcf2zarr.add_command(convert_vcf)
476
- vcf2zarr.add_command(inspect)
477
- vcf2zarr.add_command(explode)
478
- vcf2zarr.add_command(mkschema)
479
- vcf2zarr.add_command(encode)
480
- vcf2zarr.add_command(dexplode_init)
481
- vcf2zarr.add_command(dexplode_partition)
482
- vcf2zarr.add_command(dexplode_finalise)
483
- vcf2zarr.add_command(dencode_init)
484
- vcf2zarr.add_command(dencode_partition)
485
- vcf2zarr.add_command(dencode_finalise)
467
+ vcf2zarr_main.add_command(convert_vcf)
468
+ vcf2zarr_main.add_command(inspect)
469
+ vcf2zarr_main.add_command(explode)
470
+ vcf2zarr_main.add_command(mkschema)
471
+ vcf2zarr_main.add_command(encode)
472
+ vcf2zarr_main.add_command(dexplode_init)
473
+ vcf2zarr_main.add_command(dexplode_partition)
474
+ vcf2zarr_main.add_command(dexplode_finalise)
475
+ vcf2zarr_main.add_command(dencode_init)
476
+ vcf2zarr_main.add_command(dencode_partition)
477
+ vcf2zarr_main.add_command(dencode_finalise)
486
478
 
487
479
 
488
480
  @click.command(name="convert")
@@ -529,7 +521,7 @@ plink2zarr.add_command(convert_plink)
529
521
  @click.option("-i", "--index", type=click.Path(), default=None)
530
522
  @click.option("-n", "--num-parts", type=int, default=None)
531
523
  # @click.option("-s", "--part-size", type=int, default=None)
532
- def vcf_partition(vcf_path, index, num_parts):
524
+ def vcfpartition(vcf_path, index, num_parts):
533
525
  indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
534
526
  regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
535
527
  click.echo("\n".join(map(str, regions)))
bio2zarr/constants.py ADDED
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+
3
+ INT_MISSING = -1
4
+ INT_FILL = -2
5
+ STR_MISSING = "."
6
+ STR_FILL = ""
7
+
8
+ FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
9
+ np.float32
10
+ )
11
+ FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
12
+ [0x7F800001, 0x7F800002], dtype=np.int32
13
+ )
14
+
15
+
16
+ MIN_INT_VALUE = np.iinfo(np.int32).min + 2
17
+ VCF_INT_MISSING = np.iinfo(np.int32).min
18
+ VCF_INT_FILL = np.iinfo(np.int32).min + 1
bio2zarr/core.py CHANGED
@@ -1,11 +1,16 @@
1
1
  import concurrent.futures as cf
2
2
  import contextlib
3
3
  import dataclasses
4
+ import json
4
5
  import logging
6
+ import math
5
7
  import multiprocessing
8
+ import os
9
+ import os.path
6
10
  import threading
7
11
  import time
8
12
 
13
+ import humanfriendly
9
14
  import numcodecs
10
15
  import numpy as np
11
16
  import tqdm
@@ -16,6 +21,17 @@ logger = logging.getLogger(__name__)
16
21
  numcodecs.blosc.use_threads = False
17
22
 
18
23
 
24
+ def display_number(x):
25
+ ret = "n/a"
26
+ if math.isfinite(x):
27
+ ret = f"{x: 0.2g}"
28
+ return ret
29
+
30
+
31
+ def display_size(n):
32
+ return humanfriendly.format_size(n, binary=True)
33
+
34
+
19
35
  def min_int_dtype(min_value, max_value):
20
36
  if min_value > max_value:
21
37
  raise ValueError("min_value must be <= max_value")
@@ -45,6 +61,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
45
61
  return slices
46
62
 
47
63
 
64
+ def du(path):
65
+ """
66
+ Return the total bytes stored at this path.
67
+ """
68
+ total = os.path.getsize(path)
69
+ # pathlib walk method doesn't exist until 3.12 :(
70
+ for root, dirs, files in os.walk(path):
71
+ for lst in [dirs, files]:
72
+ for name in lst:
73
+ fullname = os.path.join(root, name)
74
+ size = os.path.getsize(fullname)
75
+ total += size
76
+ logger.debug(f"du({path}) = {total}")
77
+ return total
78
+
79
+
48
80
  class SynchronousExecutor(cf.Executor):
49
81
  def submit(self, fn, /, *args, **kwargs):
50
82
  future = cf.Future()
@@ -110,7 +142,6 @@ class BufferedArray:
110
142
  sync_flush_2d_array(
111
143
  self.buff[: self.buffer_row], self.array, self.array_offset
112
144
  )
113
- # FIXME the array.name doesn't seem to be working here for some reason
114
145
  logger.debug(
115
146
  f"Flushed <{self.array.name} {self.array.shape} "
116
147
  f"{self.array.dtype}> "
@@ -156,7 +187,7 @@ class ProgressConfig:
156
187
  # progressable thing happening per source process. This is
157
188
  # probably fine in practise, but there could be corner cases
158
189
  # where it's not. Something to watch out for.
159
- _progress_counter = multiprocessing.Value("Q", 0)
190
+ _progress_counter = None
160
191
 
161
192
 
162
193
  def update_progress(inc):
@@ -170,23 +201,30 @@ def get_progress():
170
201
  return val
171
202
 
172
203
 
173
- def set_progress(value):
174
- with _progress_counter.get_lock():
175
- _progress_counter.value = value
204
+ def setup_progress_counter(counter):
205
+ global _progress_counter
206
+ _progress_counter = counter
176
207
 
177
208
 
178
209
  class ParallelWorkManager(contextlib.AbstractContextManager):
179
210
  def __init__(self, worker_processes=1, progress_config=None):
211
+ # Need to specify this explicitly to suppport Macs and
212
+ # for future proofing.
213
+ ctx = multiprocessing.get_context("spawn")
214
+ global _progress_counter
215
+ _progress_counter = ctx.Value("Q", 0)
180
216
  if worker_processes <= 0:
181
217
  # NOTE: this is only for testing, not for production use!
182
218
  self.executor = SynchronousExecutor()
183
219
  else:
184
220
  self.executor = cf.ProcessPoolExecutor(
185
221
  max_workers=worker_processes,
222
+ mp_context=ctx,
223
+ initializer=setup_progress_counter,
224
+ initargs=(_progress_counter,),
186
225
  )
187
226
  self.futures = set()
188
227
 
189
- set_progress(0)
190
228
  if progress_config is None:
191
229
  progress_config = ProgressConfig()
192
230
  self.progress_config = progress_config
@@ -227,16 +265,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
227
265
  self.futures.add(future)
228
266
  return future
229
267
 
230
- def wait_for_completed(self, timeout=None):
231
- done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
232
- for future in done:
233
- exception = future.exception()
234
- # TODO do the check for BrokenProcessPool here
235
- if exception is not None:
236
- raise exception
237
- self.futures = not_done
238
- return done
239
-
240
268
  def results_as_completed(self):
241
269
  for future in cf.as_completed(self.futures):
242
270
  yield future.result()
@@ -260,3 +288,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
260
288
  self._update_progress()
261
289
  self.progress_bar.close()
262
290
  return False
291
+
292
+
293
+ class JsonDataclass:
294
+ def asdict(self):
295
+ return dataclasses.asdict(self)
296
+
297
+ def asjson(self):
298
+ return json.dumps(self.asdict(), indent=4)
@@ -0,0 +1,38 @@
1
+ from .icf import (
2
+ IntermediateColumnarFormat,
3
+ explode,
4
+ explode_finalise,
5
+ explode_init,
6
+ explode_partition,
7
+ )
8
+ from .vcz import (
9
+ VcfZarrSchema,
10
+ convert,
11
+ encode,
12
+ encode_finalise,
13
+ encode_init,
14
+ encode_partition,
15
+ inspect,
16
+ mkschema,
17
+ )
18
+ from .verification import verify
19
+
20
+ # NOTE some of these aren't intended to be part of the external
21
+ # interface (like IntermediateColumnarFormat), but putting
22
+ # them into the list to keep the lint nagging under control
23
+ __all__ = [
24
+ "IntermediateColumnarFormat",
25
+ "explode",
26
+ "explode_finalise",
27
+ "explode_init",
28
+ "explode_partition",
29
+ "VcfZarrSchema",
30
+ "convert",
31
+ "encode",
32
+ "encode_finalise",
33
+ "encode_init",
34
+ "encode_partition",
35
+ "inspect",
36
+ "mkschema",
37
+ "verify",
38
+ ]