bio2zarr 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__init__.py CHANGED
@@ -1 +1 @@
1
- from . provenance import __version__
1
+ from .provenance import __version__ # noqa F401
bio2zarr/__main__.py CHANGED
@@ -2,11 +2,13 @@ import click
2
2
 
3
3
  from . import cli
4
4
 
5
+
5
6
  @cli.version
6
7
  @click.group()
7
8
  def bio2zarr():
8
9
  pass
9
10
 
11
+
10
12
  # Provide a single top-level interface to all of the functionality.
11
13
  # This probably isn't the recommended way of interacting, as we
12
14
  # install individual commands as console scripts. However, this
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.4'
16
- __version_tuple__ = version_tuple = (0, 0, 4)
15
+ __version__ = version = '0.0.6'
16
+ __version_tuple__ = version_tuple = (0, 0, 6)
bio2zarr/cli.py CHANGED
@@ -4,15 +4,12 @@ import pathlib
4
4
  import shutil
5
5
 
6
6
  import click
7
- import tabulate
8
7
  import coloredlogs
8
+ import humanfriendly
9
9
  import numcodecs
10
+ import tabulate
10
11
 
11
- from . import vcf
12
- from . import vcf_utils
13
- from . import plink
14
- from . import provenance
15
-
12
+ from . import plink, provenance, vcf, vcf_utils
16
13
 
17
14
  logger = logging.getLogger(__name__)
18
15
 
@@ -43,6 +40,14 @@ new_zarr_path = click.argument(
43
40
  "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
44
41
  )
45
42
 
43
+ zarr_path = click.argument(
44
+ "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
45
+ )
46
+
47
+ num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
48
+
49
+ partition = click.argument("partition", type=click.IntRange(min=0))
50
+
46
51
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
47
52
 
48
53
  force = click.option(
@@ -75,7 +80,7 @@ compressor = click.option(
75
80
  "--compressor",
76
81
  type=click.Choice(["lz4", "zstd"]),
77
82
  default=None,
78
- help="Codec to use for compressing column chunks (Default=zstd)."
83
+ help="Codec to use for compressing column chunks (Default=zstd).",
79
84
  )
80
85
 
81
86
  # Note: -l and -w were chosen when these were called "width" and "length".
@@ -96,6 +101,27 @@ samples_chunk_size = click.option(
96
101
  help="Chunk size in the samples dimension",
97
102
  )
98
103
 
104
+ schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
105
+
106
+ max_variant_chunks = click.option(
107
+ "-V",
108
+ "--max-variant-chunks",
109
+ type=int,
110
+ default=None,
111
+ help=(
112
+ "Truncate the output in the variants dimension to have "
113
+ "this number of chunks. Mainly intended to help with "
114
+ "schema tuning."
115
+ ),
116
+ )
117
+
118
+ max_memory = click.option(
119
+ "-M",
120
+ "--max-memory",
121
+ default=None,
122
+ help="An approximate bound on overall memory usage (e.g. 10G),",
123
+ )
124
+
99
125
 
100
126
  def setup_logging(verbosity):
101
127
  level = "WARNING"
@@ -162,7 +188,7 @@ def explode(
162
188
  @click.command
163
189
  @vcfs
164
190
  @new_icf_path
165
- @click.argument("num_partitions", type=click.IntRange(min=1))
191
+ @num_partitions
166
192
  @force
167
193
  @column_chunk_size
168
194
  @compressor
@@ -198,7 +224,7 @@ def dexplode_init(
198
224
 
199
225
  @click.command
200
226
  @icf_path
201
- @click.argument("partition", type=click.IntRange(min=0))
227
+ @partition
202
228
  @verbose
203
229
  def dexplode_partition(icf_path, partition, verbose):
204
230
  """
@@ -207,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
207
233
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
208
234
  """
209
235
  setup_logging(verbose)
210
- vcf.explode_partition(icf_path, partition, show_progress=True)
236
+ vcf.explode_partition(icf_path, partition, show_progress=False)
211
237
 
212
238
 
213
239
  @click.command
214
- @click.argument("path", type=click.Path(), required=True)
240
+ @icf_path
215
241
  @verbose
216
- def dexplode_finalise(path, verbose):
242
+ def dexplode_finalise(icf_path, verbose):
217
243
  """
218
244
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
219
245
  """
220
246
  setup_logging(verbose)
221
- vcf.explode_finalise(path)
247
+ vcf.explode_finalise(icf_path)
222
248
 
223
249
 
224
250
  @click.command
@@ -248,26 +274,11 @@ def mkschema(icf_path):
248
274
  @new_zarr_path
249
275
  @force
250
276
  @verbose
251
- @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
277
+ @schema
252
278
  @variants_chunk_size
253
279
  @samples_chunk_size
254
- @click.option(
255
- "-V",
256
- "--max-variant-chunks",
257
- type=int,
258
- default=None,
259
- help=(
260
- "Truncate the output in the variants dimension to have "
261
- "this number of chunks. Mainly intended to help with "
262
- "schema tuning."
263
- ),
264
- )
265
- @click.option(
266
- "-M",
267
- "--max-memory",
268
- default=None,
269
- help="An approximate bound on overall memory usage (e.g. 10G),",
270
- )
280
+ @max_variant_chunks
281
+ @max_memory
271
282
  @worker_processes
272
283
  def encode(
273
284
  icf_path,
@@ -292,13 +303,96 @@ def encode(
292
303
  schema_path=schema,
293
304
  variants_chunk_size=variants_chunk_size,
294
305
  samples_chunk_size=samples_chunk_size,
295
- max_v_chunks=max_variant_chunks,
306
+ max_variant_chunks=max_variant_chunks,
296
307
  worker_processes=worker_processes,
297
308
  max_memory=max_memory,
298
309
  show_progress=True,
299
310
  )
300
311
 
301
312
 
313
+ @click.command
314
+ @icf_path
315
+ @new_zarr_path
316
+ @num_partitions
317
+ @force
318
+ @schema
319
+ @variants_chunk_size
320
+ @samples_chunk_size
321
+ @max_variant_chunks
322
+ @verbose
323
+ def dencode_init(
324
+ icf_path,
325
+ zarr_path,
326
+ num_partitions,
327
+ force,
328
+ schema,
329
+ variants_chunk_size,
330
+ samples_chunk_size,
331
+ max_variant_chunks,
332
+ verbose,
333
+ ):
334
+ """
335
+ Initialise conversion of intermediate format to VCF Zarr. This will
336
+ set up the specified ZARR_PATH to perform this conversion over
337
+ NUM_PARTITIONS.
338
+
339
+ The output of this commmand is the actual number of partitions generated
340
+ (which may be less then the requested number, if there is not sufficient
341
+ chunks in the variants dimension) and a rough lower-bound on the amount
342
+ of memory required to encode a partition.
343
+
344
+ NOTE: the format of this output will likely change in subsequent releases;
345
+ it should not be considered machine-readable for now.
346
+ """
347
+ setup_logging(verbose)
348
+ check_overwrite_dir(zarr_path, force)
349
+ num_partitions, max_memory = vcf.encode_init(
350
+ icf_path,
351
+ zarr_path,
352
+ target_num_partitions=num_partitions,
353
+ schema_path=schema,
354
+ variants_chunk_size=variants_chunk_size,
355
+ samples_chunk_size=samples_chunk_size,
356
+ max_variant_chunks=max_variant_chunks,
357
+ show_progress=True,
358
+ )
359
+ formatted_size = humanfriendly.format_size(max_memory, binary=True)
360
+ # NOTE adding the size to the stdout here so that users can parse it
361
+ # and use in their submission scripts. This is a first pass, and
362
+ # will most likely change as we see what works and doesn't.
363
+ # NOTE we probably want to format this as a table, which lists
364
+ # some other properties, line by line
365
+ # NOTE This size number is also not quite enough, you need a bit of
366
+ # headroom with it (probably 10% or so). We should include this.
367
+ click.echo(f"{num_partitions}\t{formatted_size}")
368
+
369
+
370
+ @click.command
371
+ @zarr_path
372
+ @partition
373
+ @verbose
374
+ def dencode_partition(zarr_path, partition, verbose):
375
+ """
376
+ Convert a partition from intermediate columnar format to VCF Zarr.
377
+ Must be called *after* the Zarr path has been initialised with dencode_init.
378
+ Partition indexes must be from 0 (inclusive) to the number of paritions
379
+ returned by dencode_init (exclusive).
380
+ """
381
+ setup_logging(verbose)
382
+ vcf.encode_partition(zarr_path, partition)
383
+
384
+
385
+ @click.command
386
+ @zarr_path
387
+ @verbose
388
+ def dencode_finalise(zarr_path, verbose):
389
+ """
390
+ Final step for distributed conversion of ICF to VCF Zarr.
391
+ """
392
+ setup_logging(verbose)
393
+ vcf.encode_finalise(zarr_path, show_progress=True)
394
+
395
+
302
396
  @click.command(name="convert")
303
397
  @vcfs
304
398
  @new_zarr_path
@@ -386,6 +480,9 @@ vcf2zarr.add_command(encode)
386
480
  vcf2zarr.add_command(dexplode_init)
387
481
  vcf2zarr.add_command(dexplode_partition)
388
482
  vcf2zarr.add_command(dexplode_finalise)
483
+ vcf2zarr.add_command(dencode_init)
484
+ vcf2zarr.add_command(dencode_partition)
485
+ vcf2zarr.add_command(dencode_finalise)
389
486
 
390
487
 
391
488
  @click.command(name="convert")
bio2zarr/core.py CHANGED
@@ -1,22 +1,31 @@
1
- import dataclasses
2
- import contextlib
3
1
  import concurrent.futures as cf
2
+ import contextlib
3
+ import dataclasses
4
+ import logging
4
5
  import multiprocessing
5
6
  import threading
6
- import logging
7
7
  import time
8
8
 
9
- import zarr
9
+ import numcodecs
10
10
  import numpy as np
11
11
  import tqdm
12
- import numcodecs
13
-
12
+ import zarr
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
17
16
  numcodecs.blosc.use_threads = False
18
17
 
19
18
 
19
+ def min_int_dtype(min_value, max_value):
20
+ if min_value > max_value:
21
+ raise ValueError("min_value must be <= max_value")
22
+ for a_dtype in ["i1", "i2", "i4", "i8"]:
23
+ info = np.iinfo(a_dtype)
24
+ if info.min <= min_value and max_value <= info.max:
25
+ return a_dtype
26
+ raise OverflowError("Integer cannot be represented")
27
+
28
+
20
29
  def chunk_aligned_slices(z, n, max_chunks=None):
21
30
  """
22
31
  Returns at n slices in the specified zarr array, aligned
@@ -101,6 +110,7 @@ class BufferedArray:
101
110
  sync_flush_2d_array(
102
111
  self.buff[: self.buffer_row], self.array, self.array_offset
103
112
  )
113
+ # FIXME the array.name doesn't seem to be working here for some reason
104
114
  logger.debug(
105
115
  f"Flushed <{self.array.name} {self.array.shape} "
106
116
  f"{self.array.dtype}> "
@@ -122,8 +132,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
122
132
  # encoder implementations.
123
133
  s = slice(offset, offset + np_buffer.shape[0])
124
134
  samples_chunk_size = zarr_array.chunks[1]
125
- # TODO use zarr chunks here to support non-uniform chunking later
126
- # and for simplicity
135
+ # TODO use zarr chunks here for simplicity
127
136
  zarr_array_width = zarr_array.shape[1]
128
137
  start = 0
129
138
  while start < zarr_array_width:
@@ -183,7 +192,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
183
192
  self.progress_config = progress_config
184
193
  self.progress_bar = tqdm.tqdm(
185
194
  total=progress_config.total,
186
- desc=f"{progress_config.title:>7}",
195
+ desc=f"{progress_config.title:>8}",
187
196
  unit_scale=True,
188
197
  unit=progress_config.units,
189
198
  smoothing=0.1,
bio2zarr/plink.py CHANGED
@@ -1,14 +1,13 @@
1
1
  import logging
2
2
 
3
+ import bed_reader
3
4
  import humanfriendly
5
+ import numcodecs
4
6
  import numpy as np
5
7
  import zarr
6
- import bed_reader
7
- import numcodecs
8
8
 
9
9
  from . import core
10
10
 
11
-
12
11
  logger = logging.getLogger(__name__)
13
12
 
14
13
 
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
24
23
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
24
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
25
  variants_chunk_size = gt.array.chunks[0]
27
- n = gt.array.shape[1]
28
26
  assert start % variants_chunk_size == 0
29
27
 
30
28
  logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
96
94
  chunks=(samples_chunk_size,),
97
95
  )
98
96
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
99
- logger.debug(f"Encoded samples")
97
+ logger.debug("Encoded samples")
100
98
 
101
99
  # TODO encode these in slices - but read them in one go to avoid
102
100
  # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
108
106
  chunks=(variants_chunk_size,),
109
107
  )
110
108
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug(f"encoded variant_position")
109
+ logger.debug("encoded variant_position")
112
110
 
113
111
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
112
  a = root.array(
@@ -119,7 +117,7 @@ def convert(
119
117
  chunks=(variants_chunk_size,),
120
118
  )
121
119
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
122
- logger.debug(f"encoded variant_allele")
120
+ logger.debug("encoded variant_allele")
123
121
 
124
122
  # TODO remove this?
125
123
  a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
201
199
  elif bed_call == 2:
202
200
  assert list(zarr_call) == [1, 1]
203
201
  else: # pragma no cover
204
- assert False
202
+ raise AssertionError(f"Unexpected bed call {bed_call}")
bio2zarr/typing.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from pathlib import Path
2
2
  from typing import Union
3
3
 
4
- PathType = Union[str, Path]
4
+ PathType = Union[str, Path]