bio2zarr 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.5'
16
- __version_tuple__ = version_tuple = (0, 0, 5)
15
+ __version__ = version = '0.0.9'
16
+ __version_tuple__ = version_tuple = (0, 0, 9)
bio2zarr/cli.py CHANGED
@@ -5,6 +5,7 @@ import shutil
5
5
 
6
6
  import click
7
7
  import coloredlogs
8
+ import humanfriendly
8
9
  import numcodecs
9
10
  import tabulate
10
11
 
@@ -39,6 +40,14 @@ new_zarr_path = click.argument(
39
40
  "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
40
41
  )
41
42
 
43
+ zarr_path = click.argument(
44
+ "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
45
+ )
46
+
47
+ num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
48
+
49
+ partition = click.argument("partition", type=click.IntRange(min=0))
50
+
42
51
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
43
52
 
44
53
  force = click.option(
@@ -92,6 +101,27 @@ samples_chunk_size = click.option(
92
101
  help="Chunk size in the samples dimension",
93
102
  )
94
103
 
104
+ schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
105
+
106
+ max_variant_chunks = click.option(
107
+ "-V",
108
+ "--max-variant-chunks",
109
+ type=int,
110
+ default=None,
111
+ help=(
112
+ "Truncate the output in the variants dimension to have "
113
+ "this number of chunks. Mainly intended to help with "
114
+ "schema tuning."
115
+ ),
116
+ )
117
+
118
+ max_memory = click.option(
119
+ "-M",
120
+ "--max-memory",
121
+ default=None,
122
+ help="An approximate bound on overall memory usage (e.g. 10G),",
123
+ )
124
+
95
125
 
96
126
  def setup_logging(verbosity):
97
127
  level = "WARNING"
@@ -158,7 +188,7 @@ def explode(
158
188
  @click.command
159
189
  @vcfs
160
190
  @new_icf_path
161
- @click.argument("num_partitions", type=click.IntRange(min=1))
191
+ @num_partitions
162
192
  @force
163
193
  @column_chunk_size
164
194
  @compressor
@@ -194,7 +224,7 @@ def dexplode_init(
194
224
 
195
225
  @click.command
196
226
  @icf_path
197
- @click.argument("partition", type=click.IntRange(min=0))
227
+ @partition
198
228
  @verbose
199
229
  def dexplode_partition(icf_path, partition, verbose):
200
230
  """
@@ -203,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
203
233
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
204
234
  """
205
235
  setup_logging(verbose)
206
- vcf.explode_partition(icf_path, partition, show_progress=False)
236
+ vcf.explode_partition(icf_path, partition)
207
237
 
208
238
 
209
239
  @click.command
210
- @click.argument("path", type=click.Path(), required=True)
240
+ @icf_path
211
241
  @verbose
212
- def dexplode_finalise(path, verbose):
242
+ def dexplode_finalise(icf_path, verbose):
213
243
  """
214
244
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
215
245
  """
216
246
  setup_logging(verbose)
217
- vcf.explode_finalise(path)
247
+ vcf.explode_finalise(icf_path)
218
248
 
219
249
 
220
250
  @click.command
@@ -244,26 +274,11 @@ def mkschema(icf_path):
244
274
  @new_zarr_path
245
275
  @force
246
276
  @verbose
247
- @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
277
+ @schema
248
278
  @variants_chunk_size
249
279
  @samples_chunk_size
250
- @click.option(
251
- "-V",
252
- "--max-variant-chunks",
253
- type=int,
254
- default=None,
255
- help=(
256
- "Truncate the output in the variants dimension to have "
257
- "this number of chunks. Mainly intended to help with "
258
- "schema tuning."
259
- ),
260
- )
261
- @click.option(
262
- "-M",
263
- "--max-memory",
264
- default=None,
265
- help="An approximate bound on overall memory usage (e.g. 10G),",
266
- )
280
+ @max_variant_chunks
281
+ @max_memory
267
282
  @worker_processes
268
283
  def encode(
269
284
  icf_path,
@@ -288,13 +303,96 @@ def encode(
288
303
  schema_path=schema,
289
304
  variants_chunk_size=variants_chunk_size,
290
305
  samples_chunk_size=samples_chunk_size,
291
- max_v_chunks=max_variant_chunks,
306
+ max_variant_chunks=max_variant_chunks,
292
307
  worker_processes=worker_processes,
293
308
  max_memory=max_memory,
294
309
  show_progress=True,
295
310
  )
296
311
 
297
312
 
313
+ @click.command
314
+ @icf_path
315
+ @new_zarr_path
316
+ @num_partitions
317
+ @force
318
+ @schema
319
+ @variants_chunk_size
320
+ @samples_chunk_size
321
+ @max_variant_chunks
322
+ @verbose
323
+ def dencode_init(
324
+ icf_path,
325
+ zarr_path,
326
+ num_partitions,
327
+ force,
328
+ schema,
329
+ variants_chunk_size,
330
+ samples_chunk_size,
331
+ max_variant_chunks,
332
+ verbose,
333
+ ):
334
+ """
335
+ Initialise conversion of intermediate format to VCF Zarr. This will
336
+ set up the specified ZARR_PATH to perform this conversion over
337
+ NUM_PARTITIONS.
338
+
339
+ The output of this commmand is the actual number of partitions generated
340
+ (which may be less then the requested number, if there is not sufficient
341
+ chunks in the variants dimension) and a rough lower-bound on the amount
342
+ of memory required to encode a partition.
343
+
344
+ NOTE: the format of this output will likely change in subsequent releases;
345
+ it should not be considered machine-readable for now.
346
+ """
347
+ setup_logging(verbose)
348
+ check_overwrite_dir(zarr_path, force)
349
+ num_partitions, max_memory = vcf.encode_init(
350
+ icf_path,
351
+ zarr_path,
352
+ target_num_partitions=num_partitions,
353
+ schema_path=schema,
354
+ variants_chunk_size=variants_chunk_size,
355
+ samples_chunk_size=samples_chunk_size,
356
+ max_variant_chunks=max_variant_chunks,
357
+ show_progress=True,
358
+ )
359
+ formatted_size = humanfriendly.format_size(max_memory, binary=True)
360
+ # NOTE adding the size to the stdout here so that users can parse it
361
+ # and use in their submission scripts. This is a first pass, and
362
+ # will most likely change as we see what works and doesn't.
363
+ # NOTE we probably want to format this as a table, which lists
364
+ # some other properties, line by line
365
+ # NOTE This size number is also not quite enough, you need a bit of
366
+ # headroom with it (probably 10% or so). We should include this.
367
+ click.echo(f"{num_partitions}\t{formatted_size}")
368
+
369
+
370
+ @click.command
371
+ @zarr_path
372
+ @partition
373
+ @verbose
374
+ def dencode_partition(zarr_path, partition, verbose):
375
+ """
376
+ Convert a partition from intermediate columnar format to VCF Zarr.
377
+ Must be called *after* the Zarr path has been initialised with dencode_init.
378
+ Partition indexes must be from 0 (inclusive) to the number of paritions
379
+ returned by dencode_init (exclusive).
380
+ """
381
+ setup_logging(verbose)
382
+ vcf.encode_partition(zarr_path, partition)
383
+
384
+
385
+ @click.command
386
+ @zarr_path
387
+ @verbose
388
+ def dencode_finalise(zarr_path, verbose):
389
+ """
390
+ Final step for distributed conversion of ICF to VCF Zarr.
391
+ """
392
+ setup_logging(verbose)
393
+ vcf.encode_finalise(zarr_path, show_progress=True)
394
+
395
+
298
396
  @click.command(name="convert")
299
397
  @vcfs
300
398
  @new_zarr_path
@@ -382,6 +480,9 @@ vcf2zarr.add_command(encode)
382
480
  vcf2zarr.add_command(dexplode_init)
383
481
  vcf2zarr.add_command(dexplode_partition)
384
482
  vcf2zarr.add_command(dexplode_finalise)
483
+ vcf2zarr.add_command(dencode_init)
484
+ vcf2zarr.add_command(dencode_partition)
485
+ vcf2zarr.add_command(dencode_finalise)
385
486
 
386
487
 
387
488
  @click.command(name="convert")
bio2zarr/core.py CHANGED
@@ -3,6 +3,8 @@ import contextlib
3
3
  import dataclasses
4
4
  import logging
5
5
  import multiprocessing
6
+ import os
7
+ import os.path
6
8
  import threading
7
9
  import time
8
10
 
@@ -16,6 +18,16 @@ logger = logging.getLogger(__name__)
16
18
  numcodecs.blosc.use_threads = False
17
19
 
18
20
 
21
+ def min_int_dtype(min_value, max_value):
22
+ if min_value > max_value:
23
+ raise ValueError("min_value must be <= max_value")
24
+ for a_dtype in ["i1", "i2", "i4", "i8"]:
25
+ info = np.iinfo(a_dtype)
26
+ if info.min <= min_value and max_value <= info.max:
27
+ return a_dtype
28
+ raise OverflowError("Integer cannot be represented")
29
+
30
+
19
31
  def chunk_aligned_slices(z, n, max_chunks=None):
20
32
  """
21
33
  Returns at n slices in the specified zarr array, aligned
@@ -35,6 +47,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
35
47
  return slices
36
48
 
37
49
 
50
+ def du(path):
51
+ """
52
+ Return the total bytes stored at this path.
53
+ """
54
+ total = os.path.getsize(path)
55
+ # pathlib walk method doesn't exist until 3.12 :(
56
+ for root, dirs, files in os.walk(path):
57
+ for lst in [dirs, files]:
58
+ for name in lst:
59
+ fullname = os.path.join(root, name)
60
+ size = os.path.getsize(fullname)
61
+ total += size
62
+ logger.debug(f"du({path}) = {total}")
63
+ return total
64
+
65
+
38
66
  class SynchronousExecutor(cf.Executor):
39
67
  def submit(self, fn, /, *args, **kwargs):
40
68
  future = cf.Future()
@@ -100,6 +128,7 @@ class BufferedArray:
100
128
  sync_flush_2d_array(
101
129
  self.buff[: self.buffer_row], self.array, self.array_offset
102
130
  )
131
+ # FIXME the array.name doesn't seem to be working here for some reason
103
132
  logger.debug(
104
133
  f"Flushed <{self.array.name} {self.array.shape} "
105
134
  f"{self.array.dtype}> "
@@ -121,8 +150,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
121
150
  # encoder implementations.
122
151
  s = slice(offset, offset + np_buffer.shape[0])
123
152
  samples_chunk_size = zarr_array.chunks[1]
124
- # TODO use zarr chunks here to support non-uniform chunking later
125
- # and for simplicity
153
+ # TODO use zarr chunks here for simplicity
126
154
  zarr_array_width = zarr_array.shape[1]
127
155
  start = 0
128
156
  while start < zarr_array_width:
@@ -182,7 +210,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
182
210
  self.progress_config = progress_config
183
211
  self.progress_bar = tqdm.tqdm(
184
212
  total=progress_config.total,
185
- desc=f"{progress_config.title:>7}",
213
+ desc=f"{progress_config.title:>8}",
186
214
  unit_scale=True,
187
215
  unit=progress_config.units,
188
216
  smoothing=0.1,