bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.1'
16
- __version_tuple__ = version_tuple = (0, 0, 1)
15
+ __version__ = version = '0.0.3'
16
+ __version_tuple__ = version_tuple = (0, 0, 3)
bio2zarr/cli.py CHANGED
@@ -1,3 +1,8 @@
1
+ import logging
2
+ import os
3
+ import pathlib
4
+ import shutil
5
+
1
6
  import click
2
7
  import tabulate
3
8
  import coloredlogs
@@ -7,35 +12,79 @@ from . import vcf_utils
7
12
  from . import plink
8
13
  from . import provenance
9
14
 
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class NaturalOrderGroup(click.Group):
20
+ """
21
+ List commands in the order they are provided in the help text.
22
+ """
23
+
24
+ def list_commands(self, ctx):
25
+ return self.commands.keys()
26
+
27
+
10
28
  # Common arguments/options
29
+ vcfs = click.argument(
30
+ "vcfs", nargs=-1, required=True, type=click.Path(exists=True, dir_okay=False)
31
+ )
32
+
33
+ new_icf_path = click.argument(
34
+ "icf_path", type=click.Path(file_okay=False, dir_okay=True)
35
+ )
36
+
37
+ icf_path = click.argument(
38
+ "icf_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
39
+ )
40
+
41
+ new_zarr_path = click.argument(
42
+ "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
43
+ )
44
+
11
45
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
12
46
 
47
+ force = click.option(
48
+ "-f",
49
+ "--force",
50
+ is_flag=True,
51
+ flag_value=True,
52
+ help="Force overwriting of existing directories",
53
+ )
54
+
55
+ version = click.version_option(version=f"{provenance.__version__}")
56
+
13
57
  worker_processes = click.option(
14
58
  "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
15
59
  )
16
60
 
17
- # TODO help text
18
- chunk_length = click.option(
61
+ column_chunk_size = click.option(
62
+ "-c",
63
+ "--column-chunk-size",
64
+ type=int,
65
+ default=64,
66
+ help="Approximate uncompressed size of exploded column chunks in MiB",
67
+ )
68
+
69
+ # Note: -l and -w were chosen when these were called "width" and "length".
70
+ # possibly there are better letters now.
71
+ variants_chunk_size = click.option(
19
72
  "-l",
20
- "--chunk-length",
73
+ "--variants-chunk-size",
21
74
  type=int,
22
75
  default=None,
23
76
  help="Chunk size in the variants dimension",
24
77
  )
25
78
 
26
- chunk_width = click.option(
79
+ samples_chunk_size = click.option(
27
80
  "-w",
28
- "--chunk-width",
81
+ "--samples-chunk-size",
29
82
  type=int,
30
83
  default=None,
31
84
  help="Chunk size in the samples dimension",
32
85
  )
33
86
 
34
- version = click.version_option(version=f"bio2zarr {provenance.__version__}")
35
87
 
36
-
37
- # Note: logging hasn't been implemented in the code at all, this is just
38
- # a first pass to try out some ways of doing things to see what works.
39
88
  def setup_logging(verbosity):
40
89
  level = "WARNING"
41
90
  if verbosity == 1:
@@ -43,26 +92,43 @@ def setup_logging(verbosity):
43
92
  elif verbosity >= 2:
44
93
  level = "DEBUG"
45
94
  # NOTE: I'm not that excited about coloredlogs, just trying it out
46
- # as it is installed by cyvcf2 anyway. We will have some complicated
47
- # stuff doing on with threads and processes, to logs might not work
48
- # so well anyway.
95
+ # as it is installed by cyvcf2 anyway.
49
96
  coloredlogs.install(level=level)
50
97
 
51
98
 
99
+ def check_overwrite_dir(path, force):
100
+ path = pathlib.Path(path)
101
+ if path.exists():
102
+ if not force:
103
+ click.confirm(
104
+ f"Do you want to overwrite {path}? (use --force to skip this check)",
105
+ abort=True,
106
+ )
107
+ # These trees can be mondo-big and on slow file systems, so it's entirely
108
+ # feasible that the delete would fail or be killed. This makes it less likely
109
+ # that partially deleted paths are mistaken for good paths.
110
+ tmp_delete_path = path.with_suffix(f"{path.suffix}.{os.getpid()}.DELETING")
111
+ logger.info(f"Deleting {path} (renamed to {tmp_delete_path} while in progress)")
112
+ os.rename(path, tmp_delete_path)
113
+ shutil.rmtree(tmp_delete_path)
114
+
115
+
52
116
  @click.command
53
- @click.argument("vcfs", nargs=-1, required=True)
54
- @click.argument("out_path", type=click.Path())
117
+ @vcfs
118
+ @new_icf_path
119
+ @force
55
120
  @verbose
56
121
  @worker_processes
57
- @click.option("-c", "--column-chunk-size", type=int, default=64)
58
- def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
122
+ @column_chunk_size
123
+ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
59
124
  """
60
- Convert VCF(s) to columnar intermediate format
125
+ Convert VCF(s) to intermediate columnar format
61
126
  """
62
127
  setup_logging(verbose)
128
+ check_overwrite_dir(icf_path, force)
63
129
  vcf.explode(
64
130
  vcfs,
65
- out_path,
131
+ icf_path,
66
132
  worker_processes=worker_processes,
67
133
  column_chunk_size=column_chunk_size,
68
134
  show_progress=True,
@@ -70,34 +136,88 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
70
136
 
71
137
 
72
138
  @click.command
73
- @click.argument("if_path", type=click.Path())
139
+ @vcfs
140
+ @new_icf_path
141
+ @click.argument("num_partitions", type=click.IntRange(min=1))
142
+ @force
143
+ @column_chunk_size
74
144
  @verbose
75
- def inspect(if_path, verbose):
145
+ @worker_processes
146
+ def dexplode_init(
147
+ vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
148
+ ):
76
149
  """
77
- Inspect an intermediate format file
150
+ Initial step for distributed conversion of VCF(s) to intermediate columnar format
151
+ over the requested number of paritions.
78
152
  """
79
153
  setup_logging(verbose)
80
- data = vcf.inspect(if_path)
154
+ check_overwrite_dir(icf_path, force)
155
+ num_partitions = vcf.explode_init(
156
+ icf_path,
157
+ vcfs,
158
+ target_num_partitions=num_partitions,
159
+ column_chunk_size=column_chunk_size,
160
+ worker_processes=worker_processes,
161
+ show_progress=True,
162
+ )
163
+ click.echo(num_partitions)
164
+
165
+
166
+ @click.command
167
+ @icf_path
168
+ @click.argument("partition", type=click.IntRange(min=0))
169
+ @verbose
170
+ def dexplode_partition(icf_path, partition, verbose):
171
+ """
172
+ Convert a VCF partition to intermediate columnar format. Must be called *after*
173
+ the ICF path has been initialised with dexplode_init. Partition indexes must be
174
+ from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
175
+ """
176
+ setup_logging(verbose)
177
+ vcf.explode_partition(icf_path, partition, show_progress=True)
178
+
179
+
180
+ @click.command
181
+ @click.argument("path", type=click.Path(), required=True)
182
+ @verbose
183
+ def dexplode_finalise(path, verbose):
184
+ """
185
+ Final step for distributed conversion of VCF(s) to intermediate columnar format.
186
+ """
187
+ setup_logging(verbose)
188
+ vcf.explode_finalise(path)
189
+
190
+
191
+ @click.command
192
+ @click.argument("path", type=click.Path())
193
+ @verbose
194
+ def inspect(path, verbose):
195
+ """
196
+ Inspect an intermediate columnar format or Zarr path.
197
+ """
198
+ setup_logging(verbose)
199
+ data = vcf.inspect(path)
81
200
  click.echo(tabulate.tabulate(data, headers="keys"))
82
201
 
83
202
 
84
203
  @click.command
85
- @click.argument("if_path", type=click.Path())
86
- def mkschema(if_path):
204
+ @icf_path
205
+ def mkschema(icf_path):
87
206
  """
88
207
  Generate a schema for zarr encoding
89
208
  """
90
209
  stream = click.get_text_stream("stdout")
91
- vcf.mkschema(if_path, stream)
210
+ vcf.mkschema(icf_path, stream)
92
211
 
93
212
 
94
213
  @click.command
95
- @click.argument("if_path", type=click.Path())
96
- @click.argument("zarr_path", type=click.Path())
214
+ @icf_path
215
+ @new_zarr_path
216
+ @force
97
217
  @verbose
98
218
  @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
99
- @chunk_length
100
- @chunk_width
219
+ @variants_chunk_size
220
+ @samples_chunk_size
101
221
  @click.option(
102
222
  "-V",
103
223
  "--max-variant-chunks",
@@ -109,90 +229,147 @@ def mkschema(if_path):
109
229
  "schema tuning."
110
230
  ),
111
231
  )
232
+ @click.option(
233
+ "-M",
234
+ "--max-memory",
235
+ type=int,
236
+ default=None,
237
+ help="An approximate bound on overall memory usage in megabytes",
238
+ )
112
239
  @worker_processes
113
240
  def encode(
114
- if_path,
241
+ icf_path,
115
242
  zarr_path,
243
+ force,
116
244
  verbose,
117
245
  schema,
118
- chunk_length,
119
- chunk_width,
246
+ variants_chunk_size,
247
+ samples_chunk_size,
120
248
  max_variant_chunks,
249
+ max_memory,
121
250
  worker_processes,
122
251
  ):
123
252
  """
124
- Encode intermediate format (see explode) to vcfzarr
253
+ Encode intermediate columnar format (see explode) to vcfzarr.
125
254
  """
126
255
  setup_logging(verbose)
256
+ check_overwrite_dir(zarr_path, force)
127
257
  vcf.encode(
128
- if_path,
258
+ icf_path,
129
259
  zarr_path,
130
- schema,
131
- chunk_length=chunk_length,
132
- chunk_width=chunk_width,
260
+ schema_path=schema,
261
+ variants_chunk_size=variants_chunk_size,
262
+ samples_chunk_size=samples_chunk_size,
133
263
  max_v_chunks=max_variant_chunks,
134
264
  worker_processes=worker_processes,
265
+ max_memory=max_memory,
135
266
  show_progress=True,
136
267
  )
137
268
 
138
269
 
139
270
  @click.command(name="convert")
140
- @click.argument("vcfs", nargs=-1, required=True)
141
- @click.argument("out_path", type=click.Path())
142
- @chunk_length
143
- @chunk_width
271
+ @vcfs
272
+ @new_zarr_path
273
+ @variants_chunk_size
274
+ @samples_chunk_size
144
275
  @verbose
145
276
  @worker_processes
146
- def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
277
+ def convert_vcf(
278
+ vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
279
+ ):
147
280
  """
148
- Convert input VCF(s) directly to vcfzarr (not recommended for large files)
281
+ Convert input VCF(s) directly to vcfzarr (not recommended for large files).
149
282
  """
150
283
  setup_logging(verbose)
151
284
  vcf.convert(
152
285
  vcfs,
153
- out_path,
154
- chunk_length=chunk_length,
155
- chunk_width=chunk_width,
286
+ zarr_path,
287
+ variants_chunk_size=variants_chunk_size,
288
+ samples_chunk_size=samples_chunk_size,
156
289
  show_progress=True,
157
290
  worker_processes=worker_processes,
158
291
  )
159
292
 
160
293
 
161
- @click.command
162
- @click.argument("vcfs", nargs=-1, required=True)
163
- @click.argument("out_path", type=click.Path())
164
- def validate(vcfs, out_path):
165
- """
166
- Development only, do not use. Will be removed before release.
294
+ @version
295
+ @click.group(cls=NaturalOrderGroup)
296
+ def vcf2zarr():
167
297
  """
168
- # FIXME! Will silently not look at remaining VCFs
169
- vcf.validate(vcfs[0], out_path, show_progress=True)
298
+ Convert VCF file(s) to the vcfzarr format.
170
299
 
300
+ The simplest usage is:
171
301
 
172
- @version
173
- @click.group()
174
- def vcf2zarr():
175
- pass
302
+ $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
303
+
304
+ This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
305
+ step. As this writes the intermediate columnar format to a temporary directory,
306
+ we only recommend this approach for small files (< 1GB, say).
307
+
308
+ The recommended approach is to run the conversion in two passes, and
309
+ to keep the intermediate columnar format ("exploded") around to facilitate
310
+ experimentation with chunk sizes and compression settings:
311
+
312
+ \b
313
+ $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
314
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
315
+
316
+ The inspect command provides a way to view contents of an exploded ICF
317
+ or Zarr:
318
+
319
+ $ vcf2zarr inspect [PATH]
320
+
321
+ This is useful when tweaking chunk sizes and compression settings to suit
322
+ your dataset, using the mkschema command and --schema option to encode:
323
+
324
+ \b
325
+ $ vcf2zarr mkschema [ICF_PATH] > schema.json
326
+ $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
327
+
328
+ By editing the schema.json file you can drop columns that are not of interest
329
+ and edit column specific compression settings. The --max-variant-chunks option
330
+ to encode allows you to try out these options on small subsets, hopefully
331
+ arriving at settings with the desired balance of compression and query
332
+ performance.
333
+
334
+ ADVANCED USAGE
335
+
336
+ For very large datasets (terabyte scale) it may be necessary to distribute the
337
+ explode and encode steps across a cluster:
338
+
339
+ \b
340
+ $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
341
+ $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
342
+ $ vcf2zarr dexplode-finalise [ICF_PATH]
343
+
344
+ See the online documentation at [FIXME] for more details on distributed explode.
345
+ """
176
346
 
177
347
 
178
348
  # TODO figure out how to get click to list these in the given order.
179
- vcf2zarr.add_command(explode)
349
+ vcf2zarr.add_command(convert_vcf)
180
350
  vcf2zarr.add_command(inspect)
351
+ vcf2zarr.add_command(explode)
181
352
  vcf2zarr.add_command(mkschema)
182
353
  vcf2zarr.add_command(encode)
183
- vcf2zarr.add_command(convert_vcf)
184
- vcf2zarr.add_command(validate)
354
+ vcf2zarr.add_command(dexplode_init)
355
+ vcf2zarr.add_command(dexplode_partition)
356
+ vcf2zarr.add_command(dexplode_finalise)
185
357
 
186
358
 
187
359
  @click.command(name="convert")
188
360
  @click.argument("in_path", type=click.Path())
189
- @click.argument("out_path", type=click.Path())
361
+ @click.argument("zarr_path", type=click.Path())
190
362
  @worker_processes
191
363
  @verbose
192
- @chunk_length
193
- @chunk_width
364
+ @variants_chunk_size
365
+ @samples_chunk_size
194
366
  def convert_plink(
195
- in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
367
+ in_path,
368
+ zarr_path,
369
+ verbose,
370
+ worker_processes,
371
+ variants_chunk_size,
372
+ samples_chunk_size,
196
373
  ):
197
374
  """
198
375
  In development; DO NOT USE!
@@ -200,11 +377,11 @@ def convert_plink(
200
377
  setup_logging(verbose)
201
378
  plink.convert(
202
379
  in_path,
203
- out_path,
380
+ zarr_path,
204
381
  show_progress=True,
205
382
  worker_processes=worker_processes,
206
- chunk_width=chunk_width,
207
- chunk_length=chunk_length,
383
+ samples_chunk_size=samples_chunk_size,
384
+ variants_chunk_size=variants_chunk_size,
208
385
  )
209
386
 
210
387
 
bio2zarr/core.py CHANGED
@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
16
16
 
17
17
  numcodecs.blosc.use_threads = False
18
18
 
19
- # TODO this should probably go in another module where we abstract
20
- # out the zarr defaults
21
- default_compressor = numcodecs.Blosc(
22
- cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
23
- )
24
-
25
19
 
26
20
  def chunk_aligned_slices(z, n, max_chunks=None):
27
21
  """
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
53
47
  for future in cf.as_completed(futures):
54
48
  exception = future.exception()
55
49
  if exception is not None:
56
- raise exception
50
+ cancel_futures(futures)
51
+ if isinstance(exception, cf.process.BrokenProcessPool):
52
+ raise RuntimeError(
53
+ "Worker process died: you may have run out of memory") from exception
54
+ else:
55
+ raise exception
57
56
 
58
57
 
59
58
  def cancel_futures(futures):
@@ -74,15 +73,18 @@ class BufferedArray:
74
73
  assert offset % array.chunks[0] == 0
75
74
  dims = list(array.shape)
76
75
  dims[0] = min(array.chunks[0], array.shape[0])
77
- self.buff = np.zeros(dims, dtype=array.dtype)
76
+ self.buff = np.empty(dims, dtype=array.dtype)
77
+ # Explicitly Fill with zeros here to make any out-of-memory errors happen
78
+ # quickly.
79
+ self.buff[:] = 0
78
80
  self.buffer_row = 0
79
81
 
80
82
  @property
81
- def chunk_length(self):
83
+ def variants_chunk_size(self):
82
84
  return self.buff.shape[0]
83
85
 
84
86
  def next_buffer_row(self):
85
- if self.buffer_row == self.chunk_length:
87
+ if self.buffer_row == self.variants_chunk_size:
86
88
  self.flush()
87
89
  row = self.buffer_row
88
90
  self.buffer_row += 1
@@ -104,13 +106,13 @@ class BufferedArray:
104
106
  f"{self.array_offset}:{self.array_offset + self.buffer_row}"
105
107
  f"{self.buff.nbytes / 2**20: .2f}Mb"
106
108
  )
107
- self.array_offset += self.chunk_length
109
+ self.array_offset += self.variants_chunk_size
108
110
  self.buffer_row = 0
109
111
 
110
112
 
111
113
  def sync_flush_1d_array(np_buffer, zarr_array, offset):
112
114
  zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
113
- update_progress(1)
115
+ update_progress(np_buffer.nbytes)
114
116
 
115
117
 
116
118
  def sync_flush_2d_array(np_buffer, zarr_array, offset):
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
118
120
  # incremental, and to avoid large memcopies in the underlying
119
121
  # encoder implementations.
120
122
  s = slice(offset, offset + np_buffer.shape[0])
121
- chunk_width = zarr_array.chunks[1]
123
+ samples_chunk_size = zarr_array.chunks[1]
124
+ # TODO use zarr chunks here to support non-uniform chunking later
125
+ # and for simplicity
122
126
  zarr_array_width = zarr_array.shape[1]
123
127
  start = 0
124
128
  while start < zarr_array_width:
125
- stop = min(start + chunk_width, zarr_array_width)
126
- zarr_array[s, start:stop] = np_buffer[:, start:stop]
127
- update_progress(1)
129
+ stop = min(start + samples_chunk_size, zarr_array_width)
130
+ chunk_buffer = np_buffer[:, start:stop]
131
+ zarr_array[s, start:stop] = chunk_buffer
132
+ update_progress(chunk_buffer.nbytes)
128
133
  start = stop
129
134
 
130
135
 
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
169
174
  self.executor = cf.ProcessPoolExecutor(
170
175
  max_workers=worker_processes,
171
176
  )
172
- self.futures = []
177
+ self.futures = set()
173
178
 
174
179
  set_progress(0)
175
180
  if progress_config is None:
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
177
182
  self.progress_config = progress_config
178
183
  self.progress_bar = tqdm.tqdm(
179
184
  total=progress_config.total,
180
- desc=f"{progress_config.title:>9}",
185
+ desc=f"{progress_config.title:>7}",
181
186
  unit_scale=True,
182
187
  unit=progress_config.units,
183
188
  smoothing=0.1,
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
208
213
  logger.debug("Exit progress thread")
209
214
 
210
215
  def submit(self, *args, **kwargs):
211
- self.futures.append(self.executor.submit(*args, **kwargs))
216
+ future = self.executor.submit(*args, **kwargs)
217
+ self.futures.add(future)
218
+ return future
219
+
220
+ def wait_for_completed(self, timeout=None):
221
+ done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
222
+ for future in done:
223
+ exception = future.exception()
224
+ # TODO do the check for BrokenProcessPool here
225
+ if exception is not None:
226
+ raise exception
227
+ self.futures = not_done
228
+ return done
212
229
 
213
230
  def results_as_completed(self):
214
231
  for future in cf.as_completed(self.futures):