bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +245 -68
- bio2zarr/core.py +36 -19
- bio2zarr/plink.py +25 -19
- bio2zarr/vcf.py +704 -389
- bio2zarr/vcf_utils.py +0 -1
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/METADATA +1 -1
- bio2zarr-0.0.3.dist-info/RECORD +16 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/WHEEL +1 -1
- bio2zarr-0.0.1.dist-info/RECORD +0 -16
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import pathlib
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
1
6
|
import click
|
|
2
7
|
import tabulate
|
|
3
8
|
import coloredlogs
|
|
@@ -7,35 +12,79 @@ from . import vcf_utils
|
|
|
7
12
|
from . import plink
|
|
8
13
|
from . import provenance
|
|
9
14
|
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NaturalOrderGroup(click.Group):
|
|
20
|
+
"""
|
|
21
|
+
List commands in the order they are provided in the help text.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def list_commands(self, ctx):
|
|
25
|
+
return self.commands.keys()
|
|
26
|
+
|
|
27
|
+
|
|
10
28
|
# Common arguments/options
|
|
29
|
+
vcfs = click.argument(
|
|
30
|
+
"vcfs", nargs=-1, required=True, type=click.Path(exists=True, dir_okay=False)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
new_icf_path = click.argument(
|
|
34
|
+
"icf_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
icf_path = click.argument(
|
|
38
|
+
"icf_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
new_zarr_path = click.argument(
|
|
42
|
+
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
43
|
+
)
|
|
44
|
+
|
|
11
45
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
12
46
|
|
|
47
|
+
force = click.option(
|
|
48
|
+
"-f",
|
|
49
|
+
"--force",
|
|
50
|
+
is_flag=True,
|
|
51
|
+
flag_value=True,
|
|
52
|
+
help="Force overwriting of existing directories",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
version = click.version_option(version=f"{provenance.__version__}")
|
|
56
|
+
|
|
13
57
|
worker_processes = click.option(
|
|
14
58
|
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
|
|
15
59
|
)
|
|
16
60
|
|
|
17
|
-
|
|
18
|
-
|
|
61
|
+
column_chunk_size = click.option(
|
|
62
|
+
"-c",
|
|
63
|
+
"--column-chunk-size",
|
|
64
|
+
type=int,
|
|
65
|
+
default=64,
|
|
66
|
+
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
70
|
+
# possibly there are better letters now.
|
|
71
|
+
variants_chunk_size = click.option(
|
|
19
72
|
"-l",
|
|
20
|
-
"--chunk-
|
|
73
|
+
"--variants-chunk-size",
|
|
21
74
|
type=int,
|
|
22
75
|
default=None,
|
|
23
76
|
help="Chunk size in the variants dimension",
|
|
24
77
|
)
|
|
25
78
|
|
|
26
|
-
|
|
79
|
+
samples_chunk_size = click.option(
|
|
27
80
|
"-w",
|
|
28
|
-
"--chunk-
|
|
81
|
+
"--samples-chunk-size",
|
|
29
82
|
type=int,
|
|
30
83
|
default=None,
|
|
31
84
|
help="Chunk size in the samples dimension",
|
|
32
85
|
)
|
|
33
86
|
|
|
34
|
-
version = click.version_option(version=f"bio2zarr {provenance.__version__}")
|
|
35
87
|
|
|
36
|
-
|
|
37
|
-
# Note: logging hasn't been implemented in the code at all, this is just
|
|
38
|
-
# a first pass to try out some ways of doing things to see what works.
|
|
39
88
|
def setup_logging(verbosity):
|
|
40
89
|
level = "WARNING"
|
|
41
90
|
if verbosity == 1:
|
|
@@ -43,26 +92,43 @@ def setup_logging(verbosity):
|
|
|
43
92
|
elif verbosity >= 2:
|
|
44
93
|
level = "DEBUG"
|
|
45
94
|
# NOTE: I'm not that excited about coloredlogs, just trying it out
|
|
46
|
-
# as it is installed by cyvcf2 anyway.
|
|
47
|
-
# stuff doing on with threads and processes, to logs might not work
|
|
48
|
-
# so well anyway.
|
|
95
|
+
# as it is installed by cyvcf2 anyway.
|
|
49
96
|
coloredlogs.install(level=level)
|
|
50
97
|
|
|
51
98
|
|
|
99
|
+
def check_overwrite_dir(path, force):
|
|
100
|
+
path = pathlib.Path(path)
|
|
101
|
+
if path.exists():
|
|
102
|
+
if not force:
|
|
103
|
+
click.confirm(
|
|
104
|
+
f"Do you want to overwrite {path}? (use --force to skip this check)",
|
|
105
|
+
abort=True,
|
|
106
|
+
)
|
|
107
|
+
# These trees can be mondo-big and on slow file systems, so it's entirely
|
|
108
|
+
# feasible that the delete would fail or be killed. This makes it less likely
|
|
109
|
+
# that partially deleted paths are mistaken for good paths.
|
|
110
|
+
tmp_delete_path = path.with_suffix(f"{path.suffix}.{os.getpid()}.DELETING")
|
|
111
|
+
logger.info(f"Deleting {path} (renamed to {tmp_delete_path} while in progress)")
|
|
112
|
+
os.rename(path, tmp_delete_path)
|
|
113
|
+
shutil.rmtree(tmp_delete_path)
|
|
114
|
+
|
|
115
|
+
|
|
52
116
|
@click.command
|
|
53
|
-
@
|
|
54
|
-
@
|
|
117
|
+
@vcfs
|
|
118
|
+
@new_icf_path
|
|
119
|
+
@force
|
|
55
120
|
@verbose
|
|
56
121
|
@worker_processes
|
|
57
|
-
@
|
|
58
|
-
def explode(vcfs,
|
|
122
|
+
@column_chunk_size
|
|
123
|
+
def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
|
|
59
124
|
"""
|
|
60
|
-
Convert VCF(s) to columnar
|
|
125
|
+
Convert VCF(s) to intermediate columnar format
|
|
61
126
|
"""
|
|
62
127
|
setup_logging(verbose)
|
|
128
|
+
check_overwrite_dir(icf_path, force)
|
|
63
129
|
vcf.explode(
|
|
64
130
|
vcfs,
|
|
65
|
-
|
|
131
|
+
icf_path,
|
|
66
132
|
worker_processes=worker_processes,
|
|
67
133
|
column_chunk_size=column_chunk_size,
|
|
68
134
|
show_progress=True,
|
|
@@ -70,34 +136,88 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
|
|
|
70
136
|
|
|
71
137
|
|
|
72
138
|
@click.command
|
|
73
|
-
@
|
|
139
|
+
@vcfs
|
|
140
|
+
@new_icf_path
|
|
141
|
+
@click.argument("num_partitions", type=click.IntRange(min=1))
|
|
142
|
+
@force
|
|
143
|
+
@column_chunk_size
|
|
74
144
|
@verbose
|
|
75
|
-
|
|
145
|
+
@worker_processes
|
|
146
|
+
def dexplode_init(
|
|
147
|
+
vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
|
|
148
|
+
):
|
|
76
149
|
"""
|
|
77
|
-
|
|
150
|
+
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
151
|
+
over the requested number of paritions.
|
|
78
152
|
"""
|
|
79
153
|
setup_logging(verbose)
|
|
80
|
-
|
|
154
|
+
check_overwrite_dir(icf_path, force)
|
|
155
|
+
num_partitions = vcf.explode_init(
|
|
156
|
+
icf_path,
|
|
157
|
+
vcfs,
|
|
158
|
+
target_num_partitions=num_partitions,
|
|
159
|
+
column_chunk_size=column_chunk_size,
|
|
160
|
+
worker_processes=worker_processes,
|
|
161
|
+
show_progress=True,
|
|
162
|
+
)
|
|
163
|
+
click.echo(num_partitions)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@click.command
|
|
167
|
+
@icf_path
|
|
168
|
+
@click.argument("partition", type=click.IntRange(min=0))
|
|
169
|
+
@verbose
|
|
170
|
+
def dexplode_partition(icf_path, partition, verbose):
|
|
171
|
+
"""
|
|
172
|
+
Convert a VCF partition to intermediate columnar format. Must be called *after*
|
|
173
|
+
the ICF path has been initialised with dexplode_init. Partition indexes must be
|
|
174
|
+
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
175
|
+
"""
|
|
176
|
+
setup_logging(verbose)
|
|
177
|
+
vcf.explode_partition(icf_path, partition, show_progress=True)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@click.command
|
|
181
|
+
@click.argument("path", type=click.Path(), required=True)
|
|
182
|
+
@verbose
|
|
183
|
+
def dexplode_finalise(path, verbose):
|
|
184
|
+
"""
|
|
185
|
+
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
186
|
+
"""
|
|
187
|
+
setup_logging(verbose)
|
|
188
|
+
vcf.explode_finalise(path)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@click.command
|
|
192
|
+
@click.argument("path", type=click.Path())
|
|
193
|
+
@verbose
|
|
194
|
+
def inspect(path, verbose):
|
|
195
|
+
"""
|
|
196
|
+
Inspect an intermediate columnar format or Zarr path.
|
|
197
|
+
"""
|
|
198
|
+
setup_logging(verbose)
|
|
199
|
+
data = vcf.inspect(path)
|
|
81
200
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
82
201
|
|
|
83
202
|
|
|
84
203
|
@click.command
|
|
85
|
-
@
|
|
86
|
-
def mkschema(
|
|
204
|
+
@icf_path
|
|
205
|
+
def mkschema(icf_path):
|
|
87
206
|
"""
|
|
88
207
|
Generate a schema for zarr encoding
|
|
89
208
|
"""
|
|
90
209
|
stream = click.get_text_stream("stdout")
|
|
91
|
-
vcf.mkschema(
|
|
210
|
+
vcf.mkschema(icf_path, stream)
|
|
92
211
|
|
|
93
212
|
|
|
94
213
|
@click.command
|
|
95
|
-
@
|
|
96
|
-
@
|
|
214
|
+
@icf_path
|
|
215
|
+
@new_zarr_path
|
|
216
|
+
@force
|
|
97
217
|
@verbose
|
|
98
218
|
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
99
|
-
@
|
|
100
|
-
@
|
|
219
|
+
@variants_chunk_size
|
|
220
|
+
@samples_chunk_size
|
|
101
221
|
@click.option(
|
|
102
222
|
"-V",
|
|
103
223
|
"--max-variant-chunks",
|
|
@@ -109,90 +229,147 @@ def mkschema(if_path):
|
|
|
109
229
|
"schema tuning."
|
|
110
230
|
),
|
|
111
231
|
)
|
|
232
|
+
@click.option(
|
|
233
|
+
"-M",
|
|
234
|
+
"--max-memory",
|
|
235
|
+
type=int,
|
|
236
|
+
default=None,
|
|
237
|
+
help="An approximate bound on overall memory usage in megabytes",
|
|
238
|
+
)
|
|
112
239
|
@worker_processes
|
|
113
240
|
def encode(
|
|
114
|
-
|
|
241
|
+
icf_path,
|
|
115
242
|
zarr_path,
|
|
243
|
+
force,
|
|
116
244
|
verbose,
|
|
117
245
|
schema,
|
|
118
|
-
|
|
119
|
-
|
|
246
|
+
variants_chunk_size,
|
|
247
|
+
samples_chunk_size,
|
|
120
248
|
max_variant_chunks,
|
|
249
|
+
max_memory,
|
|
121
250
|
worker_processes,
|
|
122
251
|
):
|
|
123
252
|
"""
|
|
124
|
-
Encode intermediate format (see explode) to vcfzarr
|
|
253
|
+
Encode intermediate columnar format (see explode) to vcfzarr.
|
|
125
254
|
"""
|
|
126
255
|
setup_logging(verbose)
|
|
256
|
+
check_overwrite_dir(zarr_path, force)
|
|
127
257
|
vcf.encode(
|
|
128
|
-
|
|
258
|
+
icf_path,
|
|
129
259
|
zarr_path,
|
|
130
|
-
schema,
|
|
131
|
-
|
|
132
|
-
|
|
260
|
+
schema_path=schema,
|
|
261
|
+
variants_chunk_size=variants_chunk_size,
|
|
262
|
+
samples_chunk_size=samples_chunk_size,
|
|
133
263
|
max_v_chunks=max_variant_chunks,
|
|
134
264
|
worker_processes=worker_processes,
|
|
265
|
+
max_memory=max_memory,
|
|
135
266
|
show_progress=True,
|
|
136
267
|
)
|
|
137
268
|
|
|
138
269
|
|
|
139
270
|
@click.command(name="convert")
|
|
140
|
-
@
|
|
141
|
-
@
|
|
142
|
-
@
|
|
143
|
-
@
|
|
271
|
+
@vcfs
|
|
272
|
+
@new_zarr_path
|
|
273
|
+
@variants_chunk_size
|
|
274
|
+
@samples_chunk_size
|
|
144
275
|
@verbose
|
|
145
276
|
@worker_processes
|
|
146
|
-
def convert_vcf(
|
|
277
|
+
def convert_vcf(
|
|
278
|
+
vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
|
|
279
|
+
):
|
|
147
280
|
"""
|
|
148
|
-
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
|
|
281
|
+
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
149
282
|
"""
|
|
150
283
|
setup_logging(verbose)
|
|
151
284
|
vcf.convert(
|
|
152
285
|
vcfs,
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
286
|
+
zarr_path,
|
|
287
|
+
variants_chunk_size=variants_chunk_size,
|
|
288
|
+
samples_chunk_size=samples_chunk_size,
|
|
156
289
|
show_progress=True,
|
|
157
290
|
worker_processes=worker_processes,
|
|
158
291
|
)
|
|
159
292
|
|
|
160
293
|
|
|
161
|
-
@
|
|
162
|
-
@click.
|
|
163
|
-
|
|
164
|
-
def validate(vcfs, out_path):
|
|
165
|
-
"""
|
|
166
|
-
Development only, do not use. Will be removed before release.
|
|
294
|
+
@version
|
|
295
|
+
@click.group(cls=NaturalOrderGroup)
|
|
296
|
+
def vcf2zarr():
|
|
167
297
|
"""
|
|
168
|
-
|
|
169
|
-
vcf.validate(vcfs[0], out_path, show_progress=True)
|
|
298
|
+
Convert VCF file(s) to the vcfzarr format.
|
|
170
299
|
|
|
300
|
+
The simplest usage is:
|
|
171
301
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
302
|
+
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
|
|
303
|
+
|
|
304
|
+
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
|
|
305
|
+
step. As this writes the intermediate columnar format to a temporary directory,
|
|
306
|
+
we only recommend this approach for small files (< 1GB, say).
|
|
307
|
+
|
|
308
|
+
The recommended approach is to run the conversion in two passes, and
|
|
309
|
+
to keep the intermediate columnar format ("exploded") around to facilitate
|
|
310
|
+
experimentation with chunk sizes and compression settings:
|
|
311
|
+
|
|
312
|
+
\b
|
|
313
|
+
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
|
|
314
|
+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
|
|
315
|
+
|
|
316
|
+
The inspect command provides a way to view contents of an exploded ICF
|
|
317
|
+
or Zarr:
|
|
318
|
+
|
|
319
|
+
$ vcf2zarr inspect [PATH]
|
|
320
|
+
|
|
321
|
+
This is useful when tweaking chunk sizes and compression settings to suit
|
|
322
|
+
your dataset, using the mkschema command and --schema option to encode:
|
|
323
|
+
|
|
324
|
+
\b
|
|
325
|
+
$ vcf2zarr mkschema [ICF_PATH] > schema.json
|
|
326
|
+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
|
|
327
|
+
|
|
328
|
+
By editing the schema.json file you can drop columns that are not of interest
|
|
329
|
+
and edit column specific compression settings. The --max-variant-chunks option
|
|
330
|
+
to encode allows you to try out these options on small subsets, hopefully
|
|
331
|
+
arriving at settings with the desired balance of compression and query
|
|
332
|
+
performance.
|
|
333
|
+
|
|
334
|
+
ADVANCED USAGE
|
|
335
|
+
|
|
336
|
+
For very large datasets (terabyte scale) it may be necessary to distribute the
|
|
337
|
+
explode and encode steps across a cluster:
|
|
338
|
+
|
|
339
|
+
\b
|
|
340
|
+
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
|
|
341
|
+
$ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
|
|
342
|
+
$ vcf2zarr dexplode-finalise [ICF_PATH]
|
|
343
|
+
|
|
344
|
+
See the online documentation at [FIXME] for more details on distributed explode.
|
|
345
|
+
"""
|
|
176
346
|
|
|
177
347
|
|
|
178
348
|
# TODO figure out how to get click to list these in the given order.
|
|
179
|
-
vcf2zarr.add_command(
|
|
349
|
+
vcf2zarr.add_command(convert_vcf)
|
|
180
350
|
vcf2zarr.add_command(inspect)
|
|
351
|
+
vcf2zarr.add_command(explode)
|
|
181
352
|
vcf2zarr.add_command(mkschema)
|
|
182
353
|
vcf2zarr.add_command(encode)
|
|
183
|
-
vcf2zarr.add_command(
|
|
184
|
-
vcf2zarr.add_command(
|
|
354
|
+
vcf2zarr.add_command(dexplode_init)
|
|
355
|
+
vcf2zarr.add_command(dexplode_partition)
|
|
356
|
+
vcf2zarr.add_command(dexplode_finalise)
|
|
185
357
|
|
|
186
358
|
|
|
187
359
|
@click.command(name="convert")
|
|
188
360
|
@click.argument("in_path", type=click.Path())
|
|
189
|
-
@click.argument("
|
|
361
|
+
@click.argument("zarr_path", type=click.Path())
|
|
190
362
|
@worker_processes
|
|
191
363
|
@verbose
|
|
192
|
-
@
|
|
193
|
-
@
|
|
364
|
+
@variants_chunk_size
|
|
365
|
+
@samples_chunk_size
|
|
194
366
|
def convert_plink(
|
|
195
|
-
in_path,
|
|
367
|
+
in_path,
|
|
368
|
+
zarr_path,
|
|
369
|
+
verbose,
|
|
370
|
+
worker_processes,
|
|
371
|
+
variants_chunk_size,
|
|
372
|
+
samples_chunk_size,
|
|
196
373
|
):
|
|
197
374
|
"""
|
|
198
375
|
In development; DO NOT USE!
|
|
@@ -200,11 +377,11 @@ def convert_plink(
|
|
|
200
377
|
setup_logging(verbose)
|
|
201
378
|
plink.convert(
|
|
202
379
|
in_path,
|
|
203
|
-
|
|
380
|
+
zarr_path,
|
|
204
381
|
show_progress=True,
|
|
205
382
|
worker_processes=worker_processes,
|
|
206
|
-
|
|
207
|
-
|
|
383
|
+
samples_chunk_size=samples_chunk_size,
|
|
384
|
+
variants_chunk_size=variants_chunk_size,
|
|
208
385
|
)
|
|
209
386
|
|
|
210
387
|
|
bio2zarr/core.py
CHANGED
|
@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
|
|
|
16
16
|
|
|
17
17
|
numcodecs.blosc.use_threads = False
|
|
18
18
|
|
|
19
|
-
# TODO this should probably go in another module where we abstract
|
|
20
|
-
# out the zarr defaults
|
|
21
|
-
default_compressor = numcodecs.Blosc(
|
|
22
|
-
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
|
|
23
|
-
)
|
|
24
|
-
|
|
25
19
|
|
|
26
20
|
def chunk_aligned_slices(z, n, max_chunks=None):
|
|
27
21
|
"""
|
|
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
|
|
|
53
47
|
for future in cf.as_completed(futures):
|
|
54
48
|
exception = future.exception()
|
|
55
49
|
if exception is not None:
|
|
56
|
-
|
|
50
|
+
cancel_futures(futures)
|
|
51
|
+
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
"Worker process died: you may have run out of memory") from exception
|
|
54
|
+
else:
|
|
55
|
+
raise exception
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
def cancel_futures(futures):
|
|
@@ -74,15 +73,18 @@ class BufferedArray:
|
|
|
74
73
|
assert offset % array.chunks[0] == 0
|
|
75
74
|
dims = list(array.shape)
|
|
76
75
|
dims[0] = min(array.chunks[0], array.shape[0])
|
|
77
|
-
self.buff = np.
|
|
76
|
+
self.buff = np.empty(dims, dtype=array.dtype)
|
|
77
|
+
# Explicitly Fill with zeros here to make any out-of-memory errors happen
|
|
78
|
+
# quickly.
|
|
79
|
+
self.buff[:] = 0
|
|
78
80
|
self.buffer_row = 0
|
|
79
81
|
|
|
80
82
|
@property
|
|
81
|
-
def
|
|
83
|
+
def variants_chunk_size(self):
|
|
82
84
|
return self.buff.shape[0]
|
|
83
85
|
|
|
84
86
|
def next_buffer_row(self):
|
|
85
|
-
if self.buffer_row == self.
|
|
87
|
+
if self.buffer_row == self.variants_chunk_size:
|
|
86
88
|
self.flush()
|
|
87
89
|
row = self.buffer_row
|
|
88
90
|
self.buffer_row += 1
|
|
@@ -104,13 +106,13 @@ class BufferedArray:
|
|
|
104
106
|
f"{self.array_offset}:{self.array_offset + self.buffer_row}"
|
|
105
107
|
f"{self.buff.nbytes / 2**20: .2f}Mb"
|
|
106
108
|
)
|
|
107
|
-
self.array_offset += self.
|
|
109
|
+
self.array_offset += self.variants_chunk_size
|
|
108
110
|
self.buffer_row = 0
|
|
109
111
|
|
|
110
112
|
|
|
111
113
|
def sync_flush_1d_array(np_buffer, zarr_array, offset):
|
|
112
114
|
zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
|
|
113
|
-
update_progress(
|
|
115
|
+
update_progress(np_buffer.nbytes)
|
|
114
116
|
|
|
115
117
|
|
|
116
118
|
def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
|
118
120
|
# incremental, and to avoid large memcopies in the underlying
|
|
119
121
|
# encoder implementations.
|
|
120
122
|
s = slice(offset, offset + np_buffer.shape[0])
|
|
121
|
-
|
|
123
|
+
samples_chunk_size = zarr_array.chunks[1]
|
|
124
|
+
# TODO use zarr chunks here to support non-uniform chunking later
|
|
125
|
+
# and for simplicity
|
|
122
126
|
zarr_array_width = zarr_array.shape[1]
|
|
123
127
|
start = 0
|
|
124
128
|
while start < zarr_array_width:
|
|
125
|
-
stop = min(start +
|
|
126
|
-
|
|
127
|
-
|
|
129
|
+
stop = min(start + samples_chunk_size, zarr_array_width)
|
|
130
|
+
chunk_buffer = np_buffer[:, start:stop]
|
|
131
|
+
zarr_array[s, start:stop] = chunk_buffer
|
|
132
|
+
update_progress(chunk_buffer.nbytes)
|
|
128
133
|
start = stop
|
|
129
134
|
|
|
130
135
|
|
|
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
169
174
|
self.executor = cf.ProcessPoolExecutor(
|
|
170
175
|
max_workers=worker_processes,
|
|
171
176
|
)
|
|
172
|
-
self.futures =
|
|
177
|
+
self.futures = set()
|
|
173
178
|
|
|
174
179
|
set_progress(0)
|
|
175
180
|
if progress_config is None:
|
|
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
177
182
|
self.progress_config = progress_config
|
|
178
183
|
self.progress_bar = tqdm.tqdm(
|
|
179
184
|
total=progress_config.total,
|
|
180
|
-
desc=f"{progress_config.title:>
|
|
185
|
+
desc=f"{progress_config.title:>7}",
|
|
181
186
|
unit_scale=True,
|
|
182
187
|
unit=progress_config.units,
|
|
183
188
|
smoothing=0.1,
|
|
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
208
213
|
logger.debug("Exit progress thread")
|
|
209
214
|
|
|
210
215
|
def submit(self, *args, **kwargs):
|
|
211
|
-
self.
|
|
216
|
+
future = self.executor.submit(*args, **kwargs)
|
|
217
|
+
self.futures.add(future)
|
|
218
|
+
return future
|
|
219
|
+
|
|
220
|
+
def wait_for_completed(self, timeout=None):
|
|
221
|
+
done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
|
|
222
|
+
for future in done:
|
|
223
|
+
exception = future.exception()
|
|
224
|
+
# TODO do the check for BrokenProcessPool here
|
|
225
|
+
if exception is not None:
|
|
226
|
+
raise exception
|
|
227
|
+
self.futures = not_done
|
|
228
|
+
return done
|
|
212
229
|
|
|
213
230
|
def results_as_completed(self):
|
|
214
231
|
for future in cf.as_completed(self.futures):
|