bio2zarr 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +189 -56
- bio2zarr/core.py +36 -19
- bio2zarr/plink.py +25 -19
- bio2zarr/vcf.py +704 -389
- bio2zarr/vcf_utils.py +0 -1
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/METADATA +1 -1
- bio2zarr-0.0.2.dist-info/RECORD +16 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/WHEEL +1 -1
- bio2zarr-0.0.1.dist-info/RECORD +0 -16
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -7,35 +7,52 @@ from . import vcf_utils
|
|
|
7
7
|
from . import plink
|
|
8
8
|
from . import provenance
|
|
9
9
|
|
|
10
|
+
|
|
11
|
+
class NaturalOrderGroup(click.Group):
|
|
12
|
+
"""
|
|
13
|
+
List commands in the order they are provided in the help text.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def list_commands(self, ctx):
|
|
17
|
+
return self.commands.keys()
|
|
18
|
+
|
|
19
|
+
|
|
10
20
|
# Common arguments/options
|
|
11
21
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
12
22
|
|
|
23
|
+
version = click.version_option(version=f"{provenance.__version__}")
|
|
24
|
+
|
|
13
25
|
worker_processes = click.option(
|
|
14
26
|
"-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
|
|
15
27
|
)
|
|
16
28
|
|
|
17
|
-
|
|
18
|
-
|
|
29
|
+
column_chunk_size = click.option(
|
|
30
|
+
"-c",
|
|
31
|
+
"--column-chunk-size",
|
|
32
|
+
type=int,
|
|
33
|
+
default=64,
|
|
34
|
+
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
38
|
+
# possibly there are better letters now.
|
|
39
|
+
variants_chunk_size = click.option(
|
|
19
40
|
"-l",
|
|
20
|
-
"--chunk-
|
|
41
|
+
"--variants-chunk-size",
|
|
21
42
|
type=int,
|
|
22
43
|
default=None,
|
|
23
44
|
help="Chunk size in the variants dimension",
|
|
24
45
|
)
|
|
25
46
|
|
|
26
|
-
|
|
47
|
+
samples_chunk_size = click.option(
|
|
27
48
|
"-w",
|
|
28
|
-
"--chunk-
|
|
49
|
+
"--samples-chunk-size",
|
|
29
50
|
type=int,
|
|
30
51
|
default=None,
|
|
31
52
|
help="Chunk size in the samples dimension",
|
|
32
53
|
)
|
|
33
54
|
|
|
34
|
-
version = click.version_option(version=f"bio2zarr {provenance.__version__}")
|
|
35
55
|
|
|
36
|
-
|
|
37
|
-
# Note: logging hasn't been implemented in the code at all, this is just
|
|
38
|
-
# a first pass to try out some ways of doing things to see what works.
|
|
39
56
|
def setup_logging(verbosity):
|
|
40
57
|
level = "WARNING"
|
|
41
58
|
if verbosity == 1:
|
|
@@ -43,26 +60,24 @@ def setup_logging(verbosity):
|
|
|
43
60
|
elif verbosity >= 2:
|
|
44
61
|
level = "DEBUG"
|
|
45
62
|
# NOTE: I'm not that excited about coloredlogs, just trying it out
|
|
46
|
-
# as it is installed by cyvcf2 anyway.
|
|
47
|
-
# stuff doing on with threads and processes, to logs might not work
|
|
48
|
-
# so well anyway.
|
|
63
|
+
# as it is installed by cyvcf2 anyway.
|
|
49
64
|
coloredlogs.install(level=level)
|
|
50
65
|
|
|
51
66
|
|
|
52
67
|
@click.command
|
|
53
68
|
@click.argument("vcfs", nargs=-1, required=True)
|
|
54
|
-
@click.argument("
|
|
69
|
+
@click.argument("zarr_path", type=click.Path())
|
|
55
70
|
@verbose
|
|
56
71
|
@worker_processes
|
|
57
|
-
@
|
|
58
|
-
def explode(vcfs,
|
|
72
|
+
@column_chunk_size
|
|
73
|
+
def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
|
|
59
74
|
"""
|
|
60
|
-
Convert VCF(s) to columnar
|
|
75
|
+
Convert VCF(s) to intermediate columnar format
|
|
61
76
|
"""
|
|
62
77
|
setup_logging(verbose)
|
|
63
78
|
vcf.explode(
|
|
64
79
|
vcfs,
|
|
65
|
-
|
|
80
|
+
zarr_path,
|
|
66
81
|
worker_processes=worker_processes,
|
|
67
82
|
column_chunk_size=column_chunk_size,
|
|
68
83
|
show_progress=True,
|
|
@@ -70,34 +85,85 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
|
|
|
70
85
|
|
|
71
86
|
|
|
72
87
|
@click.command
|
|
73
|
-
@click.argument("
|
|
88
|
+
@click.argument("vcfs", nargs=-1, required=True)
|
|
89
|
+
@click.argument("icf_path", type=click.Path())
|
|
90
|
+
@click.argument("num_partitions", type=int)
|
|
91
|
+
@column_chunk_size
|
|
74
92
|
@verbose
|
|
75
|
-
|
|
93
|
+
@worker_processes
|
|
94
|
+
def dexplode_init(
|
|
95
|
+
vcfs, icf_path, num_partitions, column_chunk_size, verbose, worker_processes
|
|
96
|
+
):
|
|
76
97
|
"""
|
|
77
|
-
|
|
98
|
+
Initial step for parallel conversion of VCF(s) to intermediate columnar format
|
|
99
|
+
over the requested number of paritions.
|
|
78
100
|
"""
|
|
79
101
|
setup_logging(verbose)
|
|
80
|
-
|
|
102
|
+
num_partitions = vcf.explode_init(
|
|
103
|
+
icf_path,
|
|
104
|
+
vcfs,
|
|
105
|
+
target_num_partitions=num_partitions,
|
|
106
|
+
column_chunk_size=column_chunk_size,
|
|
107
|
+
worker_processes=worker_processes,
|
|
108
|
+
show_progress=True,
|
|
109
|
+
)
|
|
110
|
+
click.echo(num_partitions)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@click.command
|
|
114
|
+
@click.argument("icf_path", type=click.Path())
|
|
115
|
+
@click.argument("partition", type=int)
|
|
116
|
+
@verbose
|
|
117
|
+
def dexplode_partition(icf_path, partition, verbose):
|
|
118
|
+
"""
|
|
119
|
+
Convert a VCF partition into intermediate columnar format. Must be called *after*
|
|
120
|
+
the ICF path has been initialised with dexplode_init. Partition indexes must be
|
|
121
|
+
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
122
|
+
"""
|
|
123
|
+
setup_logging(verbose)
|
|
124
|
+
vcf.explode_partition(icf_path, partition, show_progress=True)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@click.command
|
|
128
|
+
@click.argument("path", type=click.Path(), required=True)
|
|
129
|
+
@verbose
|
|
130
|
+
def dexplode_finalise(path, verbose):
|
|
131
|
+
"""
|
|
132
|
+
Final step for parallel conversion of VCF(s) to intermediate columnar format
|
|
133
|
+
"""
|
|
134
|
+
setup_logging(verbose)
|
|
135
|
+
vcf.explode_finalise(path)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@click.command
|
|
139
|
+
@click.argument("icf_path", type=click.Path())
|
|
140
|
+
@verbose
|
|
141
|
+
def inspect(icf_path, verbose):
|
|
142
|
+
"""
|
|
143
|
+
Inspect an intermediate format or Zarr path.
|
|
144
|
+
"""
|
|
145
|
+
setup_logging(verbose)
|
|
146
|
+
data = vcf.inspect(icf_path)
|
|
81
147
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
82
148
|
|
|
83
149
|
|
|
84
150
|
@click.command
|
|
85
|
-
@click.argument("
|
|
86
|
-
def mkschema(
|
|
151
|
+
@click.argument("icf_path", type=click.Path())
|
|
152
|
+
def mkschema(icf_path):
|
|
87
153
|
"""
|
|
88
154
|
Generate a schema for zarr encoding
|
|
89
155
|
"""
|
|
90
156
|
stream = click.get_text_stream("stdout")
|
|
91
|
-
vcf.mkschema(
|
|
157
|
+
vcf.mkschema(icf_path, stream)
|
|
92
158
|
|
|
93
159
|
|
|
94
160
|
@click.command
|
|
95
|
-
@click.argument("
|
|
161
|
+
@click.argument("icf_path", type=click.Path())
|
|
96
162
|
@click.argument("zarr_path", type=click.Path())
|
|
97
163
|
@verbose
|
|
98
164
|
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
99
|
-
@
|
|
100
|
-
@
|
|
165
|
+
@variants_chunk_size
|
|
166
|
+
@samples_chunk_size
|
|
101
167
|
@click.option(
|
|
102
168
|
"-V",
|
|
103
169
|
"--max-variant-chunks",
|
|
@@ -109,50 +175,61 @@ def mkschema(if_path):
|
|
|
109
175
|
"schema tuning."
|
|
110
176
|
),
|
|
111
177
|
)
|
|
178
|
+
@click.option(
|
|
179
|
+
"-M",
|
|
180
|
+
"--max-memory",
|
|
181
|
+
type=int,
|
|
182
|
+
default=None,
|
|
183
|
+
help="An approximate bound on overall memory usage in megabytes",
|
|
184
|
+
)
|
|
112
185
|
@worker_processes
|
|
113
186
|
def encode(
|
|
114
|
-
|
|
187
|
+
icf_path,
|
|
115
188
|
zarr_path,
|
|
116
189
|
verbose,
|
|
117
190
|
schema,
|
|
118
|
-
|
|
119
|
-
|
|
191
|
+
variants_chunk_size,
|
|
192
|
+
samples_chunk_size,
|
|
120
193
|
max_variant_chunks,
|
|
194
|
+
max_memory,
|
|
121
195
|
worker_processes,
|
|
122
196
|
):
|
|
123
197
|
"""
|
|
124
|
-
Encode intermediate format (see explode) to vcfzarr
|
|
198
|
+
Encode intermediate columnar format (see explode) to vcfzarr.
|
|
125
199
|
"""
|
|
126
200
|
setup_logging(verbose)
|
|
127
201
|
vcf.encode(
|
|
128
|
-
|
|
202
|
+
icf_path,
|
|
129
203
|
zarr_path,
|
|
130
204
|
schema,
|
|
131
|
-
|
|
132
|
-
|
|
205
|
+
variants_chunk_size=variants_chunk_size,
|
|
206
|
+
samples_chunk_size=samples_chunk_size,
|
|
133
207
|
max_v_chunks=max_variant_chunks,
|
|
134
208
|
worker_processes=worker_processes,
|
|
209
|
+
max_memory=max_memory,
|
|
135
210
|
show_progress=True,
|
|
136
211
|
)
|
|
137
212
|
|
|
138
213
|
|
|
139
214
|
@click.command(name="convert")
|
|
140
215
|
@click.argument("vcfs", nargs=-1, required=True)
|
|
141
|
-
@click.argument("
|
|
142
|
-
@
|
|
143
|
-
@
|
|
216
|
+
@click.argument("zarr_path", type=click.Path())
|
|
217
|
+
@variants_chunk_size
|
|
218
|
+
@samples_chunk_size
|
|
144
219
|
@verbose
|
|
145
220
|
@worker_processes
|
|
146
|
-
def convert_vcf(
|
|
221
|
+
def convert_vcf(
|
|
222
|
+
vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
|
|
223
|
+
):
|
|
147
224
|
"""
|
|
148
|
-
Convert input VCF(s) directly to vcfzarr (not recommended for large files)
|
|
225
|
+
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
149
226
|
"""
|
|
150
227
|
setup_logging(verbose)
|
|
151
228
|
vcf.convert(
|
|
152
229
|
vcfs,
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
230
|
+
zarr_path,
|
|
231
|
+
variants_chunk_size=variants_chunk_size,
|
|
232
|
+
samples_chunk_size=samples_chunk_size,
|
|
156
233
|
show_progress=True,
|
|
157
234
|
worker_processes=worker_processes,
|
|
158
235
|
)
|
|
@@ -160,39 +237,95 @@ def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_proce
|
|
|
160
237
|
|
|
161
238
|
@click.command
|
|
162
239
|
@click.argument("vcfs", nargs=-1, required=True)
|
|
163
|
-
@click.argument("
|
|
164
|
-
def validate(vcfs,
|
|
240
|
+
@click.argument("zarr_path", type=click.Path())
|
|
241
|
+
def validate(vcfs, zarr_path):
|
|
165
242
|
"""
|
|
166
243
|
Development only, do not use. Will be removed before release.
|
|
167
244
|
"""
|
|
168
245
|
# FIXME! Will silently not look at remaining VCFs
|
|
169
|
-
vcf.validate(vcfs[0],
|
|
246
|
+
vcf.validate(vcfs[0], zarr_path, show_progress=True)
|
|
170
247
|
|
|
171
248
|
|
|
172
249
|
@version
|
|
173
|
-
@click.group()
|
|
250
|
+
@click.group(cls=NaturalOrderGroup)
|
|
174
251
|
def vcf2zarr():
|
|
175
|
-
|
|
252
|
+
"""
|
|
253
|
+
Convert VCF file(s) to the vcfzarr format.
|
|
254
|
+
|
|
255
|
+
The simplest usage is:
|
|
256
|
+
|
|
257
|
+
$ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
|
|
258
|
+
|
|
259
|
+
This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
|
|
260
|
+
step. As this writes the intermediate columnar format to a temporary directory,
|
|
261
|
+
we only recommend this approach for small files (< 1GB, say).
|
|
262
|
+
|
|
263
|
+
The recommended approach is to run the conversion in two passes, and
|
|
264
|
+
to keep the intermediate columnar format ("exploded") around to facilitate
|
|
265
|
+
experimentation with chunk sizes and compression settings:
|
|
266
|
+
|
|
267
|
+
\b
|
|
268
|
+
$ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
|
|
269
|
+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
|
|
270
|
+
|
|
271
|
+
The inspect command provides a way to view contents of an exploded ICF
|
|
272
|
+
or Zarr:
|
|
273
|
+
|
|
274
|
+
$ vcf2zarr inspect [PATH]
|
|
275
|
+
|
|
276
|
+
This is useful when tweaking chunk sizes and compression settings to suit
|
|
277
|
+
your dataset, using the mkschema command and --schema option to encode:
|
|
278
|
+
|
|
279
|
+
\b
|
|
280
|
+
$ vcf2zarr mkschema [ICF_PATH] > schema.json
|
|
281
|
+
$ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
|
|
282
|
+
|
|
283
|
+
By editing the schema.json file you can drop columns that are not of interest
|
|
284
|
+
and edit column specific compression settings. The --max-variant-chunks option
|
|
285
|
+
to encode allows you to try out these options on small subsets, hopefully
|
|
286
|
+
arriving at settings with the desired balance of compression and query
|
|
287
|
+
performance.
|
|
288
|
+
|
|
289
|
+
ADVANCED USAGE
|
|
290
|
+
|
|
291
|
+
For very large datasets (terabyte scale) it may be necessary to distribute the
|
|
292
|
+
explode and encode steps across a cluster:
|
|
293
|
+
|
|
294
|
+
\b
|
|
295
|
+
$ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
|
|
296
|
+
$ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
|
|
297
|
+
$ vcf2zarr dexplode-finalise [ICF_PATH]
|
|
298
|
+
|
|
299
|
+
See the online documentation at [FIXME] for more details on distributed explode.
|
|
300
|
+
"""
|
|
176
301
|
|
|
177
302
|
|
|
178
303
|
# TODO figure out how to get click to list these in the given order.
|
|
179
|
-
vcf2zarr.add_command(
|
|
304
|
+
vcf2zarr.add_command(convert_vcf)
|
|
180
305
|
vcf2zarr.add_command(inspect)
|
|
306
|
+
vcf2zarr.add_command(explode)
|
|
181
307
|
vcf2zarr.add_command(mkschema)
|
|
182
308
|
vcf2zarr.add_command(encode)
|
|
183
|
-
vcf2zarr.add_command(
|
|
309
|
+
vcf2zarr.add_command(dexplode_init)
|
|
310
|
+
vcf2zarr.add_command(dexplode_partition)
|
|
311
|
+
vcf2zarr.add_command(dexplode_finalise)
|
|
184
312
|
vcf2zarr.add_command(validate)
|
|
185
313
|
|
|
186
314
|
|
|
187
315
|
@click.command(name="convert")
|
|
188
316
|
@click.argument("in_path", type=click.Path())
|
|
189
|
-
@click.argument("
|
|
317
|
+
@click.argument("zarr_path", type=click.Path())
|
|
190
318
|
@worker_processes
|
|
191
319
|
@verbose
|
|
192
|
-
@
|
|
193
|
-
@
|
|
320
|
+
@variants_chunk_size
|
|
321
|
+
@samples_chunk_size
|
|
194
322
|
def convert_plink(
|
|
195
|
-
in_path,
|
|
323
|
+
in_path,
|
|
324
|
+
zarr_path,
|
|
325
|
+
verbose,
|
|
326
|
+
worker_processes,
|
|
327
|
+
variants_chunk_size,
|
|
328
|
+
samples_chunk_size,
|
|
196
329
|
):
|
|
197
330
|
"""
|
|
198
331
|
In development; DO NOT USE!
|
|
@@ -200,11 +333,11 @@ def convert_plink(
|
|
|
200
333
|
setup_logging(verbose)
|
|
201
334
|
plink.convert(
|
|
202
335
|
in_path,
|
|
203
|
-
|
|
336
|
+
zarr_path,
|
|
204
337
|
show_progress=True,
|
|
205
338
|
worker_processes=worker_processes,
|
|
206
|
-
|
|
207
|
-
|
|
339
|
+
samples_chunk_size=samples_chunk_size,
|
|
340
|
+
variants_chunk_size=variants_chunk_size,
|
|
208
341
|
)
|
|
209
342
|
|
|
210
343
|
|
bio2zarr/core.py
CHANGED
|
@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
|
|
|
16
16
|
|
|
17
17
|
numcodecs.blosc.use_threads = False
|
|
18
18
|
|
|
19
|
-
# TODO this should probably go in another module where we abstract
|
|
20
|
-
# out the zarr defaults
|
|
21
|
-
default_compressor = numcodecs.Blosc(
|
|
22
|
-
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
|
|
23
|
-
)
|
|
24
|
-
|
|
25
19
|
|
|
26
20
|
def chunk_aligned_slices(z, n, max_chunks=None):
|
|
27
21
|
"""
|
|
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
|
|
|
53
47
|
for future in cf.as_completed(futures):
|
|
54
48
|
exception = future.exception()
|
|
55
49
|
if exception is not None:
|
|
56
|
-
|
|
50
|
+
cancel_futures(futures)
|
|
51
|
+
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
"Worker process died: you may have run out of memory") from exception
|
|
54
|
+
else:
|
|
55
|
+
raise exception
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
def cancel_futures(futures):
|
|
@@ -74,15 +73,18 @@ class BufferedArray:
|
|
|
74
73
|
assert offset % array.chunks[0] == 0
|
|
75
74
|
dims = list(array.shape)
|
|
76
75
|
dims[0] = min(array.chunks[0], array.shape[0])
|
|
77
|
-
self.buff = np.
|
|
76
|
+
self.buff = np.empty(dims, dtype=array.dtype)
|
|
77
|
+
# Explicitly Fill with zeros here to make any out-of-memory errors happen
|
|
78
|
+
# quickly.
|
|
79
|
+
self.buff[:] = 0
|
|
78
80
|
self.buffer_row = 0
|
|
79
81
|
|
|
80
82
|
@property
|
|
81
|
-
def
|
|
83
|
+
def variants_chunk_size(self):
|
|
82
84
|
return self.buff.shape[0]
|
|
83
85
|
|
|
84
86
|
def next_buffer_row(self):
|
|
85
|
-
if self.buffer_row == self.
|
|
87
|
+
if self.buffer_row == self.variants_chunk_size:
|
|
86
88
|
self.flush()
|
|
87
89
|
row = self.buffer_row
|
|
88
90
|
self.buffer_row += 1
|
|
@@ -104,13 +106,13 @@ class BufferedArray:
|
|
|
104
106
|
f"{self.array_offset}:{self.array_offset + self.buffer_row}"
|
|
105
107
|
f"{self.buff.nbytes / 2**20: .2f}Mb"
|
|
106
108
|
)
|
|
107
|
-
self.array_offset += self.
|
|
109
|
+
self.array_offset += self.variants_chunk_size
|
|
108
110
|
self.buffer_row = 0
|
|
109
111
|
|
|
110
112
|
|
|
111
113
|
def sync_flush_1d_array(np_buffer, zarr_array, offset):
|
|
112
114
|
zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
|
|
113
|
-
update_progress(
|
|
115
|
+
update_progress(np_buffer.nbytes)
|
|
114
116
|
|
|
115
117
|
|
|
116
118
|
def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
|
118
120
|
# incremental, and to avoid large memcopies in the underlying
|
|
119
121
|
# encoder implementations.
|
|
120
122
|
s = slice(offset, offset + np_buffer.shape[0])
|
|
121
|
-
|
|
123
|
+
samples_chunk_size = zarr_array.chunks[1]
|
|
124
|
+
# TODO use zarr chunks here to support non-uniform chunking later
|
|
125
|
+
# and for simplicity
|
|
122
126
|
zarr_array_width = zarr_array.shape[1]
|
|
123
127
|
start = 0
|
|
124
128
|
while start < zarr_array_width:
|
|
125
|
-
stop = min(start +
|
|
126
|
-
|
|
127
|
-
|
|
129
|
+
stop = min(start + samples_chunk_size, zarr_array_width)
|
|
130
|
+
chunk_buffer = np_buffer[:, start:stop]
|
|
131
|
+
zarr_array[s, start:stop] = chunk_buffer
|
|
132
|
+
update_progress(chunk_buffer.nbytes)
|
|
128
133
|
start = stop
|
|
129
134
|
|
|
130
135
|
|
|
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
169
174
|
self.executor = cf.ProcessPoolExecutor(
|
|
170
175
|
max_workers=worker_processes,
|
|
171
176
|
)
|
|
172
|
-
self.futures =
|
|
177
|
+
self.futures = set()
|
|
173
178
|
|
|
174
179
|
set_progress(0)
|
|
175
180
|
if progress_config is None:
|
|
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
177
182
|
self.progress_config = progress_config
|
|
178
183
|
self.progress_bar = tqdm.tqdm(
|
|
179
184
|
total=progress_config.total,
|
|
180
|
-
desc=f"{progress_config.title:>
|
|
185
|
+
desc=f"{progress_config.title:>7}",
|
|
181
186
|
unit_scale=True,
|
|
182
187
|
unit=progress_config.units,
|
|
183
188
|
smoothing=0.1,
|
|
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
208
213
|
logger.debug("Exit progress thread")
|
|
209
214
|
|
|
210
215
|
def submit(self, *args, **kwargs):
|
|
211
|
-
self.
|
|
216
|
+
future = self.executor.submit(*args, **kwargs)
|
|
217
|
+
self.futures.add(future)
|
|
218
|
+
return future
|
|
219
|
+
|
|
220
|
+
def wait_for_completed(self, timeout=None):
|
|
221
|
+
done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
|
|
222
|
+
for future in done:
|
|
223
|
+
exception = future.exception()
|
|
224
|
+
# TODO do the check for BrokenProcessPool here
|
|
225
|
+
if exception is not None:
|
|
226
|
+
raise exception
|
|
227
|
+
self.futures = not_done
|
|
228
|
+
return done
|
|
212
229
|
|
|
213
230
|
def results_as_completed(self):
|
|
214
231
|
for future in cf.as_completed(self.futures):
|
bio2zarr/plink.py
CHANGED
|
@@ -4,6 +4,7 @@ import humanfriendly
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import zarr
|
|
6
6
|
import bed_reader
|
|
7
|
+
import numcodecs
|
|
7
8
|
|
|
8
9
|
from . import core
|
|
9
10
|
|
|
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
22
23
|
gt = core.BufferedArray(root["call_genotype"], start)
|
|
23
24
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
24
25
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
25
|
-
|
|
26
|
+
variants_chunk_size = gt.array.chunks[0]
|
|
26
27
|
n = gt.array.shape[1]
|
|
27
|
-
assert start %
|
|
28
|
+
assert start % variants_chunk_size == 0
|
|
28
29
|
|
|
29
30
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
30
31
|
chunk_start = start
|
|
31
32
|
while chunk_start < stop:
|
|
32
|
-
chunk_stop = min(chunk_start +
|
|
33
|
+
chunk_stop = min(chunk_start + variants_chunk_size, stop)
|
|
33
34
|
logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
|
|
34
35
|
bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
|
|
35
36
|
logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
|
|
@@ -60,8 +61,8 @@ def convert(
|
|
|
60
61
|
*,
|
|
61
62
|
show_progress=False,
|
|
62
63
|
worker_processes=1,
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
variants_chunk_size=None,
|
|
65
|
+
samples_chunk_size=None,
|
|
65
66
|
):
|
|
66
67
|
bed = bed_reader.open_bed(bed_path, num_threads=1)
|
|
67
68
|
n = bed.iid_count
|
|
@@ -69,25 +70,30 @@ def convert(
|
|
|
69
70
|
logging.info(f"Scanned plink with {n} samples and {m} variants")
|
|
70
71
|
|
|
71
72
|
# FIXME
|
|
72
|
-
if
|
|
73
|
-
|
|
74
|
-
if
|
|
75
|
-
|
|
73
|
+
if samples_chunk_size is None:
|
|
74
|
+
samples_chunk_size = 1000
|
|
75
|
+
if variants_chunk_size is None:
|
|
76
|
+
variants_chunk_size = 10_000
|
|
76
77
|
|
|
77
78
|
store = zarr.DirectoryStore(zarr_path)
|
|
78
79
|
root = zarr.group(store=store, overwrite=True)
|
|
79
80
|
|
|
80
81
|
ploidy = 2
|
|
81
82
|
shape = [m, n]
|
|
82
|
-
chunks = [
|
|
83
|
+
chunks = [variants_chunk_size, samples_chunk_size]
|
|
83
84
|
dimensions = ["variants", "samples"]
|
|
84
85
|
|
|
86
|
+
# TODO we should be reusing some logic from vcfzarr here on laying
|
|
87
|
+
# out the basic dataset, and using the schema generator. Currently
|
|
88
|
+
# we're not using the best Blosc settings for genotypes here.
|
|
89
|
+
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
90
|
+
|
|
85
91
|
a = root.array(
|
|
86
92
|
"sample_id",
|
|
87
93
|
bed.iid,
|
|
88
94
|
dtype="str",
|
|
89
|
-
compressor=
|
|
90
|
-
chunks=(
|
|
95
|
+
compressor=default_compressor,
|
|
96
|
+
chunks=(samples_chunk_size,),
|
|
91
97
|
)
|
|
92
98
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
93
99
|
logger.debug(f"Encoded samples")
|
|
@@ -98,8 +104,8 @@ def convert(
|
|
|
98
104
|
"variant_position",
|
|
99
105
|
bed.bp_position,
|
|
100
106
|
dtype=np.int32,
|
|
101
|
-
compressor=
|
|
102
|
-
chunks=(
|
|
107
|
+
compressor=default_compressor,
|
|
108
|
+
chunks=(variants_chunk_size,),
|
|
103
109
|
)
|
|
104
110
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
105
111
|
logger.debug(f"encoded variant_position")
|
|
@@ -109,8 +115,8 @@ def convert(
|
|
|
109
115
|
"variant_allele",
|
|
110
116
|
alleles,
|
|
111
117
|
dtype="str",
|
|
112
|
-
compressor=
|
|
113
|
-
chunks=(
|
|
118
|
+
compressor=default_compressor,
|
|
119
|
+
chunks=(variants_chunk_size,),
|
|
114
120
|
)
|
|
115
121
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
116
122
|
logger.debug(f"encoded variant_allele")
|
|
@@ -121,7 +127,7 @@ def convert(
|
|
|
121
127
|
dtype="bool",
|
|
122
128
|
shape=list(shape),
|
|
123
129
|
chunks=list(chunks),
|
|
124
|
-
compressor=
|
|
130
|
+
compressor=default_compressor,
|
|
125
131
|
)
|
|
126
132
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
127
133
|
|
|
@@ -132,7 +138,7 @@ def convert(
|
|
|
132
138
|
dtype="i1",
|
|
133
139
|
shape=list(shape),
|
|
134
140
|
chunks=list(chunks),
|
|
135
|
-
compressor=
|
|
141
|
+
compressor=default_compressor,
|
|
136
142
|
)
|
|
137
143
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
138
144
|
|
|
@@ -141,7 +147,7 @@ def convert(
|
|
|
141
147
|
dtype="bool",
|
|
142
148
|
shape=list(shape),
|
|
143
149
|
chunks=list(chunks),
|
|
144
|
-
compressor=
|
|
150
|
+
compressor=default_compressor,
|
|
145
151
|
)
|
|
146
152
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
147
153
|
|