bio2zarr 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +115 -39
- bio2zarr/core.py +2 -1
- bio2zarr/vcf.py +83 -50
- {bio2zarr-0.0.2.dist-info → bio2zarr-0.0.4.dist-info}/METADATA +2 -2
- bio2zarr-0.0.4.dist-info/RECORD +16 -0
- bio2zarr-0.0.2.dist-info/RECORD +0 -16
- {bio2zarr-0.0.2.dist-info → bio2zarr-0.0.4.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.2.dist-info → bio2zarr-0.0.4.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.2.dist-info → bio2zarr-0.0.4.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.2.dist-info → bio2zarr-0.0.4.dist-info}/top_level.txt +0 -0
bio2zarr/_version.py
CHANGED
bio2zarr/cli.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import pathlib
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
1
6
|
import click
|
|
2
7
|
import tabulate
|
|
3
8
|
import coloredlogs
|
|
9
|
+
import numcodecs
|
|
4
10
|
|
|
5
11
|
from . import vcf
|
|
6
12
|
from . import vcf_utils
|
|
@@ -8,6 +14,9 @@ from . import plink
|
|
|
8
14
|
from . import provenance
|
|
9
15
|
|
|
10
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
11
20
|
class NaturalOrderGroup(click.Group):
|
|
12
21
|
"""
|
|
13
22
|
List commands in the order they are provided in the help text.
|
|
@@ -18,8 +27,32 @@ class NaturalOrderGroup(click.Group):
|
|
|
18
27
|
|
|
19
28
|
|
|
20
29
|
# Common arguments/options
|
|
30
|
+
vcfs = click.argument(
|
|
31
|
+
"vcfs", nargs=-1, required=True, type=click.Path(exists=True, dir_okay=False)
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
new_icf_path = click.argument(
|
|
35
|
+
"icf_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
icf_path = click.argument(
|
|
39
|
+
"icf_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
new_zarr_path = click.argument(
|
|
43
|
+
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
44
|
+
)
|
|
45
|
+
|
|
21
46
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
22
47
|
|
|
48
|
+
force = click.option(
|
|
49
|
+
"-f",
|
|
50
|
+
"--force",
|
|
51
|
+
is_flag=True,
|
|
52
|
+
flag_value=True,
|
|
53
|
+
help="Force overwriting of existing directories",
|
|
54
|
+
)
|
|
55
|
+
|
|
23
56
|
version = click.version_option(version=f"{provenance.__version__}")
|
|
24
57
|
|
|
25
58
|
worker_processes = click.option(
|
|
@@ -34,6 +67,17 @@ column_chunk_size = click.option(
|
|
|
34
67
|
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
35
68
|
)
|
|
36
69
|
|
|
70
|
+
# We could provide the full flexiblity of numcodecs/Blosc here, but there
|
|
71
|
+
# doesn't seem much point. Can always add more arguments here to control
|
|
72
|
+
# compression level, etc.
|
|
73
|
+
compressor = click.option(
|
|
74
|
+
"-C",
|
|
75
|
+
"--compressor",
|
|
76
|
+
type=click.Choice(["lz4", "zstd"]),
|
|
77
|
+
default=None,
|
|
78
|
+
help="Codec to use for compressing column chunks (Default=zstd)."
|
|
79
|
+
)
|
|
80
|
+
|
|
37
81
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
38
82
|
# possibly there are better letters now.
|
|
39
83
|
variants_chunk_size = click.option(
|
|
@@ -64,59 +108,101 @@ def setup_logging(verbosity):
|
|
|
64
108
|
coloredlogs.install(level=level)
|
|
65
109
|
|
|
66
110
|
|
|
111
|
+
def check_overwrite_dir(path, force):
|
|
112
|
+
path = pathlib.Path(path)
|
|
113
|
+
if path.exists():
|
|
114
|
+
if not force:
|
|
115
|
+
click.confirm(
|
|
116
|
+
f"Do you want to overwrite {path}? (use --force to skip this check)",
|
|
117
|
+
abort=True,
|
|
118
|
+
)
|
|
119
|
+
# These trees can be mondo-big and on slow file systems, so it's entirely
|
|
120
|
+
# feasible that the delete would fail or be killed. This makes it less likely
|
|
121
|
+
# that partially deleted paths are mistaken for good paths.
|
|
122
|
+
tmp_delete_path = path.with_suffix(f"{path.suffix}.{os.getpid()}.DELETING")
|
|
123
|
+
logger.info(f"Deleting {path} (renamed to {tmp_delete_path} while in progress)")
|
|
124
|
+
os.rename(path, tmp_delete_path)
|
|
125
|
+
shutil.rmtree(tmp_delete_path)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_compressor(cname):
|
|
129
|
+
if cname is None:
|
|
130
|
+
return None
|
|
131
|
+
config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
132
|
+
config["cname"] = cname
|
|
133
|
+
return numcodecs.get_codec(config)
|
|
134
|
+
|
|
135
|
+
|
|
67
136
|
@click.command
|
|
68
|
-
@
|
|
69
|
-
@
|
|
137
|
+
@vcfs
|
|
138
|
+
@new_icf_path
|
|
139
|
+
@force
|
|
70
140
|
@verbose
|
|
71
|
-
@worker_processes
|
|
72
141
|
@column_chunk_size
|
|
73
|
-
|
|
142
|
+
@compressor
|
|
143
|
+
@worker_processes
|
|
144
|
+
def explode(
|
|
145
|
+
vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
|
|
146
|
+
):
|
|
74
147
|
"""
|
|
75
148
|
Convert VCF(s) to intermediate columnar format
|
|
76
149
|
"""
|
|
77
150
|
setup_logging(verbose)
|
|
151
|
+
check_overwrite_dir(icf_path, force)
|
|
78
152
|
vcf.explode(
|
|
153
|
+
icf_path,
|
|
79
154
|
vcfs,
|
|
80
|
-
zarr_path,
|
|
81
155
|
worker_processes=worker_processes,
|
|
82
156
|
column_chunk_size=column_chunk_size,
|
|
157
|
+
compressor=get_compressor(compressor),
|
|
83
158
|
show_progress=True,
|
|
84
159
|
)
|
|
85
160
|
|
|
86
161
|
|
|
87
162
|
@click.command
|
|
88
|
-
@
|
|
89
|
-
@
|
|
90
|
-
@click.argument("num_partitions", type=
|
|
163
|
+
@vcfs
|
|
164
|
+
@new_icf_path
|
|
165
|
+
@click.argument("num_partitions", type=click.IntRange(min=1))
|
|
166
|
+
@force
|
|
91
167
|
@column_chunk_size
|
|
168
|
+
@compressor
|
|
92
169
|
@verbose
|
|
93
170
|
@worker_processes
|
|
94
171
|
def dexplode_init(
|
|
95
|
-
vcfs,
|
|
172
|
+
vcfs,
|
|
173
|
+
icf_path,
|
|
174
|
+
num_partitions,
|
|
175
|
+
force,
|
|
176
|
+
column_chunk_size,
|
|
177
|
+
compressor,
|
|
178
|
+
verbose,
|
|
179
|
+
worker_processes,
|
|
96
180
|
):
|
|
97
181
|
"""
|
|
98
|
-
Initial step for
|
|
182
|
+
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
99
183
|
over the requested number of paritions.
|
|
100
184
|
"""
|
|
101
185
|
setup_logging(verbose)
|
|
186
|
+
check_overwrite_dir(icf_path, force)
|
|
102
187
|
num_partitions = vcf.explode_init(
|
|
103
188
|
icf_path,
|
|
104
189
|
vcfs,
|
|
105
190
|
target_num_partitions=num_partitions,
|
|
106
191
|
column_chunk_size=column_chunk_size,
|
|
107
192
|
worker_processes=worker_processes,
|
|
193
|
+
compressor=get_compressor(compressor),
|
|
108
194
|
show_progress=True,
|
|
109
195
|
)
|
|
110
196
|
click.echo(num_partitions)
|
|
111
197
|
|
|
112
198
|
|
|
113
199
|
@click.command
|
|
114
|
-
@
|
|
115
|
-
@click.argument("partition", type=
|
|
200
|
+
@icf_path
|
|
201
|
+
@click.argument("partition", type=click.IntRange(min=0))
|
|
116
202
|
@verbose
|
|
117
203
|
def dexplode_partition(icf_path, partition, verbose):
|
|
118
204
|
"""
|
|
119
|
-
Convert a VCF partition
|
|
205
|
+
Convert a VCF partition to intermediate columnar format. Must be called *after*
|
|
120
206
|
the ICF path has been initialised with dexplode_init. Partition indexes must be
|
|
121
207
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
122
208
|
"""
|
|
@@ -129,26 +215,26 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
129
215
|
@verbose
|
|
130
216
|
def dexplode_finalise(path, verbose):
|
|
131
217
|
"""
|
|
132
|
-
Final step for
|
|
218
|
+
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
133
219
|
"""
|
|
134
220
|
setup_logging(verbose)
|
|
135
221
|
vcf.explode_finalise(path)
|
|
136
222
|
|
|
137
223
|
|
|
138
224
|
@click.command
|
|
139
|
-
@click.argument("
|
|
225
|
+
@click.argument("path", type=click.Path())
|
|
140
226
|
@verbose
|
|
141
|
-
def inspect(
|
|
227
|
+
def inspect(path, verbose):
|
|
142
228
|
"""
|
|
143
|
-
Inspect an intermediate format or Zarr path.
|
|
229
|
+
Inspect an intermediate columnar format or Zarr path.
|
|
144
230
|
"""
|
|
145
231
|
setup_logging(verbose)
|
|
146
|
-
data = vcf.inspect(
|
|
232
|
+
data = vcf.inspect(path)
|
|
147
233
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
148
234
|
|
|
149
235
|
|
|
150
236
|
@click.command
|
|
151
|
-
@
|
|
237
|
+
@icf_path
|
|
152
238
|
def mkschema(icf_path):
|
|
153
239
|
"""
|
|
154
240
|
Generate a schema for zarr encoding
|
|
@@ -158,8 +244,9 @@ def mkschema(icf_path):
|
|
|
158
244
|
|
|
159
245
|
|
|
160
246
|
@click.command
|
|
161
|
-
@
|
|
162
|
-
@
|
|
247
|
+
@icf_path
|
|
248
|
+
@new_zarr_path
|
|
249
|
+
@force
|
|
163
250
|
@verbose
|
|
164
251
|
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
165
252
|
@variants_chunk_size
|
|
@@ -178,14 +265,14 @@ def mkschema(icf_path):
|
|
|
178
265
|
@click.option(
|
|
179
266
|
"-M",
|
|
180
267
|
"--max-memory",
|
|
181
|
-
type=int,
|
|
182
268
|
default=None,
|
|
183
|
-
help="An approximate bound on overall memory usage
|
|
269
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
184
270
|
)
|
|
185
271
|
@worker_processes
|
|
186
272
|
def encode(
|
|
187
273
|
icf_path,
|
|
188
274
|
zarr_path,
|
|
275
|
+
force,
|
|
189
276
|
verbose,
|
|
190
277
|
schema,
|
|
191
278
|
variants_chunk_size,
|
|
@@ -195,13 +282,14 @@ def encode(
|
|
|
195
282
|
worker_processes,
|
|
196
283
|
):
|
|
197
284
|
"""
|
|
198
|
-
|
|
285
|
+
Convert intermediate columnar format to vcfzarr.
|
|
199
286
|
"""
|
|
200
287
|
setup_logging(verbose)
|
|
288
|
+
check_overwrite_dir(zarr_path, force)
|
|
201
289
|
vcf.encode(
|
|
202
290
|
icf_path,
|
|
203
291
|
zarr_path,
|
|
204
|
-
schema,
|
|
292
|
+
schema_path=schema,
|
|
205
293
|
variants_chunk_size=variants_chunk_size,
|
|
206
294
|
samples_chunk_size=samples_chunk_size,
|
|
207
295
|
max_v_chunks=max_variant_chunks,
|
|
@@ -212,8 +300,8 @@ def encode(
|
|
|
212
300
|
|
|
213
301
|
|
|
214
302
|
@click.command(name="convert")
|
|
215
|
-
@
|
|
216
|
-
@
|
|
303
|
+
@vcfs
|
|
304
|
+
@new_zarr_path
|
|
217
305
|
@variants_chunk_size
|
|
218
306
|
@samples_chunk_size
|
|
219
307
|
@verbose
|
|
@@ -235,17 +323,6 @@ def convert_vcf(
|
|
|
235
323
|
)
|
|
236
324
|
|
|
237
325
|
|
|
238
|
-
@click.command
|
|
239
|
-
@click.argument("vcfs", nargs=-1, required=True)
|
|
240
|
-
@click.argument("zarr_path", type=click.Path())
|
|
241
|
-
def validate(vcfs, zarr_path):
|
|
242
|
-
"""
|
|
243
|
-
Development only, do not use. Will be removed before release.
|
|
244
|
-
"""
|
|
245
|
-
# FIXME! Will silently not look at remaining VCFs
|
|
246
|
-
vcf.validate(vcfs[0], zarr_path, show_progress=True)
|
|
247
|
-
|
|
248
|
-
|
|
249
326
|
@version
|
|
250
327
|
@click.group(cls=NaturalOrderGroup)
|
|
251
328
|
def vcf2zarr():
|
|
@@ -309,7 +386,6 @@ vcf2zarr.add_command(encode)
|
|
|
309
386
|
vcf2zarr.add_command(dexplode_init)
|
|
310
387
|
vcf2zarr.add_command(dexplode_partition)
|
|
311
388
|
vcf2zarr.add_command(dexplode_finalise)
|
|
312
|
-
vcf2zarr.add_command(validate)
|
|
313
389
|
|
|
314
390
|
|
|
315
391
|
@click.command(name="convert")
|
bio2zarr/core.py
CHANGED
|
@@ -50,7 +50,8 @@ def wait_on_futures(futures):
|
|
|
50
50
|
cancel_futures(futures)
|
|
51
51
|
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
52
|
raise RuntimeError(
|
|
53
|
-
"Worker process died: you may have run out of memory"
|
|
53
|
+
"Worker process died: you may have run out of memory"
|
|
54
|
+
) from exception
|
|
54
55
|
else:
|
|
55
56
|
raise exception
|
|
56
57
|
|
bio2zarr/vcf.py
CHANGED
|
@@ -151,8 +151,8 @@ class VcfPartition:
|
|
|
151
151
|
|
|
152
152
|
ICF_METADATA_FORMAT_VERSION = "0.2"
|
|
153
153
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
154
|
-
cname="
|
|
155
|
-
)
|
|
154
|
+
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
155
|
+
)
|
|
156
156
|
|
|
157
157
|
|
|
158
158
|
@dataclasses.dataclass
|
|
@@ -284,9 +284,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
284
284
|
return metadata, vcf.raw_header
|
|
285
285
|
|
|
286
286
|
|
|
287
|
-
def scan_vcfs(
|
|
288
|
-
paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
|
|
289
|
-
):
|
|
287
|
+
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
290
288
|
logger.info(
|
|
291
289
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
|
|
292
290
|
)
|
|
@@ -334,12 +332,6 @@ def scan_vcfs(
|
|
|
334
332
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
335
333
|
)
|
|
336
334
|
icf_metadata.partitions = all_partitions
|
|
337
|
-
icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
338
|
-
icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
|
|
339
|
-
icf_metadata.column_chunk_size = column_chunk_size
|
|
340
|
-
# Bare minimum here for provenance - would be nice to include versions of key
|
|
341
|
-
# dependencies as well.
|
|
342
|
-
icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
343
335
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
344
336
|
return icf_metadata, header
|
|
345
337
|
|
|
@@ -824,13 +816,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
|
|
|
824
816
|
return False
|
|
825
817
|
|
|
826
818
|
|
|
827
|
-
# TODO rename to IntermediateColumnarFormat and move to icf.py
|
|
828
|
-
|
|
829
|
-
|
|
830
819
|
class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
831
|
-
# TODO Check if other compressors would give reasonable compression
|
|
832
|
-
# with significantly faster times
|
|
833
|
-
|
|
834
820
|
def __init__(self, path):
|
|
835
821
|
self.path = pathlib.Path(path)
|
|
836
822
|
# TODO raise a more informative error here telling people this
|
|
@@ -904,6 +890,15 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
904
890
|
return len(self.columns)
|
|
905
891
|
|
|
906
892
|
|
|
893
|
+
|
|
894
|
+
def mkdir_with_progress(path):
|
|
895
|
+
logger.debug(f"mkdir f{path}")
|
|
896
|
+
# NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
|
|
897
|
+
# parents=True will take care of it.
|
|
898
|
+
path.mkdir(parents=True)
|
|
899
|
+
core.update_progress(1)
|
|
900
|
+
|
|
901
|
+
|
|
907
902
|
class IntermediateColumnarFormatWriter:
|
|
908
903
|
def __init__(self, path):
|
|
909
904
|
self.path = pathlib.Path(path)
|
|
@@ -922,9 +917,12 @@ class IntermediateColumnarFormatWriter:
|
|
|
922
917
|
worker_processes=1,
|
|
923
918
|
target_num_partitions=None,
|
|
924
919
|
show_progress=False,
|
|
920
|
+
compressor=None,
|
|
925
921
|
):
|
|
926
922
|
if self.path.exists():
|
|
927
|
-
|
|
923
|
+
raise ValueError("ICF path already exists")
|
|
924
|
+
if compressor is None:
|
|
925
|
+
compressor = ICF_DEFAULT_COMPRESSOR
|
|
928
926
|
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
929
927
|
target_num_partitions = max(target_num_partitions, len(vcfs))
|
|
930
928
|
|
|
@@ -934,14 +932,19 @@ class IntermediateColumnarFormatWriter:
|
|
|
934
932
|
worker_processes=worker_processes,
|
|
935
933
|
show_progress=show_progress,
|
|
936
934
|
target_num_partitions=target_num_partitions,
|
|
937
|
-
column_chunk_size=column_chunk_size,
|
|
938
935
|
)
|
|
939
936
|
self.metadata = icf_metadata
|
|
937
|
+
self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
|
|
938
|
+
self.metadata.compressor = compressor.get_config()
|
|
939
|
+
self.metadata.column_chunk_size = column_chunk_size
|
|
940
|
+
# Bare minimum here for provenance - would be nice to include versions of key
|
|
941
|
+
# dependencies as well.
|
|
942
|
+
self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
|
|
940
943
|
|
|
941
|
-
self.mkdirs()
|
|
944
|
+
self.mkdirs(worker_processes, show_progress=show_progress)
|
|
942
945
|
|
|
943
946
|
# Note: this is needed for the current version of the vcfzarr spec, but it's
|
|
944
|
-
# probably
|
|
947
|
+
# probably going to be dropped.
|
|
945
948
|
# https://github.com/pystatgen/vcf-zarr-spec/issues/15
|
|
946
949
|
# May be useful to keep lying around still though?
|
|
947
950
|
logger.info(f"Writing VCF header")
|
|
@@ -953,20 +956,30 @@ class IntermediateColumnarFormatWriter:
|
|
|
953
956
|
json.dump(self.metadata.asdict(), f, indent=4)
|
|
954
957
|
return self.num_partitions
|
|
955
958
|
|
|
956
|
-
def mkdirs(self):
|
|
957
|
-
|
|
958
|
-
logger.info(
|
|
959
|
-
f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
|
|
960
|
-
)
|
|
959
|
+
def mkdirs(self, worker_processes=1, show_progress=False):
|
|
960
|
+
num_dirs = len(self.metadata.fields) * self.num_partitions
|
|
961
|
+
logger.info(f"Creating {num_dirs} directories")
|
|
961
962
|
self.path.mkdir()
|
|
962
963
|
self.wip_path.mkdir()
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
964
|
+
# Due to high latency batch system filesystems, we create all the directories in
|
|
965
|
+
# parallel
|
|
966
|
+
progress_config = core.ProgressConfig(
|
|
967
|
+
total=num_dirs,
|
|
968
|
+
units="dirs",
|
|
969
|
+
title="Mkdirs",
|
|
970
|
+
show=show_progress,
|
|
971
|
+
)
|
|
972
|
+
with core.ParallelWorkManager(
|
|
973
|
+
worker_processes=worker_processes, progress_config=progress_config
|
|
974
|
+
) as manager:
|
|
975
|
+
for field in self.metadata.fields:
|
|
976
|
+
col_path = get_vcf_field_path(self.path, field)
|
|
977
|
+
# Don't bother trying to count the intermediate directories towards
|
|
978
|
+
# progress
|
|
979
|
+
manager.submit(col_path.mkdir, parents=True)
|
|
980
|
+
for j in range(self.num_partitions):
|
|
981
|
+
part_path = col_path / f"p{j}"
|
|
982
|
+
manager.submit(mkdir_with_progress, part_path)
|
|
970
983
|
|
|
971
984
|
def load_partition_summaries(self):
|
|
972
985
|
summaries = []
|
|
@@ -1133,12 +1146,13 @@ class IntermediateColumnarFormatWriter:
|
|
|
1133
1146
|
|
|
1134
1147
|
|
|
1135
1148
|
def explode(
|
|
1136
|
-
vcfs,
|
|
1137
1149
|
icf_path,
|
|
1150
|
+
vcfs,
|
|
1138
1151
|
*,
|
|
1139
1152
|
column_chunk_size=16,
|
|
1140
1153
|
worker_processes=1,
|
|
1141
1154
|
show_progress=False,
|
|
1155
|
+
compressor=None,
|
|
1142
1156
|
):
|
|
1143
1157
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1144
1158
|
num_partitions = writer.init(
|
|
@@ -1148,6 +1162,7 @@ def explode(
|
|
|
1148
1162
|
worker_processes=worker_processes,
|
|
1149
1163
|
show_progress=show_progress,
|
|
1150
1164
|
column_chunk_size=column_chunk_size,
|
|
1165
|
+
compressor=compressor,
|
|
1151
1166
|
)
|
|
1152
1167
|
writer.explode(worker_processes=worker_processes, show_progress=show_progress)
|
|
1153
1168
|
writer.finalise()
|
|
@@ -1162,6 +1177,7 @@ def explode_init(
|
|
|
1162
1177
|
target_num_partitions=1,
|
|
1163
1178
|
worker_processes=1,
|
|
1164
1179
|
show_progress=False,
|
|
1180
|
+
compressor=None,
|
|
1165
1181
|
):
|
|
1166
1182
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1167
1183
|
return writer.init(
|
|
@@ -1170,6 +1186,7 @@ def explode_init(
|
|
|
1170
1186
|
worker_processes=worker_processes,
|
|
1171
1187
|
show_progress=show_progress,
|
|
1172
1188
|
column_chunk_size=column_chunk_size,
|
|
1189
|
+
compressor=compressor,
|
|
1173
1190
|
)
|
|
1174
1191
|
|
|
1175
1192
|
|
|
@@ -1480,16 +1497,28 @@ class EncodingWork:
|
|
|
1480
1497
|
memory: int = 0
|
|
1481
1498
|
|
|
1482
1499
|
|
|
1500
|
+
def parse_max_memory(max_memory):
|
|
1501
|
+
if max_memory is None:
|
|
1502
|
+
# Effectively unbounded
|
|
1503
|
+
return 2**63
|
|
1504
|
+
if isinstance(max_memory, str):
|
|
1505
|
+
max_memory = humanfriendly.parse_size(max_memory)
|
|
1506
|
+
logger.info(f"Set memory budget to {display_size(max_memory)}")
|
|
1507
|
+
return max_memory
|
|
1508
|
+
|
|
1509
|
+
|
|
1483
1510
|
class VcfZarrWriter:
|
|
1484
|
-
def __init__(self, path, icf, schema):
|
|
1511
|
+
def __init__(self, path, icf, schema, dimension_separator=None):
|
|
1485
1512
|
self.path = pathlib.Path(path)
|
|
1486
1513
|
self.icf = icf
|
|
1487
1514
|
self.schema = schema
|
|
1515
|
+
# Default to using nested directories following the Zarr v3 default.
|
|
1516
|
+
# This seems to require version 2.17+ to work properly
|
|
1517
|
+
self.dimension_separator = "/" if dimension_separator is None else dimension_separator
|
|
1488
1518
|
store = zarr.DirectoryStore(self.path)
|
|
1489
1519
|
self.root = zarr.group(store=store)
|
|
1490
1520
|
|
|
1491
1521
|
def init_array(self, variable):
|
|
1492
|
-
# print("CREATE", variable)
|
|
1493
1522
|
object_codec = None
|
|
1494
1523
|
if variable.dtype == "O":
|
|
1495
1524
|
object_codec = numcodecs.VLenUTF8()
|
|
@@ -1501,7 +1530,9 @@ class VcfZarrWriter:
|
|
|
1501
1530
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1502
1531
|
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1503
1532
|
object_codec=object_codec,
|
|
1533
|
+
dimension_separator=self.dimension_separator,
|
|
1504
1534
|
)
|
|
1535
|
+
# Dimension names are part of the spec in Zarr v3
|
|
1505
1536
|
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
|
|
1506
1537
|
|
|
1507
1538
|
def get_array(self, name):
|
|
@@ -1639,6 +1670,7 @@ class VcfZarrWriter:
|
|
|
1639
1670
|
"contig_length",
|
|
1640
1671
|
self.schema.contig_length,
|
|
1641
1672
|
dtype=np.int64,
|
|
1673
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1642
1674
|
)
|
|
1643
1675
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1644
1676
|
return {v: j for j, v in enumerate(self.schema.contig_id)}
|
|
@@ -1661,8 +1693,6 @@ class VcfZarrWriter:
|
|
|
1661
1693
|
self.init_array(column)
|
|
1662
1694
|
|
|
1663
1695
|
def finalise(self):
|
|
1664
|
-
# for column in self.schema.columns.values():
|
|
1665
|
-
# self.finalise_array(column)
|
|
1666
1696
|
zarr.consolidate_metadata(self.path)
|
|
1667
1697
|
|
|
1668
1698
|
def encode(
|
|
@@ -1672,12 +1702,7 @@ class VcfZarrWriter:
|
|
|
1672
1702
|
show_progress=False,
|
|
1673
1703
|
max_memory=None,
|
|
1674
1704
|
):
|
|
1675
|
-
|
|
1676
|
-
# Unbounded
|
|
1677
|
-
max_memory = 2**63
|
|
1678
|
-
else:
|
|
1679
|
-
# Value is specified in Mibibytes
|
|
1680
|
-
max_memory *= 2**20 # NEEDS TEST
|
|
1705
|
+
max_memory = parse_max_memory(max_memory)
|
|
1681
1706
|
|
|
1682
1707
|
# TODO this will move into the setup logic later when we're making it possible
|
|
1683
1708
|
# to split the work by slice
|
|
@@ -1764,8 +1789,8 @@ class VcfZarrWriter:
|
|
|
1764
1789
|
|
|
1765
1790
|
# Fail early if we can't fit a particular column into memory
|
|
1766
1791
|
for wp in work:
|
|
1767
|
-
if wp.memory
|
|
1768
|
-
raise ValueError(
|
|
1792
|
+
if wp.memory > max_memory:
|
|
1793
|
+
raise ValueError(
|
|
1769
1794
|
f"Insufficient memory for {wp.columns}: "
|
|
1770
1795
|
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1771
1796
|
)
|
|
@@ -1778,6 +1803,8 @@ class VcfZarrWriter:
|
|
|
1778
1803
|
)
|
|
1779
1804
|
|
|
1780
1805
|
used_memory = 0
|
|
1806
|
+
# We need to keep some bounds on the queue size or the memory bounds algorithm
|
|
1807
|
+
# below doesn't really work.
|
|
1781
1808
|
max_queued = 4 * max(1, worker_processes)
|
|
1782
1809
|
encoded_slices = collections.Counter()
|
|
1783
1810
|
|
|
@@ -1804,10 +1831,14 @@ class VcfZarrWriter:
|
|
|
1804
1831
|
self.finalise_array(column)
|
|
1805
1832
|
|
|
1806
1833
|
for wp in work:
|
|
1807
|
-
|
|
1834
|
+
while (
|
|
1808
1835
|
used_memory + wp.memory > max_memory
|
|
1809
1836
|
or len(future_to_work) > max_queued
|
|
1810
1837
|
):
|
|
1838
|
+
logger.debug(
|
|
1839
|
+
f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
|
|
1840
|
+
f"queued={len(future_to_work)} max_queued={max_queued}"
|
|
1841
|
+
)
|
|
1811
1842
|
service_completed_futures()
|
|
1812
1843
|
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1813
1844
|
used_memory += wp.memory
|
|
@@ -1832,6 +1863,7 @@ def encode(
|
|
|
1832
1863
|
variants_chunk_size=None,
|
|
1833
1864
|
samples_chunk_size=None,
|
|
1834
1865
|
max_v_chunks=None,
|
|
1866
|
+
dimension_separator=None,
|
|
1835
1867
|
max_memory=None,
|
|
1836
1868
|
worker_processes=1,
|
|
1837
1869
|
show_progress=False,
|
|
@@ -1855,7 +1887,7 @@ def encode(
|
|
|
1855
1887
|
if zarr_path.exists():
|
|
1856
1888
|
logger.warning(f"Deleting existing {zarr_path}")
|
|
1857
1889
|
shutil.rmtree(zarr_path)
|
|
1858
|
-
vzw = VcfZarrWriter(zarr_path, icf, schema)
|
|
1890
|
+
vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
|
|
1859
1891
|
vzw.init()
|
|
1860
1892
|
vzw.encode(
|
|
1861
1893
|
max_v_chunks=max_v_chunks,
|
|
@@ -1876,10 +1908,11 @@ def convert(
|
|
|
1876
1908
|
show_progress=False,
|
|
1877
1909
|
# TODO add arguments to control location of tmpdir
|
|
1878
1910
|
):
|
|
1879
|
-
with tempfile.TemporaryDirectory(prefix="
|
|
1911
|
+
with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
|
|
1912
|
+
if_dir = pathlib.Path(tmp) / "if"
|
|
1880
1913
|
explode(
|
|
1881
|
-
vcfs,
|
|
1882
1914
|
if_dir,
|
|
1915
|
+
vcfs,
|
|
1883
1916
|
worker_processes=worker_processes,
|
|
1884
1917
|
show_progress=show_progress,
|
|
1885
1918
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Home-page: https://github.com/pystatgen/bio2zarr
|
|
6
6
|
Author: sgkit Developers
|
|
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
|
|
|
20
20
|
Description-Content-Type: text/x-rst
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy
|
|
23
|
-
Requires-Dist: zarr
|
|
23
|
+
Requires-Dist: zarr >=2.17
|
|
24
24
|
Requires-Dist: click
|
|
25
25
|
Requires-Dist: tabulate
|
|
26
26
|
Requires-Dist: tqdm
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
+
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
+
bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
|
|
4
|
+
bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
|
|
5
|
+
bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
|
|
6
|
+
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
+
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
+
bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
+
bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
+
bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
|
|
13
|
+
bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
+
bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
+
bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
+
bio2zarr-0.0.4.dist-info/RECORD,,
|
bio2zarr-0.0.2.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
|
|
2
|
-
bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
|
|
3
|
-
bio2zarr/_version.py,sha256=NDHlyIcJZjLz8wKlmD1-pr6me5FHBAYwO_ynLG-37N8,411
|
|
4
|
-
bio2zarr/cli.py,sha256=rNgxpjIwpltEHj1NOpJtwLvGOA0etuxcqMXyNlPbCts,9882
|
|
5
|
-
bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
|
|
6
|
-
bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
|
|
7
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
8
|
-
bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
|
|
9
|
-
bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
|
|
11
|
-
bio2zarr-0.0.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
12
|
-
bio2zarr-0.0.2.dist-info/METADATA,sha256=Uqirw85BARPHIZmkPJJKfWRKQgjhtQDDfH9wLJDoxj8,1106
|
|
13
|
-
bio2zarr-0.0.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
14
|
-
bio2zarr-0.0.2.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
|
|
15
|
-
bio2zarr-0.0.2.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
16
|
-
bio2zarr-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|