bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -2
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +176 -113
- bio2zarr/constants.py +18 -0
- bio2zarr/core.py +65 -20
- bio2zarr/vcf2zarr/__init__.py +38 -0
- bio2zarr/vcf2zarr/icf.py +1221 -0
- bio2zarr/vcf2zarr/vcz.py +1053 -0
- bio2zarr/vcf2zarr/verification.py +230 -0
- bio2zarr/vcf_utils.py +11 -6
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
- bio2zarr-0.1.0.dist-info/RECORD +20 -0
- bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
- bio2zarr/vcf.py +0 -2445
- bio2zarr-0.0.9.dist-info/RECORD +0 -16
- bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0
bio2zarr/core.py
CHANGED
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
import concurrent.futures as cf
|
|
2
2
|
import contextlib
|
|
3
3
|
import dataclasses
|
|
4
|
+
import json
|
|
4
5
|
import logging
|
|
6
|
+
import math
|
|
5
7
|
import multiprocessing
|
|
6
8
|
import os
|
|
7
9
|
import os.path
|
|
10
|
+
import sys
|
|
8
11
|
import threading
|
|
9
12
|
import time
|
|
13
|
+
import warnings
|
|
10
14
|
|
|
15
|
+
import humanfriendly
|
|
11
16
|
import numcodecs
|
|
12
17
|
import numpy as np
|
|
13
18
|
import tqdm
|
|
@@ -18,6 +23,17 @@ logger = logging.getLogger(__name__)
|
|
|
18
23
|
numcodecs.blosc.use_threads = False
|
|
19
24
|
|
|
20
25
|
|
|
26
|
+
def display_number(x):
|
|
27
|
+
ret = "n/a"
|
|
28
|
+
if math.isfinite(x):
|
|
29
|
+
ret = f"{x: 0.2g}"
|
|
30
|
+
return ret
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def display_size(n):
|
|
34
|
+
return humanfriendly.format_size(n, binary=True)
|
|
35
|
+
|
|
36
|
+
|
|
21
37
|
def min_int_dtype(min_value, max_value):
|
|
22
38
|
if min_value > max_value:
|
|
23
39
|
raise ValueError("min_value must be <= max_value")
|
|
@@ -64,6 +80,11 @@ def du(path):
|
|
|
64
80
|
|
|
65
81
|
|
|
66
82
|
class SynchronousExecutor(cf.Executor):
|
|
83
|
+
# Arguably we should use workers=0 as the default and use this
|
|
84
|
+
# executor implementation. However, the docs are fairly explicit
|
|
85
|
+
# about saying we shouldn't instantiate Future objects directly,
|
|
86
|
+
# so it's best to keep this as a semi-secret debugging interface
|
|
87
|
+
# for now.
|
|
67
88
|
def submit(self, fn, /, *args, **kwargs):
|
|
68
89
|
future = cf.Future()
|
|
69
90
|
future.set_result(fn(*args, **kwargs))
|
|
@@ -128,7 +149,6 @@ class BufferedArray:
|
|
|
128
149
|
sync_flush_2d_array(
|
|
129
150
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
130
151
|
)
|
|
131
|
-
# FIXME the array.name doesn't seem to be working here for some reason
|
|
132
152
|
logger.debug(
|
|
133
153
|
f"Flushed <{self.array.name} {self.array.shape} "
|
|
134
154
|
f"{self.array.dtype}> "
|
|
@@ -174,12 +194,15 @@ class ProgressConfig:
|
|
|
174
194
|
# progressable thing happening per source process. This is
|
|
175
195
|
# probably fine in practise, but there could be corner cases
|
|
176
196
|
# where it's not. Something to watch out for.
|
|
177
|
-
_progress_counter =
|
|
197
|
+
_progress_counter = None
|
|
178
198
|
|
|
179
199
|
|
|
180
200
|
def update_progress(inc):
|
|
181
|
-
|
|
182
|
-
|
|
201
|
+
# If the _progress_counter has not been set we are working in a
|
|
202
|
+
# synchronous non-progress tracking context
|
|
203
|
+
if _progress_counter is not None:
|
|
204
|
+
with _progress_counter.get_lock():
|
|
205
|
+
_progress_counter.value += inc
|
|
183
206
|
|
|
184
207
|
|
|
185
208
|
def get_progress():
|
|
@@ -188,23 +211,48 @@ def get_progress():
|
|
|
188
211
|
return val
|
|
189
212
|
|
|
190
213
|
|
|
191
|
-
def
|
|
192
|
-
|
|
193
|
-
|
|
214
|
+
def setup_progress_counter(counter):
|
|
215
|
+
global _progress_counter
|
|
216
|
+
_progress_counter = counter
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def warn_py39_mac():
|
|
220
|
+
if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
|
|
221
|
+
warnings.warn(
|
|
222
|
+
"There is a known issue with bio2zarr on MacOS Python 3.9 "
|
|
223
|
+
"in which OS-level named semaphores are leaked. "
|
|
224
|
+
"You will also probably see warnings like 'There appear to be N "
|
|
225
|
+
"leaked semaphore objects at shutdown'. "
|
|
226
|
+
"While this is likely harmless for a few runs, it could lead to "
|
|
227
|
+
"issues if you do a lot of conversion. To get prevent this issue "
|
|
228
|
+
"either: (1) use --worker-processes=0 or (2) upgrade to a newer "
|
|
229
|
+
"Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
|
|
230
|
+
"for more details.",
|
|
231
|
+
stacklevel=2,
|
|
232
|
+
)
|
|
194
233
|
|
|
195
234
|
|
|
196
235
|
class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
197
236
|
def __init__(self, worker_processes=1, progress_config=None):
|
|
237
|
+
# Need to specify this explicitly to suppport Macs and
|
|
238
|
+
# for future proofing.
|
|
239
|
+
ctx = multiprocessing.get_context("spawn")
|
|
240
|
+
global _progress_counter
|
|
241
|
+
_progress_counter = ctx.Value("Q", 0)
|
|
198
242
|
if worker_processes <= 0:
|
|
199
|
-
# NOTE: this is only for testing, not for
|
|
243
|
+
# NOTE: this is only for testing and debugging, not for
|
|
244
|
+
# production. See note on the SynchronousExecutor class.
|
|
200
245
|
self.executor = SynchronousExecutor()
|
|
201
246
|
else:
|
|
247
|
+
warn_py39_mac()
|
|
202
248
|
self.executor = cf.ProcessPoolExecutor(
|
|
203
249
|
max_workers=worker_processes,
|
|
250
|
+
mp_context=ctx,
|
|
251
|
+
initializer=setup_progress_counter,
|
|
252
|
+
initargs=(_progress_counter,),
|
|
204
253
|
)
|
|
205
254
|
self.futures = set()
|
|
206
255
|
|
|
207
|
-
set_progress(0)
|
|
208
256
|
if progress_config is None:
|
|
209
257
|
progress_config = ProgressConfig()
|
|
210
258
|
self.progress_config = progress_config
|
|
@@ -228,7 +276,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
228
276
|
def _update_progress(self):
|
|
229
277
|
current = get_progress()
|
|
230
278
|
inc = current - self.progress_bar.n
|
|
231
|
-
# print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
|
|
232
279
|
self.progress_bar.update(inc)
|
|
233
280
|
|
|
234
281
|
def _update_progress_worker(self):
|
|
@@ -245,16 +292,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
245
292
|
self.futures.add(future)
|
|
246
293
|
return future
|
|
247
294
|
|
|
248
|
-
def wait_for_completed(self, timeout=None):
|
|
249
|
-
done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
|
|
250
|
-
for future in done:
|
|
251
|
-
exception = future.exception()
|
|
252
|
-
# TODO do the check for BrokenProcessPool here
|
|
253
|
-
if exception is not None:
|
|
254
|
-
raise exception
|
|
255
|
-
self.futures = not_done
|
|
256
|
-
return done
|
|
257
|
-
|
|
258
295
|
def results_as_completed(self):
|
|
259
296
|
for future in cf.as_completed(self.futures):
|
|
260
297
|
yield future.result()
|
|
@@ -278,3 +315,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
278
315
|
self._update_progress()
|
|
279
316
|
self.progress_bar.close()
|
|
280
317
|
return False
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class JsonDataclass:
|
|
321
|
+
def asdict(self):
|
|
322
|
+
return dataclasses.asdict(self)
|
|
323
|
+
|
|
324
|
+
def asjson(self):
|
|
325
|
+
return json.dumps(self.asdict(), indent=4)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .icf import (
|
|
2
|
+
IntermediateColumnarFormat,
|
|
3
|
+
explode,
|
|
4
|
+
explode_finalise,
|
|
5
|
+
explode_init,
|
|
6
|
+
explode_partition,
|
|
7
|
+
)
|
|
8
|
+
from .vcz import (
|
|
9
|
+
VcfZarrSchema,
|
|
10
|
+
convert,
|
|
11
|
+
encode,
|
|
12
|
+
encode_finalise,
|
|
13
|
+
encode_init,
|
|
14
|
+
encode_partition,
|
|
15
|
+
inspect,
|
|
16
|
+
mkschema,
|
|
17
|
+
)
|
|
18
|
+
from .verification import verify
|
|
19
|
+
|
|
20
|
+
# NOTE some of these aren't intended to be part of the external
|
|
21
|
+
# interface (like IntermediateColumnarFormat), but putting
|
|
22
|
+
# them into the list to keep the lint nagging under control
|
|
23
|
+
__all__ = [
|
|
24
|
+
"IntermediateColumnarFormat",
|
|
25
|
+
"explode",
|
|
26
|
+
"explode_finalise",
|
|
27
|
+
"explode_init",
|
|
28
|
+
"explode_partition",
|
|
29
|
+
"VcfZarrSchema",
|
|
30
|
+
"convert",
|
|
31
|
+
"encode",
|
|
32
|
+
"encode_finalise",
|
|
33
|
+
"encode_init",
|
|
34
|
+
"encode_partition",
|
|
35
|
+
"inspect",
|
|
36
|
+
"mkschema",
|
|
37
|
+
"verify",
|
|
38
|
+
]
|