bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/core.py CHANGED
@@ -1,13 +1,18 @@
1
1
  import concurrent.futures as cf
2
2
  import contextlib
3
3
  import dataclasses
4
+ import json
4
5
  import logging
6
+ import math
5
7
  import multiprocessing
6
8
  import os
7
9
  import os.path
10
+ import sys
8
11
  import threading
9
12
  import time
13
+ import warnings
10
14
 
15
+ import humanfriendly
11
16
  import numcodecs
12
17
  import numpy as np
13
18
  import tqdm
@@ -18,6 +23,17 @@ logger = logging.getLogger(__name__)
18
23
  numcodecs.blosc.use_threads = False
19
24
 
20
25
 
26
+ def display_number(x):
27
+ ret = "n/a"
28
+ if math.isfinite(x):
29
+ ret = f"{x: 0.2g}"
30
+ return ret
31
+
32
+
33
+ def display_size(n):
34
+ return humanfriendly.format_size(n, binary=True)
35
+
36
+
21
37
  def min_int_dtype(min_value, max_value):
22
38
  if min_value > max_value:
23
39
  raise ValueError("min_value must be <= max_value")
@@ -64,6 +80,11 @@ def du(path):
64
80
 
65
81
 
66
82
  class SynchronousExecutor(cf.Executor):
83
+ # Arguably we should use workers=0 as the default and use this
84
+ # executor implementation. However, the docs are fairly explicit
85
+ # about saying we shouldn't instantiate Future objects directly,
86
+ # so it's best to keep this as a semi-secret debugging interface
87
+ # for now.
67
88
  def submit(self, fn, /, *args, **kwargs):
68
89
  future = cf.Future()
69
90
  future.set_result(fn(*args, **kwargs))
@@ -128,7 +149,6 @@ class BufferedArray:
128
149
  sync_flush_2d_array(
129
150
  self.buff[: self.buffer_row], self.array, self.array_offset
130
151
  )
131
- # FIXME the array.name doesn't seem to be working here for some reason
132
152
  logger.debug(
133
153
  f"Flushed <{self.array.name} {self.array.shape} "
134
154
  f"{self.array.dtype}> "
@@ -174,12 +194,15 @@ class ProgressConfig:
174
194
  # progressable thing happening per source process. This is
175
195
  # probably fine in practise, but there could be corner cases
176
196
  # where it's not. Something to watch out for.
177
- _progress_counter = multiprocessing.Value("Q", 0)
197
+ _progress_counter = None
178
198
 
179
199
 
180
200
  def update_progress(inc):
181
- with _progress_counter.get_lock():
182
- _progress_counter.value += inc
201
+ # If the _progress_counter has not been set we are working in a
202
+ # synchronous non-progress tracking context
203
+ if _progress_counter is not None:
204
+ with _progress_counter.get_lock():
205
+ _progress_counter.value += inc
183
206
 
184
207
 
185
208
  def get_progress():
@@ -188,23 +211,48 @@ def get_progress():
188
211
  return val
189
212
 
190
213
 
191
- def set_progress(value):
192
- with _progress_counter.get_lock():
193
- _progress_counter.value = value
214
+ def setup_progress_counter(counter):
215
+ global _progress_counter
216
+ _progress_counter = counter
217
+
218
+
219
+ def warn_py39_mac():
220
+ if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
221
+ warnings.warn(
222
+ "There is a known issue with bio2zarr on MacOS Python 3.9 "
223
+ "in which OS-level named semaphores are leaked. "
224
+ "You will also probably see warnings like 'There appear to be N "
225
+ "leaked semaphore objects at shutdown'. "
226
+ "While this is likely harmless for a few runs, it could lead to "
227
+ "issues if you do a lot of conversion. To get prevent this issue "
228
+ "either: (1) use --worker-processes=0 or (2) upgrade to a newer "
229
+ "Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
230
+ "for more details.",
231
+ stacklevel=2,
232
+ )
194
233
 
195
234
 
196
235
  class ParallelWorkManager(contextlib.AbstractContextManager):
197
236
  def __init__(self, worker_processes=1, progress_config=None):
237
+ # Need to specify this explicitly to suppport Macs and
238
+ # for future proofing.
239
+ ctx = multiprocessing.get_context("spawn")
240
+ global _progress_counter
241
+ _progress_counter = ctx.Value("Q", 0)
198
242
  if worker_processes <= 0:
199
- # NOTE: this is only for testing, not for production use!
243
+ # NOTE: this is only for testing and debugging, not for
244
+ # production. See note on the SynchronousExecutor class.
200
245
  self.executor = SynchronousExecutor()
201
246
  else:
247
+ warn_py39_mac()
202
248
  self.executor = cf.ProcessPoolExecutor(
203
249
  max_workers=worker_processes,
250
+ mp_context=ctx,
251
+ initializer=setup_progress_counter,
252
+ initargs=(_progress_counter,),
204
253
  )
205
254
  self.futures = set()
206
255
 
207
- set_progress(0)
208
256
  if progress_config is None:
209
257
  progress_config = ProgressConfig()
210
258
  self.progress_config = progress_config
@@ -228,7 +276,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
228
276
  def _update_progress(self):
229
277
  current = get_progress()
230
278
  inc = current - self.progress_bar.n
231
- # print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
232
279
  self.progress_bar.update(inc)
233
280
 
234
281
  def _update_progress_worker(self):
@@ -245,16 +292,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
245
292
  self.futures.add(future)
246
293
  return future
247
294
 
248
- def wait_for_completed(self, timeout=None):
249
- done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
250
- for future in done:
251
- exception = future.exception()
252
- # TODO do the check for BrokenProcessPool here
253
- if exception is not None:
254
- raise exception
255
- self.futures = not_done
256
- return done
257
-
258
295
  def results_as_completed(self):
259
296
  for future in cf.as_completed(self.futures):
260
297
  yield future.result()
@@ -278,3 +315,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
278
315
  self._update_progress()
279
316
  self.progress_bar.close()
280
317
  return False
318
+
319
+
320
+ class JsonDataclass:
321
+ def asdict(self):
322
+ return dataclasses.asdict(self)
323
+
324
+ def asjson(self):
325
+ return json.dumps(self.asdict(), indent=4)
@@ -0,0 +1,38 @@
1
+ from .icf import (
2
+ IntermediateColumnarFormat,
3
+ explode,
4
+ explode_finalise,
5
+ explode_init,
6
+ explode_partition,
7
+ )
8
+ from .vcz import (
9
+ VcfZarrSchema,
10
+ convert,
11
+ encode,
12
+ encode_finalise,
13
+ encode_init,
14
+ encode_partition,
15
+ inspect,
16
+ mkschema,
17
+ )
18
+ from .verification import verify
19
+
20
+ # NOTE some of these aren't intended to be part of the external
21
+ # interface (like IntermediateColumnarFormat), but putting
22
+ # them into the list to keep the lint nagging under control
23
+ __all__ = [
24
+ "IntermediateColumnarFormat",
25
+ "explode",
26
+ "explode_finalise",
27
+ "explode_init",
28
+ "explode_partition",
29
+ "VcfZarrSchema",
30
+ "convert",
31
+ "encode",
32
+ "encode_finalise",
33
+ "encode_init",
34
+ "encode_partition",
35
+ "inspect",
36
+ "mkschema",
37
+ "verify",
38
+ ]