bio2zarr 0.0.3__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (45) hide show
  1. bio2zarr-0.0.6/.github/workflows/docs.yml +56 -0
  2. bio2zarr-0.0.6/.github/workflows/lint.yml +17 -0
  3. bio2zarr-0.0.6/.pre-commit-config.yaml +15 -0
  4. bio2zarr-0.0.6/CHANGELOG.md +32 -0
  5. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/PKG-INFO +2 -2
  6. bio2zarr-0.0.6/bio2zarr/__init__.py +1 -0
  7. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/__main__.py +2 -0
  8. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/_version.py +2 -2
  9. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/cli.py +166 -37
  10. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/core.py +20 -10
  11. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/plink.py +6 -8
  12. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/typing.py +1 -1
  13. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/vcf.py +670 -381
  14. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/vcf_utils.py +26 -8
  15. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/PKG-INFO +2 -2
  16. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/SOURCES.txt +13 -0
  17. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/requires.txt +1 -1
  18. bio2zarr-0.0.6/docs/Makefile +18 -0
  19. bio2zarr-0.0.6/docs/_config.yml +36 -0
  20. bio2zarr-0.0.6/docs/_toc.yml +4 -0
  21. bio2zarr-0.0.6/docs/build.sh +20 -0
  22. bio2zarr-0.0.6/docs/cli.md +10 -0
  23. bio2zarr-0.0.6/docs/intro.md +76 -0
  24. bio2zarr-0.0.6/docs/logo.png +0 -0
  25. bio2zarr-0.0.6/docs/references.bib +3 -0
  26. bio2zarr-0.0.6/docs/requirements.txt +11 -0
  27. bio2zarr-0.0.6/pyproject.toml +15 -0
  28. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/requirements/development.txt +2 -1
  29. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/setup.cfg +1 -1
  30. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation.py +9 -4
  31. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/vcf_generator.py +1 -0
  32. bio2zarr-0.0.3/CHANGELOG.md +0 -11
  33. bio2zarr-0.0.3/bio2zarr/__init__.py +0 -1
  34. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/.gitignore +0 -0
  35. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/LICENSE +0 -0
  36. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/MANIFEST.in +0 -0
  37. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/README.md +0 -0
  38. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/provenance.py +0 -0
  39. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/dependency_links.txt +0 -0
  40. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/entry_points.txt +0 -0
  41. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/not-zip-safe +0 -0
  42. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/top_level.txt +0 -0
  43. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/setup.py +0 -0
  44. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation-data/Makefile +0 -0
  45. {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation-data/split.sh +0 -0
@@ -0,0 +1,56 @@
1
+ name: Build Docs
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main, test]
7
+ tags:
8
+ - '*'
9
+
10
+ jobs:
11
+ build-docs:
12
+ name: Docs
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - name: Cancel Previous Runs
16
+ uses: styfle/cancel-workflow-action@0.12.1
17
+ with:
18
+ access_token: ${{ github.token }}
19
+
20
+ - uses: actions/checkout@v3
21
+
22
+ - uses: actions/setup-python@v4
23
+ with:
24
+ python-version: "3.11"
25
+ cache: 'pip'
26
+
27
+ - name: Create venv and install deps
28
+ run: |
29
+ pip install --upgrade pip wheel
30
+ pip install -r docs/requirements.txt
31
+
32
+ - name: Build Docs
33
+ run: |
34
+ make -C docs
35
+
36
+ - name: Upload Pages Artifact
37
+ uses: actions/upload-pages-artifact@v3
38
+ with:
39
+ path: docs/_build/html
40
+
41
+ deploy:
42
+ needs: build-docs
43
+ if: github.event_name != 'pull_request'
44
+ permissions:
45
+ pages: write
46
+ id-token: write
47
+
48
+ environment:
49
+ name: github-pages
50
+ url: ${{ steps.deployment.outputs.page_url }}
51
+
52
+ runs-on: ubuntu-latest
53
+ steps:
54
+ - name: Deploy to GitHub Pages
55
+ id: deployment
56
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,17 @@
1
+ name: Lint
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main, test]
7
+
8
+ jobs:
9
+ pre-commit:
10
+ name: Lint
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ - uses: actions/setup-python@v4
15
+ with:
16
+ python-version: '3.11'
17
+ - uses: pre-commit/action@v3.0.1
@@ -0,0 +1,15 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: check-merge-conflict
6
+ - id: debug-statements
7
+ - id: mixed-line-ending
8
+ - id: check-case-conflict
9
+ - id: check-yaml
10
+ - repo: https://github.com/astral-sh/ruff-pre-commit
11
+ rev: v0.3.7
12
+ hooks:
13
+ - id: ruff
14
+ args: [ --fix ]
15
+ - id: ruff-format
@@ -0,0 +1,32 @@
1
+ # 0.0.6 2024-04-24
2
+
3
+ - Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.
4
+ - Add initial implementation of distributed encode
5
+
6
+ # 0.0.5 2024-04-17
7
+
8
+ - Fix bug in schema handling (compressor settings ignored)
9
+ - Move making ICF field partition directories into per-partition processing.
10
+ Remove progress on the init mkdirs step.
11
+ - Turn off progress monitor on dexplode-partition
12
+ - Fix empty partition bug
13
+
14
+ # 0.0.4 2024-04-08
15
+
16
+ - Fix bug in --max-memory handling, and argument to a string like 10G
17
+ - Add compressor choice in explode, switch default to zstd
18
+ - Run mkdirs in parallel and provide progress
19
+ - Change dimension separator to "/" in Zarr
20
+ - Update min Zarr version to 2.17
21
+
22
+ # 0.0.3 2024-03-28
23
+
24
+ - Various refinements to the CLI
25
+
26
+ # 0.0.2 2024-03-27
27
+
28
+ - Merged 1D and 2D encode steps into one, and change rate reporting to bytes
29
+ - Add --max-memory for encode
30
+ - Change `chunk_width` to `samples_chunk_size` and `chunk_length` to `variants_chunk_size`
31
+ - Various updates to the intermediate chunked format, with breaking change to version 0.2
32
+ - Add distributed explode commands
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.3
3
+ Version: 0.0.6
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Home-page: https://github.com/pystatgen/bio2zarr
6
6
  Author: sgkit Developers
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
20
20
  Description-Content-Type: text/x-rst
21
21
  License-File: LICENSE
22
22
  Requires-Dist: numpy
23
- Requires-Dist: zarr!=2.11.0,!=2.11.1,!=2.11.2,>=2.10.0
23
+ Requires-Dist: zarr>=2.17
24
24
  Requires-Dist: click
25
25
  Requires-Dist: tabulate
26
26
  Requires-Dist: tqdm
@@ -0,0 +1 @@
1
+ from .provenance import __version__ # noqa F401
@@ -2,11 +2,13 @@ import click
2
2
 
3
3
  from . import cli
4
4
 
5
+
5
6
  @cli.version
6
7
  @click.group()
7
8
  def bio2zarr():
8
9
  pass
9
10
 
11
+
10
12
  # Provide a single top-level interface to all of the functionality.
11
13
  # This probably isn't the recommended way of interacting, as we
12
14
  # install individual commands as console scripts. However, this
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.3'
16
- __version_tuple__ = version_tuple = (0, 0, 3)
15
+ __version__ = version = '0.0.6'
16
+ __version_tuple__ = version_tuple = (0, 0, 6)
@@ -4,14 +4,12 @@ import pathlib
4
4
  import shutil
5
5
 
6
6
  import click
7
- import tabulate
8
7
  import coloredlogs
8
+ import humanfriendly
9
+ import numcodecs
10
+ import tabulate
9
11
 
10
- from . import vcf
11
- from . import vcf_utils
12
- from . import plink
13
- from . import provenance
14
-
12
+ from . import plink, provenance, vcf, vcf_utils
15
13
 
16
14
  logger = logging.getLogger(__name__)
17
15
 
@@ -42,6 +40,14 @@ new_zarr_path = click.argument(
42
40
  "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
43
41
  )
44
42
 
43
+ zarr_path = click.argument(
44
+ "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
45
+ )
46
+
47
+ num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
48
+
49
+ partition = click.argument("partition", type=click.IntRange(min=0))
50
+
45
51
  verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
46
52
 
47
53
  force = click.option(
@@ -66,6 +72,17 @@ column_chunk_size = click.option(
66
72
  help="Approximate uncompressed size of exploded column chunks in MiB",
67
73
  )
68
74
 
75
+ # We could provide the full flexiblity of numcodecs/Blosc here, but there
76
+ # doesn't seem much point. Can always add more arguments here to control
77
+ # compression level, etc.
78
+ compressor = click.option(
79
+ "-C",
80
+ "--compressor",
81
+ type=click.Choice(["lz4", "zstd"]),
82
+ default=None,
83
+ help="Codec to use for compressing column chunks (Default=zstd).",
84
+ )
85
+
69
86
  # Note: -l and -w were chosen when these were called "width" and "length".
70
87
  # possibly there are better letters now.
71
88
  variants_chunk_size = click.option(
@@ -84,6 +101,27 @@ samples_chunk_size = click.option(
84
101
  help="Chunk size in the samples dimension",
85
102
  )
86
103
 
104
+ schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
105
+
106
+ max_variant_chunks = click.option(
107
+ "-V",
108
+ "--max-variant-chunks",
109
+ type=int,
110
+ default=None,
111
+ help=(
112
+ "Truncate the output in the variants dimension to have "
113
+ "this number of chunks. Mainly intended to help with "
114
+ "schema tuning."
115
+ ),
116
+ )
117
+
118
+ max_memory = click.option(
119
+ "-M",
120
+ "--max-memory",
121
+ default=None,
122
+ help="An approximate bound on overall memory usage (e.g. 10G),",
123
+ )
124
+
87
125
 
88
126
  def setup_logging(verbosity):
89
127
  level = "WARNING"
@@ -113,24 +151,36 @@ def check_overwrite_dir(path, force):
113
151
  shutil.rmtree(tmp_delete_path)
114
152
 
115
153
 
154
+ def get_compressor(cname):
155
+ if cname is None:
156
+ return None
157
+ config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
158
+ config["cname"] = cname
159
+ return numcodecs.get_codec(config)
160
+
161
+
116
162
  @click.command
117
163
  @vcfs
118
164
  @new_icf_path
119
165
  @force
120
166
  @verbose
121
- @worker_processes
122
167
  @column_chunk_size
123
- def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
168
+ @compressor
169
+ @worker_processes
170
+ def explode(
171
+ vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
172
+ ):
124
173
  """
125
174
  Convert VCF(s) to intermediate columnar format
126
175
  """
127
176
  setup_logging(verbose)
128
177
  check_overwrite_dir(icf_path, force)
129
178
  vcf.explode(
130
- vcfs,
131
179
  icf_path,
180
+ vcfs,
132
181
  worker_processes=worker_processes,
133
182
  column_chunk_size=column_chunk_size,
183
+ compressor=get_compressor(compressor),
134
184
  show_progress=True,
135
185
  )
136
186
 
@@ -138,13 +188,21 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
138
188
  @click.command
139
189
  @vcfs
140
190
  @new_icf_path
141
- @click.argument("num_partitions", type=click.IntRange(min=1))
191
+ @num_partitions
142
192
  @force
143
193
  @column_chunk_size
194
+ @compressor
144
195
  @verbose
145
196
  @worker_processes
146
197
  def dexplode_init(
147
- vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
198
+ vcfs,
199
+ icf_path,
200
+ num_partitions,
201
+ force,
202
+ column_chunk_size,
203
+ compressor,
204
+ verbose,
205
+ worker_processes,
148
206
  ):
149
207
  """
150
208
  Initial step for distributed conversion of VCF(s) to intermediate columnar format
@@ -158,6 +216,7 @@ def dexplode_init(
158
216
  target_num_partitions=num_partitions,
159
217
  column_chunk_size=column_chunk_size,
160
218
  worker_processes=worker_processes,
219
+ compressor=get_compressor(compressor),
161
220
  show_progress=True,
162
221
  )
163
222
  click.echo(num_partitions)
@@ -165,7 +224,7 @@ def dexplode_init(
165
224
 
166
225
  @click.command
167
226
  @icf_path
168
- @click.argument("partition", type=click.IntRange(min=0))
227
+ @partition
169
228
  @verbose
170
229
  def dexplode_partition(icf_path, partition, verbose):
171
230
  """
@@ -174,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
174
233
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
175
234
  """
176
235
  setup_logging(verbose)
177
- vcf.explode_partition(icf_path, partition, show_progress=True)
236
+ vcf.explode_partition(icf_path, partition, show_progress=False)
178
237
 
179
238
 
180
239
  @click.command
181
- @click.argument("path", type=click.Path(), required=True)
240
+ @icf_path
182
241
  @verbose
183
- def dexplode_finalise(path, verbose):
242
+ def dexplode_finalise(icf_path, verbose):
184
243
  """
185
244
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
186
245
  """
187
246
  setup_logging(verbose)
188
- vcf.explode_finalise(path)
247
+ vcf.explode_finalise(icf_path)
189
248
 
190
249
 
191
250
  @click.command
@@ -215,27 +274,11 @@ def mkschema(icf_path):
215
274
  @new_zarr_path
216
275
  @force
217
276
  @verbose
218
- @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
277
+ @schema
219
278
  @variants_chunk_size
220
279
  @samples_chunk_size
221
- @click.option(
222
- "-V",
223
- "--max-variant-chunks",
224
- type=int,
225
- default=None,
226
- help=(
227
- "Truncate the output in the variants dimension to have "
228
- "this number of chunks. Mainly intended to help with "
229
- "schema tuning."
230
- ),
231
- )
232
- @click.option(
233
- "-M",
234
- "--max-memory",
235
- type=int,
236
- default=None,
237
- help="An approximate bound on overall memory usage in megabytes",
238
- )
280
+ @max_variant_chunks
281
+ @max_memory
239
282
  @worker_processes
240
283
  def encode(
241
284
  icf_path,
@@ -250,7 +293,7 @@ def encode(
250
293
  worker_processes,
251
294
  ):
252
295
  """
253
- Encode intermediate columnar format (see explode) to vcfzarr.
296
+ Convert intermediate columnar format to vcfzarr.
254
297
  """
255
298
  setup_logging(verbose)
256
299
  check_overwrite_dir(zarr_path, force)
@@ -260,13 +303,96 @@ def encode(
260
303
  schema_path=schema,
261
304
  variants_chunk_size=variants_chunk_size,
262
305
  samples_chunk_size=samples_chunk_size,
263
- max_v_chunks=max_variant_chunks,
306
+ max_variant_chunks=max_variant_chunks,
264
307
  worker_processes=worker_processes,
265
308
  max_memory=max_memory,
266
309
  show_progress=True,
267
310
  )
268
311
 
269
312
 
313
+ @click.command
314
+ @icf_path
315
+ @new_zarr_path
316
+ @num_partitions
317
+ @force
318
+ @schema
319
+ @variants_chunk_size
320
+ @samples_chunk_size
321
+ @max_variant_chunks
322
+ @verbose
323
+ def dencode_init(
324
+ icf_path,
325
+ zarr_path,
326
+ num_partitions,
327
+ force,
328
+ schema,
329
+ variants_chunk_size,
330
+ samples_chunk_size,
331
+ max_variant_chunks,
332
+ verbose,
333
+ ):
334
+ """
335
+ Initialise conversion of intermediate format to VCF Zarr. This will
336
+ set up the specified ZARR_PATH to perform this conversion over
337
+ NUM_PARTITIONS.
338
+
339
+ The output of this commmand is the actual number of partitions generated
340
+ (which may be less then the requested number, if there is not sufficient
341
+ chunks in the variants dimension) and a rough lower-bound on the amount
342
+ of memory required to encode a partition.
343
+
344
+ NOTE: the format of this output will likely change in subsequent releases;
345
+ it should not be considered machine-readable for now.
346
+ """
347
+ setup_logging(verbose)
348
+ check_overwrite_dir(zarr_path, force)
349
+ num_partitions, max_memory = vcf.encode_init(
350
+ icf_path,
351
+ zarr_path,
352
+ target_num_partitions=num_partitions,
353
+ schema_path=schema,
354
+ variants_chunk_size=variants_chunk_size,
355
+ samples_chunk_size=samples_chunk_size,
356
+ max_variant_chunks=max_variant_chunks,
357
+ show_progress=True,
358
+ )
359
+ formatted_size = humanfriendly.format_size(max_memory, binary=True)
360
+ # NOTE adding the size to the stdout here so that users can parse it
361
+ # and use in their submission scripts. This is a first pass, and
362
+ # will most likely change as we see what works and doesn't.
363
+ # NOTE we probably want to format this as a table, which lists
364
+ # some other properties, line by line
365
+ # NOTE This size number is also not quite enough, you need a bit of
366
+ # headroom with it (probably 10% or so). We should include this.
367
+ click.echo(f"{num_partitions}\t{formatted_size}")
368
+
369
+
370
+ @click.command
371
+ @zarr_path
372
+ @partition
373
+ @verbose
374
+ def dencode_partition(zarr_path, partition, verbose):
375
+ """
376
+ Convert a partition from intermediate columnar format to VCF Zarr.
377
+ Must be called *after* the Zarr path has been initialised with dencode_init.
378
+ Partition indexes must be from 0 (inclusive) to the number of paritions
379
+ returned by dencode_init (exclusive).
380
+ """
381
+ setup_logging(verbose)
382
+ vcf.encode_partition(zarr_path, partition)
383
+
384
+
385
+ @click.command
386
+ @zarr_path
387
+ @verbose
388
+ def dencode_finalise(zarr_path, verbose):
389
+ """
390
+ Final step for distributed conversion of ICF to VCF Zarr.
391
+ """
392
+ setup_logging(verbose)
393
+ vcf.encode_finalise(zarr_path, show_progress=True)
394
+
395
+
270
396
  @click.command(name="convert")
271
397
  @vcfs
272
398
  @new_zarr_path
@@ -354,6 +480,9 @@ vcf2zarr.add_command(encode)
354
480
  vcf2zarr.add_command(dexplode_init)
355
481
  vcf2zarr.add_command(dexplode_partition)
356
482
  vcf2zarr.add_command(dexplode_finalise)
483
+ vcf2zarr.add_command(dencode_init)
484
+ vcf2zarr.add_command(dencode_partition)
485
+ vcf2zarr.add_command(dencode_finalise)
357
486
 
358
487
 
359
488
  @click.command(name="convert")
@@ -1,22 +1,31 @@
1
- import dataclasses
2
- import contextlib
3
1
  import concurrent.futures as cf
2
+ import contextlib
3
+ import dataclasses
4
+ import logging
4
5
  import multiprocessing
5
6
  import threading
6
- import logging
7
7
  import time
8
8
 
9
- import zarr
9
+ import numcodecs
10
10
  import numpy as np
11
11
  import tqdm
12
- import numcodecs
13
-
12
+ import zarr
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
17
16
  numcodecs.blosc.use_threads = False
18
17
 
19
18
 
19
+ def min_int_dtype(min_value, max_value):
20
+ if min_value > max_value:
21
+ raise ValueError("min_value must be <= max_value")
22
+ for a_dtype in ["i1", "i2", "i4", "i8"]:
23
+ info = np.iinfo(a_dtype)
24
+ if info.min <= min_value and max_value <= info.max:
25
+ return a_dtype
26
+ raise OverflowError("Integer cannot be represented")
27
+
28
+
20
29
  def chunk_aligned_slices(z, n, max_chunks=None):
21
30
  """
22
31
  Returns at n slices in the specified zarr array, aligned
@@ -50,7 +59,8 @@ def wait_on_futures(futures):
50
59
  cancel_futures(futures)
51
60
  if isinstance(exception, cf.process.BrokenProcessPool):
52
61
  raise RuntimeError(
53
- "Worker process died: you may have run out of memory") from exception
62
+ "Worker process died: you may have run out of memory"
63
+ ) from exception
54
64
  else:
55
65
  raise exception
56
66
 
@@ -100,6 +110,7 @@ class BufferedArray:
100
110
  sync_flush_2d_array(
101
111
  self.buff[: self.buffer_row], self.array, self.array_offset
102
112
  )
113
+ # FIXME the array.name doesn't seem to be working here for some reason
103
114
  logger.debug(
104
115
  f"Flushed <{self.array.name} {self.array.shape} "
105
116
  f"{self.array.dtype}> "
@@ -121,8 +132,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
121
132
  # encoder implementations.
122
133
  s = slice(offset, offset + np_buffer.shape[0])
123
134
  samples_chunk_size = zarr_array.chunks[1]
124
- # TODO use zarr chunks here to support non-uniform chunking later
125
- # and for simplicity
135
+ # TODO use zarr chunks here for simplicity
126
136
  zarr_array_width = zarr_array.shape[1]
127
137
  start = 0
128
138
  while start < zarr_array_width:
@@ -182,7 +192,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
182
192
  self.progress_config = progress_config
183
193
  self.progress_bar = tqdm.tqdm(
184
194
  total=progress_config.total,
185
- desc=f"{progress_config.title:>7}",
195
+ desc=f"{progress_config.title:>8}",
186
196
  unit_scale=True,
187
197
  unit=progress_config.units,
188
198
  smoothing=0.1,
@@ -1,14 +1,13 @@
1
1
  import logging
2
2
 
3
+ import bed_reader
3
4
  import humanfriendly
5
+ import numcodecs
4
6
  import numpy as np
5
7
  import zarr
6
- import bed_reader
7
- import numcodecs
8
8
 
9
9
  from . import core
10
10
 
11
-
12
11
  logger = logging.getLogger(__name__)
13
12
 
14
13
 
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
24
23
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
24
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
25
  variants_chunk_size = gt.array.chunks[0]
27
- n = gt.array.shape[1]
28
26
  assert start % variants_chunk_size == 0
29
27
 
30
28
  logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
96
94
  chunks=(samples_chunk_size,),
97
95
  )
98
96
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
99
- logger.debug(f"Encoded samples")
97
+ logger.debug("Encoded samples")
100
98
 
101
99
  # TODO encode these in slices - but read them in one go to avoid
102
100
  # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
108
106
  chunks=(variants_chunk_size,),
109
107
  )
110
108
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug(f"encoded variant_position")
109
+ logger.debug("encoded variant_position")
112
110
 
113
111
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
112
  a = root.array(
@@ -119,7 +117,7 @@ def convert(
119
117
  chunks=(variants_chunk_size,),
120
118
  )
121
119
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
122
- logger.debug(f"encoded variant_allele")
120
+ logger.debug("encoded variant_allele")
123
121
 
124
122
  # TODO remove this?
125
123
  a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
201
199
  elif bed_call == 2:
202
200
  assert list(zarr_call) == [1, 1]
203
201
  else: # pragma no cover
204
- assert False
202
+ raise AssertionError(f"Unexpected bed call {bed_call}")
@@ -1,4 +1,4 @@
1
1
  from pathlib import Path
2
2
  from typing import Union
3
3
 
4
- PathType = Union[str, Path]
4
+ PathType = Union[str, Path]