bio2zarr 0.0.3__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr-0.0.6/.github/workflows/docs.yml +56 -0
- bio2zarr-0.0.6/.github/workflows/lint.yml +17 -0
- bio2zarr-0.0.6/.pre-commit-config.yaml +15 -0
- bio2zarr-0.0.6/CHANGELOG.md +32 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/PKG-INFO +2 -2
- bio2zarr-0.0.6/bio2zarr/__init__.py +1 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/__main__.py +2 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/_version.py +2 -2
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/cli.py +166 -37
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/core.py +20 -10
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/plink.py +6 -8
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/typing.py +1 -1
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/vcf.py +670 -381
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/vcf_utils.py +26 -8
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/PKG-INFO +2 -2
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/SOURCES.txt +13 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/requires.txt +1 -1
- bio2zarr-0.0.6/docs/Makefile +18 -0
- bio2zarr-0.0.6/docs/_config.yml +36 -0
- bio2zarr-0.0.6/docs/_toc.yml +4 -0
- bio2zarr-0.0.6/docs/build.sh +20 -0
- bio2zarr-0.0.6/docs/cli.md +10 -0
- bio2zarr-0.0.6/docs/intro.md +76 -0
- bio2zarr-0.0.6/docs/logo.png +0 -0
- bio2zarr-0.0.6/docs/references.bib +3 -0
- bio2zarr-0.0.6/docs/requirements.txt +11 -0
- bio2zarr-0.0.6/pyproject.toml +15 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/requirements/development.txt +2 -1
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/setup.cfg +1 -1
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation.py +9 -4
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/vcf_generator.py +1 -0
- bio2zarr-0.0.3/CHANGELOG.md +0 -11
- bio2zarr-0.0.3/bio2zarr/__init__.py +0 -1
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/.gitignore +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/LICENSE +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/MANIFEST.in +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/README.md +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/entry_points.txt +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/not-zip-safe +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/setup.py +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation-data/Makefile +0 -0
- {bio2zarr-0.0.3 → bio2zarr-0.0.6}/validation-data/split.sh +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: Build Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main, test]
|
|
7
|
+
tags:
|
|
8
|
+
- '*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
build-docs:
|
|
12
|
+
name: Docs
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- name: Cancel Previous Runs
|
|
16
|
+
uses: styfle/cancel-workflow-action@0.12.1
|
|
17
|
+
with:
|
|
18
|
+
access_token: ${{ github.token }}
|
|
19
|
+
|
|
20
|
+
- uses: actions/checkout@v3
|
|
21
|
+
|
|
22
|
+
- uses: actions/setup-python@v4
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.11"
|
|
25
|
+
cache: 'pip'
|
|
26
|
+
|
|
27
|
+
- name: Create venv and install deps
|
|
28
|
+
run: |
|
|
29
|
+
pip install --upgrade pip wheel
|
|
30
|
+
pip install -r docs/requirements.txt
|
|
31
|
+
|
|
32
|
+
- name: Build Docs
|
|
33
|
+
run: |
|
|
34
|
+
make -C docs
|
|
35
|
+
|
|
36
|
+
- name: Upload Pages Artifact
|
|
37
|
+
uses: actions/upload-pages-artifact@v3
|
|
38
|
+
with:
|
|
39
|
+
path: docs/_build/html
|
|
40
|
+
|
|
41
|
+
deploy:
|
|
42
|
+
needs: build-docs
|
|
43
|
+
if: github.event_name != 'pull_request'
|
|
44
|
+
permissions:
|
|
45
|
+
pages: write
|
|
46
|
+
id-token: write
|
|
47
|
+
|
|
48
|
+
environment:
|
|
49
|
+
name: github-pages
|
|
50
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
51
|
+
|
|
52
|
+
runs-on: ubuntu-latest
|
|
53
|
+
steps:
|
|
54
|
+
- name: Deploy to GitHub Pages
|
|
55
|
+
id: deployment
|
|
56
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
name: Lint
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main, test]
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
pre-commit:
|
|
10
|
+
name: Lint
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v3
|
|
14
|
+
- uses: actions/setup-python@v4
|
|
15
|
+
with:
|
|
16
|
+
python-version: '3.11'
|
|
17
|
+
- uses: pre-commit/action@v3.0.1
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.5.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: check-merge-conflict
|
|
6
|
+
- id: debug-statements
|
|
7
|
+
- id: mixed-line-ending
|
|
8
|
+
- id: check-case-conflict
|
|
9
|
+
- id: check-yaml
|
|
10
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
11
|
+
rev: v0.3.7
|
|
12
|
+
hooks:
|
|
13
|
+
- id: ruff
|
|
14
|
+
args: [ --fix ]
|
|
15
|
+
- id: ruff-format
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# 0.0.6 2024-04-24
|
|
2
|
+
|
|
3
|
+
- Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.
|
|
4
|
+
- Add initial implementation of distributed encode
|
|
5
|
+
|
|
6
|
+
# 0.0.5 2024-04-17
|
|
7
|
+
|
|
8
|
+
- Fix bug in schema handling (compressor settings ignored)
|
|
9
|
+
- Move making ICF field partition directories into per-partition processing.
|
|
10
|
+
Remove progress on the init mkdirs step.
|
|
11
|
+
- Turn off progress monitor on dexplode-partition
|
|
12
|
+
- Fix empty partition bug
|
|
13
|
+
|
|
14
|
+
# 0.0.4 2024-04-08
|
|
15
|
+
|
|
16
|
+
- Fix bug in --max-memory handling, and argument to a string like 10G
|
|
17
|
+
- Add compressor choice in explode, switch default to zstd
|
|
18
|
+
- Run mkdirs in parallel and provide progress
|
|
19
|
+
- Change dimension separator to "/" in Zarr
|
|
20
|
+
- Update min Zarr version to 2.17
|
|
21
|
+
|
|
22
|
+
# 0.0.3 2024-03-28
|
|
23
|
+
|
|
24
|
+
- Various refinements to the CLI
|
|
25
|
+
|
|
26
|
+
# 0.0.2 2024-03-27
|
|
27
|
+
|
|
28
|
+
- Merged 1D and 2D encode steps into one, and change rate reporting to bytes
|
|
29
|
+
- Add --max-memory for encode
|
|
30
|
+
- Change `chunk_width` to `samples_chunk_size` and `chunk_length` to `variants_chunk_size`
|
|
31
|
+
- Various updates to the intermediate chunked format, with breaking change to version 0.2
|
|
32
|
+
- Add distributed explode commands
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Home-page: https://github.com/pystatgen/bio2zarr
|
|
6
6
|
Author: sgkit Developers
|
|
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
|
|
|
20
20
|
Description-Content-Type: text/x-rst
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: numpy
|
|
23
|
-
Requires-Dist: zarr
|
|
23
|
+
Requires-Dist: zarr>=2.17
|
|
24
24
|
Requires-Dist: click
|
|
25
25
|
Requires-Dist: tabulate
|
|
26
26
|
Requires-Dist: tqdm
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .provenance import __version__ # noqa F401
|
|
@@ -2,11 +2,13 @@ import click
|
|
|
2
2
|
|
|
3
3
|
from . import cli
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
@cli.version
|
|
6
7
|
@click.group()
|
|
7
8
|
def bio2zarr():
|
|
8
9
|
pass
|
|
9
10
|
|
|
11
|
+
|
|
10
12
|
# Provide a single top-level interface to all of the functionality.
|
|
11
13
|
# This probably isn't the recommended way of interacting, as we
|
|
12
14
|
# install individual commands as console scripts. However, this
|
|
@@ -4,14 +4,12 @@ import pathlib
|
|
|
4
4
|
import shutil
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
|
-
import tabulate
|
|
8
7
|
import coloredlogs
|
|
8
|
+
import humanfriendly
|
|
9
|
+
import numcodecs
|
|
10
|
+
import tabulate
|
|
9
11
|
|
|
10
|
-
from . import vcf
|
|
11
|
-
from . import vcf_utils
|
|
12
|
-
from . import plink
|
|
13
|
-
from . import provenance
|
|
14
|
-
|
|
12
|
+
from . import plink, provenance, vcf, vcf_utils
|
|
15
13
|
|
|
16
14
|
logger = logging.getLogger(__name__)
|
|
17
15
|
|
|
@@ -42,6 +40,14 @@ new_zarr_path = click.argument(
|
|
|
42
40
|
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
|
|
43
41
|
)
|
|
44
42
|
|
|
43
|
+
zarr_path = click.argument(
|
|
44
|
+
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
|
|
48
|
+
|
|
49
|
+
partition = click.argument("partition", type=click.IntRange(min=0))
|
|
50
|
+
|
|
45
51
|
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
|
|
46
52
|
|
|
47
53
|
force = click.option(
|
|
@@ -66,6 +72,17 @@ column_chunk_size = click.option(
|
|
|
66
72
|
help="Approximate uncompressed size of exploded column chunks in MiB",
|
|
67
73
|
)
|
|
68
74
|
|
|
75
|
+
# We could provide the full flexiblity of numcodecs/Blosc here, but there
|
|
76
|
+
# doesn't seem much point. Can always add more arguments here to control
|
|
77
|
+
# compression level, etc.
|
|
78
|
+
compressor = click.option(
|
|
79
|
+
"-C",
|
|
80
|
+
"--compressor",
|
|
81
|
+
type=click.Choice(["lz4", "zstd"]),
|
|
82
|
+
default=None,
|
|
83
|
+
help="Codec to use for compressing column chunks (Default=zstd).",
|
|
84
|
+
)
|
|
85
|
+
|
|
69
86
|
# Note: -l and -w were chosen when these were called "width" and "length".
|
|
70
87
|
# possibly there are better letters now.
|
|
71
88
|
variants_chunk_size = click.option(
|
|
@@ -84,6 +101,27 @@ samples_chunk_size = click.option(
|
|
|
84
101
|
help="Chunk size in the samples dimension",
|
|
85
102
|
)
|
|
86
103
|
|
|
104
|
+
schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
|
|
105
|
+
|
|
106
|
+
max_variant_chunks = click.option(
|
|
107
|
+
"-V",
|
|
108
|
+
"--max-variant-chunks",
|
|
109
|
+
type=int,
|
|
110
|
+
default=None,
|
|
111
|
+
help=(
|
|
112
|
+
"Truncate the output in the variants dimension to have "
|
|
113
|
+
"this number of chunks. Mainly intended to help with "
|
|
114
|
+
"schema tuning."
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
max_memory = click.option(
|
|
119
|
+
"-M",
|
|
120
|
+
"--max-memory",
|
|
121
|
+
default=None,
|
|
122
|
+
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
123
|
+
)
|
|
124
|
+
|
|
87
125
|
|
|
88
126
|
def setup_logging(verbosity):
|
|
89
127
|
level = "WARNING"
|
|
@@ -113,24 +151,36 @@ def check_overwrite_dir(path, force):
|
|
|
113
151
|
shutil.rmtree(tmp_delete_path)
|
|
114
152
|
|
|
115
153
|
|
|
154
|
+
def get_compressor(cname):
|
|
155
|
+
if cname is None:
|
|
156
|
+
return None
|
|
157
|
+
config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
158
|
+
config["cname"] = cname
|
|
159
|
+
return numcodecs.get_codec(config)
|
|
160
|
+
|
|
161
|
+
|
|
116
162
|
@click.command
|
|
117
163
|
@vcfs
|
|
118
164
|
@new_icf_path
|
|
119
165
|
@force
|
|
120
166
|
@verbose
|
|
121
|
-
@worker_processes
|
|
122
167
|
@column_chunk_size
|
|
123
|
-
|
|
168
|
+
@compressor
|
|
169
|
+
@worker_processes
|
|
170
|
+
def explode(
|
|
171
|
+
vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
|
|
172
|
+
):
|
|
124
173
|
"""
|
|
125
174
|
Convert VCF(s) to intermediate columnar format
|
|
126
175
|
"""
|
|
127
176
|
setup_logging(verbose)
|
|
128
177
|
check_overwrite_dir(icf_path, force)
|
|
129
178
|
vcf.explode(
|
|
130
|
-
vcfs,
|
|
131
179
|
icf_path,
|
|
180
|
+
vcfs,
|
|
132
181
|
worker_processes=worker_processes,
|
|
133
182
|
column_chunk_size=column_chunk_size,
|
|
183
|
+
compressor=get_compressor(compressor),
|
|
134
184
|
show_progress=True,
|
|
135
185
|
)
|
|
136
186
|
|
|
@@ -138,13 +188,21 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
|
|
|
138
188
|
@click.command
|
|
139
189
|
@vcfs
|
|
140
190
|
@new_icf_path
|
|
141
|
-
@
|
|
191
|
+
@num_partitions
|
|
142
192
|
@force
|
|
143
193
|
@column_chunk_size
|
|
194
|
+
@compressor
|
|
144
195
|
@verbose
|
|
145
196
|
@worker_processes
|
|
146
197
|
def dexplode_init(
|
|
147
|
-
vcfs,
|
|
198
|
+
vcfs,
|
|
199
|
+
icf_path,
|
|
200
|
+
num_partitions,
|
|
201
|
+
force,
|
|
202
|
+
column_chunk_size,
|
|
203
|
+
compressor,
|
|
204
|
+
verbose,
|
|
205
|
+
worker_processes,
|
|
148
206
|
):
|
|
149
207
|
"""
|
|
150
208
|
Initial step for distributed conversion of VCF(s) to intermediate columnar format
|
|
@@ -158,6 +216,7 @@ def dexplode_init(
|
|
|
158
216
|
target_num_partitions=num_partitions,
|
|
159
217
|
column_chunk_size=column_chunk_size,
|
|
160
218
|
worker_processes=worker_processes,
|
|
219
|
+
compressor=get_compressor(compressor),
|
|
161
220
|
show_progress=True,
|
|
162
221
|
)
|
|
163
222
|
click.echo(num_partitions)
|
|
@@ -165,7 +224,7 @@ def dexplode_init(
|
|
|
165
224
|
|
|
166
225
|
@click.command
|
|
167
226
|
@icf_path
|
|
168
|
-
@
|
|
227
|
+
@partition
|
|
169
228
|
@verbose
|
|
170
229
|
def dexplode_partition(icf_path, partition, verbose):
|
|
171
230
|
"""
|
|
@@ -174,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
174
233
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
175
234
|
"""
|
|
176
235
|
setup_logging(verbose)
|
|
177
|
-
vcf.explode_partition(icf_path, partition, show_progress=
|
|
236
|
+
vcf.explode_partition(icf_path, partition, show_progress=False)
|
|
178
237
|
|
|
179
238
|
|
|
180
239
|
@click.command
|
|
181
|
-
@
|
|
240
|
+
@icf_path
|
|
182
241
|
@verbose
|
|
183
|
-
def dexplode_finalise(
|
|
242
|
+
def dexplode_finalise(icf_path, verbose):
|
|
184
243
|
"""
|
|
185
244
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
186
245
|
"""
|
|
187
246
|
setup_logging(verbose)
|
|
188
|
-
vcf.explode_finalise(
|
|
247
|
+
vcf.explode_finalise(icf_path)
|
|
189
248
|
|
|
190
249
|
|
|
191
250
|
@click.command
|
|
@@ -215,27 +274,11 @@ def mkschema(icf_path):
|
|
|
215
274
|
@new_zarr_path
|
|
216
275
|
@force
|
|
217
276
|
@verbose
|
|
218
|
-
@
|
|
277
|
+
@schema
|
|
219
278
|
@variants_chunk_size
|
|
220
279
|
@samples_chunk_size
|
|
221
|
-
@
|
|
222
|
-
|
|
223
|
-
"--max-variant-chunks",
|
|
224
|
-
type=int,
|
|
225
|
-
default=None,
|
|
226
|
-
help=(
|
|
227
|
-
"Truncate the output in the variants dimension to have "
|
|
228
|
-
"this number of chunks. Mainly intended to help with "
|
|
229
|
-
"schema tuning."
|
|
230
|
-
),
|
|
231
|
-
)
|
|
232
|
-
@click.option(
|
|
233
|
-
"-M",
|
|
234
|
-
"--max-memory",
|
|
235
|
-
type=int,
|
|
236
|
-
default=None,
|
|
237
|
-
help="An approximate bound on overall memory usage in megabytes",
|
|
238
|
-
)
|
|
280
|
+
@max_variant_chunks
|
|
281
|
+
@max_memory
|
|
239
282
|
@worker_processes
|
|
240
283
|
def encode(
|
|
241
284
|
icf_path,
|
|
@@ -250,7 +293,7 @@ def encode(
|
|
|
250
293
|
worker_processes,
|
|
251
294
|
):
|
|
252
295
|
"""
|
|
253
|
-
|
|
296
|
+
Convert intermediate columnar format to vcfzarr.
|
|
254
297
|
"""
|
|
255
298
|
setup_logging(verbose)
|
|
256
299
|
check_overwrite_dir(zarr_path, force)
|
|
@@ -260,13 +303,96 @@ def encode(
|
|
|
260
303
|
schema_path=schema,
|
|
261
304
|
variants_chunk_size=variants_chunk_size,
|
|
262
305
|
samples_chunk_size=samples_chunk_size,
|
|
263
|
-
|
|
306
|
+
max_variant_chunks=max_variant_chunks,
|
|
264
307
|
worker_processes=worker_processes,
|
|
265
308
|
max_memory=max_memory,
|
|
266
309
|
show_progress=True,
|
|
267
310
|
)
|
|
268
311
|
|
|
269
312
|
|
|
313
|
+
@click.command
|
|
314
|
+
@icf_path
|
|
315
|
+
@new_zarr_path
|
|
316
|
+
@num_partitions
|
|
317
|
+
@force
|
|
318
|
+
@schema
|
|
319
|
+
@variants_chunk_size
|
|
320
|
+
@samples_chunk_size
|
|
321
|
+
@max_variant_chunks
|
|
322
|
+
@verbose
|
|
323
|
+
def dencode_init(
|
|
324
|
+
icf_path,
|
|
325
|
+
zarr_path,
|
|
326
|
+
num_partitions,
|
|
327
|
+
force,
|
|
328
|
+
schema,
|
|
329
|
+
variants_chunk_size,
|
|
330
|
+
samples_chunk_size,
|
|
331
|
+
max_variant_chunks,
|
|
332
|
+
verbose,
|
|
333
|
+
):
|
|
334
|
+
"""
|
|
335
|
+
Initialise conversion of intermediate format to VCF Zarr. This will
|
|
336
|
+
set up the specified ZARR_PATH to perform this conversion over
|
|
337
|
+
NUM_PARTITIONS.
|
|
338
|
+
|
|
339
|
+
The output of this commmand is the actual number of partitions generated
|
|
340
|
+
(which may be less then the requested number, if there is not sufficient
|
|
341
|
+
chunks in the variants dimension) and a rough lower-bound on the amount
|
|
342
|
+
of memory required to encode a partition.
|
|
343
|
+
|
|
344
|
+
NOTE: the format of this output will likely change in subsequent releases;
|
|
345
|
+
it should not be considered machine-readable for now.
|
|
346
|
+
"""
|
|
347
|
+
setup_logging(verbose)
|
|
348
|
+
check_overwrite_dir(zarr_path, force)
|
|
349
|
+
num_partitions, max_memory = vcf.encode_init(
|
|
350
|
+
icf_path,
|
|
351
|
+
zarr_path,
|
|
352
|
+
target_num_partitions=num_partitions,
|
|
353
|
+
schema_path=schema,
|
|
354
|
+
variants_chunk_size=variants_chunk_size,
|
|
355
|
+
samples_chunk_size=samples_chunk_size,
|
|
356
|
+
max_variant_chunks=max_variant_chunks,
|
|
357
|
+
show_progress=True,
|
|
358
|
+
)
|
|
359
|
+
formatted_size = humanfriendly.format_size(max_memory, binary=True)
|
|
360
|
+
# NOTE adding the size to the stdout here so that users can parse it
|
|
361
|
+
# and use in their submission scripts. This is a first pass, and
|
|
362
|
+
# will most likely change as we see what works and doesn't.
|
|
363
|
+
# NOTE we probably want to format this as a table, which lists
|
|
364
|
+
# some other properties, line by line
|
|
365
|
+
# NOTE This size number is also not quite enough, you need a bit of
|
|
366
|
+
# headroom with it (probably 10% or so). We should include this.
|
|
367
|
+
click.echo(f"{num_partitions}\t{formatted_size}")
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
@click.command
|
|
371
|
+
@zarr_path
|
|
372
|
+
@partition
|
|
373
|
+
@verbose
|
|
374
|
+
def dencode_partition(zarr_path, partition, verbose):
|
|
375
|
+
"""
|
|
376
|
+
Convert a partition from intermediate columnar format to VCF Zarr.
|
|
377
|
+
Must be called *after* the Zarr path has been initialised with dencode_init.
|
|
378
|
+
Partition indexes must be from 0 (inclusive) to the number of paritions
|
|
379
|
+
returned by dencode_init (exclusive).
|
|
380
|
+
"""
|
|
381
|
+
setup_logging(verbose)
|
|
382
|
+
vcf.encode_partition(zarr_path, partition)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
@click.command
|
|
386
|
+
@zarr_path
|
|
387
|
+
@verbose
|
|
388
|
+
def dencode_finalise(zarr_path, verbose):
|
|
389
|
+
"""
|
|
390
|
+
Final step for distributed conversion of ICF to VCF Zarr.
|
|
391
|
+
"""
|
|
392
|
+
setup_logging(verbose)
|
|
393
|
+
vcf.encode_finalise(zarr_path, show_progress=True)
|
|
394
|
+
|
|
395
|
+
|
|
270
396
|
@click.command(name="convert")
|
|
271
397
|
@vcfs
|
|
272
398
|
@new_zarr_path
|
|
@@ -354,6 +480,9 @@ vcf2zarr.add_command(encode)
|
|
|
354
480
|
vcf2zarr.add_command(dexplode_init)
|
|
355
481
|
vcf2zarr.add_command(dexplode_partition)
|
|
356
482
|
vcf2zarr.add_command(dexplode_finalise)
|
|
483
|
+
vcf2zarr.add_command(dencode_init)
|
|
484
|
+
vcf2zarr.add_command(dencode_partition)
|
|
485
|
+
vcf2zarr.add_command(dencode_finalise)
|
|
357
486
|
|
|
358
487
|
|
|
359
488
|
@click.command(name="convert")
|
|
@@ -1,22 +1,31 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import contextlib
|
|
3
1
|
import concurrent.futures as cf
|
|
2
|
+
import contextlib
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
4
5
|
import multiprocessing
|
|
5
6
|
import threading
|
|
6
|
-
import logging
|
|
7
7
|
import time
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import numcodecs
|
|
10
10
|
import numpy as np
|
|
11
11
|
import tqdm
|
|
12
|
-
import
|
|
13
|
-
|
|
12
|
+
import zarr
|
|
14
13
|
|
|
15
14
|
logger = logging.getLogger(__name__)
|
|
16
15
|
|
|
17
16
|
numcodecs.blosc.use_threads = False
|
|
18
17
|
|
|
19
18
|
|
|
19
|
+
def min_int_dtype(min_value, max_value):
|
|
20
|
+
if min_value > max_value:
|
|
21
|
+
raise ValueError("min_value must be <= max_value")
|
|
22
|
+
for a_dtype in ["i1", "i2", "i4", "i8"]:
|
|
23
|
+
info = np.iinfo(a_dtype)
|
|
24
|
+
if info.min <= min_value and max_value <= info.max:
|
|
25
|
+
return a_dtype
|
|
26
|
+
raise OverflowError("Integer cannot be represented")
|
|
27
|
+
|
|
28
|
+
|
|
20
29
|
def chunk_aligned_slices(z, n, max_chunks=None):
|
|
21
30
|
"""
|
|
22
31
|
Returns at n slices in the specified zarr array, aligned
|
|
@@ -50,7 +59,8 @@ def wait_on_futures(futures):
|
|
|
50
59
|
cancel_futures(futures)
|
|
51
60
|
if isinstance(exception, cf.process.BrokenProcessPool):
|
|
52
61
|
raise RuntimeError(
|
|
53
|
-
"Worker process died: you may have run out of memory"
|
|
62
|
+
"Worker process died: you may have run out of memory"
|
|
63
|
+
) from exception
|
|
54
64
|
else:
|
|
55
65
|
raise exception
|
|
56
66
|
|
|
@@ -100,6 +110,7 @@ class BufferedArray:
|
|
|
100
110
|
sync_flush_2d_array(
|
|
101
111
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
102
112
|
)
|
|
113
|
+
# FIXME the array.name doesn't seem to be working here for some reason
|
|
103
114
|
logger.debug(
|
|
104
115
|
f"Flushed <{self.array.name} {self.array.shape} "
|
|
105
116
|
f"{self.array.dtype}> "
|
|
@@ -121,8 +132,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
|
|
|
121
132
|
# encoder implementations.
|
|
122
133
|
s = slice(offset, offset + np_buffer.shape[0])
|
|
123
134
|
samples_chunk_size = zarr_array.chunks[1]
|
|
124
|
-
# TODO use zarr chunks here
|
|
125
|
-
# and for simplicity
|
|
135
|
+
# TODO use zarr chunks here for simplicity
|
|
126
136
|
zarr_array_width = zarr_array.shape[1]
|
|
127
137
|
start = 0
|
|
128
138
|
while start < zarr_array_width:
|
|
@@ -182,7 +192,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
182
192
|
self.progress_config = progress_config
|
|
183
193
|
self.progress_bar = tqdm.tqdm(
|
|
184
194
|
total=progress_config.total,
|
|
185
|
-
desc=f"{progress_config.title:>
|
|
195
|
+
desc=f"{progress_config.title:>8}",
|
|
186
196
|
unit_scale=True,
|
|
187
197
|
unit=progress_config.units,
|
|
188
198
|
smoothing=0.1,
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
import bed_reader
|
|
3
4
|
import humanfriendly
|
|
5
|
+
import numcodecs
|
|
4
6
|
import numpy as np
|
|
5
7
|
import zarr
|
|
6
|
-
import bed_reader
|
|
7
|
-
import numcodecs
|
|
8
8
|
|
|
9
9
|
from . import core
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
logger = logging.getLogger(__name__)
|
|
13
12
|
|
|
14
13
|
|
|
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
24
23
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
25
24
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
26
25
|
variants_chunk_size = gt.array.chunks[0]
|
|
27
|
-
n = gt.array.shape[1]
|
|
28
26
|
assert start % variants_chunk_size == 0
|
|
29
27
|
|
|
30
28
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
@@ -96,7 +94,7 @@ def convert(
|
|
|
96
94
|
chunks=(samples_chunk_size,),
|
|
97
95
|
)
|
|
98
96
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
99
|
-
logger.debug(
|
|
97
|
+
logger.debug("Encoded samples")
|
|
100
98
|
|
|
101
99
|
# TODO encode these in slices - but read them in one go to avoid
|
|
102
100
|
# fetching repeatedly from bim file
|
|
@@ -108,7 +106,7 @@ def convert(
|
|
|
108
106
|
chunks=(variants_chunk_size,),
|
|
109
107
|
)
|
|
110
108
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug(
|
|
109
|
+
logger.debug("encoded variant_position")
|
|
112
110
|
|
|
113
111
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
112
|
a = root.array(
|
|
@@ -119,7 +117,7 @@ def convert(
|
|
|
119
117
|
chunks=(variants_chunk_size,),
|
|
120
118
|
)
|
|
121
119
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
122
|
-
logger.debug(
|
|
120
|
+
logger.debug("encoded variant_allele")
|
|
123
121
|
|
|
124
122
|
# TODO remove this?
|
|
125
123
|
a = root.empty(
|
|
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
|
|
|
201
199
|
elif bed_call == 2:
|
|
202
200
|
assert list(zarr_call) == [1, 1]
|
|
203
201
|
else: # pragma no cover
|
|
204
|
-
|
|
202
|
+
raise AssertionError(f"Unexpected bed call {bed_call}")
|