bio2zarr 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/cd.yml +2 -1
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/ci.yml +46 -3
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/docs.yml +3 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/CHANGELOG.md +33 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/PKG-INFO +19 -7
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/__main__.py +2 -1
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/_version.py +2 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/cli.py +91 -24
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/core.py +43 -22
- bio2zarr-0.1.6/bio2zarr/plink.py +332 -0
- bio2zarr-0.1.6/bio2zarr/tskit.py +301 -0
- bio2zarr-0.1.6/bio2zarr/typing.py +3 -0
- bio2zarr-0.1.4/bio2zarr/vcf2zarr/icf.py → bio2zarr-0.1.6/bio2zarr/vcf.py +614 -118
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/vcf_utils.py +66 -33
- {bio2zarr-0.1.4/bio2zarr/vcf2zarr → bio2zarr-0.1.6/bio2zarr}/vcz.py +544 -708
- bio2zarr-0.1.4/bio2zarr/vcf2zarr/verification.py → bio2zarr-0.1.6/bio2zarr/vcz_verification.py +5 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/PKG-INFO +19 -7
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/SOURCES.txt +10 -4
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/entry_points.txt +2 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/requires.txt +17 -2
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_config.yml +5 -1
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_toc.yml +8 -0
- bio2zarr-0.1.6/docs/installation.md +62 -0
- bio2zarr-0.1.6/docs/intro.md +38 -0
- bio2zarr-0.1.6/docs/plink2zarr/cli_ref.md +17 -0
- bio2zarr-0.1.6/docs/plink2zarr/overview.md +38 -0
- bio2zarr-0.1.6/docs/tskit2zarr/cli_ref.md +18 -0
- bio2zarr-0.1.6/docs/tskit2zarr/overview.md +10 -0
- bio2zarr-0.1.6/docs/tskit2zarr/python_api.md +37 -0
- bio2zarr-0.1.6/docs/vcf2zarr/python_api.md +17 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/pyproject.toml +28 -13
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation.py +5 -5
- bio2zarr-0.1.4/bio2zarr/plink.py +0 -207
- bio2zarr-0.1.4/bio2zarr/typing.py +0 -4
- bio2zarr-0.1.4/bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.4/docs/installation.md +0 -49
- bio2zarr-0.1.4/docs/intro.md +0 -36
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.gitignore +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/LICENSE +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/MANIFEST.in +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/README.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/constants.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/zarr_utils.py +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/Makefile +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/asciinema-player.css +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/asciinema-player.min.js +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/custom.css +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/build.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/logo.png +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/requirements.txt +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/overview.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/tutorial.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcfpartition/cli_ref.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcfpartition/overview.md +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/setup.cfg +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation-data/Makefile +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation-data/split.sh +0 -0
- {bio2zarr-0.1.4 → bio2zarr-0.1.6}/vcf_generator.py +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
name: CD
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
push:
|
|
5
6
|
branches:
|
|
6
7
|
- main
|
|
@@ -18,7 +19,7 @@ jobs:
|
|
|
18
19
|
- uses: actions/checkout@v4
|
|
19
20
|
- uses: actions/setup-python@v5
|
|
20
21
|
with:
|
|
21
|
-
python-version: '3.
|
|
22
|
+
python-version: '3.10'
|
|
22
23
|
- name: Install dependencies
|
|
23
24
|
run: |
|
|
24
25
|
python -m pip install --upgrade pip
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
name: CI
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
pull_request:
|
|
5
6
|
push:
|
|
6
7
|
branches:
|
|
@@ -24,7 +25,7 @@ jobs:
|
|
|
24
25
|
# Use macos-13 because pip binary packages for ARM aren't
|
|
25
26
|
# available for many dependencies
|
|
26
27
|
os: [macos-13, macos-14, ubuntu-latest]
|
|
27
|
-
python-version: ["3.
|
|
28
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
28
29
|
exclude:
|
|
29
30
|
# Just run macos tests on one Python version
|
|
30
31
|
- os: macos-13
|
|
@@ -33,8 +34,6 @@ jobs:
|
|
|
33
34
|
python-version: "3.11"
|
|
34
35
|
- os: macos-13
|
|
35
36
|
python-version: "3.12"
|
|
36
|
-
- os: macos-14
|
|
37
|
-
python-version: "3.9"
|
|
38
37
|
- os: macos-14
|
|
39
38
|
python-version: "3.10"
|
|
40
39
|
- os: macos-14
|
|
@@ -70,6 +69,12 @@ jobs:
|
|
|
70
69
|
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
|
|
71
70
|
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
|
|
72
71
|
python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
|
|
72
|
+
- name: Run tskit2zarr example
|
|
73
|
+
run: |
|
|
74
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
|
|
75
|
+
- name: Run plink2zarr example
|
|
76
|
+
run: |
|
|
77
|
+
python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
|
|
73
78
|
- name: Run tests
|
|
74
79
|
run: |
|
|
75
80
|
pytest --cov=bio2zarr
|
|
@@ -82,6 +87,36 @@ jobs:
|
|
|
82
87
|
# https://github.com/coverallsapp/github-action
|
|
83
88
|
fail-on-error: false
|
|
84
89
|
|
|
90
|
+
optional_dependencies:
|
|
91
|
+
name: Optional dependencies
|
|
92
|
+
runs-on: ubuntu-latest
|
|
93
|
+
steps:
|
|
94
|
+
- uses: actions/checkout@v4
|
|
95
|
+
- uses: actions/setup-python@v5
|
|
96
|
+
with:
|
|
97
|
+
python-version: '3.11'
|
|
98
|
+
- name: Test optional dependencies
|
|
99
|
+
run: |
|
|
100
|
+
python -m venv env-tskit
|
|
101
|
+
source env-tskit/bin/activate
|
|
102
|
+
python -m pip install .
|
|
103
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz > ts.txt 2>&1 || echo $? > ts_exit.txt
|
|
104
|
+
test "$(cat ts_exit.txt)" = "1"
|
|
105
|
+
grep -q "This process requires the optional tskit module. Install it with: pip install bio2zarr\[tskit\]" ts.txt
|
|
106
|
+
python -m pip install '.[tskit]'
|
|
107
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz
|
|
108
|
+
deactivate
|
|
109
|
+
|
|
110
|
+
python -m venv env-vcf
|
|
111
|
+
source env-vcf/bin/activate
|
|
112
|
+
python -m pip install .
|
|
113
|
+
python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz > vcf.txt 2>&1 || echo $? > vcf_exit.txt
|
|
114
|
+
test "$(cat vcf_exit.txt)" = "1"
|
|
115
|
+
grep -q "This process requires the optional cyvcf2 module. Install it with: pip install bio2zarr\[vcf\]" vcf.txt
|
|
116
|
+
python -m pip install '.[vcf]'
|
|
117
|
+
python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz
|
|
118
|
+
deactivate
|
|
119
|
+
|
|
85
120
|
packaging:
|
|
86
121
|
name: Packaging
|
|
87
122
|
runs-on: ubuntu-latest
|
|
@@ -108,6 +143,14 @@ jobs:
|
|
|
108
143
|
run: |
|
|
109
144
|
vcfpartition --help
|
|
110
145
|
python -m bio2zarr vcfpartition --help
|
|
146
|
+
- name: Check tskit2zarr CLI
|
|
147
|
+
run: |
|
|
148
|
+
tskit2zarr --help
|
|
149
|
+
python -m bio2zarr tskit2zarr --help
|
|
150
|
+
- name: Check plink2zarr CLI
|
|
151
|
+
run: |
|
|
152
|
+
plink2zarr --help
|
|
153
|
+
python -m bio2zarr plink2zarr --help
|
|
111
154
|
|
|
112
155
|
test-numpy-version:
|
|
113
156
|
name: Test numpy versions
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
name: Docs
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
pull_request:
|
|
5
6
|
push:
|
|
6
7
|
branches:
|
|
@@ -37,7 +38,7 @@ jobs:
|
|
|
37
38
|
|
|
38
39
|
- name: Install package
|
|
39
40
|
run: |
|
|
40
|
-
python3 -m pip install .
|
|
41
|
+
python3 -m pip install '.[all]'
|
|
41
42
|
|
|
42
43
|
- name: Build Docs
|
|
43
44
|
run: |
|
|
@@ -50,7 +51,7 @@ jobs:
|
|
|
50
51
|
|
|
51
52
|
deploy:
|
|
52
53
|
needs: build-docs
|
|
53
|
-
if: github.event_name != 'pull_request'
|
|
54
|
+
if: github.event_name != 'pull_request' && github.event_name != 'merge_group'
|
|
54
55
|
permissions:
|
|
55
56
|
pages: write
|
|
56
57
|
id-token: write
|
|
@@ -1,3 +1,36 @@
|
|
|
1
|
+
# 0.1.6 2025-05-23
|
|
2
|
+
|
|
3
|
+
- Initial Python API support for VCF and tskit one-shot conversion. Format
|
|
4
|
+
conversion is done using the functions ``bio2zarr.vcf.convert``
|
|
5
|
+
and ``bio2zarr.tskit.convert``.
|
|
6
|
+
|
|
7
|
+
- Initial version of supported plink2zarr (#390, #344, #382)
|
|
8
|
+
|
|
9
|
+
- Initial version of tskit2zarr (#232)
|
|
10
|
+
|
|
11
|
+
- Make format-specific dependencies optional (#385)
|
|
12
|
+
|
|
13
|
+
- Remove bed_reader dependency (#397, #400)
|
|
14
|
+
|
|
15
|
+
- Change default number of worker processes to zero (#404) to simplify
|
|
16
|
+
debugging
|
|
17
|
+
|
|
18
|
+
*Breaking changes*
|
|
19
|
+
|
|
20
|
+
- Remove explicit sample, contig and filter lists from the schema.
|
|
21
|
+
Existing ICFs will need to be recreated. (#343)
|
|
22
|
+
|
|
23
|
+
- Add dimensions and default compressor and filter settings to the schema.
|
|
24
|
+
(#361)
|
|
25
|
+
|
|
26
|
+
- Various changes to existing experimental plink encoding (#390)
|
|
27
|
+
|
|
28
|
+
# 0.1.5 2025-03-31
|
|
29
|
+
|
|
30
|
+
- Add support for merging contig IDs across multiple VCFs (#335)
|
|
31
|
+
|
|
32
|
+
- Add support for unindexed (and uncompressed) VCFs (#337)
|
|
33
|
+
|
|
1
34
|
# 0.1.4 2025-03-10
|
|
2
35
|
|
|
3
36
|
- Fix bug in handling all-missing genotypes (#328)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -216,23 +216,24 @@ Classifier: Operating System :: MacOS :: MacOS X
|
|
|
216
216
|
Classifier: Intended Audience :: Science/Research
|
|
217
217
|
Classifier: Programming Language :: Python
|
|
218
218
|
Classifier: Programming Language :: Python :: 3
|
|
219
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
220
219
|
Classifier: Programming Language :: Python :: 3.10
|
|
221
220
|
Classifier: Programming Language :: Python :: 3.11
|
|
222
221
|
Classifier: Programming Language :: Python :: 3.12
|
|
223
222
|
Classifier: Topic :: Scientific/Engineering
|
|
224
|
-
Requires-Python: >=3.
|
|
223
|
+
Requires-Python: >=3.10
|
|
225
224
|
Description-Content-Type: text/markdown
|
|
226
225
|
License-File: LICENSE
|
|
227
226
|
Requires-Dist: numpy>=1.26
|
|
228
227
|
Requires-Dist: zarr<3,>=2.17
|
|
229
|
-
Requires-Dist:
|
|
228
|
+
Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
|
|
230
229
|
Requires-Dist: tabulate
|
|
231
230
|
Requires-Dist: tqdm
|
|
232
231
|
Requires-Dist: humanfriendly
|
|
233
|
-
Requires-Dist:
|
|
234
|
-
Requires-Dist:
|
|
232
|
+
Requires-Dist: coloredlogs
|
|
233
|
+
Requires-Dist: click
|
|
234
|
+
Requires-Dist: pandas
|
|
235
235
|
Provides-Extra: dev
|
|
236
|
+
Requires-Dist: click>=8.2.0; extra == "dev"
|
|
236
237
|
Requires-Dist: hypothesis-vcf; extra == "dev"
|
|
237
238
|
Requires-Dist: msprime; extra == "dev"
|
|
238
239
|
Requires-Dist: pysam; extra == "dev"
|
|
@@ -241,6 +242,17 @@ Requires-Dist: pytest-coverage; extra == "dev"
|
|
|
241
242
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
242
243
|
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
243
244
|
Requires-Dist: tqdm; extra == "dev"
|
|
245
|
+
Requires-Dist: tskit>=0.6.4; extra == "dev"
|
|
246
|
+
Requires-Dist: bed_reader; extra == "dev"
|
|
247
|
+
Requires-Dist: cyvcf2; extra == "dev"
|
|
248
|
+
Provides-Extra: tskit
|
|
249
|
+
Requires-Dist: tskit>=0.6.4; extra == "tskit"
|
|
250
|
+
Provides-Extra: vcf
|
|
251
|
+
Requires-Dist: cyvcf2; extra == "vcf"
|
|
252
|
+
Provides-Extra: all
|
|
253
|
+
Requires-Dist: tskit>=0.6.4; extra == "all"
|
|
254
|
+
Requires-Dist: cyvcf2; extra == "all"
|
|
255
|
+
Dynamic: license-file
|
|
244
256
|
|
|
245
257
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
246
258
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
@@ -15,7 +15,8 @@ def bio2zarr():
|
|
|
15
15
|
# is handy for development and for those whose PATHs aren't set
|
|
16
16
|
# up in the right way.
|
|
17
17
|
bio2zarr.add_command(cli.vcf2zarr_main)
|
|
18
|
-
bio2zarr.add_command(cli.
|
|
18
|
+
bio2zarr.add_command(cli.plink2zarr_main)
|
|
19
|
+
bio2zarr.add_command(cli.tskit2zarr_main)
|
|
19
20
|
bio2zarr.add_command(cli.vcfpartition)
|
|
20
21
|
|
|
21
22
|
if __name__ == "__main__":
|
|
@@ -8,8 +8,9 @@ import coloredlogs
|
|
|
8
8
|
import numcodecs
|
|
9
9
|
import tabulate
|
|
10
10
|
|
|
11
|
-
from . import plink, provenance,
|
|
12
|
-
from .
|
|
11
|
+
from . import core, plink, provenance, vcf_utils
|
|
12
|
+
from . import tskit as tskit_mod
|
|
13
|
+
from . import vcf as vcf_mod
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
@@ -88,7 +89,12 @@ json = click.option(
|
|
|
88
89
|
version = click.version_option(version=f"{provenance.__version__}")
|
|
89
90
|
|
|
90
91
|
worker_processes = click.option(
|
|
91
|
-
"-p",
|
|
92
|
+
"-p",
|
|
93
|
+
"--worker-processes",
|
|
94
|
+
type=int,
|
|
95
|
+
default=core.DEFAULT_WORKER_PROCESSES,
|
|
96
|
+
help="Number of worker processes",
|
|
97
|
+
show_default=True,
|
|
92
98
|
)
|
|
93
99
|
|
|
94
100
|
column_chunk_size = click.option(
|
|
@@ -197,7 +203,7 @@ def check_partitions(num_partitions):
|
|
|
197
203
|
def get_compressor(cname):
|
|
198
204
|
if cname is None:
|
|
199
205
|
return None
|
|
200
|
-
config =
|
|
206
|
+
config = vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
201
207
|
config["cname"] = cname
|
|
202
208
|
return numcodecs.get_codec(config)
|
|
203
209
|
|
|
@@ -236,7 +242,7 @@ def explode(
|
|
|
236
242
|
"""
|
|
237
243
|
setup_logging(verbose)
|
|
238
244
|
check_overwrite_dir(icf_path, force)
|
|
239
|
-
|
|
245
|
+
vcf_mod.explode(
|
|
240
246
|
icf_path,
|
|
241
247
|
vcfs,
|
|
242
248
|
worker_processes=worker_processes,
|
|
@@ -276,7 +282,7 @@ def dexplode_init(
|
|
|
276
282
|
setup_logging(verbose)
|
|
277
283
|
check_overwrite_dir(icf_path, force)
|
|
278
284
|
check_partitions(num_partitions)
|
|
279
|
-
work_summary =
|
|
285
|
+
work_summary = vcf_mod.explode_init(
|
|
280
286
|
icf_path,
|
|
281
287
|
vcfs,
|
|
282
288
|
target_num_partitions=num_partitions,
|
|
@@ -304,7 +310,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based):
|
|
|
304
310
|
setup_logging(verbose)
|
|
305
311
|
if one_based:
|
|
306
312
|
partition -= 1
|
|
307
|
-
|
|
313
|
+
vcf_mod.explode_partition(icf_path, partition)
|
|
308
314
|
|
|
309
315
|
|
|
310
316
|
@click.command
|
|
@@ -315,7 +321,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
315
321
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
316
322
|
"""
|
|
317
323
|
setup_logging(verbose)
|
|
318
|
-
|
|
324
|
+
vcf_mod.explode_finalise(icf_path)
|
|
319
325
|
|
|
320
326
|
|
|
321
327
|
@click.command
|
|
@@ -326,7 +332,7 @@ def inspect(path, verbose):
|
|
|
326
332
|
Inspect an intermediate columnar format or Zarr path.
|
|
327
333
|
"""
|
|
328
334
|
setup_logging(verbose)
|
|
329
|
-
data =
|
|
335
|
+
data = vcf_mod.inspect(path)
|
|
330
336
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
331
337
|
|
|
332
338
|
|
|
@@ -345,7 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
|
345
351
|
err=True,
|
|
346
352
|
)
|
|
347
353
|
stream = click.get_text_stream("stdout")
|
|
348
|
-
|
|
354
|
+
vcf_mod.mkschema(
|
|
349
355
|
icf_path,
|
|
350
356
|
stream,
|
|
351
357
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -380,11 +386,11 @@ def encode(
|
|
|
380
386
|
worker_processes,
|
|
381
387
|
):
|
|
382
388
|
"""
|
|
383
|
-
Convert intermediate columnar format to
|
|
389
|
+
Convert intermediate columnar format to VCF Zarr.
|
|
384
390
|
"""
|
|
385
391
|
setup_logging(verbose)
|
|
386
392
|
check_overwrite_dir(zarr_path, force)
|
|
387
|
-
|
|
393
|
+
vcf_mod.encode(
|
|
388
394
|
icf_path,
|
|
389
395
|
zarr_path,
|
|
390
396
|
schema_path=schema,
|
|
@@ -438,7 +444,7 @@ def dencode_init(
|
|
|
438
444
|
setup_logging(verbose)
|
|
439
445
|
check_overwrite_dir(zarr_path, force)
|
|
440
446
|
check_partitions(num_partitions)
|
|
441
|
-
work_summary =
|
|
447
|
+
work_summary = vcf_mod.encode_init(
|
|
442
448
|
icf_path,
|
|
443
449
|
zarr_path,
|
|
444
450
|
target_num_partitions=num_partitions,
|
|
@@ -466,7 +472,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
|
|
|
466
472
|
setup_logging(verbose)
|
|
467
473
|
if one_based:
|
|
468
474
|
partition -= 1
|
|
469
|
-
|
|
475
|
+
vcf_mod.encode_partition(zarr_path, partition)
|
|
470
476
|
|
|
471
477
|
|
|
472
478
|
@click.command
|
|
@@ -478,7 +484,7 @@ def dencode_finalise(zarr_path, verbose, progress):
|
|
|
478
484
|
Final step for distributed conversion of ICF to VCF Zarr.
|
|
479
485
|
"""
|
|
480
486
|
setup_logging(verbose)
|
|
481
|
-
|
|
487
|
+
vcf_mod.encode_finalise(zarr_path, show_progress=progress)
|
|
482
488
|
|
|
483
489
|
|
|
484
490
|
@click.command(name="convert")
|
|
@@ -503,11 +509,11 @@ def convert_vcf(
|
|
|
503
509
|
local_alleles,
|
|
504
510
|
):
|
|
505
511
|
"""
|
|
506
|
-
Convert input VCF(s) directly to
|
|
512
|
+
Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
|
|
507
513
|
"""
|
|
508
514
|
setup_logging(verbose)
|
|
509
515
|
check_overwrite_dir(zarr_path, force)
|
|
510
|
-
|
|
516
|
+
vcf_mod.convert(
|
|
511
517
|
vcfs,
|
|
512
518
|
zarr_path,
|
|
513
519
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -522,9 +528,10 @@ def convert_vcf(
|
|
|
522
528
|
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
|
|
523
529
|
def vcf2zarr_main():
|
|
524
530
|
"""
|
|
525
|
-
Convert VCF file(s) to
|
|
531
|
+
Convert VCF file(s) to VCF Zarr format.
|
|
526
532
|
|
|
527
533
|
See the online documentation at https://sgkit-dev.github.io/bio2zarr/
|
|
534
|
+
|
|
528
535
|
for more information.
|
|
529
536
|
"""
|
|
530
537
|
|
|
@@ -545,6 +552,7 @@ vcf2zarr_main.add_command(dencode_finalise)
|
|
|
545
552
|
@click.command(name="convert")
|
|
546
553
|
@click.argument("in_path", type=click.Path())
|
|
547
554
|
@click.argument("zarr_path", type=click.Path())
|
|
555
|
+
@force
|
|
548
556
|
@worker_processes
|
|
549
557
|
@progress
|
|
550
558
|
@verbose
|
|
@@ -553,6 +561,7 @@ vcf2zarr_main.add_command(dencode_finalise)
|
|
|
553
561
|
def convert_plink(
|
|
554
562
|
in_path,
|
|
555
563
|
zarr_path,
|
|
564
|
+
force,
|
|
556
565
|
verbose,
|
|
557
566
|
worker_processes,
|
|
558
567
|
progress,
|
|
@@ -560,9 +569,12 @@ def convert_plink(
|
|
|
560
569
|
samples_chunk_size,
|
|
561
570
|
):
|
|
562
571
|
"""
|
|
563
|
-
|
|
572
|
+
Convert plink fileset to VCF Zarr. Results are equivalent to
|
|
573
|
+
`plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
|
|
574
|
+
then running `vcf2zarr convert tmp.vcf zarr_path`
|
|
564
575
|
"""
|
|
565
576
|
setup_logging(verbose)
|
|
577
|
+
check_overwrite_dir(zarr_path, force)
|
|
566
578
|
plink.convert(
|
|
567
579
|
in_path,
|
|
568
580
|
zarr_path,
|
|
@@ -574,12 +586,15 @@ def convert_plink(
|
|
|
574
586
|
|
|
575
587
|
|
|
576
588
|
@version
|
|
577
|
-
@click.group()
|
|
578
|
-
def
|
|
589
|
+
@click.group(name="plink2zarr")
|
|
590
|
+
def plink2zarr_main():
|
|
591
|
+
"""
|
|
592
|
+
Convert plink fileset(s) to VCF Zarr format
|
|
593
|
+
"""
|
|
579
594
|
pass
|
|
580
595
|
|
|
581
596
|
|
|
582
|
-
|
|
597
|
+
plink2zarr_main.add_command(convert_plink)
|
|
583
598
|
|
|
584
599
|
|
|
585
600
|
@click.command
|
|
@@ -624,9 +639,61 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
|
624
639
|
num_parts_per_path = max(1, num_partitions // len(vcfs))
|
|
625
640
|
|
|
626
641
|
for vcf_path in vcfs:
|
|
627
|
-
|
|
628
|
-
regions =
|
|
642
|
+
vcf_file = vcf_utils.VcfFile(vcf_path)
|
|
643
|
+
regions = vcf_file.partition_into_regions(
|
|
629
644
|
num_parts=num_parts_per_path, target_part_size=partition_size
|
|
630
645
|
)
|
|
631
646
|
for region in regions:
|
|
632
647
|
click.echo(f"{region}\t{vcf_path}")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
@click.command(name="convert")
|
|
651
|
+
@click.argument("ts_path", type=click.Path(exists=True))
|
|
652
|
+
@click.argument("zarr_path", type=click.Path())
|
|
653
|
+
@click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
|
|
654
|
+
@click.option(
|
|
655
|
+
"--isolated-as-missing", is_flag=True, help="Treat isolated nodes as missing"
|
|
656
|
+
)
|
|
657
|
+
@variants_chunk_size
|
|
658
|
+
@samples_chunk_size
|
|
659
|
+
@verbose
|
|
660
|
+
@progress
|
|
661
|
+
@worker_processes
|
|
662
|
+
@force
|
|
663
|
+
def convert_tskit(
|
|
664
|
+
ts_path,
|
|
665
|
+
zarr_path,
|
|
666
|
+
contig_id,
|
|
667
|
+
isolated_as_missing,
|
|
668
|
+
variants_chunk_size,
|
|
669
|
+
samples_chunk_size,
|
|
670
|
+
verbose,
|
|
671
|
+
progress,
|
|
672
|
+
worker_processes,
|
|
673
|
+
force,
|
|
674
|
+
):
|
|
675
|
+
setup_logging(verbose)
|
|
676
|
+
check_overwrite_dir(zarr_path, force)
|
|
677
|
+
|
|
678
|
+
tskit_mod.convert(
|
|
679
|
+
ts_path,
|
|
680
|
+
zarr_path,
|
|
681
|
+
contig_id=contig_id,
|
|
682
|
+
isolated_as_missing=isolated_as_missing,
|
|
683
|
+
variants_chunk_size=variants_chunk_size,
|
|
684
|
+
samples_chunk_size=samples_chunk_size,
|
|
685
|
+
worker_processes=worker_processes,
|
|
686
|
+
show_progress=progress,
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
|
|
690
|
+
@version
|
|
691
|
+
@click.group(name="tskit2zarr")
|
|
692
|
+
def tskit2zarr_main():
|
|
693
|
+
"""
|
|
694
|
+
Convert tskit tree sequence(s) to VCF Zarr format
|
|
695
|
+
"""
|
|
696
|
+
pass
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
tskit2zarr_main.add_command(convert_tskit)
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import concurrent.futures as cf
|
|
2
2
|
import contextlib
|
|
3
3
|
import dataclasses
|
|
4
|
+
import functools
|
|
5
|
+
import importlib
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
import math
|
|
7
9
|
import multiprocessing
|
|
8
10
|
import os
|
|
9
11
|
import os.path
|
|
10
|
-
import sys
|
|
11
12
|
import threading
|
|
12
13
|
import time
|
|
13
|
-
import warnings
|
|
14
14
|
|
|
15
15
|
import humanfriendly
|
|
16
16
|
import numcodecs
|
|
@@ -23,6 +23,26 @@ logger = logging.getLogger(__name__)
|
|
|
23
23
|
numcodecs.blosc.use_threads = False
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def requires_optional_dependency(module_name, extras_name):
|
|
27
|
+
"""Decorator to check for optional dependencies"""
|
|
28
|
+
|
|
29
|
+
def decorator(func):
|
|
30
|
+
@functools.wraps(func)
|
|
31
|
+
def wrapper(*args, **kwargs):
|
|
32
|
+
try:
|
|
33
|
+
importlib.import_module(module_name)
|
|
34
|
+
except ImportError:
|
|
35
|
+
raise ImportError(
|
|
36
|
+
f"This process requires the optional {module_name} module. "
|
|
37
|
+
f"Install it with: pip install bio2zarr[{extras_name}]"
|
|
38
|
+
) from None
|
|
39
|
+
return func(*args, **kwargs)
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
return decorator
|
|
44
|
+
|
|
45
|
+
|
|
26
46
|
def display_number(x):
|
|
27
47
|
ret = "n/a"
|
|
28
48
|
if math.isfinite(x):
|
|
@@ -34,6 +54,16 @@ def display_size(n):
|
|
|
34
54
|
return humanfriendly.format_size(n, binary=True)
|
|
35
55
|
|
|
36
56
|
|
|
57
|
+
def parse_max_memory(max_memory):
|
|
58
|
+
if max_memory is None:
|
|
59
|
+
# Effectively unbounded
|
|
60
|
+
return 2**63
|
|
61
|
+
if isinstance(max_memory, str):
|
|
62
|
+
max_memory = humanfriendly.parse_size(max_memory)
|
|
63
|
+
logger.info(f"Set memory budget to {display_size(max_memory)}")
|
|
64
|
+
return max_memory
|
|
65
|
+
|
|
66
|
+
|
|
37
67
|
def min_int_dtype(min_value, max_value):
|
|
38
68
|
if min_value > max_value:
|
|
39
69
|
raise ValueError("min_value must be <= max_value")
|
|
@@ -100,12 +130,20 @@ def du(path):
|
|
|
100
130
|
return total
|
|
101
131
|
|
|
102
132
|
|
|
133
|
+
# We set the default number of worker processes to 0 because it avoids
|
|
134
|
+
# complexity in the call chain and makes things easier to debug by
|
|
135
|
+
# default. However, it does use the SynchronousExecutor here, which
|
|
136
|
+
# is technically not recommended by the Python docs.
|
|
137
|
+
DEFAULT_WORKER_PROCESSES = 0
|
|
138
|
+
|
|
139
|
+
|
|
103
140
|
class SynchronousExecutor(cf.Executor):
|
|
104
|
-
#
|
|
141
|
+
# Since https://github.com/sgkit-dev/bio2zarr/issues/404 we
|
|
142
|
+
# set worker_processses=0 as the default and use this
|
|
105
143
|
# executor implementation. However, the docs are fairly explicit
|
|
106
144
|
# about saying we shouldn't instantiate Future objects directly,
|
|
107
|
-
# so
|
|
108
|
-
#
|
|
145
|
+
# so we may need to revisit this is obscure problems start to
|
|
146
|
+
# arise.
|
|
109
147
|
def submit(self, fn, /, *args, **kwargs):
|
|
110
148
|
future = cf.Future()
|
|
111
149
|
future.set_result(fn(*args, **kwargs))
|
|
@@ -246,22 +284,6 @@ def setup_progress_counter(counter):
|
|
|
246
284
|
_progress_counter = counter
|
|
247
285
|
|
|
248
286
|
|
|
249
|
-
def warn_py39_mac():
|
|
250
|
-
if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
|
|
251
|
-
warnings.warn(
|
|
252
|
-
"There is a known issue with bio2zarr on MacOS Python 3.9 "
|
|
253
|
-
"in which OS-level named semaphores are leaked. "
|
|
254
|
-
"You will also probably see warnings like 'There appear to be N "
|
|
255
|
-
"leaked semaphore objects at shutdown'. "
|
|
256
|
-
"While this is likely harmless for a few runs, it could lead to "
|
|
257
|
-
"issues if you do a lot of conversion. To get prevent this issue "
|
|
258
|
-
"either: (1) use --worker-processes=0 or (2) upgrade to a newer "
|
|
259
|
-
"Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
|
|
260
|
-
"for more details.",
|
|
261
|
-
stacklevel=2,
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
|
|
265
287
|
class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
266
288
|
def __init__(self, worker_processes=1, progress_config=None):
|
|
267
289
|
# Need to specify this explicitly to suppport Macs and
|
|
@@ -274,7 +296,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
|
|
|
274
296
|
# production. See note on the SynchronousExecutor class.
|
|
275
297
|
self.executor = SynchronousExecutor()
|
|
276
298
|
else:
|
|
277
|
-
warn_py39_mac()
|
|
278
299
|
self.executor = cf.ProcessPoolExecutor(
|
|
279
300
|
max_workers=worker_processes,
|
|
280
301
|
mp_context=ctx,
|