bio2zarr 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/cd.yml +2 -1
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/ci.yml +56 -34
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/docs.yml +3 -2
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/CHANGELOG.md +55 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/PKG-INFO +23 -8
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/README.md +2 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/__main__.py +2 -1
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/_version.py +16 -3
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/cli.py +102 -22
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/core.py +43 -22
- bio2zarr-0.1.7/bio2zarr/plink.py +334 -0
- bio2zarr-0.1.7/bio2zarr/tskit.py +296 -0
- bio2zarr-0.1.7/bio2zarr/typing.py +3 -0
- bio2zarr-0.1.5/bio2zarr/vcf2zarr/icf.py → bio2zarr-0.1.7/bio2zarr/vcf.py +606 -114
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/vcf_utils.py +12 -11
- {bio2zarr-0.1.5/bio2zarr/vcf2zarr → bio2zarr-0.1.7/bio2zarr}/vcz.py +568 -739
- bio2zarr-0.1.5/bio2zarr/vcf2zarr/verification.py → bio2zarr-0.1.7/bio2zarr/vcz_verification.py +5 -2
- bio2zarr-0.1.7/bio2zarr/zarr_utils.py +185 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/PKG-INFO +23 -8
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/SOURCES.txt +10 -4
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/entry_points.txt +2 -0
- bio2zarr-0.1.7/bio2zarr.egg-info/requires.txt +34 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/Makefile +3 -2
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_config.yml +5 -1
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_toc.yml +8 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/build.sh +2 -2
- bio2zarr-0.1.7/docs/installation.md +62 -0
- bio2zarr-0.1.7/docs/intro.md +38 -0
- bio2zarr-0.1.7/docs/plink2zarr/cli_ref.md +17 -0
- bio2zarr-0.1.7/docs/plink2zarr/overview.md +38 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/requirements.txt +1 -1
- bio2zarr-0.1.7/docs/tskit2zarr/cli_ref.md +18 -0
- bio2zarr-0.1.7/docs/tskit2zarr/overview.md +10 -0
- bio2zarr-0.1.7/docs/tskit2zarr/python_api.md +40 -0
- bio2zarr-0.1.7/docs/vcf2zarr/python_api.md +17 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/tutorial.md +1 -1
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/pyproject.toml +33 -18
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation.py +5 -5
- bio2zarr-0.1.5/bio2zarr/plink.py +0 -207
- bio2zarr-0.1.5/bio2zarr/typing.py +0 -4
- bio2zarr-0.1.5/bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5/bio2zarr/zarr_utils.py +0 -18
- bio2zarr-0.1.5/bio2zarr.egg-info/requires.txt +0 -18
- bio2zarr-0.1.5/docs/installation.md +0 -49
- bio2zarr-0.1.5/docs/intro.md +0 -36
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.gitignore +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/LICENSE +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/MANIFEST.in +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/constants.py +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/asciinema-player.css +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/asciinema-player.min.js +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/custom.css +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/logo.png +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/overview.md +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcfpartition/cli_ref.md +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcfpartition/overview.md +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/setup.cfg +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation-data/Makefile +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation-data/split.sh +0 -0
- {bio2zarr-0.1.5 → bio2zarr-0.1.7}/vcf_generator.py +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
name: CD
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
push:
|
|
5
6
|
branches:
|
|
6
7
|
- main
|
|
@@ -18,7 +19,7 @@ jobs:
|
|
|
18
19
|
- uses: actions/checkout@v4
|
|
19
20
|
- uses: actions/setup-python@v5
|
|
20
21
|
with:
|
|
21
|
-
python-version: '3.
|
|
22
|
+
python-version: '3.10'
|
|
22
23
|
- name: Install dependencies
|
|
23
24
|
run: |
|
|
24
25
|
python -m pip install --upgrade pip
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
name: CI
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
pull_request:
|
|
5
6
|
push:
|
|
6
7
|
branches:
|
|
7
8
|
- main
|
|
9
|
+
schedule:
|
|
10
|
+
# At 04:44 on Monday, see https://crontab.guru/
|
|
11
|
+
- cron: "44 4 * * 1"
|
|
8
12
|
|
|
9
13
|
jobs:
|
|
10
14
|
pre-commit:
|
|
@@ -21,24 +25,16 @@ jobs:
|
|
|
21
25
|
runs-on: ${{ matrix.os }}
|
|
22
26
|
strategy:
|
|
23
27
|
matrix:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
os: [macos-13, macos-14, ubuntu-latest]
|
|
27
|
-
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
28
|
+
os: [macos-14, ubuntu-latest]
|
|
29
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
28
30
|
exclude:
|
|
29
31
|
# Just run macos tests on one Python version
|
|
30
|
-
- os: macos-13
|
|
31
|
-
python-version: "3.10"
|
|
32
|
-
- os: macos-13
|
|
33
|
-
python-version: "3.11"
|
|
34
|
-
- os: macos-13
|
|
35
|
-
python-version: "3.12"
|
|
36
|
-
- os: macos-14
|
|
37
|
-
python-version: "3.9"
|
|
38
32
|
- os: macos-14
|
|
39
33
|
python-version: "3.10"
|
|
40
34
|
- os: macos-14
|
|
41
35
|
python-version: "3.12"
|
|
36
|
+
- os: macos-14
|
|
37
|
+
python-version: "3.13"
|
|
42
38
|
steps:
|
|
43
39
|
- uses: actions/checkout@v4
|
|
44
40
|
- name: Set up Python ${{ matrix.python-version }}
|
|
@@ -70,6 +66,12 @@ jobs:
|
|
|
70
66
|
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
|
|
71
67
|
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
|
|
72
68
|
python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
|
|
69
|
+
- name: Run tskit2zarr example
|
|
70
|
+
run: |
|
|
71
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
|
|
72
|
+
- name: Run plink2zarr example
|
|
73
|
+
run: |
|
|
74
|
+
python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
|
|
73
75
|
- name: Run tests
|
|
74
76
|
run: |
|
|
75
77
|
pytest --cov=bio2zarr
|
|
@@ -82,6 +84,36 @@ jobs:
|
|
|
82
84
|
# https://github.com/coverallsapp/github-action
|
|
83
85
|
fail-on-error: false
|
|
84
86
|
|
|
87
|
+
optional_dependencies:
|
|
88
|
+
name: Optional dependencies
|
|
89
|
+
runs-on: ubuntu-latest
|
|
90
|
+
steps:
|
|
91
|
+
- uses: actions/checkout@v4
|
|
92
|
+
- uses: actions/setup-python@v5
|
|
93
|
+
with:
|
|
94
|
+
python-version: '3.11'
|
|
95
|
+
- name: Test optional dependencies
|
|
96
|
+
run: |
|
|
97
|
+
python -m venv env-tskit
|
|
98
|
+
source env-tskit/bin/activate
|
|
99
|
+
python -m pip install .
|
|
100
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz > ts.txt 2>&1 || echo $? > ts_exit.txt
|
|
101
|
+
test "$(cat ts_exit.txt)" = "1"
|
|
102
|
+
grep -q "This process requires the optional tskit module. Install it with: pip install bio2zarr\[tskit\]" ts.txt
|
|
103
|
+
python -m pip install '.[tskit]'
|
|
104
|
+
python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz
|
|
105
|
+
deactivate
|
|
106
|
+
|
|
107
|
+
python -m venv env-vcf
|
|
108
|
+
source env-vcf/bin/activate
|
|
109
|
+
python -m pip install .
|
|
110
|
+
python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz > vcf.txt 2>&1 || echo $? > vcf_exit.txt
|
|
111
|
+
test "$(cat vcf_exit.txt)" = "1"
|
|
112
|
+
grep -q "This process requires the optional cyvcf2 module. Install it with: pip install bio2zarr\[vcf\]" vcf.txt
|
|
113
|
+
python -m pip install '.[vcf]'
|
|
114
|
+
python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz
|
|
115
|
+
deactivate
|
|
116
|
+
|
|
85
117
|
packaging:
|
|
86
118
|
name: Packaging
|
|
87
119
|
runs-on: ubuntu-latest
|
|
@@ -108,30 +140,14 @@ jobs:
|
|
|
108
140
|
run: |
|
|
109
141
|
vcfpartition --help
|
|
110
142
|
python -m bio2zarr vcfpartition --help
|
|
111
|
-
|
|
112
|
-
test-numpy-version:
|
|
113
|
-
name: Test numpy versions
|
|
114
|
-
runs-on: ubuntu-latest
|
|
115
|
-
strategy:
|
|
116
|
-
matrix:
|
|
117
|
-
numpy: ["==1.26", ">=2"]
|
|
118
|
-
steps:
|
|
119
|
-
- uses: actions/checkout@v4
|
|
120
|
-
- uses: actions/setup-python@v5
|
|
121
|
-
with:
|
|
122
|
-
python-version: '3.11'
|
|
123
|
-
- name: Install dependencies
|
|
143
|
+
- name: Check tskit2zarr CLI
|
|
124
144
|
run: |
|
|
125
|
-
|
|
126
|
-
python -m
|
|
127
|
-
- name:
|
|
145
|
+
tskit2zarr --help
|
|
146
|
+
python -m bio2zarr tskit2zarr --help
|
|
147
|
+
- name: Check plink2zarr CLI
|
|
128
148
|
run: |
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
run: |
|
|
132
|
-
# We just run the CLI tests here because it doesn't require other upstream
|
|
133
|
-
# packages like sgkit (which are tangled up with the numpy 2 dependency)
|
|
134
|
-
python -m pytest tests/test_cli.py
|
|
149
|
+
plink2zarr --help
|
|
150
|
+
python -m bio2zarr plink2zarr --help
|
|
135
151
|
|
|
136
152
|
test-zarr-version:
|
|
137
153
|
name: Test Zarr versions
|
|
@@ -139,6 +155,10 @@ jobs:
|
|
|
139
155
|
strategy:
|
|
140
156
|
matrix:
|
|
141
157
|
zarr: ["==2.18.3", ">=3.0.3"]
|
|
158
|
+
zarr-format: [2, 3]
|
|
159
|
+
exclude:
|
|
160
|
+
- zarr: "==2.18.3"
|
|
161
|
+
zarr-format: 3
|
|
142
162
|
steps:
|
|
143
163
|
- uses: actions/checkout@v4
|
|
144
164
|
- uses: actions/setup-python@v5
|
|
@@ -154,3 +174,5 @@ jobs:
|
|
|
154
174
|
- name: Run tests
|
|
155
175
|
run: |
|
|
156
176
|
python -m pytest
|
|
177
|
+
env:
|
|
178
|
+
BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
name: Docs
|
|
2
2
|
|
|
3
3
|
on:
|
|
4
|
+
merge_group:
|
|
4
5
|
pull_request:
|
|
5
6
|
push:
|
|
6
7
|
branches:
|
|
@@ -37,7 +38,7 @@ jobs:
|
|
|
37
38
|
|
|
38
39
|
- name: Install package
|
|
39
40
|
run: |
|
|
40
|
-
python3 -m pip install .
|
|
41
|
+
python3 -m pip install '.[all]'
|
|
41
42
|
|
|
42
43
|
- name: Build Docs
|
|
43
44
|
run: |
|
|
@@ -50,7 +51,7 @@ jobs:
|
|
|
50
51
|
|
|
51
52
|
deploy:
|
|
52
53
|
needs: build-docs
|
|
53
|
-
if: github.event_name != 'pull_request'
|
|
54
|
+
if: github.event_name != 'pull_request' && github.event_name != 'merge_group'
|
|
54
55
|
permissions:
|
|
55
56
|
pages: write
|
|
56
57
|
id-token: write
|
|
@@ -1,3 +1,58 @@
|
|
|
1
|
+
# 0.1.7 2026-02-03
|
|
2
|
+
|
|
3
|
+
*Bug fixes*
|
|
4
|
+
|
|
5
|
+
- Fix issue with 0-dimensional arrays (#437)
|
|
6
|
+
|
|
7
|
+
- Fix issue with pandas 3.x (required in plink code; #439)
|
|
8
|
+
|
|
9
|
+
*Breaking changes*
|
|
10
|
+
|
|
11
|
+
- Require NumPy 2 (#426)
|
|
12
|
+
|
|
13
|
+
- Require tskit >= 1.0.
|
|
14
|
+
|
|
15
|
+
- The default `isolated_as_missing` behaviour for tskit conversion now follows
|
|
16
|
+
tskit's default (currently `True`). To get the previous behaviour, create a
|
|
17
|
+
model mapping using `ts.map_to_vcf_model(isolated_as_missing=False)` and pass
|
|
18
|
+
it via the `model_mapping` parameter (or use `tskit2zarr convert --isolated-as-ancestral`).
|
|
19
|
+
|
|
20
|
+
- The `contig_id` and `isolated_as_missing` parameters to
|
|
21
|
+
`bio2zarr.tskit.convert` have been removed; set these via
|
|
22
|
+
`tskit.TreeSequence.map_to_vcf_model` and pass the returned mapping via the
|
|
23
|
+
`model_mapping` parameter.
|
|
24
|
+
|
|
25
|
+
*Maintenance*
|
|
26
|
+
|
|
27
|
+
- Add support for Python 3.13
|
|
28
|
+
|
|
29
|
+
# 0.1.6 2025-05-23
|
|
30
|
+
|
|
31
|
+
- Initial Python API support for VCF and tskit one-shot conversion. Format
|
|
32
|
+
conversion is done using the functions ``bio2zarr.vcf.convert``
|
|
33
|
+
and ``bio2zarr.tskit.convert``.
|
|
34
|
+
|
|
35
|
+
- Initial version of supported plink2zarr (#390, #344, #382)
|
|
36
|
+
|
|
37
|
+
- Initial version of tskit2zarr (#232)
|
|
38
|
+
|
|
39
|
+
- Make format-specific dependencies optional (#385)
|
|
40
|
+
|
|
41
|
+
- Remove bed_reader dependency (#397, #400)
|
|
42
|
+
|
|
43
|
+
- Change default number of worker processes to zero (#404) to simplify
|
|
44
|
+
debugging
|
|
45
|
+
|
|
46
|
+
*Breaking changes*
|
|
47
|
+
|
|
48
|
+
- Remove explicit sample, contig and filter lists from the schema.
|
|
49
|
+
Existing ICFs will need to be recreated. (#343)
|
|
50
|
+
|
|
51
|
+
- Add dimensions and default compressor and filter settings to the schema.
|
|
52
|
+
(#361)
|
|
53
|
+
|
|
54
|
+
- Various changes to existing experimental plink encoding (#390)
|
|
55
|
+
|
|
1
56
|
# 0.1.5 2025-03-31
|
|
2
57
|
|
|
3
58
|
- Add support for merging contig IDs across multiple VCFs (#335)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -216,35 +216,50 @@ Classifier: Operating System :: MacOS :: MacOS X
|
|
|
216
216
|
Classifier: Intended Audience :: Science/Research
|
|
217
217
|
Classifier: Programming Language :: Python
|
|
218
218
|
Classifier: Programming Language :: Python :: 3
|
|
219
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
220
219
|
Classifier: Programming Language :: Python :: 3.10
|
|
221
220
|
Classifier: Programming Language :: Python :: 3.11
|
|
222
221
|
Classifier: Programming Language :: Python :: 3.12
|
|
222
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
223
223
|
Classifier: Topic :: Scientific/Engineering
|
|
224
|
-
Requires-Python: >=3.
|
|
224
|
+
Requires-Python: >=3.10
|
|
225
225
|
Description-Content-Type: text/markdown
|
|
226
226
|
License-File: LICENSE
|
|
227
|
-
Requires-Dist: numpy>=
|
|
227
|
+
Requires-Dist: numpy>=2
|
|
228
228
|
Requires-Dist: zarr<3,>=2.17
|
|
229
|
-
Requires-Dist:
|
|
229
|
+
Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
|
|
230
230
|
Requires-Dist: tabulate
|
|
231
231
|
Requires-Dist: tqdm
|
|
232
232
|
Requires-Dist: humanfriendly
|
|
233
|
-
Requires-Dist:
|
|
234
|
-
Requires-Dist:
|
|
233
|
+
Requires-Dist: coloredlogs
|
|
234
|
+
Requires-Dist: click
|
|
235
|
+
Requires-Dist: pandas
|
|
235
236
|
Provides-Extra: dev
|
|
237
|
+
Requires-Dist: click>=8.2.0; extra == "dev"
|
|
236
238
|
Requires-Dist: hypothesis-vcf; extra == "dev"
|
|
237
239
|
Requires-Dist: msprime; extra == "dev"
|
|
238
240
|
Requires-Dist: pysam; extra == "dev"
|
|
239
241
|
Requires-Dist: pytest; extra == "dev"
|
|
240
242
|
Requires-Dist: pytest-coverage; extra == "dev"
|
|
241
243
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
242
|
-
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
243
244
|
Requires-Dist: tqdm; extra == "dev"
|
|
245
|
+
Requires-Dist: tskit>=1; extra == "dev"
|
|
246
|
+
Requires-Dist: bed_reader; extra == "dev"
|
|
247
|
+
Requires-Dist: cyvcf2; extra == "dev"
|
|
248
|
+
Requires-Dist: xarray<2025.03.1; extra == "dev"
|
|
249
|
+
Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
|
|
250
|
+
Provides-Extra: tskit
|
|
251
|
+
Requires-Dist: tskit>=1; extra == "tskit"
|
|
252
|
+
Provides-Extra: vcf
|
|
253
|
+
Requires-Dist: cyvcf2; extra == "vcf"
|
|
254
|
+
Provides-Extra: all
|
|
255
|
+
Requires-Dist: tskit>=1; extra == "all"
|
|
256
|
+
Requires-Dist: cyvcf2; extra == "all"
|
|
244
257
|
Dynamic: license-file
|
|
245
258
|
|
|
246
259
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
247
260
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
261
|
+
[](https://pepy.tech/projects/bio2zarr)
|
|
262
|
+
[](https://anaconda.org/bioconda/bio2zarr)
|
|
248
263
|
|
|
249
264
|
|
|
250
265
|
# bio2zarr
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
2
2
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
3
|
+
[](https://pepy.tech/projects/bio2zarr)
|
|
4
|
+
[](https://anaconda.org/bioconda/bio2zarr)
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
# bio2zarr
|
|
@@ -15,7 +15,8 @@ def bio2zarr():
|
|
|
15
15
|
# is handy for development and for those whose PATHs aren't set
|
|
16
16
|
# up in the right way.
|
|
17
17
|
bio2zarr.add_command(cli.vcf2zarr_main)
|
|
18
|
-
bio2zarr.add_command(cli.
|
|
18
|
+
bio2zarr.add_command(cli.plink2zarr_main)
|
|
19
|
+
bio2zarr.add_command(cli.tskit2zarr_main)
|
|
19
20
|
bio2zarr.add_command(cli.vcfpartition)
|
|
20
21
|
|
|
21
22
|
if __name__ == "__main__":
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
5
12
|
|
|
6
13
|
TYPE_CHECKING = False
|
|
7
14
|
if TYPE_CHECKING:
|
|
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
|
|
|
9
16
|
from typing import Union
|
|
10
17
|
|
|
11
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
12
20
|
else:
|
|
13
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
14
23
|
|
|
15
24
|
version: str
|
|
16
25
|
__version__: str
|
|
17
26
|
__version_tuple__: VERSION_TUPLE
|
|
18
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
19
30
|
|
|
20
|
-
__version__ = version = '0.1.
|
|
21
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.7'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 7)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'g4359d72e2'
|
|
@@ -8,8 +8,9 @@ import coloredlogs
|
|
|
8
8
|
import numcodecs
|
|
9
9
|
import tabulate
|
|
10
10
|
|
|
11
|
-
from . import plink, provenance,
|
|
12
|
-
from .
|
|
11
|
+
from . import core, plink, provenance, vcf_utils
|
|
12
|
+
from . import tskit as tskit_mod
|
|
13
|
+
from . import vcf as vcf_mod
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
@@ -88,7 +89,12 @@ json = click.option(
|
|
|
88
89
|
version = click.version_option(version=f"{provenance.__version__}")
|
|
89
90
|
|
|
90
91
|
worker_processes = click.option(
|
|
91
|
-
"-p",
|
|
92
|
+
"-p",
|
|
93
|
+
"--worker-processes",
|
|
94
|
+
type=int,
|
|
95
|
+
default=core.DEFAULT_WORKER_PROCESSES,
|
|
96
|
+
help="Number of worker processes",
|
|
97
|
+
show_default=True,
|
|
92
98
|
)
|
|
93
99
|
|
|
94
100
|
column_chunk_size = click.option(
|
|
@@ -197,7 +203,7 @@ def check_partitions(num_partitions):
|
|
|
197
203
|
def get_compressor(cname):
|
|
198
204
|
if cname is None:
|
|
199
205
|
return None
|
|
200
|
-
config =
|
|
206
|
+
config = vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
|
|
201
207
|
config["cname"] = cname
|
|
202
208
|
return numcodecs.get_codec(config)
|
|
203
209
|
|
|
@@ -236,7 +242,7 @@ def explode(
|
|
|
236
242
|
"""
|
|
237
243
|
setup_logging(verbose)
|
|
238
244
|
check_overwrite_dir(icf_path, force)
|
|
239
|
-
|
|
245
|
+
vcf_mod.explode(
|
|
240
246
|
icf_path,
|
|
241
247
|
vcfs,
|
|
242
248
|
worker_processes=worker_processes,
|
|
@@ -276,7 +282,7 @@ def dexplode_init(
|
|
|
276
282
|
setup_logging(verbose)
|
|
277
283
|
check_overwrite_dir(icf_path, force)
|
|
278
284
|
check_partitions(num_partitions)
|
|
279
|
-
work_summary =
|
|
285
|
+
work_summary = vcf_mod.explode_init(
|
|
280
286
|
icf_path,
|
|
281
287
|
vcfs,
|
|
282
288
|
target_num_partitions=num_partitions,
|
|
@@ -304,7 +310,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based):
|
|
|
304
310
|
setup_logging(verbose)
|
|
305
311
|
if one_based:
|
|
306
312
|
partition -= 1
|
|
307
|
-
|
|
313
|
+
vcf_mod.explode_partition(icf_path, partition)
|
|
308
314
|
|
|
309
315
|
|
|
310
316
|
@click.command
|
|
@@ -315,7 +321,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
315
321
|
Final step for distributed conversion of VCF(s) to intermediate columnar format.
|
|
316
322
|
"""
|
|
317
323
|
setup_logging(verbose)
|
|
318
|
-
|
|
324
|
+
vcf_mod.explode_finalise(icf_path)
|
|
319
325
|
|
|
320
326
|
|
|
321
327
|
@click.command
|
|
@@ -326,7 +332,7 @@ def inspect(path, verbose):
|
|
|
326
332
|
Inspect an intermediate columnar format or Zarr path.
|
|
327
333
|
"""
|
|
328
334
|
setup_logging(verbose)
|
|
329
|
-
data =
|
|
335
|
+
data = vcf_mod.inspect(path)
|
|
330
336
|
click.echo(tabulate.tabulate(data, headers="keys"))
|
|
331
337
|
|
|
332
338
|
|
|
@@ -345,7 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
|
345
351
|
err=True,
|
|
346
352
|
)
|
|
347
353
|
stream = click.get_text_stream("stdout")
|
|
348
|
-
|
|
354
|
+
vcf_mod.mkschema(
|
|
349
355
|
icf_path,
|
|
350
356
|
stream,
|
|
351
357
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -380,11 +386,11 @@ def encode(
|
|
|
380
386
|
worker_processes,
|
|
381
387
|
):
|
|
382
388
|
"""
|
|
383
|
-
Convert intermediate columnar format to
|
|
389
|
+
Convert intermediate columnar format to VCF Zarr.
|
|
384
390
|
"""
|
|
385
391
|
setup_logging(verbose)
|
|
386
392
|
check_overwrite_dir(zarr_path, force)
|
|
387
|
-
|
|
393
|
+
vcf_mod.encode(
|
|
388
394
|
icf_path,
|
|
389
395
|
zarr_path,
|
|
390
396
|
schema_path=schema,
|
|
@@ -438,7 +444,7 @@ def dencode_init(
|
|
|
438
444
|
setup_logging(verbose)
|
|
439
445
|
check_overwrite_dir(zarr_path, force)
|
|
440
446
|
check_partitions(num_partitions)
|
|
441
|
-
work_summary =
|
|
447
|
+
work_summary = vcf_mod.encode_init(
|
|
442
448
|
icf_path,
|
|
443
449
|
zarr_path,
|
|
444
450
|
target_num_partitions=num_partitions,
|
|
@@ -466,7 +472,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
|
|
|
466
472
|
setup_logging(verbose)
|
|
467
473
|
if one_based:
|
|
468
474
|
partition -= 1
|
|
469
|
-
|
|
475
|
+
vcf_mod.encode_partition(zarr_path, partition)
|
|
470
476
|
|
|
471
477
|
|
|
472
478
|
@click.command
|
|
@@ -478,7 +484,7 @@ def dencode_finalise(zarr_path, verbose, progress):
|
|
|
478
484
|
Final step for distributed conversion of ICF to VCF Zarr.
|
|
479
485
|
"""
|
|
480
486
|
setup_logging(verbose)
|
|
481
|
-
|
|
487
|
+
vcf_mod.encode_finalise(zarr_path, show_progress=progress)
|
|
482
488
|
|
|
483
489
|
|
|
484
490
|
@click.command(name="convert")
|
|
@@ -503,11 +509,11 @@ def convert_vcf(
|
|
|
503
509
|
local_alleles,
|
|
504
510
|
):
|
|
505
511
|
"""
|
|
506
|
-
Convert input VCF(s) directly to
|
|
512
|
+
Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
|
|
507
513
|
"""
|
|
508
514
|
setup_logging(verbose)
|
|
509
515
|
check_overwrite_dir(zarr_path, force)
|
|
510
|
-
|
|
516
|
+
vcf_mod.convert(
|
|
511
517
|
vcfs,
|
|
512
518
|
zarr_path,
|
|
513
519
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -522,9 +528,10 @@ def convert_vcf(
|
|
|
522
528
|
@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
|
|
523
529
|
def vcf2zarr_main():
|
|
524
530
|
"""
|
|
525
|
-
Convert VCF file(s) to
|
|
531
|
+
Convert VCF file(s) to VCF Zarr format.
|
|
526
532
|
|
|
527
533
|
See the online documentation at https://sgkit-dev.github.io/bio2zarr/
|
|
534
|
+
|
|
528
535
|
for more information.
|
|
529
536
|
"""
|
|
530
537
|
|
|
@@ -545,6 +552,7 @@ vcf2zarr_main.add_command(dencode_finalise)
|
|
|
545
552
|
@click.command(name="convert")
|
|
546
553
|
@click.argument("in_path", type=click.Path())
|
|
547
554
|
@click.argument("zarr_path", type=click.Path())
|
|
555
|
+
@force
|
|
548
556
|
@worker_processes
|
|
549
557
|
@progress
|
|
550
558
|
@verbose
|
|
@@ -553,6 +561,7 @@ vcf2zarr_main.add_command(dencode_finalise)
|
|
|
553
561
|
def convert_plink(
|
|
554
562
|
in_path,
|
|
555
563
|
zarr_path,
|
|
564
|
+
force,
|
|
556
565
|
verbose,
|
|
557
566
|
worker_processes,
|
|
558
567
|
progress,
|
|
@@ -560,9 +569,12 @@ def convert_plink(
|
|
|
560
569
|
samples_chunk_size,
|
|
561
570
|
):
|
|
562
571
|
"""
|
|
563
|
-
|
|
572
|
+
Convert plink fileset to VCF Zarr. Results are equivalent to
|
|
573
|
+
`plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
|
|
574
|
+
then running `vcf2zarr convert tmp.vcf zarr_path`
|
|
564
575
|
"""
|
|
565
576
|
setup_logging(verbose)
|
|
577
|
+
check_overwrite_dir(zarr_path, force)
|
|
566
578
|
plink.convert(
|
|
567
579
|
in_path,
|
|
568
580
|
zarr_path,
|
|
@@ -574,12 +586,15 @@ def convert_plink(
|
|
|
574
586
|
|
|
575
587
|
|
|
576
588
|
@version
|
|
577
|
-
@click.group()
|
|
578
|
-
def
|
|
589
|
+
@click.group(name="plink2zarr")
|
|
590
|
+
def plink2zarr_main():
|
|
591
|
+
"""
|
|
592
|
+
Convert plink fileset(s) to VCF Zarr format
|
|
593
|
+
"""
|
|
579
594
|
pass
|
|
580
595
|
|
|
581
596
|
|
|
582
|
-
|
|
597
|
+
plink2zarr_main.add_command(convert_plink)
|
|
583
598
|
|
|
584
599
|
|
|
585
600
|
@click.command
|
|
@@ -630,3 +645,68 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
|
630
645
|
)
|
|
631
646
|
for region in regions:
|
|
632
647
|
click.echo(f"{region}\t{vcf_path}")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
@click.command(name="convert")
|
|
651
|
+
@click.argument("ts_path", type=click.Path(exists=True))
|
|
652
|
+
@click.argument("zarr_path", type=click.Path())
|
|
653
|
+
@click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
|
|
654
|
+
@click.option(
|
|
655
|
+
"--isolated-as-missing/--isolated-as-ancestral",
|
|
656
|
+
default=None,
|
|
657
|
+
help=(
|
|
658
|
+
"Treat isolated samples without mutations as missing or ancestral "
|
|
659
|
+
"(default: tskit default)"
|
|
660
|
+
),
|
|
661
|
+
)
|
|
662
|
+
@variants_chunk_size
|
|
663
|
+
@samples_chunk_size
|
|
664
|
+
@verbose
|
|
665
|
+
@progress
|
|
666
|
+
@worker_processes
|
|
667
|
+
@force
|
|
668
|
+
@core.requires_optional_dependency("tskit", "tskit")
|
|
669
|
+
def convert_tskit(
|
|
670
|
+
ts_path,
|
|
671
|
+
zarr_path,
|
|
672
|
+
contig_id,
|
|
673
|
+
isolated_as_missing,
|
|
674
|
+
variants_chunk_size,
|
|
675
|
+
samples_chunk_size,
|
|
676
|
+
verbose,
|
|
677
|
+
progress,
|
|
678
|
+
worker_processes,
|
|
679
|
+
force,
|
|
680
|
+
):
|
|
681
|
+
setup_logging(verbose)
|
|
682
|
+
check_overwrite_dir(zarr_path, force)
|
|
683
|
+
|
|
684
|
+
import tskit
|
|
685
|
+
|
|
686
|
+
ts = tskit.load(ts_path)
|
|
687
|
+
model_mapping = ts.map_to_vcf_model(
|
|
688
|
+
contig_id=contig_id,
|
|
689
|
+
isolated_as_missing=isolated_as_missing,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
tskit_mod.convert(
|
|
693
|
+
ts_path,
|
|
694
|
+
zarr_path,
|
|
695
|
+
model_mapping=model_mapping,
|
|
696
|
+
variants_chunk_size=variants_chunk_size,
|
|
697
|
+
samples_chunk_size=samples_chunk_size,
|
|
698
|
+
worker_processes=worker_processes,
|
|
699
|
+
show_progress=progress,
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
@version
|
|
704
|
+
@click.group(name="tskit2zarr")
|
|
705
|
+
def tskit2zarr_main():
|
|
706
|
+
"""
|
|
707
|
+
Convert tskit tree sequence(s) to VCF Zarr format
|
|
708
|
+
"""
|
|
709
|
+
pass
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
tskit2zarr_main.add_command(convert_tskit)
|