bio2zarr 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (68) hide show
  1. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/cd.yml +2 -1
  2. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/ci.yml +56 -34
  3. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.github/workflows/docs.yml +3 -2
  4. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/CHANGELOG.md +55 -0
  5. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/PKG-INFO +23 -8
  6. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/README.md +2 -0
  7. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/__main__.py +2 -1
  8. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/_version.py +16 -3
  9. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/cli.py +102 -22
  10. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/core.py +43 -22
  11. bio2zarr-0.1.7/bio2zarr/plink.py +334 -0
  12. bio2zarr-0.1.7/bio2zarr/tskit.py +296 -0
  13. bio2zarr-0.1.7/bio2zarr/typing.py +3 -0
  14. bio2zarr-0.1.5/bio2zarr/vcf2zarr/icf.py → bio2zarr-0.1.7/bio2zarr/vcf.py +606 -114
  15. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/vcf_utils.py +12 -11
  16. {bio2zarr-0.1.5/bio2zarr/vcf2zarr → bio2zarr-0.1.7/bio2zarr}/vcz.py +568 -739
  17. bio2zarr-0.1.5/bio2zarr/vcf2zarr/verification.py → bio2zarr-0.1.7/bio2zarr/vcz_verification.py +5 -2
  18. bio2zarr-0.1.7/bio2zarr/zarr_utils.py +185 -0
  19. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/PKG-INFO +23 -8
  20. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/SOURCES.txt +10 -4
  21. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/entry_points.txt +2 -0
  22. bio2zarr-0.1.7/bio2zarr.egg-info/requires.txt +34 -0
  23. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/Makefile +3 -2
  24. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_config.yml +5 -1
  25. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_toc.yml +8 -0
  26. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/build.sh +2 -2
  27. bio2zarr-0.1.7/docs/installation.md +62 -0
  28. bio2zarr-0.1.7/docs/intro.md +38 -0
  29. bio2zarr-0.1.7/docs/plink2zarr/cli_ref.md +17 -0
  30. bio2zarr-0.1.7/docs/plink2zarr/overview.md +38 -0
  31. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/requirements.txt +1 -1
  32. bio2zarr-0.1.7/docs/tskit2zarr/cli_ref.md +18 -0
  33. bio2zarr-0.1.7/docs/tskit2zarr/overview.md +10 -0
  34. bio2zarr-0.1.7/docs/tskit2zarr/python_api.md +40 -0
  35. bio2zarr-0.1.7/docs/vcf2zarr/python_api.md +17 -0
  36. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/tutorial.md +1 -1
  37. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/pyproject.toml +33 -18
  38. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation.py +5 -5
  39. bio2zarr-0.1.5/bio2zarr/plink.py +0 -207
  40. bio2zarr-0.1.5/bio2zarr/typing.py +0 -4
  41. bio2zarr-0.1.5/bio2zarr/vcf2zarr/__init__.py +0 -38
  42. bio2zarr-0.1.5/bio2zarr/zarr_utils.py +0 -18
  43. bio2zarr-0.1.5/bio2zarr.egg-info/requires.txt +0 -18
  44. bio2zarr-0.1.5/docs/installation.md +0 -49
  45. bio2zarr-0.1.5/docs/intro.md +0 -36
  46. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.gitignore +0 -0
  47. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/.pre-commit-config.yaml +0 -0
  48. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/LICENSE +0 -0
  49. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/MANIFEST.in +0 -0
  50. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/__init__.py +0 -0
  51. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/constants.py +0 -0
  52. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr/provenance.py +0 -0
  53. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/dependency_links.txt +0 -0
  54. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/bio2zarr.egg-info/top_level.txt +0 -0
  55. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/asciinema-player.css +0 -0
  56. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/asciinema-player.min.js +0 -0
  57. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/_static/custom.css +0 -0
  58. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
  59. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
  60. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/logo.png +0 -0
  61. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/cli_ref.md +0 -0
  62. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcf2zarr/overview.md +0 -0
  63. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcfpartition/cli_ref.md +0 -0
  64. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/docs/vcfpartition/overview.md +0 -0
  65. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/setup.cfg +0 -0
  66. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation-data/Makefile +0 -0
  67. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/validation-data/split.sh +0 -0
  68. {bio2zarr-0.1.5 → bio2zarr-0.1.7}/vcf_generator.py +0 -0
@@ -1,6 +1,7 @@
1
1
  name: CD
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  push:
5
6
  branches:
6
7
  - main
@@ -18,7 +19,7 @@ jobs:
18
19
  - uses: actions/checkout@v4
19
20
  - uses: actions/setup-python@v5
20
21
  with:
21
- python-version: '3.9'
22
+ python-version: '3.10'
22
23
  - name: Install dependencies
23
24
  run: |
24
25
  python -m pip install --upgrade pip
@@ -1,10 +1,14 @@
1
1
  name: CI
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  pull_request:
5
6
  push:
6
7
  branches:
7
8
  - main
9
+ schedule:
10
+ # At 04:44 on Monday, see https://crontab.guru/
11
+ - cron: "44 4 * * 1"
8
12
 
9
13
  jobs:
10
14
  pre-commit:
@@ -21,24 +25,16 @@ jobs:
21
25
  runs-on: ${{ matrix.os }}
22
26
  strategy:
23
27
  matrix:
24
- # Use macos-13 because pip binary packages for ARM aren't
25
- # available for many dependencies
26
- os: [macos-13, macos-14, ubuntu-latest]
27
- python-version: ["3.9", "3.10", "3.11", "3.12"]
28
+ os: [macos-14, ubuntu-latest]
29
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
28
30
  exclude:
29
31
  # Just run macos tests on one Python version
30
- - os: macos-13
31
- python-version: "3.10"
32
- - os: macos-13
33
- python-version: "3.11"
34
- - os: macos-13
35
- python-version: "3.12"
36
- - os: macos-14
37
- python-version: "3.9"
38
32
  - os: macos-14
39
33
  python-version: "3.10"
40
34
  - os: macos-14
41
35
  python-version: "3.12"
36
+ - os: macos-14
37
+ python-version: "3.13"
42
38
  steps:
43
39
  - uses: actions/checkout@v4
44
40
  - name: Set up Python ${{ matrix.python-version }}
@@ -70,6 +66,12 @@ jobs:
70
66
  python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
71
67
  python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
72
68
  python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
69
+ - name: Run tskit2zarr example
70
+ run: |
71
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
72
+ - name: Run plink2zarr example
73
+ run: |
74
+ python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
73
75
  - name: Run tests
74
76
  run: |
75
77
  pytest --cov=bio2zarr
@@ -82,6 +84,36 @@ jobs:
82
84
  # https://github.com/coverallsapp/github-action
83
85
  fail-on-error: false
84
86
 
87
+ optional_dependencies:
88
+ name: Optional dependencies
89
+ runs-on: ubuntu-latest
90
+ steps:
91
+ - uses: actions/checkout@v4
92
+ - uses: actions/setup-python@v5
93
+ with:
94
+ python-version: '3.11'
95
+ - name: Test optional dependencies
96
+ run: |
97
+ python -m venv env-tskit
98
+ source env-tskit/bin/activate
99
+ python -m pip install .
100
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz > ts.txt 2>&1 || echo $? > ts_exit.txt
101
+ test "$(cat ts_exit.txt)" = "1"
102
+ grep -q "This process requires the optional tskit module. Install it with: pip install bio2zarr\[tskit\]" ts.txt
103
+ python -m pip install '.[tskit]'
104
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz
105
+ deactivate
106
+
107
+ python -m venv env-vcf
108
+ source env-vcf/bin/activate
109
+ python -m pip install .
110
+ python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz > vcf.txt 2>&1 || echo $? > vcf_exit.txt
111
+ test "$(cat vcf_exit.txt)" = "1"
112
+ grep -q "This process requires the optional cyvcf2 module. Install it with: pip install bio2zarr\[vcf\]" vcf.txt
113
+ python -m pip install '.[vcf]'
114
+ python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz
115
+ deactivate
116
+
85
117
  packaging:
86
118
  name: Packaging
87
119
  runs-on: ubuntu-latest
@@ -108,30 +140,14 @@ jobs:
108
140
  run: |
109
141
  vcfpartition --help
110
142
  python -m bio2zarr vcfpartition --help
111
-
112
- test-numpy-version:
113
- name: Test numpy versions
114
- runs-on: ubuntu-latest
115
- strategy:
116
- matrix:
117
- numpy: ["==1.26", ">=2"]
118
- steps:
119
- - uses: actions/checkout@v4
120
- - uses: actions/setup-python@v5
121
- with:
122
- python-version: '3.11'
123
- - name: Install dependencies
143
+ - name: Check tskit2zarr CLI
124
144
  run: |
125
- python -m pip install --upgrade pip
126
- python -m pip install '.[dev]'
127
- - name: Install numpy${{ matrix.numpy }}
145
+ tskit2zarr --help
146
+ python -m bio2zarr tskit2zarr --help
147
+ - name: Check plink2zarr CLI
128
148
  run: |
129
- python -m pip install 'numpy${{ matrix.numpy }}'
130
- - name: Run tests
131
- run: |
132
- # We just run the CLI tests here because it doesn't require other upstream
133
- # packages like sgkit (which are tangled up with the numpy 2 dependency)
134
- python -m pytest tests/test_cli.py
149
+ plink2zarr --help
150
+ python -m bio2zarr plink2zarr --help
135
151
 
136
152
  test-zarr-version:
137
153
  name: Test Zarr versions
@@ -139,6 +155,10 @@ jobs:
139
155
  strategy:
140
156
  matrix:
141
157
  zarr: ["==2.18.3", ">=3.0.3"]
158
+ zarr-format: [2, 3]
159
+ exclude:
160
+ - zarr: "==2.18.3"
161
+ zarr-format: 3
142
162
  steps:
143
163
  - uses: actions/checkout@v4
144
164
  - uses: actions/setup-python@v5
@@ -154,3 +174,5 @@ jobs:
154
174
  - name: Run tests
155
175
  run: |
156
176
  python -m pytest
177
+ env:
178
+ BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }}
@@ -1,6 +1,7 @@
1
1
  name: Docs
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  pull_request:
5
6
  push:
6
7
  branches:
@@ -37,7 +38,7 @@ jobs:
37
38
 
38
39
  - name: Install package
39
40
  run: |
40
- python3 -m pip install .
41
+ python3 -m pip install '.[all]'
41
42
 
42
43
  - name: Build Docs
43
44
  run: |
@@ -50,7 +51,7 @@ jobs:
50
51
 
51
52
  deploy:
52
53
  needs: build-docs
53
- if: github.event_name != 'pull_request'
54
+ if: github.event_name != 'pull_request' && github.event_name != 'merge_group'
54
55
  permissions:
55
56
  pages: write
56
57
  id-token: write
@@ -1,3 +1,58 @@
1
+ # 0.1.7 2026-02-03
2
+
3
+ *Bug fixes*
4
+
5
+ - Fix issue with 0-dimensional arrays (#437)
6
+
7
+ - Fix issue with pandas 3.x (required in plink code; #439)
8
+
9
+ *Breaking changes*
10
+
11
+ - Require NumPy 2 (#426)
12
+
13
+ - Require tskit >= 1.0.
14
+
15
+ - The default `isolated_as_missing` behaviour for tskit conversion now follows
16
+ tskit's default (currently `True`). To get the previous behaviour, create a
17
+ model mapping using `ts.map_to_vcf_model(isolated_as_missing=False)` and pass
18
+ it via the `model_mapping` parameter (or use `tskit2zarr convert --isolated-as-ancestral`).
19
+
20
+ - The `contig_id` and `isolated_as_missing` parameters to
21
+ `bio2zarr.tskit.convert` have been removed; set these via
22
+ `tskit.TreeSequence.map_to_vcf_model` and pass the returned mapping via the
23
+ `model_mapping` parameter.
24
+
25
+ *Maintenance*
26
+
27
+ - Add support for Python 3.13
28
+
29
+ # 0.1.6 2025-05-23
30
+
31
+ - Initial Python API support for VCF and tskit one-shot conversion. Format
32
+ conversion is done using the functions ``bio2zarr.vcf.convert``
33
+ and ``bio2zarr.tskit.convert``.
34
+
35
+ - Initial version of supported plink2zarr (#390, #344, #382)
36
+
37
+ - Initial version of tskit2zarr (#232)
38
+
39
+ - Make format-specific dependencies optional (#385)
40
+
41
+ - Remove bed_reader dependency (#397, #400)
42
+
43
+ - Change default number of worker processes to zero (#404) to simplify
44
+ debugging
45
+
46
+ *Breaking changes*
47
+
48
+ - Remove explicit sample, contig and filter lists from the schema.
49
+ Existing ICFs will need to be recreated. (#343)
50
+
51
+ - Add dimensions and default compressor and filter settings to the schema.
52
+ (#361)
53
+
54
+ - Various changes to existing experimental plink encoding (#390)
55
+
1
56
  # 0.1.5 2025-03-31
2
57
 
3
58
  - Add support for merging contig IDs across multiple VCFs (#335)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -216,35 +216,50 @@ Classifier: Operating System :: MacOS :: MacOS X
216
216
  Classifier: Intended Audience :: Science/Research
217
217
  Classifier: Programming Language :: Python
218
218
  Classifier: Programming Language :: Python :: 3
219
- Classifier: Programming Language :: Python :: 3.9
220
219
  Classifier: Programming Language :: Python :: 3.10
221
220
  Classifier: Programming Language :: Python :: 3.11
222
221
  Classifier: Programming Language :: Python :: 3.12
222
+ Classifier: Programming Language :: Python :: 3.13
223
223
  Classifier: Topic :: Scientific/Engineering
224
- Requires-Python: >=3.9
224
+ Requires-Python: >=3.10
225
225
  Description-Content-Type: text/markdown
226
226
  License-File: LICENSE
227
- Requires-Dist: numpy>=1.26
227
+ Requires-Dist: numpy>=2
228
228
  Requires-Dist: zarr<3,>=2.17
229
- Requires-Dist: click
229
+ Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
230
230
  Requires-Dist: tabulate
231
231
  Requires-Dist: tqdm
232
232
  Requires-Dist: humanfriendly
233
- Requires-Dist: cyvcf2
234
- Requires-Dist: bed_reader
233
+ Requires-Dist: coloredlogs
234
+ Requires-Dist: click
235
+ Requires-Dist: pandas
235
236
  Provides-Extra: dev
237
+ Requires-Dist: click>=8.2.0; extra == "dev"
236
238
  Requires-Dist: hypothesis-vcf; extra == "dev"
237
239
  Requires-Dist: msprime; extra == "dev"
238
240
  Requires-Dist: pysam; extra == "dev"
239
241
  Requires-Dist: pytest; extra == "dev"
240
242
  Requires-Dist: pytest-coverage; extra == "dev"
241
243
  Requires-Dist: pytest-xdist; extra == "dev"
242
- Requires-Dist: sgkit>=0.8.0; extra == "dev"
243
244
  Requires-Dist: tqdm; extra == "dev"
245
+ Requires-Dist: tskit>=1; extra == "dev"
246
+ Requires-Dist: bed_reader; extra == "dev"
247
+ Requires-Dist: cyvcf2; extra == "dev"
248
+ Requires-Dist: xarray<2025.03.1; extra == "dev"
249
+ Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
250
+ Provides-Extra: tskit
251
+ Requires-Dist: tskit>=1; extra == "tskit"
252
+ Provides-Extra: vcf
253
+ Requires-Dist: cyvcf2; extra == "vcf"
254
+ Provides-Extra: all
255
+ Requires-Dist: tskit>=1; extra == "all"
256
+ Requires-Dist: cyvcf2; extra == "all"
244
257
  Dynamic: license-file
245
258
 
246
259
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
247
260
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
261
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
262
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
248
263
 
249
264
 
250
265
  # bio2zarr
@@ -1,5 +1,7 @@
1
1
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
2
2
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
3
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
4
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
3
5
 
4
6
 
5
7
  # bio2zarr
@@ -15,7 +15,8 @@ def bio2zarr():
15
15
  # is handy for development and for those whose PATHs aren't set
16
16
  # up in the right way.
17
17
  bio2zarr.add_command(cli.vcf2zarr_main)
18
- bio2zarr.add_command(cli.plink2zarr)
18
+ bio2zarr.add_command(cli.plink2zarr_main)
19
+ bio2zarr.add_command(cli.tskit2zarr_main)
19
20
  bio2zarr.add_command(cli.vcfpartition)
20
21
 
21
22
  if __name__ == "__main__":
@@ -1,7 +1,14 @@
1
1
  # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
3
 
4
- __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
5
12
 
6
13
  TYPE_CHECKING = False
7
14
  if TYPE_CHECKING:
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
9
16
  from typing import Union
10
17
 
11
18
  VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
12
20
  else:
13
21
  VERSION_TUPLE = object
22
+ COMMIT_ID = object
14
23
 
15
24
  version: str
16
25
  __version__: str
17
26
  __version_tuple__: VERSION_TUPLE
18
27
  version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
19
30
 
20
- __version__ = version = '0.1.5'
21
- __version_tuple__ = version_tuple = (0, 1, 5)
31
+ __version__ = version = '0.1.7'
32
+ __version_tuple__ = version_tuple = (0, 1, 7)
33
+
34
+ __commit_id__ = commit_id = 'g4359d72e2'
@@ -8,8 +8,9 @@ import coloredlogs
8
8
  import numcodecs
9
9
  import tabulate
10
10
 
11
- from . import plink, provenance, vcf2zarr, vcf_utils
12
- from .vcf2zarr import icf as icf_mod
11
+ from . import core, plink, provenance, vcf_utils
12
+ from . import tskit as tskit_mod
13
+ from . import vcf as vcf_mod
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -88,7 +89,12 @@ json = click.option(
88
89
  version = click.version_option(version=f"{provenance.__version__}")
89
90
 
90
91
  worker_processes = click.option(
91
- "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
92
+ "-p",
93
+ "--worker-processes",
94
+ type=int,
95
+ default=core.DEFAULT_WORKER_PROCESSES,
96
+ help="Number of worker processes",
97
+ show_default=True,
92
98
  )
93
99
 
94
100
  column_chunk_size = click.option(
@@ -197,7 +203,7 @@ def check_partitions(num_partitions):
197
203
  def get_compressor(cname):
198
204
  if cname is None:
199
205
  return None
200
- config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
206
+ config = vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
201
207
  config["cname"] = cname
202
208
  return numcodecs.get_codec(config)
203
209
 
@@ -236,7 +242,7 @@ def explode(
236
242
  """
237
243
  setup_logging(verbose)
238
244
  check_overwrite_dir(icf_path, force)
239
- vcf2zarr.explode(
245
+ vcf_mod.explode(
240
246
  icf_path,
241
247
  vcfs,
242
248
  worker_processes=worker_processes,
@@ -276,7 +282,7 @@ def dexplode_init(
276
282
  setup_logging(verbose)
277
283
  check_overwrite_dir(icf_path, force)
278
284
  check_partitions(num_partitions)
279
- work_summary = vcf2zarr.explode_init(
285
+ work_summary = vcf_mod.explode_init(
280
286
  icf_path,
281
287
  vcfs,
282
288
  target_num_partitions=num_partitions,
@@ -304,7 +310,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based):
304
310
  setup_logging(verbose)
305
311
  if one_based:
306
312
  partition -= 1
307
- vcf2zarr.explode_partition(icf_path, partition)
313
+ vcf_mod.explode_partition(icf_path, partition)
308
314
 
309
315
 
310
316
  @click.command
@@ -315,7 +321,7 @@ def dexplode_finalise(icf_path, verbose):
315
321
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
316
322
  """
317
323
  setup_logging(verbose)
318
- vcf2zarr.explode_finalise(icf_path)
324
+ vcf_mod.explode_finalise(icf_path)
319
325
 
320
326
 
321
327
  @click.command
@@ -326,7 +332,7 @@ def inspect(path, verbose):
326
332
  Inspect an intermediate columnar format or Zarr path.
327
333
  """
328
334
  setup_logging(verbose)
329
- data = vcf2zarr.inspect(path)
335
+ data = vcf_mod.inspect(path)
330
336
  click.echo(tabulate.tabulate(data, headers="keys"))
331
337
 
332
338
 
@@ -345,7 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
345
351
  err=True,
346
352
  )
347
353
  stream = click.get_text_stream("stdout")
348
- vcf2zarr.mkschema(
354
+ vcf_mod.mkschema(
349
355
  icf_path,
350
356
  stream,
351
357
  variants_chunk_size=variants_chunk_size,
@@ -380,11 +386,11 @@ def encode(
380
386
  worker_processes,
381
387
  ):
382
388
  """
383
- Convert intermediate columnar format to vcfzarr.
389
+ Convert intermediate columnar format to VCF Zarr.
384
390
  """
385
391
  setup_logging(verbose)
386
392
  check_overwrite_dir(zarr_path, force)
387
- vcf2zarr.encode(
393
+ vcf_mod.encode(
388
394
  icf_path,
389
395
  zarr_path,
390
396
  schema_path=schema,
@@ -438,7 +444,7 @@ def dencode_init(
438
444
  setup_logging(verbose)
439
445
  check_overwrite_dir(zarr_path, force)
440
446
  check_partitions(num_partitions)
441
- work_summary = vcf2zarr.encode_init(
447
+ work_summary = vcf_mod.encode_init(
442
448
  icf_path,
443
449
  zarr_path,
444
450
  target_num_partitions=num_partitions,
@@ -466,7 +472,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
466
472
  setup_logging(verbose)
467
473
  if one_based:
468
474
  partition -= 1
469
- vcf2zarr.encode_partition(zarr_path, partition)
475
+ vcf_mod.encode_partition(zarr_path, partition)
470
476
 
471
477
 
472
478
  @click.command
@@ -478,7 +484,7 @@ def dencode_finalise(zarr_path, verbose, progress):
478
484
  Final step for distributed conversion of ICF to VCF Zarr.
479
485
  """
480
486
  setup_logging(verbose)
481
- vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
487
+ vcf_mod.encode_finalise(zarr_path, show_progress=progress)
482
488
 
483
489
 
484
490
  @click.command(name="convert")
@@ -503,11 +509,11 @@ def convert_vcf(
503
509
  local_alleles,
504
510
  ):
505
511
  """
506
- Convert input VCF(s) directly to vcfzarr (not recommended for large files).
512
+ Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
507
513
  """
508
514
  setup_logging(verbose)
509
515
  check_overwrite_dir(zarr_path, force)
510
- vcf2zarr.convert(
516
+ vcf_mod.convert(
511
517
  vcfs,
512
518
  zarr_path,
513
519
  variants_chunk_size=variants_chunk_size,
@@ -522,9 +528,10 @@ def convert_vcf(
522
528
  @click.group(cls=NaturalOrderGroup, name="vcf2zarr")
523
529
  def vcf2zarr_main():
524
530
  """
525
- Convert VCF file(s) to the vcfzarr format.
531
+ Convert VCF file(s) to VCF Zarr format.
526
532
 
527
533
  See the online documentation at https://sgkit-dev.github.io/bio2zarr/
534
+
528
535
  for more information.
529
536
  """
530
537
 
@@ -545,6 +552,7 @@ vcf2zarr_main.add_command(dencode_finalise)
545
552
  @click.command(name="convert")
546
553
  @click.argument("in_path", type=click.Path())
547
554
  @click.argument("zarr_path", type=click.Path())
555
+ @force
548
556
  @worker_processes
549
557
  @progress
550
558
  @verbose
@@ -553,6 +561,7 @@ vcf2zarr_main.add_command(dencode_finalise)
553
561
  def convert_plink(
554
562
  in_path,
555
563
  zarr_path,
564
+ force,
556
565
  verbose,
557
566
  worker_processes,
558
567
  progress,
@@ -560,9 +569,12 @@ def convert_plink(
560
569
  samples_chunk_size,
561
570
  ):
562
571
  """
563
- In development; DO NOT USE!
572
+ Convert plink fileset to VCF Zarr. Results are equivalent to
573
+ `plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
574
+ then running `vcf2zarr convert tmp.vcf zarr_path`
564
575
  """
565
576
  setup_logging(verbose)
577
+ check_overwrite_dir(zarr_path, force)
566
578
  plink.convert(
567
579
  in_path,
568
580
  zarr_path,
@@ -574,12 +586,15 @@ def convert_plink(
574
586
 
575
587
 
576
588
  @version
577
- @click.group()
578
- def plink2zarr():
589
+ @click.group(name="plink2zarr")
590
+ def plink2zarr_main():
591
+ """
592
+ Convert plink fileset(s) to VCF Zarr format
593
+ """
579
594
  pass
580
595
 
581
596
 
582
- plink2zarr.add_command(convert_plink)
597
+ plink2zarr_main.add_command(convert_plink)
583
598
 
584
599
 
585
600
  @click.command
@@ -630,3 +645,68 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
630
645
  )
631
646
  for region in regions:
632
647
  click.echo(f"{region}\t{vcf_path}")
648
+
649
+
650
+ @click.command(name="convert")
651
+ @click.argument("ts_path", type=click.Path(exists=True))
652
+ @click.argument("zarr_path", type=click.Path())
653
+ @click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
654
+ @click.option(
655
+ "--isolated-as-missing/--isolated-as-ancestral",
656
+ default=None,
657
+ help=(
658
+ "Treat isolated samples without mutations as missing or ancestral "
659
+ "(default: tskit default)"
660
+ ),
661
+ )
662
+ @variants_chunk_size
663
+ @samples_chunk_size
664
+ @verbose
665
+ @progress
666
+ @worker_processes
667
+ @force
668
+ @core.requires_optional_dependency("tskit", "tskit")
669
+ def convert_tskit(
670
+ ts_path,
671
+ zarr_path,
672
+ contig_id,
673
+ isolated_as_missing,
674
+ variants_chunk_size,
675
+ samples_chunk_size,
676
+ verbose,
677
+ progress,
678
+ worker_processes,
679
+ force,
680
+ ):
681
+ setup_logging(verbose)
682
+ check_overwrite_dir(zarr_path, force)
683
+
684
+ import tskit
685
+
686
+ ts = tskit.load(ts_path)
687
+ model_mapping = ts.map_to_vcf_model(
688
+ contig_id=contig_id,
689
+ isolated_as_missing=isolated_as_missing,
690
+ )
691
+
692
+ tskit_mod.convert(
693
+ ts_path,
694
+ zarr_path,
695
+ model_mapping=model_mapping,
696
+ variants_chunk_size=variants_chunk_size,
697
+ samples_chunk_size=samples_chunk_size,
698
+ worker_processes=worker_processes,
699
+ show_progress=progress,
700
+ )
701
+
702
+
703
+ @version
704
+ @click.group(name="tskit2zarr")
705
+ def tskit2zarr_main():
706
+ """
707
+ Convert tskit tree sequence(s) to VCF Zarr format
708
+ """
709
+ pass
710
+
711
+
712
+ tskit2zarr_main.add_command(convert_tskit)