bio2zarr 0.0.9__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (64) hide show
  1. bio2zarr-0.1.0/.github/workflows/cd.yml +86 -0
  2. bio2zarr-0.1.0/.github/workflows/ci.yml +107 -0
  3. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/.github/workflows/docs.yml +13 -3
  4. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/CHANGELOG.md +11 -0
  5. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/PKG-INFO +10 -123
  6. bio2zarr-0.1.0/README.md +8 -0
  7. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/__main__.py +2 -2
  8. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/_version.py +2 -2
  9. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/cli.py +176 -113
  10. bio2zarr-0.1.0/bio2zarr/constants.py +18 -0
  11. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/core.py +65 -20
  12. bio2zarr-0.1.0/bio2zarr/vcf2zarr/__init__.py +38 -0
  13. bio2zarr-0.1.0/bio2zarr/vcf2zarr/icf.py +1221 -0
  14. bio2zarr-0.1.0/bio2zarr/vcf2zarr/vcz.py +1053 -0
  15. bio2zarr-0.1.0/bio2zarr/vcf2zarr/verification.py +230 -0
  16. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/vcf_utils.py +11 -6
  17. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr.egg-info/PKG-INFO +10 -123
  18. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr.egg-info/SOURCES.txt +17 -3
  19. bio2zarr-0.1.0/bio2zarr.egg-info/entry_points.txt +3 -0
  20. bio2zarr-0.1.0/docs/Makefile +46 -0
  21. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/docs/_config.yml +10 -10
  22. bio2zarr-0.1.0/docs/_static/asciinema-player.css +2295 -0
  23. bio2zarr-0.1.0/docs/_static/asciinema-player.min.js +1 -0
  24. bio2zarr-0.1.0/docs/_static/custom.css +5 -0
  25. bio2zarr-0.1.0/docs/_toc.yml +11 -0
  26. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/docs/build.sh +2 -2
  27. bio2zarr-0.1.0/docs/cast_scripts/vcf2zarr_convert.sh +3 -0
  28. bio2zarr-0.1.0/docs/cast_scripts/vcf2zarr_explode.sh +5 -0
  29. bio2zarr-0.1.0/docs/installation.md +49 -0
  30. bio2zarr-0.1.0/docs/intro.md +36 -0
  31. bio2zarr-0.1.0/docs/requirements.txt +4 -0
  32. bio2zarr-0.1.0/docs/vcf2zarr/cli_ref.md +76 -0
  33. bio2zarr-0.1.0/docs/vcf2zarr/overview.md +92 -0
  34. bio2zarr-0.1.0/docs/vcf2zarr/tutorial.md +272 -0
  35. bio2zarr-0.1.0/docs/vcfpartition/cli_ref.md +9 -0
  36. bio2zarr-0.1.0/docs/vcfpartition/overview.md +113 -0
  37. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/pyproject.toml +8 -6
  38. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/validation.py +7 -7
  39. bio2zarr-0.0.9/.github/workflows/ci.yml +0 -34
  40. bio2zarr-0.0.9/README.md +0 -124
  41. bio2zarr-0.0.9/bio2zarr/vcf.py +0 -2445
  42. bio2zarr-0.0.9/bio2zarr.egg-info/entry_points.txt +0 -4
  43. bio2zarr-0.0.9/docs/Makefile +0 -18
  44. bio2zarr-0.0.9/docs/_toc.yml +0 -4
  45. bio2zarr-0.0.9/docs/cli.md +0 -10
  46. bio2zarr-0.0.9/docs/intro.md +0 -76
  47. bio2zarr-0.0.9/docs/references.bib +0 -3
  48. bio2zarr-0.0.9/docs/requirements.txt +0 -11
  49. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/.gitignore +0 -0
  50. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/.pre-commit-config.yaml +0 -0
  51. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/LICENSE +0 -0
  52. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/MANIFEST.in +0 -0
  53. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/__init__.py +0 -0
  54. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/plink.py +0 -0
  55. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/provenance.py +0 -0
  56. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr/typing.py +0 -0
  57. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr.egg-info/dependency_links.txt +0 -0
  58. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr.egg-info/requires.txt +0 -0
  59. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/bio2zarr.egg-info/top_level.txt +0 -0
  60. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/docs/logo.png +0 -0
  61. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/setup.cfg +0 -0
  62. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/validation-data/Makefile +0 -0
  63. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/validation-data/split.sh +0 -0
  64. {bio2zarr-0.0.9 → bio2zarr-0.1.0}/vcf_generator.py +0 -0
@@ -0,0 +1,86 @@
1
+ name: CD
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ tags:
8
+ - '*'
9
+ release:
10
+ types: [published]
11
+
12
+ jobs:
13
+ packaging:
14
+ if: github.repository_owner == 'sgkit-dev'
15
+ name: Packaging
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: '3.9'
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ python -m pip install build twine validate-pyproject[all]
26
+ - name: Check and install package
27
+ run: |
28
+ validate-pyproject pyproject.toml
29
+ python -m build
30
+ python -m twine check --strict dist/*
31
+ python -m pip install dist/*.whl
32
+ - name: Check vcf2zarr CLI
33
+ run: |
34
+ vcf2zarr --help
35
+ python -m bio2zarr vcf2zarr --help
36
+ - name: Check vcfpartition CLI
37
+ run: |
38
+ vcfpartition --help
39
+ python -m bio2zarr vcfpartition --help
40
+ - name: Store the distribution packages
41
+ uses: actions/upload-artifact@v4
42
+ with:
43
+ name: python-package-distributions
44
+ path: dist/
45
+
46
+ publish-to-pypi:
47
+ if: github.repository_owner == 'sgkit-dev' && github.event_name == 'release'
48
+ needs:
49
+ - packaging
50
+ runs-on: ubuntu-latest
51
+
52
+ environment:
53
+ name: pypi
54
+ url: https://pypi.org/p/bio2zarr
55
+ permissions:
56
+ id-token: write # IMPORTANT: mandatory for trusted publishing
57
+
58
+ steps:
59
+ - uses: actions/download-artifact@v4
60
+ with:
61
+ name: python-package-distributions
62
+ path: dist/
63
+ - uses: pypa/gh-action-pypi-publish@release/v1
64
+
65
+
66
+ publish-to-testpypi:
67
+ if: github.repository_owner == 'sgkit-dev' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
68
+ needs:
69
+ - packaging
70
+ runs-on: ubuntu-latest
71
+
72
+ environment:
73
+ name: testpypi
74
+ url: https://test.pypi.org/p/bio2zarr
75
+
76
+ permissions:
77
+ id-token: write # IMPORTANT: mandatory for trusted publishing
78
+
79
+ steps:
80
+ - uses: actions/download-artifact@v4
81
+ with:
82
+ name: python-package-distributions
83
+ path: dist/
84
+ - uses: pypa/gh-action-pypi-publish@release/v1
85
+ with:
86
+ repository-url: https://test.pypi.org/legacy/
@@ -0,0 +1,107 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ pre-commit:
11
+ name: Lint
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: '3.11'
18
+ - uses: pre-commit/action@v3.0.1
19
+ test:
20
+ name: Test
21
+ runs-on: ${{ matrix.os }}
22
+ strategy:
23
+ matrix:
24
+ # Use macos-13 because pip binary packages for ARM aren't
25
+ # available for many dependencies
26
+ os: [macos-13, macos-14, ubuntu-latest]
27
+ python-version: ["3.9", "3.10", "3.11"]
28
+ exclude:
29
+ # Just run macos tests on one Python version
30
+ - os: macos-13
31
+ python-version: "3.10"
32
+ - os: macos-13
33
+ python-version: "3.11"
34
+ - os: macos-14
35
+ python-version: "3.9"
36
+ - os: macos-14
37
+ python-version: "3.10"
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - name: Set up Python ${{ matrix.python-version }}
41
+ uses: actions/setup-python@v5
42
+ with:
43
+ python-version: ${{ matrix.python-version }}
44
+ - name: Install dependencies
45
+ run: |
46
+ python -m pip install --upgrade pip
47
+ python -m pip install '.[dev]'
48
+ - name: Run basic vcf2zarr example
49
+ run: |
50
+ python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz -f
51
+ - name: Run two-pass vcf2zarr example
52
+ run: |
53
+ python -m bio2zarr vcf2zarr explode tests/data/vcf/sample.vcf.gz sample.icf -f
54
+ python -m bio2zarr vcf2zarr encode sample.icf sample.vcz -f
55
+ - name: Run distributed explode example
56
+ run: |
57
+ python -m bio2zarr vcf2zarr dexplode-init tests/data/vcf/sample.vcf.gz sample.icf -fn 3
58
+ python -m bio2zarr vcf2zarr dexplode-partition sample.icf 0
59
+ python -m bio2zarr vcf2zarr dexplode-partition sample.icf 1
60
+ python -m bio2zarr vcf2zarr dexplode-partition sample.icf 2
61
+ python -m bio2zarr vcf2zarr dexplode-finalise sample.icf
62
+ - name: Run distributed encode example
63
+ run: |
64
+ python -m bio2zarr vcf2zarr dencode-init sample.icf sample.vcz -fn 3 --variants-chunk-size=3
65
+ python -m bio2zarr vcf2zarr dencode-partition sample.vcz 0
66
+ python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
67
+ python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
68
+ python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
69
+ - name: Run tests
70
+ run: |
71
+ pytest --cov=bio2zarr
72
+ - name: Upload coverage to Coveralls
73
+ uses: coverallsapp/github-action@v2.3.0
74
+ with:
75
+ github-token: ${{ secrets.GITHUB_TOKEN }}
76
+ # The first coveralls upload will succeed and others seem to fail now.
77
+ # This is a quick workaround for doing a proper "parallel" setup:
78
+ # https://github.com/coverallsapp/github-action
79
+ fail-on-error: false
80
+
81
+ packaging:
82
+ name: Packaging
83
+ runs-on: ubuntu-latest
84
+ steps:
85
+ - uses: actions/checkout@v4
86
+ - uses: actions/setup-python@v5
87
+ with:
88
+ python-version: '3.11'
89
+ - name: Install dependencies
90
+ run: |
91
+ python -m pip install --upgrade pip
92
+ python -m pip install build twine validate-pyproject[all]
93
+ - name: Check and install package
94
+ run: |
95
+ validate-pyproject pyproject.toml
96
+ python -m build
97
+ python -m twine check --strict dist/*
98
+ python -m pip install dist/*.whl
99
+ - name: Check vcf2zarr CLI
100
+ run: |
101
+ vcf2zarr --help
102
+ python -m bio2zarr vcf2zarr --help
103
+ - name: Check vcfpartition CLI
104
+ run: |
105
+ vcfpartition --help
106
+ python -m bio2zarr vcfpartition --help
107
+
@@ -1,15 +1,16 @@
1
- name: Build Docs
1
+ name: Docs
2
2
 
3
3
  on:
4
4
  pull_request:
5
5
  push:
6
- branches: [main, test]
6
+ branches:
7
+ - main
7
8
  tags:
8
9
  - '*'
9
10
 
10
11
  jobs:
11
12
  build-docs:
12
- name: Docs
13
+ name: Build
13
14
  runs-on: ubuntu-latest
14
15
  steps:
15
16
  - name: Cancel Previous Runs
@@ -28,6 +29,15 @@ jobs:
28
29
  run: |
29
30
  pip install --upgrade pip wheel
30
31
  pip install -r docs/requirements.txt
32
+ python3 -m bash_kernel.install
33
+
34
+ - name: Install bcftools
35
+ run: |
36
+ sudo apt-get install bcftools jq
37
+
38
+ - name: Install package
39
+ run: |
40
+ python3 -m pip install .
31
41
 
32
42
  - name: Build Docs
33
43
  run: |
@@ -1,3 +1,14 @@
1
+ # 0.1.0 2024-06-10
2
+
3
+ - Initial production-ready version.
4
+ - Add -Q/--no-progress flag to CLI
5
+ - Change num-partitions argument in dexplode-init and dencode-init
6
+ to a named option.
7
+
8
+ # 0.0.10 2024-05-15
9
+ - Change output format of dexplode-init and dencode-init
10
+ - Bugfix for mac progress, and change of multiprocessing startup strategy.
11
+
1
12
  # 0.0.9 2024-05-02
2
13
 
3
14
  - Change on-disk format for explode and schema
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.9
3
+ Version: 0.1.0
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -206,10 +206,13 @@ License: Apache License
206
206
  limitations under the License.
207
207
 
208
208
  Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
209
- Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/intro.html
210
- Classifier: Development Status :: 3 - Alpha
209
+ Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
210
+ Classifier: Development Status :: 4 - Beta
211
211
  Classifier: License :: OSI Approved :: Apache Software License
212
- Classifier: Operating System :: OS Independent
212
+ Classifier: Operating System :: POSIX
213
+ Classifier: Operating System :: POSIX :: Linux
214
+ Classifier: Operating System :: MacOS
215
+ Classifier: Operating System :: MacOS :: MacOS X
213
216
  Classifier: Intended Audience :: Science/Research
214
217
  Classifier: Programming Language :: Python
215
218
  Classifier: Programming Language :: Python :: 3
@@ -238,126 +241,10 @@ Requires-Dist: sgkit>=0.8.0; extra == "dev"
238
241
  Requires-Dist: tqdm; extra == "dev"
239
242
 
240
243
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
244
+ [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
245
+
241
246
 
242
247
  # bio2zarr
243
248
  Convert bioinformatics file formats to Zarr
244
249
 
245
- Initially supports converting VCF to the
246
- [sgkit vcf-zarr specification](https://github.com/pystatgen/vcf-zarr-spec/)
247
-
248
- **This is early alpha-status code: everything is subject to change,
249
- and it has not been thoroughly tested**
250
-
251
- ## Install
252
-
253
- ```
254
- $ python3 -m pip install bio2zarr
255
- ```
256
-
257
- This will install the programs ``vcf2zarr``, ``plink2zarr`` and ``vcf_partition``
258
- into your local Python path. You may need to update your $PATH to call the
259
- executables directly.
260
-
261
- Alternatively, calling
262
- ```
263
- $ python3 -m bio2zarr vcf2zarr <args>
264
- ```
265
- is equivalent to
266
-
267
- ```
268
- $ vcf2zarr <args>
269
- ```
270
- and will always work.
271
-
272
-
273
- ## vcf2zarr
274
-
275
-
276
- Convert a VCF to zarr format:
277
-
278
- ```
279
- $ vcf2zarr convert <VCF1> <VCF2> <zarr>
280
- ```
281
-
282
- Converts the VCF to zarr format.
283
-
284
- **Do not use this for anything but the smallest files**
285
-
286
- The recommended approach is to use a multi-stage conversion
287
-
288
- First, convert the VCF into the intermediate format:
289
-
290
- ```
291
- vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exploded
292
- ```
293
-
294
- Then, (optionally) inspect this representation to get a feel for your dataset
295
- ```
296
- vcf2zarr inspect tmp/sample.exploded
297
- ```
298
-
299
- Then, (optionally) generate a conversion schema to describe the corresponding
300
- Zarr arrays:
301
-
302
- ```
303
- vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
304
- ```
305
-
306
- View and edit the schema, deleting any columns you don't want, or tweaking
307
- dtypes and compression settings to your taste.
308
-
309
- Finally, encode to Zarr:
310
- ```
311
- vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
312
- ```
313
-
314
- Use the ``-p, --worker-processes`` argument to control the number of workers used
315
- in the ``explode`` and ``encode`` phases.
316
-
317
- ### Shell completion
318
-
319
- To enable shell completion for a particular session in Bash do:
320
-
321
- ```
322
- eval "$(_VCF2ZARR_COMPLETE=bash_source vcf2zarr)"
323
- ```
324
-
325
- If you add this to your ``.bashrc`` vcf2zarr shell completion should available
326
- in all new shell sessions.
327
-
328
- See the [Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/#enabling-completion)
329
- for instructions on how to enable completion in other shells.
330
- a
331
-
332
- ## plink2zarr
333
-
334
- Convert a plink ``.bed`` file to zarr format. **This is incomplete**
335
-
336
- ## vcf_partition
337
-
338
- Partition a given VCF file into (approximately) a give number of regions:
339
-
340
- ```
341
- vcf_partition 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz -n 10
342
- ```
343
- gives
344
- ```
345
- chr20:1-6799360
346
- chr20:6799361-14319616
347
- chr20:14319617-21790720
348
- chr20:21790721-28770304
349
- chr20:28770305-31096832
350
- chr20:31096833-38043648
351
- chr20:38043649-45580288
352
- chr20:45580289-52117504
353
- chr20:52117505-58834944
354
- chr20:58834945-
355
- ```
356
-
357
- These reqion strings can then be used to split computation of the VCF
358
- into chunks for parallelisation.
359
-
360
- **TODO give a nice example here using xargs**
361
-
362
- **WARNING that this does not take into account that indels may overlap
363
- partitions and you may count variants twice or more if they do**
250
+ See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.
@@ -0,0 +1,8 @@
1
+ [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
2
+ [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
3
+
4
+
5
+ # bio2zarr
6
+ Convert bioinformatics file formats to Zarr
7
+
8
+ See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.
@@ -14,9 +14,9 @@ def bio2zarr():
14
14
  # install individual commands as console scripts. However, this
15
15
  # is handy for development and for those whose PATHs aren't set
16
16
  # up in the right way.
17
- bio2zarr.add_command(cli.vcf2zarr)
17
+ bio2zarr.add_command(cli.vcf2zarr_main)
18
18
  bio2zarr.add_command(cli.plink2zarr)
19
- bio2zarr.add_command(cli.vcf_partition)
19
+ bio2zarr.add_command(cli.vcfpartition)
20
20
 
21
21
  if __name__ == "__main__":
22
22
  bio2zarr()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.9'
16
- __version_tuple__ = version_tuple = (0, 0, 9)
15
+ __version__ = version = '0.1.0'
16
+ __version_tuple__ = version_tuple = (0, 1, 0)