bio2zarr 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (66) hide show
  1. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/cd.yml +2 -1
  2. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/ci.yml +46 -3
  3. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.github/workflows/docs.yml +3 -2
  4. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/CHANGELOG.md +33 -0
  5. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/PKG-INFO +19 -7
  6. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/__main__.py +2 -1
  7. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/_version.py +2 -2
  8. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/cli.py +91 -24
  9. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/core.py +43 -22
  10. bio2zarr-0.1.6/bio2zarr/plink.py +332 -0
  11. bio2zarr-0.1.6/bio2zarr/tskit.py +301 -0
  12. bio2zarr-0.1.6/bio2zarr/typing.py +3 -0
  13. bio2zarr-0.1.4/bio2zarr/vcf2zarr/icf.py → bio2zarr-0.1.6/bio2zarr/vcf.py +614 -118
  14. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/vcf_utils.py +66 -33
  15. {bio2zarr-0.1.4/bio2zarr/vcf2zarr → bio2zarr-0.1.6/bio2zarr}/vcz.py +544 -708
  16. bio2zarr-0.1.4/bio2zarr/vcf2zarr/verification.py → bio2zarr-0.1.6/bio2zarr/vcz_verification.py +5 -2
  17. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/PKG-INFO +19 -7
  18. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/SOURCES.txt +10 -4
  19. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/entry_points.txt +2 -0
  20. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/requires.txt +17 -2
  21. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_config.yml +5 -1
  22. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_toc.yml +8 -0
  23. bio2zarr-0.1.6/docs/installation.md +62 -0
  24. bio2zarr-0.1.6/docs/intro.md +38 -0
  25. bio2zarr-0.1.6/docs/plink2zarr/cli_ref.md +17 -0
  26. bio2zarr-0.1.6/docs/plink2zarr/overview.md +38 -0
  27. bio2zarr-0.1.6/docs/tskit2zarr/cli_ref.md +18 -0
  28. bio2zarr-0.1.6/docs/tskit2zarr/overview.md +10 -0
  29. bio2zarr-0.1.6/docs/tskit2zarr/python_api.md +37 -0
  30. bio2zarr-0.1.6/docs/vcf2zarr/python_api.md +17 -0
  31. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/pyproject.toml +28 -13
  32. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation.py +5 -5
  33. bio2zarr-0.1.4/bio2zarr/plink.py +0 -207
  34. bio2zarr-0.1.4/bio2zarr/typing.py +0 -4
  35. bio2zarr-0.1.4/bio2zarr/vcf2zarr/__init__.py +0 -38
  36. bio2zarr-0.1.4/docs/installation.md +0 -49
  37. bio2zarr-0.1.4/docs/intro.md +0 -36
  38. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.gitignore +0 -0
  39. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/.pre-commit-config.yaml +0 -0
  40. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/LICENSE +0 -0
  41. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/MANIFEST.in +0 -0
  42. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/README.md +0 -0
  43. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/__init__.py +0 -0
  44. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/constants.py +0 -0
  45. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/provenance.py +0 -0
  46. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr/zarr_utils.py +0 -0
  47. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/dependency_links.txt +0 -0
  48. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/bio2zarr.egg-info/top_level.txt +0 -0
  49. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/Makefile +0 -0
  50. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/asciinema-player.css +0 -0
  51. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/asciinema-player.min.js +0 -0
  52. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/_static/custom.css +0 -0
  53. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/build.sh +0 -0
  54. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
  55. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
  56. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/logo.png +0 -0
  57. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/requirements.txt +0 -0
  58. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/cli_ref.md +0 -0
  59. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/overview.md +0 -0
  60. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcf2zarr/tutorial.md +0 -0
  61. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcfpartition/cli_ref.md +0 -0
  62. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/docs/vcfpartition/overview.md +0 -0
  63. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/setup.cfg +0 -0
  64. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation-data/Makefile +0 -0
  65. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/validation-data/split.sh +0 -0
  66. {bio2zarr-0.1.4 → bio2zarr-0.1.6}/vcf_generator.py +0 -0
@@ -1,6 +1,7 @@
1
1
  name: CD
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  push:
5
6
  branches:
6
7
  - main
@@ -18,7 +19,7 @@ jobs:
18
19
  - uses: actions/checkout@v4
19
20
  - uses: actions/setup-python@v5
20
21
  with:
21
- python-version: '3.9'
22
+ python-version: '3.10'
22
23
  - name: Install dependencies
23
24
  run: |
24
25
  python -m pip install --upgrade pip
@@ -1,6 +1,7 @@
1
1
  name: CI
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  pull_request:
5
6
  push:
6
7
  branches:
@@ -24,7 +25,7 @@ jobs:
24
25
  # Use macos-13 because pip binary packages for ARM aren't
25
26
  # available for many dependencies
26
27
  os: [macos-13, macos-14, ubuntu-latest]
27
- python-version: ["3.9", "3.10", "3.11", "3.12"]
28
+ python-version: ["3.10", "3.11", "3.12"]
28
29
  exclude:
29
30
  # Just run macos tests on one Python version
30
31
  - os: macos-13
@@ -33,8 +34,6 @@ jobs:
33
34
  python-version: "3.11"
34
35
  - os: macos-13
35
36
  python-version: "3.12"
36
- - os: macos-14
37
- python-version: "3.9"
38
37
  - os: macos-14
39
38
  python-version: "3.10"
40
39
  - os: macos-14
@@ -70,6 +69,12 @@ jobs:
70
69
  python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
71
70
  python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2
72
71
  python -m bio2zarr vcf2zarr dencode-finalise sample.vcz
72
+ - name: Run tskit2zarr example
73
+ run: |
74
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees sample.vcz -f
75
+ - name: Run plink2zarr example
76
+ run: |
77
+ python -m bio2zarr plink2zarr convert tests/data/plink/example sample.vcz -f
73
78
  - name: Run tests
74
79
  run: |
75
80
  pytest --cov=bio2zarr
@@ -82,6 +87,36 @@ jobs:
82
87
  # https://github.com/coverallsapp/github-action
83
88
  fail-on-error: false
84
89
 
90
+ optional_dependencies:
91
+ name: Optional dependencies
92
+ runs-on: ubuntu-latest
93
+ steps:
94
+ - uses: actions/checkout@v4
95
+ - uses: actions/setup-python@v5
96
+ with:
97
+ python-version: '3.11'
98
+ - name: Test optional dependencies
99
+ run: |
100
+ python -m venv env-tskit
101
+ source env-tskit/bin/activate
102
+ python -m pip install .
103
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz > ts.txt 2>&1 || echo $? > ts_exit.txt
104
+ test "$(cat ts_exit.txt)" = "1"
105
+ grep -q "This process requires the optional tskit module. Install it with: pip install bio2zarr\[tskit\]" ts.txt
106
+ python -m pip install '.[tskit]'
107
+ python -m bio2zarr tskit2zarr convert tests/data/tskit/example.trees ts.vcz
108
+ deactivate
109
+
110
+ python -m venv env-vcf
111
+ source env-vcf/bin/activate
112
+ python -m pip install .
113
+ python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz > vcf.txt 2>&1 || echo $? > vcf_exit.txt
114
+ test "$(cat vcf_exit.txt)" = "1"
115
+ grep -q "This process requires the optional cyvcf2 module. Install it with: pip install bio2zarr\[vcf\]" vcf.txt
116
+ python -m pip install '.[vcf]'
117
+ python -m bio2zarr vcf2zarr convert tests/data/vcf/sample.vcf.gz sample.vcz
118
+ deactivate
119
+
85
120
  packaging:
86
121
  name: Packaging
87
122
  runs-on: ubuntu-latest
@@ -108,6 +143,14 @@ jobs:
108
143
  run: |
109
144
  vcfpartition --help
110
145
  python -m bio2zarr vcfpartition --help
146
+ - name: Check tskit2zarr CLI
147
+ run: |
148
+ tskit2zarr --help
149
+ python -m bio2zarr tskit2zarr --help
150
+ - name: Check plink2zarr CLI
151
+ run: |
152
+ plink2zarr --help
153
+ python -m bio2zarr plink2zarr --help
111
154
 
112
155
  test-numpy-version:
113
156
  name: Test numpy versions
@@ -1,6 +1,7 @@
1
1
  name: Docs
2
2
 
3
3
  on:
4
+ merge_group:
4
5
  pull_request:
5
6
  push:
6
7
  branches:
@@ -37,7 +38,7 @@ jobs:
37
38
 
38
39
  - name: Install package
39
40
  run: |
40
- python3 -m pip install .
41
+ python3 -m pip install '.[all]'
41
42
 
42
43
  - name: Build Docs
43
44
  run: |
@@ -50,7 +51,7 @@ jobs:
50
51
 
51
52
  deploy:
52
53
  needs: build-docs
53
- if: github.event_name != 'pull_request'
54
+ if: github.event_name != 'pull_request' && github.event_name != 'merge_group'
54
55
  permissions:
55
56
  pages: write
56
57
  id-token: write
@@ -1,3 +1,36 @@
1
+ # 0.1.6 2025-05-23
2
+
3
+ - Initial Python API support for VCF and tskit one-shot conversion. Format
4
+ conversion is done using the functions ``bio2zarr.vcf.convert``
5
+ and ``bio2zarr.tskit.convert``.
6
+
7
+ - Initial version of supported plink2zarr (#390, #344, #382)
8
+
9
+ - Initial version of tskit2zarr (#232)
10
+
11
+ - Make format-specific dependencies optional (#385)
12
+
13
+ - Remove bed_reader dependency (#397, #400)
14
+
15
+ - Change default number of worker processes to zero (#404) to simplify
16
+ debugging
17
+
18
+ *Breaking changes*
19
+
20
+ - Remove explicit sample, contig and filter lists from the schema.
21
+ Existing ICFs will need to be recreated. (#343)
22
+
23
+ - Add dimensions and default compressor and filter settings to the schema.
24
+ (#361)
25
+
26
+ - Various changes to existing experimental plink encoding (#390)
27
+
28
+ # 0.1.5 2025-03-31
29
+
30
+ - Add support for merging contig IDs across multiple VCFs (#335)
31
+
32
+ - Add support for unindexed (and uncompressed) VCFs (#337)
33
+
1
34
  # 0.1.4 2025-03-10
2
35
 
3
36
  - Fix bug in handling all-missing genotypes (#328)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -216,23 +216,24 @@ Classifier: Operating System :: MacOS :: MacOS X
216
216
  Classifier: Intended Audience :: Science/Research
217
217
  Classifier: Programming Language :: Python
218
218
  Classifier: Programming Language :: Python :: 3
219
- Classifier: Programming Language :: Python :: 3.9
220
219
  Classifier: Programming Language :: Python :: 3.10
221
220
  Classifier: Programming Language :: Python :: 3.11
222
221
  Classifier: Programming Language :: Python :: 3.12
223
222
  Classifier: Topic :: Scientific/Engineering
224
- Requires-Python: >=3.9
223
+ Requires-Python: >=3.10
225
224
  Description-Content-Type: text/markdown
226
225
  License-File: LICENSE
227
226
  Requires-Dist: numpy>=1.26
228
227
  Requires-Dist: zarr<3,>=2.17
229
- Requires-Dist: click
228
+ Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
230
229
  Requires-Dist: tabulate
231
230
  Requires-Dist: tqdm
232
231
  Requires-Dist: humanfriendly
233
- Requires-Dist: cyvcf2
234
- Requires-Dist: bed_reader
232
+ Requires-Dist: coloredlogs
233
+ Requires-Dist: click
234
+ Requires-Dist: pandas
235
235
  Provides-Extra: dev
236
+ Requires-Dist: click>=8.2.0; extra == "dev"
236
237
  Requires-Dist: hypothesis-vcf; extra == "dev"
237
238
  Requires-Dist: msprime; extra == "dev"
238
239
  Requires-Dist: pysam; extra == "dev"
@@ -241,6 +242,17 @@ Requires-Dist: pytest-coverage; extra == "dev"
241
242
  Requires-Dist: pytest-xdist; extra == "dev"
242
243
  Requires-Dist: sgkit>=0.8.0; extra == "dev"
243
244
  Requires-Dist: tqdm; extra == "dev"
245
+ Requires-Dist: tskit>=0.6.4; extra == "dev"
246
+ Requires-Dist: bed_reader; extra == "dev"
247
+ Requires-Dist: cyvcf2; extra == "dev"
248
+ Provides-Extra: tskit
249
+ Requires-Dist: tskit>=0.6.4; extra == "tskit"
250
+ Provides-Extra: vcf
251
+ Requires-Dist: cyvcf2; extra == "vcf"
252
+ Provides-Extra: all
253
+ Requires-Dist: tskit>=0.6.4; extra == "all"
254
+ Requires-Dist: cyvcf2; extra == "all"
255
+ Dynamic: license-file
244
256
 
245
257
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
246
258
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
@@ -15,7 +15,8 @@ def bio2zarr():
15
15
  # is handy for development and for those whose PATHs aren't set
16
16
  # up in the right way.
17
17
  bio2zarr.add_command(cli.vcf2zarr_main)
18
- bio2zarr.add_command(cli.plink2zarr)
18
+ bio2zarr.add_command(cli.plink2zarr_main)
19
+ bio2zarr.add_command(cli.tskit2zarr_main)
19
20
  bio2zarr.add_command(cli.vcfpartition)
20
21
 
21
22
  if __name__ == "__main__":
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.4'
21
- __version_tuple__ = version_tuple = (0, 1, 4)
20
+ __version__ = version = '0.1.6'
21
+ __version_tuple__ = version_tuple = (0, 1, 6)
@@ -8,8 +8,9 @@ import coloredlogs
8
8
  import numcodecs
9
9
  import tabulate
10
10
 
11
- from . import plink, provenance, vcf2zarr, vcf_utils
12
- from .vcf2zarr import icf as icf_mod
11
+ from . import core, plink, provenance, vcf_utils
12
+ from . import tskit as tskit_mod
13
+ from . import vcf as vcf_mod
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -88,7 +89,12 @@ json = click.option(
88
89
  version = click.version_option(version=f"{provenance.__version__}")
89
90
 
90
91
  worker_processes = click.option(
91
- "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
92
+ "-p",
93
+ "--worker-processes",
94
+ type=int,
95
+ default=core.DEFAULT_WORKER_PROCESSES,
96
+ help="Number of worker processes",
97
+ show_default=True,
92
98
  )
93
99
 
94
100
  column_chunk_size = click.option(
@@ -197,7 +203,7 @@ def check_partitions(num_partitions):
197
203
  def get_compressor(cname):
198
204
  if cname is None:
199
205
  return None
200
- config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
206
+ config = vcf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
201
207
  config["cname"] = cname
202
208
  return numcodecs.get_codec(config)
203
209
 
@@ -236,7 +242,7 @@ def explode(
236
242
  """
237
243
  setup_logging(verbose)
238
244
  check_overwrite_dir(icf_path, force)
239
- vcf2zarr.explode(
245
+ vcf_mod.explode(
240
246
  icf_path,
241
247
  vcfs,
242
248
  worker_processes=worker_processes,
@@ -276,7 +282,7 @@ def dexplode_init(
276
282
  setup_logging(verbose)
277
283
  check_overwrite_dir(icf_path, force)
278
284
  check_partitions(num_partitions)
279
- work_summary = vcf2zarr.explode_init(
285
+ work_summary = vcf_mod.explode_init(
280
286
  icf_path,
281
287
  vcfs,
282
288
  target_num_partitions=num_partitions,
@@ -304,7 +310,7 @@ def dexplode_partition(icf_path, partition, verbose, one_based):
304
310
  setup_logging(verbose)
305
311
  if one_based:
306
312
  partition -= 1
307
- vcf2zarr.explode_partition(icf_path, partition)
313
+ vcf_mod.explode_partition(icf_path, partition)
308
314
 
309
315
 
310
316
  @click.command
@@ -315,7 +321,7 @@ def dexplode_finalise(icf_path, verbose):
315
321
  Final step for distributed conversion of VCF(s) to intermediate columnar format.
316
322
  """
317
323
  setup_logging(verbose)
318
- vcf2zarr.explode_finalise(icf_path)
324
+ vcf_mod.explode_finalise(icf_path)
319
325
 
320
326
 
321
327
  @click.command
@@ -326,7 +332,7 @@ def inspect(path, verbose):
326
332
  Inspect an intermediate columnar format or Zarr path.
327
333
  """
328
334
  setup_logging(verbose)
329
- data = vcf2zarr.inspect(path)
335
+ data = vcf_mod.inspect(path)
330
336
  click.echo(tabulate.tabulate(data, headers="keys"))
331
337
 
332
338
 
@@ -345,7 +351,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
345
351
  err=True,
346
352
  )
347
353
  stream = click.get_text_stream("stdout")
348
- vcf2zarr.mkschema(
354
+ vcf_mod.mkschema(
349
355
  icf_path,
350
356
  stream,
351
357
  variants_chunk_size=variants_chunk_size,
@@ -380,11 +386,11 @@ def encode(
380
386
  worker_processes,
381
387
  ):
382
388
  """
383
- Convert intermediate columnar format to vcfzarr.
389
+ Convert intermediate columnar format to VCF Zarr.
384
390
  """
385
391
  setup_logging(verbose)
386
392
  check_overwrite_dir(zarr_path, force)
387
- vcf2zarr.encode(
393
+ vcf_mod.encode(
388
394
  icf_path,
389
395
  zarr_path,
390
396
  schema_path=schema,
@@ -438,7 +444,7 @@ def dencode_init(
438
444
  setup_logging(verbose)
439
445
  check_overwrite_dir(zarr_path, force)
440
446
  check_partitions(num_partitions)
441
- work_summary = vcf2zarr.encode_init(
447
+ work_summary = vcf_mod.encode_init(
442
448
  icf_path,
443
449
  zarr_path,
444
450
  target_num_partitions=num_partitions,
@@ -466,7 +472,7 @@ def dencode_partition(zarr_path, partition, verbose, one_based):
466
472
  setup_logging(verbose)
467
473
  if one_based:
468
474
  partition -= 1
469
- vcf2zarr.encode_partition(zarr_path, partition)
475
+ vcf_mod.encode_partition(zarr_path, partition)
470
476
 
471
477
 
472
478
  @click.command
@@ -478,7 +484,7 @@ def dencode_finalise(zarr_path, verbose, progress):
478
484
  Final step for distributed conversion of ICF to VCF Zarr.
479
485
  """
480
486
  setup_logging(verbose)
481
- vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
487
+ vcf_mod.encode_finalise(zarr_path, show_progress=progress)
482
488
 
483
489
 
484
490
  @click.command(name="convert")
@@ -503,11 +509,11 @@ def convert_vcf(
503
509
  local_alleles,
504
510
  ):
505
511
  """
506
- Convert input VCF(s) directly to vcfzarr (not recommended for large files).
512
+ Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
507
513
  """
508
514
  setup_logging(verbose)
509
515
  check_overwrite_dir(zarr_path, force)
510
- vcf2zarr.convert(
516
+ vcf_mod.convert(
511
517
  vcfs,
512
518
  zarr_path,
513
519
  variants_chunk_size=variants_chunk_size,
@@ -522,9 +528,10 @@ def convert_vcf(
522
528
  @click.group(cls=NaturalOrderGroup, name="vcf2zarr")
523
529
  def vcf2zarr_main():
524
530
  """
525
- Convert VCF file(s) to the vcfzarr format.
531
+ Convert VCF file(s) to VCF Zarr format.
526
532
 
527
533
  See the online documentation at https://sgkit-dev.github.io/bio2zarr/
534
+
528
535
  for more information.
529
536
  """
530
537
 
@@ -545,6 +552,7 @@ vcf2zarr_main.add_command(dencode_finalise)
545
552
  @click.command(name="convert")
546
553
  @click.argument("in_path", type=click.Path())
547
554
  @click.argument("zarr_path", type=click.Path())
555
+ @force
548
556
  @worker_processes
549
557
  @progress
550
558
  @verbose
@@ -553,6 +561,7 @@ vcf2zarr_main.add_command(dencode_finalise)
553
561
  def convert_plink(
554
562
  in_path,
555
563
  zarr_path,
564
+ force,
556
565
  verbose,
557
566
  worker_processes,
558
567
  progress,
@@ -560,9 +569,12 @@ def convert_plink(
560
569
  samples_chunk_size,
561
570
  ):
562
571
  """
563
- In development; DO NOT USE!
572
+ Convert plink fileset to VCF Zarr. Results are equivalent to
573
+ `plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
574
+ then running `vcf2zarr convert tmp.vcf zarr_path`
564
575
  """
565
576
  setup_logging(verbose)
577
+ check_overwrite_dir(zarr_path, force)
566
578
  plink.convert(
567
579
  in_path,
568
580
  zarr_path,
@@ -574,12 +586,15 @@ def convert_plink(
574
586
 
575
587
 
576
588
  @version
577
- @click.group()
578
- def plink2zarr():
589
+ @click.group(name="plink2zarr")
590
+ def plink2zarr_main():
591
+ """
592
+ Convert plink fileset(s) to VCF Zarr format
593
+ """
579
594
  pass
580
595
 
581
596
 
582
- plink2zarr.add_command(convert_plink)
597
+ plink2zarr_main.add_command(convert_plink)
583
598
 
584
599
 
585
600
  @click.command
@@ -624,9 +639,61 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
624
639
  num_parts_per_path = max(1, num_partitions // len(vcfs))
625
640
 
626
641
  for vcf_path in vcfs:
627
- indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
628
- regions = indexed_vcf.partition_into_regions(
642
+ vcf_file = vcf_utils.VcfFile(vcf_path)
643
+ regions = vcf_file.partition_into_regions(
629
644
  num_parts=num_parts_per_path, target_part_size=partition_size
630
645
  )
631
646
  for region in regions:
632
647
  click.echo(f"{region}\t{vcf_path}")
648
+
649
+
650
+ @click.command(name="convert")
651
+ @click.argument("ts_path", type=click.Path(exists=True))
652
+ @click.argument("zarr_path", type=click.Path())
653
+ @click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
654
+ @click.option(
655
+ "--isolated-as-missing", is_flag=True, help="Treat isolated nodes as missing"
656
+ )
657
+ @variants_chunk_size
658
+ @samples_chunk_size
659
+ @verbose
660
+ @progress
661
+ @worker_processes
662
+ @force
663
+ def convert_tskit(
664
+ ts_path,
665
+ zarr_path,
666
+ contig_id,
667
+ isolated_as_missing,
668
+ variants_chunk_size,
669
+ samples_chunk_size,
670
+ verbose,
671
+ progress,
672
+ worker_processes,
673
+ force,
674
+ ):
675
+ setup_logging(verbose)
676
+ check_overwrite_dir(zarr_path, force)
677
+
678
+ tskit_mod.convert(
679
+ ts_path,
680
+ zarr_path,
681
+ contig_id=contig_id,
682
+ isolated_as_missing=isolated_as_missing,
683
+ variants_chunk_size=variants_chunk_size,
684
+ samples_chunk_size=samples_chunk_size,
685
+ worker_processes=worker_processes,
686
+ show_progress=progress,
687
+ )
688
+
689
+
690
+ @version
691
+ @click.group(name="tskit2zarr")
692
+ def tskit2zarr_main():
693
+ """
694
+ Convert tskit tree sequence(s) to VCF Zarr format
695
+ """
696
+ pass
697
+
698
+
699
+ tskit2zarr_main.add_command(convert_tskit)
@@ -1,16 +1,16 @@
1
1
  import concurrent.futures as cf
2
2
  import contextlib
3
3
  import dataclasses
4
+ import functools
5
+ import importlib
4
6
  import json
5
7
  import logging
6
8
  import math
7
9
  import multiprocessing
8
10
  import os
9
11
  import os.path
10
- import sys
11
12
  import threading
12
13
  import time
13
- import warnings
14
14
 
15
15
  import humanfriendly
16
16
  import numcodecs
@@ -23,6 +23,26 @@ logger = logging.getLogger(__name__)
23
23
  numcodecs.blosc.use_threads = False
24
24
 
25
25
 
26
+ def requires_optional_dependency(module_name, extras_name):
27
+ """Decorator to check for optional dependencies"""
28
+
29
+ def decorator(func):
30
+ @functools.wraps(func)
31
+ def wrapper(*args, **kwargs):
32
+ try:
33
+ importlib.import_module(module_name)
34
+ except ImportError:
35
+ raise ImportError(
36
+ f"This process requires the optional {module_name} module. "
37
+ f"Install it with: pip install bio2zarr[{extras_name}]"
38
+ ) from None
39
+ return func(*args, **kwargs)
40
+
41
+ return wrapper
42
+
43
+ return decorator
44
+
45
+
26
46
  def display_number(x):
27
47
  ret = "n/a"
28
48
  if math.isfinite(x):
@@ -34,6 +54,16 @@ def display_size(n):
34
54
  return humanfriendly.format_size(n, binary=True)
35
55
 
36
56
 
57
+ def parse_max_memory(max_memory):
58
+ if max_memory is None:
59
+ # Effectively unbounded
60
+ return 2**63
61
+ if isinstance(max_memory, str):
62
+ max_memory = humanfriendly.parse_size(max_memory)
63
+ logger.info(f"Set memory budget to {display_size(max_memory)}")
64
+ return max_memory
65
+
66
+
37
67
  def min_int_dtype(min_value, max_value):
38
68
  if min_value > max_value:
39
69
  raise ValueError("min_value must be <= max_value")
@@ -100,12 +130,20 @@ def du(path):
100
130
  return total
101
131
 
102
132
 
133
+ # We set the default number of worker processes to 0 because it avoids
134
+ # complexity in the call chain and makes things easier to debug by
135
+ # default. However, it does use the SynchronousExecutor here, which
136
+ # is technically not recommended by the Python docs.
137
+ DEFAULT_WORKER_PROCESSES = 0
138
+
139
+
103
140
  class SynchronousExecutor(cf.Executor):
104
- # Arguably we should use workers=0 as the default and use this
141
+ # Since https://github.com/sgkit-dev/bio2zarr/issues/404 we
142
+ # set worker_processses=0 as the default and use this
105
143
  # executor implementation. However, the docs are fairly explicit
106
144
  # about saying we shouldn't instantiate Future objects directly,
107
- # so it's best to keep this as a semi-secret debugging interface
108
- # for now.
145
+ # so we may need to revisit this is obscure problems start to
146
+ # arise.
109
147
  def submit(self, fn, /, *args, **kwargs):
110
148
  future = cf.Future()
111
149
  future.set_result(fn(*args, **kwargs))
@@ -246,22 +284,6 @@ def setup_progress_counter(counter):
246
284
  _progress_counter = counter
247
285
 
248
286
 
249
- def warn_py39_mac():
250
- if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
251
- warnings.warn(
252
- "There is a known issue with bio2zarr on MacOS Python 3.9 "
253
- "in which OS-level named semaphores are leaked. "
254
- "You will also probably see warnings like 'There appear to be N "
255
- "leaked semaphore objects at shutdown'. "
256
- "While this is likely harmless for a few runs, it could lead to "
257
- "issues if you do a lot of conversion. To get prevent this issue "
258
- "either: (1) use --worker-processes=0 or (2) upgrade to a newer "
259
- "Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
260
- "for more details.",
261
- stacklevel=2,
262
- )
263
-
264
-
265
287
  class ParallelWorkManager(contextlib.AbstractContextManager):
266
288
  def __init__(self, worker_processes=1, progress_config=None):
267
289
  # Need to specify this explicitly to suppport Macs and
@@ -274,7 +296,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
274
296
  # production. See note on the SynchronousExecutor class.
275
297
  self.executor = SynchronousExecutor()
276
298
  else:
277
- warn_py39_mac()
278
299
  self.executor = cf.ProcessPoolExecutor(
279
300
  max_workers=worker_processes,
280
301
  mp_context=ctx,