bio2zarr 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (62) hide show
  1. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/ci.yml +13 -34
  2. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/CHANGELOG.md +28 -0
  3. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/PKG-INFO +10 -6
  4. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/README.md +2 -0
  5. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/_version.py +16 -3
  6. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/cli.py +16 -3
  7. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/plink.py +7 -5
  8. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/tskit.py +14 -19
  9. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcf.py +23 -13
  10. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcz.py +34 -41
  11. bio2zarr-0.1.7/bio2zarr/zarr_utils.py +185 -0
  12. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/PKG-INFO +10 -6
  13. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/requires.txt +6 -5
  14. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/Makefile +3 -2
  15. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/build.sh +2 -2
  16. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/requirements.txt +1 -1
  17. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/python_api.md +5 -2
  18. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/tutorial.md +1 -1
  19. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/pyproject.toml +9 -9
  20. bio2zarr-0.1.6/bio2zarr/zarr_utils.py +0 -18
  21. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/cd.yml +0 -0
  22. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/docs.yml +0 -0
  23. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.gitignore +0 -0
  24. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.pre-commit-config.yaml +0 -0
  25. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/LICENSE +0 -0
  26. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/MANIFEST.in +0 -0
  27. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/__init__.py +0 -0
  28. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/__main__.py +0 -0
  29. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/constants.py +0 -0
  30. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/core.py +0 -0
  31. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/provenance.py +0 -0
  32. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/typing.py +0 -0
  33. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcf_utils.py +0 -0
  34. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcz_verification.py +0 -0
  35. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/SOURCES.txt +0 -0
  36. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/dependency_links.txt +0 -0
  37. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/entry_points.txt +0 -0
  38. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/top_level.txt +0 -0
  39. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_config.yml +0 -0
  40. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/asciinema-player.css +0 -0
  41. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/asciinema-player.min.js +0 -0
  42. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/custom.css +0 -0
  43. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_toc.yml +0 -0
  44. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
  45. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
  46. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/installation.md +0 -0
  47. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/intro.md +0 -0
  48. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/logo.png +0 -0
  49. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/plink2zarr/cli_ref.md +0 -0
  50. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/plink2zarr/overview.md +0 -0
  51. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/cli_ref.md +0 -0
  52. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/overview.md +0 -0
  53. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/cli_ref.md +0 -0
  54. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/overview.md +0 -0
  55. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/python_api.md +0 -0
  56. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcfpartition/cli_ref.md +0 -0
  57. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcfpartition/overview.md +0 -0
  58. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/setup.cfg +0 -0
  59. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation-data/Makefile +0 -0
  60. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation-data/split.sh +0 -0
  61. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation.py +0 -0
  62. {bio2zarr-0.1.6 → bio2zarr-0.1.7}/vcf_generator.py +0 -0
@@ -6,6 +6,9 @@ on:
6
6
  push:
7
7
  branches:
8
8
  - main
9
+ schedule:
10
+ # At 04:44 on Monday, see https://crontab.guru/
11
+ - cron: "44 4 * * 1"
9
12
 
10
13
  jobs:
11
14
  pre-commit:
@@ -22,22 +25,16 @@ jobs:
22
25
  runs-on: ${{ matrix.os }}
23
26
  strategy:
24
27
  matrix:
25
- # Use macos-13 because pip binary packages for ARM aren't
26
- # available for many dependencies
27
- os: [macos-13, macos-14, ubuntu-latest]
28
- python-version: ["3.10", "3.11", "3.12"]
28
+ os: [macos-14, ubuntu-latest]
29
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
29
30
  exclude:
30
31
  # Just run macos tests on one Python version
31
- - os: macos-13
32
- python-version: "3.10"
33
- - os: macos-13
34
- python-version: "3.11"
35
- - os: macos-13
36
- python-version: "3.12"
37
32
  - os: macos-14
38
33
  python-version: "3.10"
39
34
  - os: macos-14
40
35
  python-version: "3.12"
36
+ - os: macos-14
37
+ python-version: "3.13"
41
38
  steps:
42
39
  - uses: actions/checkout@v4
43
40
  - name: Set up Python ${{ matrix.python-version }}
@@ -152,36 +149,16 @@ jobs:
152
149
  plink2zarr --help
153
150
  python -m bio2zarr plink2zarr --help
154
151
 
155
- test-numpy-version:
156
- name: Test numpy versions
157
- runs-on: ubuntu-latest
158
- strategy:
159
- matrix:
160
- numpy: ["==1.26", ">=2"]
161
- steps:
162
- - uses: actions/checkout@v4
163
- - uses: actions/setup-python@v5
164
- with:
165
- python-version: '3.11'
166
- - name: Install dependencies
167
- run: |
168
- python -m pip install --upgrade pip
169
- python -m pip install '.[dev]'
170
- - name: Install numpy${{ matrix.numpy }}
171
- run: |
172
- python -m pip install 'numpy${{ matrix.numpy }}'
173
- - name: Run tests
174
- run: |
175
- # We just run the CLI tests here because it doesn't require other upstream
176
- # packages like sgkit (which are tangled up with the numpy 2 dependency)
177
- python -m pytest tests/test_cli.py
178
-
179
152
  test-zarr-version:
180
153
  name: Test Zarr versions
181
154
  runs-on: ubuntu-latest
182
155
  strategy:
183
156
  matrix:
184
157
  zarr: ["==2.18.3", ">=3.0.3"]
158
+ zarr-format: [2, 3]
159
+ exclude:
160
+ - zarr: "==2.18.3"
161
+ zarr-format: 3
185
162
  steps:
186
163
  - uses: actions/checkout@v4
187
164
  - uses: actions/setup-python@v5
@@ -197,3 +174,5 @@ jobs:
197
174
  - name: Run tests
198
175
  run: |
199
176
  python -m pytest
177
+ env:
178
+ BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }}
@@ -1,3 +1,31 @@
1
+ # 0.1.7 2026-02-03
2
+
3
+ *Bug fixes*
4
+
5
+ - Fix issue with 0-dimensional arrays (#437)
6
+
7
+ - Fix issue with pandas 3.x (required in plink code; #439)
8
+
9
+ *Breaking changes*
10
+
11
+ - Require NumPy 2 (#426)
12
+
13
+ - Require tskit >= 1.0.
14
+
15
+ - The default `isolated_as_missing` behaviour for tskit conversion now follows
16
+ tskit's default (currently `True`). To get the previous behaviour, create a
17
+ model mapping using `ts.map_to_vcf_model(isolated_as_missing=False)` and pass
18
+ it via the `model_mapping` parameter (or use `tskit2zarr convert --isolated-as-ancestral`).
19
+
20
+ - The `contig_id` and `isolated_as_missing` parameters to
21
+ `bio2zarr.tskit.convert` have been removed; set these via
22
+ `tskit.TreeSequence.map_to_vcf_model` and pass the returned mapping via the
23
+ `model_mapping` parameter.
24
+
25
+ *Maintenance*
26
+
27
+ - Add support for Python 3.13
28
+
1
29
  # 0.1.6 2025-05-23
2
30
 
3
31
  - Initial Python API support for VCF and tskit one-shot conversion. Format
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
219
219
  Classifier: Programming Language :: Python :: 3.10
220
220
  Classifier: Programming Language :: Python :: 3.11
221
221
  Classifier: Programming Language :: Python :: 3.12
222
+ Classifier: Programming Language :: Python :: 3.13
222
223
  Classifier: Topic :: Scientific/Engineering
223
224
  Requires-Python: >=3.10
224
225
  Description-Content-Type: text/markdown
225
226
  License-File: LICENSE
226
- Requires-Dist: numpy>=1.26
227
+ Requires-Dist: numpy>=2
227
228
  Requires-Dist: zarr<3,>=2.17
228
229
  Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
229
230
  Requires-Dist: tabulate
@@ -240,22 +241,25 @@ Requires-Dist: pysam; extra == "dev"
240
241
  Requires-Dist: pytest; extra == "dev"
241
242
  Requires-Dist: pytest-coverage; extra == "dev"
242
243
  Requires-Dist: pytest-xdist; extra == "dev"
243
- Requires-Dist: sgkit>=0.8.0; extra == "dev"
244
244
  Requires-Dist: tqdm; extra == "dev"
245
- Requires-Dist: tskit>=0.6.4; extra == "dev"
245
+ Requires-Dist: tskit>=1; extra == "dev"
246
246
  Requires-Dist: bed_reader; extra == "dev"
247
247
  Requires-Dist: cyvcf2; extra == "dev"
248
+ Requires-Dist: xarray<2025.03.1; extra == "dev"
249
+ Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
248
250
  Provides-Extra: tskit
249
- Requires-Dist: tskit>=0.6.4; extra == "tskit"
251
+ Requires-Dist: tskit>=1; extra == "tskit"
250
252
  Provides-Extra: vcf
251
253
  Requires-Dist: cyvcf2; extra == "vcf"
252
254
  Provides-Extra: all
253
- Requires-Dist: tskit>=0.6.4; extra == "all"
255
+ Requires-Dist: tskit>=1; extra == "all"
254
256
  Requires-Dist: cyvcf2; extra == "all"
255
257
  Dynamic: license-file
256
258
 
257
259
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
258
260
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
261
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
262
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
259
263
 
260
264
 
261
265
  # bio2zarr
@@ -1,5 +1,7 @@
1
1
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
2
2
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
3
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
4
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
3
5
 
4
6
 
5
7
  # bio2zarr
@@ -1,7 +1,14 @@
1
1
  # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
3
 
4
- __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
5
12
 
6
13
  TYPE_CHECKING = False
7
14
  if TYPE_CHECKING:
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
9
16
  from typing import Union
10
17
 
11
18
  VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
12
20
  else:
13
21
  VERSION_TUPLE = object
22
+ COMMIT_ID = object
14
23
 
15
24
  version: str
16
25
  __version__: str
17
26
  __version_tuple__: VERSION_TUPLE
18
27
  version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
19
30
 
20
- __version__ = version = '0.1.6'
21
- __version_tuple__ = version_tuple = (0, 1, 6)
31
+ __version__ = version = '0.1.7'
32
+ __version_tuple__ = version_tuple = (0, 1, 7)
33
+
34
+ __commit_id__ = commit_id = 'g4359d72e2'
@@ -652,7 +652,12 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
652
652
  @click.argument("zarr_path", type=click.Path())
653
653
  @click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
654
654
  @click.option(
655
- "--isolated-as-missing", is_flag=True, help="Treat isolated nodes as missing"
655
+ "--isolated-as-missing/--isolated-as-ancestral",
656
+ default=None,
657
+ help=(
658
+ "Treat isolated samples without mutations as missing or ancestral "
659
+ "(default: tskit default)"
660
+ ),
656
661
  )
657
662
  @variants_chunk_size
658
663
  @samples_chunk_size
@@ -660,6 +665,7 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
660
665
  @progress
661
666
  @worker_processes
662
667
  @force
668
+ @core.requires_optional_dependency("tskit", "tskit")
663
669
  def convert_tskit(
664
670
  ts_path,
665
671
  zarr_path,
@@ -675,11 +681,18 @@ def convert_tskit(
675
681
  setup_logging(verbose)
676
682
  check_overwrite_dir(zarr_path, force)
677
683
 
684
+ import tskit
685
+
686
+ ts = tskit.load(ts_path)
687
+ model_mapping = ts.map_to_vcf_model(
688
+ contig_id=contig_id,
689
+ isolated_as_missing=isolated_as_missing,
690
+ )
691
+
678
692
  tskit_mod.convert(
679
693
  ts_path,
680
694
  zarr_path,
681
- contig_id=contig_id,
682
- isolated_as_missing=isolated_as_missing,
695
+ model_mapping=model_mapping,
683
696
  variants_chunk_size=variants_chunk_size,
684
697
  samples_chunk_size=samples_chunk_size,
685
698
  worker_processes=worker_processes,
@@ -6,6 +6,7 @@ import numpy as np
6
6
  import pandas as pd
7
7
 
8
8
  from bio2zarr import constants, core, vcz
9
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
@@ -198,7 +199,7 @@ class PlinkFormat(vcz.Source):
198
199
  ref_iter = self.bim.allele_2.values[start:stop]
199
200
  gt_iter = self.bed_reader.iter_decode(start, stop)
200
201
  for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
201
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
202
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
202
203
  alleles[0] = ref
203
204
  alleles[1 : 1 + len(alt)] = alt
204
205
  phased = np.zeros(gt.shape[0], dtype=bool)
@@ -234,8 +235,9 @@ class PlinkFormat(vcz.Source):
234
235
  )
235
236
  # If we don't have SVLEN or END annotations, the rlen field is defined
236
237
  # as the length of the REF
237
- max_len = self.bim.allele_2.values.itemsize
238
-
238
+ # Explicitly cast to fixed size array to support pandas 2.x and 3.x
239
+ allele_2_array = self.bim.allele_2.values.astype("S")
240
+ max_len = allele_2_array.itemsize
239
241
  array_specs = [
240
242
  vcz.ZarrArraySpec(
241
243
  source="position",
@@ -246,13 +248,13 @@ class PlinkFormat(vcz.Source):
246
248
  ),
247
249
  vcz.ZarrArraySpec(
248
250
  name="variant_allele",
249
- dtype="O",
251
+ dtype=STRING_DTYPE_NAME,
250
252
  dimensions=["variants", "alleles"],
251
253
  description=None,
252
254
  ),
253
255
  vcz.ZarrArraySpec(
254
256
  name="variant_id",
255
- dtype="O",
257
+ dtype=STRING_DTYPE_NAME,
256
258
  dimensions=["variants"],
257
259
  description=None,
258
260
  ),
@@ -4,6 +4,7 @@ import pathlib
4
4
  import numpy as np
5
5
 
6
6
  from bio2zarr import constants, core, vcz
7
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
@@ -15,8 +16,6 @@ class TskitFormat(vcz.Source):
15
16
  ts,
16
17
  *,
17
18
  model_mapping=None,
18
- contig_id=None,
19
- isolated_as_missing=False,
20
19
  ):
21
20
  import tskit
22
21
 
@@ -35,14 +34,14 @@ class TskitFormat(vcz.Source):
35
34
  f"{self.ts.num_sites} sites"
36
35
  )
37
36
 
38
- self.contig_id = contig_id if contig_id is not None else "1"
39
- self.isolated_as_missing = isolated_as_missing
40
-
41
- self.positions = self.ts.sites_position
42
-
43
37
  if model_mapping is None:
44
38
  model_mapping = self.ts.map_to_vcf_model()
45
39
 
40
+ self.contig_id = model_mapping.contig_id
41
+ self.contig_length = model_mapping.contig_length
42
+ self.isolated_as_missing = model_mapping.isolated_as_missing
43
+ self.raw_positions = self.ts.sites_position
44
+ self.vcf_positions = model_mapping.transformed_positions
46
45
  individuals_nodes = model_mapping.individuals_nodes
47
46
  sample_ids = model_mapping.individuals_name
48
47
 
@@ -91,14 +90,14 @@ class TskitFormat(vcz.Source):
91
90
 
92
91
  @property
93
92
  def contigs(self):
94
- return [vcz.Contig(id=self.contig_id)]
93
+ return [vcz.Contig(id=self.contig_id, length=self.contig_length)]
95
94
 
96
95
  def iter_contig(self, start, stop):
97
96
  yield from (0 for _ in range(start, stop))
98
97
 
99
98
  def iter_field(self, field_name, shape, start, stop):
100
99
  if field_name == "position":
101
- for pos in self.ts.sites_position[start:stop]:
100
+ for pos in self.vcf_positions[start:stop]:
102
101
  yield int(pos)
103
102
  else:
104
103
  raise ValueError(f"Unknown field {field_name}")
@@ -110,13 +109,13 @@ class TskitFormat(vcz.Source):
110
109
 
111
110
  for variant in self.ts.variants(
112
111
  isolated_as_missing=self.isolated_as_missing,
113
- left=self.positions[start],
114
- right=self.positions[stop] if stop < self.num_records else None,
112
+ left=self.raw_positions[start],
113
+ right=self.raw_positions[stop] if stop < self.num_records else None,
115
114
  samples=self.tskit_samples,
116
115
  copy=False,
117
116
  ):
118
117
  gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
119
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
118
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
120
119
  # length is the length of the REF allele unless other fields
121
120
  # are included.
122
121
  variant_length = len(variant.alleles[0])
@@ -176,8 +175,8 @@ class TskitFormat(vcz.Source):
176
175
  min_position = 0
177
176
  max_position = 0
178
177
  if self.ts.num_sites > 0:
179
- min_position = np.min(self.ts.sites_position)
180
- max_position = np.max(self.ts.sites_position)
178
+ min_position = np.min(self.vcf_positions)
179
+ max_position = np.max(self.vcf_positions)
181
180
 
182
181
  tables = self.ts.tables
183
182
  ancestral_state_offsets = tables.sites.ancestral_state_offset
@@ -200,7 +199,7 @@ class TskitFormat(vcz.Source):
200
199
  vcz.ZarrArraySpec(
201
200
  source=None,
202
201
  name="variant_allele",
203
- dtype="O",
202
+ dtype=STRING_DTYPE_NAME,
204
203
  dimensions=["variants", "alleles"],
205
204
  description="Alleles for each variant",
206
205
  ),
@@ -252,8 +251,6 @@ def convert(
252
251
  vcz_path,
253
252
  *,
254
253
  model_mapping=None,
255
- contig_id=None,
256
- isolated_as_missing=False,
257
254
  variants_chunk_size=None,
258
255
  samples_chunk_size=None,
259
256
  worker_processes=core.DEFAULT_WORKER_PROCESSES,
@@ -277,8 +274,6 @@ def convert(
277
274
  tskit_format = TskitFormat(
278
275
  ts_or_path,
279
276
  model_mapping=model_mapping,
280
- contig_id=contig_id,
281
- isolated_as_missing=isolated_as_missing,
282
277
  )
283
278
  schema_instance = tskit_format.generate_schema(
284
279
  variants_chunk_size=variants_chunk_size,
@@ -16,6 +16,8 @@ from typing import Any
16
16
  import numcodecs
17
17
  import numpy as np
18
18
 
19
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
20
+
19
21
  from . import constants, core, provenance, vcf_utils, vcz
20
22
 
21
23
  logger = logging.getLogger(__name__)
@@ -110,7 +112,7 @@ class VcfField:
110
112
  ret = "U1"
111
113
  else:
112
114
  assert self.vcf_type == "String"
113
- ret = "O"
115
+ ret = STRING_DTYPE_NAME
114
116
  return ret
115
117
 
116
118
 
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
397
399
 
398
400
  def sanitise_value_string_1d(shape, value):
399
401
  if value is None:
400
- return np.full(shape, ".", dtype="O")
402
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
401
403
  else:
402
404
  value = drop_empty_second_dim(value)
403
405
  result = np.full(shape, "", dtype=value.dtype)
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
407
409
 
408
410
  def sanitise_value_string_2d(shape, value):
409
411
  if value is None:
410
- return np.full(shape, ".", dtype="O")
412
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
411
413
  else:
412
- result = np.full(shape, "", dtype="O")
414
+ result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
413
415
  if value.ndim == 2:
414
416
  result[: value.shape[0], : value.shape[1]] = value
415
417
  else:
@@ -569,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
569
571
  value = np.array(list(vcf_value.split(",")))
570
572
  else:
571
573
  # TODO can we make this faster??
572
- value = np.array([v.split(",") for v in vcf_value], dtype="O")
574
+ var_len_values = [v.split(",") for v in vcf_value]
575
+ number = max(len(v) for v in var_len_values)
576
+ value = np.array(
577
+ [v + [""] * (number - len(v)) for v in var_len_values],
578
+ dtype=STRING_DTYPE_NAME,
579
+ )
573
580
  # print("HERE", vcf_value, value)
574
581
  # for v in vcf_value:
575
582
  # print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ class IntermediateColumnarFormat(vcz.Source):
1044
1051
  ref_field.iter_values(start, stop),
1045
1052
  alt_field.iter_values(start, stop),
1046
1053
  ):
1047
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1054
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
1048
1055
  alleles[0] = ref[0]
1049
1056
  alleles[1 : 1 + len(alt)] = alt
1050
1057
  yield alleles
@@ -1068,14 +1075,16 @@ class IntermediateColumnarFormat(vcz.Source):
1068
1075
  for variant_length, alleles in zip(
1069
1076
  variant_lengths, self.iter_alleles(start, stop, num_alleles)
1070
1077
  ):
1071
- yield vcz.VariantData(variant_length, alleles, None, None)
1078
+ # Stored ICF values are always at least 1D arrays; "rlen" is Number=1
1079
+ # so we must extract the scalar to avoid NumPy scalar-conversion issues.
1080
+ yield vcz.VariantData(variant_length[0], alleles, None, None)
1072
1081
  else:
1073
1082
  for variant_length, alleles, (gt, phased) in zip(
1074
1083
  variant_lengths,
1075
1084
  self.iter_alleles(start, stop, num_alleles),
1076
1085
  self.iter_genotypes(shape, start, stop),
1077
1086
  ):
1078
- yield vcz.VariantData(variant_length, alleles, gt, phased)
1087
+ yield vcz.VariantData(variant_length[0], alleles, gt, phased)
1079
1088
 
1080
1089
  def generate_schema(
1081
1090
  self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
@@ -1087,8 +1096,10 @@ class IntermediateColumnarFormat(vcz.Source):
1087
1096
 
1088
1097
  # Add ploidy and genotypes dimensions only when needed
1089
1098
  max_genotypes = 0
1099
+ has_g_field = False
1090
1100
  for field in self.metadata.format_fields:
1091
1101
  if field.vcf_number == "G":
1102
+ has_g_field = True
1092
1103
  max_genotypes = max(max_genotypes, field.summary.max_number)
1093
1104
 
1094
1105
  ploidy = None
@@ -1100,7 +1111,7 @@ class IntermediateColumnarFormat(vcz.Source):
1100
1111
  genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1101
1112
  # assert max_genotypes == genotypes_size
1102
1113
  else:
1103
- if max_genotypes > 0:
1114
+ if max_genotypes > 0 or has_g_field:
1104
1115
  # there is no GT field, but there is at least one Number=G field,
1105
1116
  # so need to define genotypes dimension
1106
1117
  genotypes_size = max_genotypes
@@ -1163,7 +1174,7 @@ class IntermediateColumnarFormat(vcz.Source):
1163
1174
  ),
1164
1175
  fixed_field_spec(
1165
1176
  name="variant_allele",
1166
- dtype="O",
1177
+ dtype=STRING_DTYPE_NAME,
1167
1178
  dimensions=["variants", "alleles"],
1168
1179
  ),
1169
1180
  fixed_field_spec(
@@ -1173,7 +1184,7 @@ class IntermediateColumnarFormat(vcz.Source):
1173
1184
  ),
1174
1185
  fixed_field_spec(
1175
1186
  name="variant_id",
1176
- dtype="O",
1187
+ dtype=STRING_DTYPE_NAME,
1177
1188
  ),
1178
1189
  fixed_field_spec(
1179
1190
  name="variant_id_mask",
@@ -1581,8 +1592,7 @@ def inspect(path):
1581
1592
  raise ValueError(f"Path not found: {path}")
1582
1593
  if (path / "metadata.json").exists():
1583
1594
  obj = IntermediateColumnarFormat(path)
1584
- # NOTE: this is too strict, we should support more general Zarrs, see #276
1585
- elif (path / ".zmetadata").exists():
1595
+ elif zarr_exists(path):
1586
1596
  obj = vcz.VcfZarr(path)
1587
1597
  else:
1588
1598
  raise ValueError(f"{path} not in ICF or VCF Zarr format")