bio2zarr 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/ci.yml +13 -34
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/CHANGELOG.md +28 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/PKG-INFO +10 -6
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/README.md +2 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/_version.py +16 -3
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/cli.py +16 -3
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/plink.py +7 -5
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/tskit.py +14 -19
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcf.py +23 -13
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcz.py +34 -41
- bio2zarr-0.1.7/bio2zarr/zarr_utils.py +185 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/PKG-INFO +10 -6
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/requires.txt +6 -5
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/Makefile +3 -2
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/build.sh +2 -2
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/requirements.txt +1 -1
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/python_api.md +5 -2
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/tutorial.md +1 -1
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/pyproject.toml +9 -9
- bio2zarr-0.1.6/bio2zarr/zarr_utils.py +0 -18
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/cd.yml +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.github/workflows/docs.yml +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.gitignore +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/LICENSE +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/MANIFEST.in +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/__main__.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/constants.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/core.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/typing.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcf_utils.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr/vcz_verification.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/SOURCES.txt +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/entry_points.txt +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_config.yml +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/asciinema-player.css +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/asciinema-player.min.js +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_static/custom.css +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/_toc.yml +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/installation.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/intro.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/logo.png +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/plink2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/plink2zarr/overview.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/tskit2zarr/overview.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/overview.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcf2zarr/python_api.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcfpartition/cli_ref.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/docs/vcfpartition/overview.md +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/setup.cfg +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation-data/Makefile +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation-data/split.sh +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/validation.py +0 -0
- {bio2zarr-0.1.6 → bio2zarr-0.1.7}/vcf_generator.py +0 -0
|
@@ -6,6 +6,9 @@ on:
|
|
|
6
6
|
push:
|
|
7
7
|
branches:
|
|
8
8
|
- main
|
|
9
|
+
schedule:
|
|
10
|
+
# At 04:44 on Monday, see https://crontab.guru/
|
|
11
|
+
- cron: "44 4 * * 1"
|
|
9
12
|
|
|
10
13
|
jobs:
|
|
11
14
|
pre-commit:
|
|
@@ -22,22 +25,16 @@ jobs:
|
|
|
22
25
|
runs-on: ${{ matrix.os }}
|
|
23
26
|
strategy:
|
|
24
27
|
matrix:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
os: [macos-13, macos-14, ubuntu-latest]
|
|
28
|
-
python-version: ["3.10", "3.11", "3.12"]
|
|
28
|
+
os: [macos-14, ubuntu-latest]
|
|
29
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
29
30
|
exclude:
|
|
30
31
|
# Just run macos tests on one Python version
|
|
31
|
-
- os: macos-13
|
|
32
|
-
python-version: "3.10"
|
|
33
|
-
- os: macos-13
|
|
34
|
-
python-version: "3.11"
|
|
35
|
-
- os: macos-13
|
|
36
|
-
python-version: "3.12"
|
|
37
32
|
- os: macos-14
|
|
38
33
|
python-version: "3.10"
|
|
39
34
|
- os: macos-14
|
|
40
35
|
python-version: "3.12"
|
|
36
|
+
- os: macos-14
|
|
37
|
+
python-version: "3.13"
|
|
41
38
|
steps:
|
|
42
39
|
- uses: actions/checkout@v4
|
|
43
40
|
- name: Set up Python ${{ matrix.python-version }}
|
|
@@ -152,36 +149,16 @@ jobs:
|
|
|
152
149
|
plink2zarr --help
|
|
153
150
|
python -m bio2zarr plink2zarr --help
|
|
154
151
|
|
|
155
|
-
test-numpy-version:
|
|
156
|
-
name: Test numpy versions
|
|
157
|
-
runs-on: ubuntu-latest
|
|
158
|
-
strategy:
|
|
159
|
-
matrix:
|
|
160
|
-
numpy: ["==1.26", ">=2"]
|
|
161
|
-
steps:
|
|
162
|
-
- uses: actions/checkout@v4
|
|
163
|
-
- uses: actions/setup-python@v5
|
|
164
|
-
with:
|
|
165
|
-
python-version: '3.11'
|
|
166
|
-
- name: Install dependencies
|
|
167
|
-
run: |
|
|
168
|
-
python -m pip install --upgrade pip
|
|
169
|
-
python -m pip install '.[dev]'
|
|
170
|
-
- name: Install numpy${{ matrix.numpy }}
|
|
171
|
-
run: |
|
|
172
|
-
python -m pip install 'numpy${{ matrix.numpy }}'
|
|
173
|
-
- name: Run tests
|
|
174
|
-
run: |
|
|
175
|
-
# We just run the CLI tests here because it doesn't require other upstream
|
|
176
|
-
# packages like sgkit (which are tangled up with the numpy 2 dependency)
|
|
177
|
-
python -m pytest tests/test_cli.py
|
|
178
|
-
|
|
179
152
|
test-zarr-version:
|
|
180
153
|
name: Test Zarr versions
|
|
181
154
|
runs-on: ubuntu-latest
|
|
182
155
|
strategy:
|
|
183
156
|
matrix:
|
|
184
157
|
zarr: ["==2.18.3", ">=3.0.3"]
|
|
158
|
+
zarr-format: [2, 3]
|
|
159
|
+
exclude:
|
|
160
|
+
- zarr: "==2.18.3"
|
|
161
|
+
zarr-format: 3
|
|
185
162
|
steps:
|
|
186
163
|
- uses: actions/checkout@v4
|
|
187
164
|
- uses: actions/setup-python@v5
|
|
@@ -197,3 +174,5 @@ jobs:
|
|
|
197
174
|
- name: Run tests
|
|
198
175
|
run: |
|
|
199
176
|
python -m pytest
|
|
177
|
+
env:
|
|
178
|
+
BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }}
|
|
@@ -1,3 +1,31 @@
|
|
|
1
|
+
# 0.1.7 2026-02-03
|
|
2
|
+
|
|
3
|
+
*Bug fixes*
|
|
4
|
+
|
|
5
|
+
- Fix issue with 0-dimensional arrays (#437)
|
|
6
|
+
|
|
7
|
+
- Fix issue with pandas 3.x (required in plink code; #439)
|
|
8
|
+
|
|
9
|
+
*Breaking changes*
|
|
10
|
+
|
|
11
|
+
- Require NumPy 2 (#426)
|
|
12
|
+
|
|
13
|
+
- Require tskit >= 1.0.
|
|
14
|
+
|
|
15
|
+
- The default `isolated_as_missing` behaviour for tskit conversion now follows
|
|
16
|
+
tskit's default (currently `True`). To get the previous behaviour, create a
|
|
17
|
+
model mapping using `ts.map_to_vcf_model(isolated_as_missing=False)` and pass
|
|
18
|
+
it via the `model_mapping` parameter (or use `tskit2zarr convert --isolated-as-ancestral`).
|
|
19
|
+
|
|
20
|
+
- The `contig_id` and `isolated_as_missing` parameters to
|
|
21
|
+
`bio2zarr.tskit.convert` have been removed; set these via
|
|
22
|
+
`tskit.TreeSequence.map_to_vcf_model` and pass the returned mapping via the
|
|
23
|
+
`model_mapping` parameter.
|
|
24
|
+
|
|
25
|
+
*Maintenance*
|
|
26
|
+
|
|
27
|
+
- Add support for Python 3.13
|
|
28
|
+
|
|
1
29
|
# 0.1.6 2025-05-23
|
|
2
30
|
|
|
3
31
|
- Initial Python API support for VCF and tskit one-shot conversion. Format
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
|
|
|
219
219
|
Classifier: Programming Language :: Python :: 3.10
|
|
220
220
|
Classifier: Programming Language :: Python :: 3.11
|
|
221
221
|
Classifier: Programming Language :: Python :: 3.12
|
|
222
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
222
223
|
Classifier: Topic :: Scientific/Engineering
|
|
223
224
|
Requires-Python: >=3.10
|
|
224
225
|
Description-Content-Type: text/markdown
|
|
225
226
|
License-File: LICENSE
|
|
226
|
-
Requires-Dist: numpy>=
|
|
227
|
+
Requires-Dist: numpy>=2
|
|
227
228
|
Requires-Dist: zarr<3,>=2.17
|
|
228
229
|
Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
|
|
229
230
|
Requires-Dist: tabulate
|
|
@@ -240,22 +241,25 @@ Requires-Dist: pysam; extra == "dev"
|
|
|
240
241
|
Requires-Dist: pytest; extra == "dev"
|
|
241
242
|
Requires-Dist: pytest-coverage; extra == "dev"
|
|
242
243
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
243
|
-
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
244
244
|
Requires-Dist: tqdm; extra == "dev"
|
|
245
|
-
Requires-Dist: tskit>=
|
|
245
|
+
Requires-Dist: tskit>=1; extra == "dev"
|
|
246
246
|
Requires-Dist: bed_reader; extra == "dev"
|
|
247
247
|
Requires-Dist: cyvcf2; extra == "dev"
|
|
248
|
+
Requires-Dist: xarray<2025.03.1; extra == "dev"
|
|
249
|
+
Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
|
|
248
250
|
Provides-Extra: tskit
|
|
249
|
-
Requires-Dist: tskit>=
|
|
251
|
+
Requires-Dist: tskit>=1; extra == "tskit"
|
|
250
252
|
Provides-Extra: vcf
|
|
251
253
|
Requires-Dist: cyvcf2; extra == "vcf"
|
|
252
254
|
Provides-Extra: all
|
|
253
|
-
Requires-Dist: tskit>=
|
|
255
|
+
Requires-Dist: tskit>=1; extra == "all"
|
|
254
256
|
Requires-Dist: cyvcf2; extra == "all"
|
|
255
257
|
Dynamic: license-file
|
|
256
258
|
|
|
257
259
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
258
260
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
261
|
+
[](https://pepy.tech/projects/bio2zarr)
|
|
262
|
+
[](https://anaconda.org/bioconda/bio2zarr)
|
|
259
263
|
|
|
260
264
|
|
|
261
265
|
# bio2zarr
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
2
2
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
3
|
+
[](https://pepy.tech/projects/bio2zarr)
|
|
4
|
+
[](https://anaconda.org/bioconda/bio2zarr)
|
|
3
5
|
|
|
4
6
|
|
|
5
7
|
# bio2zarr
|
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
5
12
|
|
|
6
13
|
TYPE_CHECKING = False
|
|
7
14
|
if TYPE_CHECKING:
|
|
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
|
|
|
9
16
|
from typing import Union
|
|
10
17
|
|
|
11
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
12
20
|
else:
|
|
13
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
14
23
|
|
|
15
24
|
version: str
|
|
16
25
|
__version__: str
|
|
17
26
|
__version_tuple__: VERSION_TUPLE
|
|
18
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
19
30
|
|
|
20
|
-
__version__ = version = '0.1.
|
|
21
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.7'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 7)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'g4359d72e2'
|
|
@@ -652,7 +652,12 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
|
652
652
|
@click.argument("zarr_path", type=click.Path())
|
|
653
653
|
@click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
|
|
654
654
|
@click.option(
|
|
655
|
-
"--isolated-as-missing
|
|
655
|
+
"--isolated-as-missing/--isolated-as-ancestral",
|
|
656
|
+
default=None,
|
|
657
|
+
help=(
|
|
658
|
+
"Treat isolated samples without mutations as missing or ancestral "
|
|
659
|
+
"(default: tskit default)"
|
|
660
|
+
),
|
|
656
661
|
)
|
|
657
662
|
@variants_chunk_size
|
|
658
663
|
@samples_chunk_size
|
|
@@ -660,6 +665,7 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
|
660
665
|
@progress
|
|
661
666
|
@worker_processes
|
|
662
667
|
@force
|
|
668
|
+
@core.requires_optional_dependency("tskit", "tskit")
|
|
663
669
|
def convert_tskit(
|
|
664
670
|
ts_path,
|
|
665
671
|
zarr_path,
|
|
@@ -675,11 +681,18 @@ def convert_tskit(
|
|
|
675
681
|
setup_logging(verbose)
|
|
676
682
|
check_overwrite_dir(zarr_path, force)
|
|
677
683
|
|
|
684
|
+
import tskit
|
|
685
|
+
|
|
686
|
+
ts = tskit.load(ts_path)
|
|
687
|
+
model_mapping = ts.map_to_vcf_model(
|
|
688
|
+
contig_id=contig_id,
|
|
689
|
+
isolated_as_missing=isolated_as_missing,
|
|
690
|
+
)
|
|
691
|
+
|
|
678
692
|
tskit_mod.convert(
|
|
679
693
|
ts_path,
|
|
680
694
|
zarr_path,
|
|
681
|
-
|
|
682
|
-
isolated_as_missing=isolated_as_missing,
|
|
695
|
+
model_mapping=model_mapping,
|
|
683
696
|
variants_chunk_size=variants_chunk_size,
|
|
684
697
|
samples_chunk_size=samples_chunk_size,
|
|
685
698
|
worker_processes=worker_processes,
|
|
@@ -6,6 +6,7 @@ import numpy as np
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
8
|
from bio2zarr import constants, core, vcz
|
|
9
|
+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger(__name__)
|
|
11
12
|
|
|
@@ -198,7 +199,7 @@ class PlinkFormat(vcz.Source):
|
|
|
198
199
|
ref_iter = self.bim.allele_2.values[start:stop]
|
|
199
200
|
gt_iter = self.bed_reader.iter_decode(start, stop)
|
|
200
201
|
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
|
|
201
|
-
alleles = np.full(num_alleles, constants.STR_FILL, dtype=
|
|
202
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
|
|
202
203
|
alleles[0] = ref
|
|
203
204
|
alleles[1 : 1 + len(alt)] = alt
|
|
204
205
|
phased = np.zeros(gt.shape[0], dtype=bool)
|
|
@@ -234,8 +235,9 @@ class PlinkFormat(vcz.Source):
|
|
|
234
235
|
)
|
|
235
236
|
# If we don't have SVLEN or END annotations, the rlen field is defined
|
|
236
237
|
# as the length of the REF
|
|
237
|
-
|
|
238
|
-
|
|
238
|
+
# Explicitly cast to fixed size array to support pandas 2.x and 3.x
|
|
239
|
+
allele_2_array = self.bim.allele_2.values.astype("S")
|
|
240
|
+
max_len = allele_2_array.itemsize
|
|
239
241
|
array_specs = [
|
|
240
242
|
vcz.ZarrArraySpec(
|
|
241
243
|
source="position",
|
|
@@ -246,13 +248,13 @@ class PlinkFormat(vcz.Source):
|
|
|
246
248
|
),
|
|
247
249
|
vcz.ZarrArraySpec(
|
|
248
250
|
name="variant_allele",
|
|
249
|
-
dtype=
|
|
251
|
+
dtype=STRING_DTYPE_NAME,
|
|
250
252
|
dimensions=["variants", "alleles"],
|
|
251
253
|
description=None,
|
|
252
254
|
),
|
|
253
255
|
vcz.ZarrArraySpec(
|
|
254
256
|
name="variant_id",
|
|
255
|
-
dtype=
|
|
257
|
+
dtype=STRING_DTYPE_NAME,
|
|
256
258
|
dimensions=["variants"],
|
|
257
259
|
description=None,
|
|
258
260
|
),
|
|
@@ -4,6 +4,7 @@ import pathlib
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
6
|
from bio2zarr import constants, core, vcz
|
|
7
|
+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
@@ -15,8 +16,6 @@ class TskitFormat(vcz.Source):
|
|
|
15
16
|
ts,
|
|
16
17
|
*,
|
|
17
18
|
model_mapping=None,
|
|
18
|
-
contig_id=None,
|
|
19
|
-
isolated_as_missing=False,
|
|
20
19
|
):
|
|
21
20
|
import tskit
|
|
22
21
|
|
|
@@ -35,14 +34,14 @@ class TskitFormat(vcz.Source):
|
|
|
35
34
|
f"{self.ts.num_sites} sites"
|
|
36
35
|
)
|
|
37
36
|
|
|
38
|
-
self.contig_id = contig_id if contig_id is not None else "1"
|
|
39
|
-
self.isolated_as_missing = isolated_as_missing
|
|
40
|
-
|
|
41
|
-
self.positions = self.ts.sites_position
|
|
42
|
-
|
|
43
37
|
if model_mapping is None:
|
|
44
38
|
model_mapping = self.ts.map_to_vcf_model()
|
|
45
39
|
|
|
40
|
+
self.contig_id = model_mapping.contig_id
|
|
41
|
+
self.contig_length = model_mapping.contig_length
|
|
42
|
+
self.isolated_as_missing = model_mapping.isolated_as_missing
|
|
43
|
+
self.raw_positions = self.ts.sites_position
|
|
44
|
+
self.vcf_positions = model_mapping.transformed_positions
|
|
46
45
|
individuals_nodes = model_mapping.individuals_nodes
|
|
47
46
|
sample_ids = model_mapping.individuals_name
|
|
48
47
|
|
|
@@ -91,14 +90,14 @@ class TskitFormat(vcz.Source):
|
|
|
91
90
|
|
|
92
91
|
@property
|
|
93
92
|
def contigs(self):
|
|
94
|
-
return [vcz.Contig(id=self.contig_id)]
|
|
93
|
+
return [vcz.Contig(id=self.contig_id, length=self.contig_length)]
|
|
95
94
|
|
|
96
95
|
def iter_contig(self, start, stop):
|
|
97
96
|
yield from (0 for _ in range(start, stop))
|
|
98
97
|
|
|
99
98
|
def iter_field(self, field_name, shape, start, stop):
|
|
100
99
|
if field_name == "position":
|
|
101
|
-
for pos in self.
|
|
100
|
+
for pos in self.vcf_positions[start:stop]:
|
|
102
101
|
yield int(pos)
|
|
103
102
|
else:
|
|
104
103
|
raise ValueError(f"Unknown field {field_name}")
|
|
@@ -110,13 +109,13 @@ class TskitFormat(vcz.Source):
|
|
|
110
109
|
|
|
111
110
|
for variant in self.ts.variants(
|
|
112
111
|
isolated_as_missing=self.isolated_as_missing,
|
|
113
|
-
left=self.
|
|
114
|
-
right=self.
|
|
112
|
+
left=self.raw_positions[start],
|
|
113
|
+
right=self.raw_positions[stop] if stop < self.num_records else None,
|
|
115
114
|
samples=self.tskit_samples,
|
|
116
115
|
copy=False,
|
|
117
116
|
):
|
|
118
117
|
gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
|
|
119
|
-
alleles = np.full(num_alleles, constants.STR_FILL, dtype=
|
|
118
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
|
|
120
119
|
# length is the length of the REF allele unless other fields
|
|
121
120
|
# are included.
|
|
122
121
|
variant_length = len(variant.alleles[0])
|
|
@@ -176,8 +175,8 @@ class TskitFormat(vcz.Source):
|
|
|
176
175
|
min_position = 0
|
|
177
176
|
max_position = 0
|
|
178
177
|
if self.ts.num_sites > 0:
|
|
179
|
-
min_position = np.min(self.
|
|
180
|
-
max_position = np.max(self.
|
|
178
|
+
min_position = np.min(self.vcf_positions)
|
|
179
|
+
max_position = np.max(self.vcf_positions)
|
|
181
180
|
|
|
182
181
|
tables = self.ts.tables
|
|
183
182
|
ancestral_state_offsets = tables.sites.ancestral_state_offset
|
|
@@ -200,7 +199,7 @@ class TskitFormat(vcz.Source):
|
|
|
200
199
|
vcz.ZarrArraySpec(
|
|
201
200
|
source=None,
|
|
202
201
|
name="variant_allele",
|
|
203
|
-
dtype=
|
|
202
|
+
dtype=STRING_DTYPE_NAME,
|
|
204
203
|
dimensions=["variants", "alleles"],
|
|
205
204
|
description="Alleles for each variant",
|
|
206
205
|
),
|
|
@@ -252,8 +251,6 @@ def convert(
|
|
|
252
251
|
vcz_path,
|
|
253
252
|
*,
|
|
254
253
|
model_mapping=None,
|
|
255
|
-
contig_id=None,
|
|
256
|
-
isolated_as_missing=False,
|
|
257
254
|
variants_chunk_size=None,
|
|
258
255
|
samples_chunk_size=None,
|
|
259
256
|
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
@@ -277,8 +274,6 @@ def convert(
|
|
|
277
274
|
tskit_format = TskitFormat(
|
|
278
275
|
ts_or_path,
|
|
279
276
|
model_mapping=model_mapping,
|
|
280
|
-
contig_id=contig_id,
|
|
281
|
-
isolated_as_missing=isolated_as_missing,
|
|
282
277
|
)
|
|
283
278
|
schema_instance = tskit_format.generate_schema(
|
|
284
279
|
variants_chunk_size=variants_chunk_size,
|
|
@@ -16,6 +16,8 @@ from typing import Any
|
|
|
16
16
|
import numcodecs
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
|
|
20
|
+
|
|
19
21
|
from . import constants, core, provenance, vcf_utils, vcz
|
|
20
22
|
|
|
21
23
|
logger = logging.getLogger(__name__)
|
|
@@ -110,7 +112,7 @@ class VcfField:
|
|
|
110
112
|
ret = "U1"
|
|
111
113
|
else:
|
|
112
114
|
assert self.vcf_type == "String"
|
|
113
|
-
ret =
|
|
115
|
+
ret = STRING_DTYPE_NAME
|
|
114
116
|
return ret
|
|
115
117
|
|
|
116
118
|
|
|
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
|
|
|
397
399
|
|
|
398
400
|
def sanitise_value_string_1d(shape, value):
|
|
399
401
|
if value is None:
|
|
400
|
-
return np.full(shape, ".", dtype=
|
|
402
|
+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
|
|
401
403
|
else:
|
|
402
404
|
value = drop_empty_second_dim(value)
|
|
403
405
|
result = np.full(shape, "", dtype=value.dtype)
|
|
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
|
|
|
407
409
|
|
|
408
410
|
def sanitise_value_string_2d(shape, value):
|
|
409
411
|
if value is None:
|
|
410
|
-
return np.full(shape, ".", dtype=
|
|
412
|
+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
|
|
411
413
|
else:
|
|
412
|
-
result = np.full(shape, "", dtype=
|
|
414
|
+
result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
|
|
413
415
|
if value.ndim == 2:
|
|
414
416
|
result[: value.shape[0], : value.shape[1]] = value
|
|
415
417
|
else:
|
|
@@ -569,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
|
|
|
569
571
|
value = np.array(list(vcf_value.split(",")))
|
|
570
572
|
else:
|
|
571
573
|
# TODO can we make this faster??
|
|
572
|
-
|
|
574
|
+
var_len_values = [v.split(",") for v in vcf_value]
|
|
575
|
+
number = max(len(v) for v in var_len_values)
|
|
576
|
+
value = np.array(
|
|
577
|
+
[v + [""] * (number - len(v)) for v in var_len_values],
|
|
578
|
+
dtype=STRING_DTYPE_NAME,
|
|
579
|
+
)
|
|
573
580
|
# print("HERE", vcf_value, value)
|
|
574
581
|
# for v in vcf_value:
|
|
575
582
|
# print("\t", type(v), len(v), v.split(","))
|
|
@@ -1044,7 +1051,7 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1044
1051
|
ref_field.iter_values(start, stop),
|
|
1045
1052
|
alt_field.iter_values(start, stop),
|
|
1046
1053
|
):
|
|
1047
|
-
alleles = np.full(num_alleles, constants.STR_FILL, dtype=
|
|
1054
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
|
|
1048
1055
|
alleles[0] = ref[0]
|
|
1049
1056
|
alleles[1 : 1 + len(alt)] = alt
|
|
1050
1057
|
yield alleles
|
|
@@ -1068,14 +1075,16 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1068
1075
|
for variant_length, alleles in zip(
|
|
1069
1076
|
variant_lengths, self.iter_alleles(start, stop, num_alleles)
|
|
1070
1077
|
):
|
|
1071
|
-
|
|
1078
|
+
# Stored ICF values are always at least 1D arrays; "rlen" is Number=1
|
|
1079
|
+
# so we must extract the scalar to avoid NumPy scalar-conversion issues.
|
|
1080
|
+
yield vcz.VariantData(variant_length[0], alleles, None, None)
|
|
1072
1081
|
else:
|
|
1073
1082
|
for variant_length, alleles, (gt, phased) in zip(
|
|
1074
1083
|
variant_lengths,
|
|
1075
1084
|
self.iter_alleles(start, stop, num_alleles),
|
|
1076
1085
|
self.iter_genotypes(shape, start, stop),
|
|
1077
1086
|
):
|
|
1078
|
-
yield vcz.VariantData(variant_length, alleles, gt, phased)
|
|
1087
|
+
yield vcz.VariantData(variant_length[0], alleles, gt, phased)
|
|
1079
1088
|
|
|
1080
1089
|
def generate_schema(
|
|
1081
1090
|
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
|
|
@@ -1087,8 +1096,10 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1087
1096
|
|
|
1088
1097
|
# Add ploidy and genotypes dimensions only when needed
|
|
1089
1098
|
max_genotypes = 0
|
|
1099
|
+
has_g_field = False
|
|
1090
1100
|
for field in self.metadata.format_fields:
|
|
1091
1101
|
if field.vcf_number == "G":
|
|
1102
|
+
has_g_field = True
|
|
1092
1103
|
max_genotypes = max(max_genotypes, field.summary.max_number)
|
|
1093
1104
|
|
|
1094
1105
|
ploidy = None
|
|
@@ -1100,7 +1111,7 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1100
1111
|
genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
|
|
1101
1112
|
# assert max_genotypes == genotypes_size
|
|
1102
1113
|
else:
|
|
1103
|
-
if max_genotypes > 0:
|
|
1114
|
+
if max_genotypes > 0 or has_g_field:
|
|
1104
1115
|
# there is no GT field, but there is at least one Number=G field,
|
|
1105
1116
|
# so need to define genotypes dimension
|
|
1106
1117
|
genotypes_size = max_genotypes
|
|
@@ -1163,7 +1174,7 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1163
1174
|
),
|
|
1164
1175
|
fixed_field_spec(
|
|
1165
1176
|
name="variant_allele",
|
|
1166
|
-
dtype=
|
|
1177
|
+
dtype=STRING_DTYPE_NAME,
|
|
1167
1178
|
dimensions=["variants", "alleles"],
|
|
1168
1179
|
),
|
|
1169
1180
|
fixed_field_spec(
|
|
@@ -1173,7 +1184,7 @@ class IntermediateColumnarFormat(vcz.Source):
|
|
|
1173
1184
|
),
|
|
1174
1185
|
fixed_field_spec(
|
|
1175
1186
|
name="variant_id",
|
|
1176
|
-
dtype=
|
|
1187
|
+
dtype=STRING_DTYPE_NAME,
|
|
1177
1188
|
),
|
|
1178
1189
|
fixed_field_spec(
|
|
1179
1190
|
name="variant_id_mask",
|
|
@@ -1581,8 +1592,7 @@ def inspect(path):
|
|
|
1581
1592
|
raise ValueError(f"Path not found: {path}")
|
|
1582
1593
|
if (path / "metadata.json").exists():
|
|
1583
1594
|
obj = IntermediateColumnarFormat(path)
|
|
1584
|
-
|
|
1585
|
-
elif (path / ".zmetadata").exists():
|
|
1595
|
+
elif zarr_exists(path):
|
|
1586
1596
|
obj = vcz.VcfZarr(path)
|
|
1587
1597
|
else:
|
|
1588
1598
|
raise ValueError(f"{path} not in ICF or VCF Zarr format")
|