bio2zarr 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/ci.yml +50 -1
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/CHANGELOG.md +25 -1
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/PKG-INFO +5 -3
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/_version.py +9 -4
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/cli.py +46 -12
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/core.py +32 -2
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/plink.py +19 -14
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/icf.py +41 -18
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/vcz.py +460 -138
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/verification.py +19 -16
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf_utils.py +30 -14
- bio2zarr-0.1.3/bio2zarr/zarr_utils.py +18 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/PKG-INFO +5 -3
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/SOURCES.txt +1 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/requires.txt +2 -1
- bio2zarr-0.1.3/docs/vcf2zarr/overview.md +152 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcf2zarr/tutorial.md +2 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcfpartition/overview.md +1 -1
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/pyproject.toml +5 -1
- bio2zarr-0.1.1/docs/vcf2zarr/overview.md +0 -92
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/cd.yml +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.github/workflows/docs.yml +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.gitignore +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/LICENSE +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/MANIFEST.in +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/README.md +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/__main__.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/constants.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/typing.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr/vcf2zarr/__init__.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/entry_points.txt +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/Makefile +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_config.yml +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/asciinema-player.css +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/asciinema-player.min.js +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_static/custom.css +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/_toc.yml +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/build.sh +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/cast_scripts/vcf2zarr_convert.sh +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/cast_scripts/vcf2zarr_explode.sh +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/installation.md +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/intro.md +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/logo.png +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/requirements.txt +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcf2zarr/cli_ref.md +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/docs/vcfpartition/cli_ref.md +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/setup.cfg +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation-data/Makefile +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation-data/split.sh +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/validation.py +0 -0
- {bio2zarr-0.1.1 → bio2zarr-0.1.3}/vcf_generator.py +0 -0
|
@@ -24,17 +24,21 @@ jobs:
|
|
|
24
24
|
# Use macos-13 because pip binary packages for ARM aren't
|
|
25
25
|
# available for many dependencies
|
|
26
26
|
os: [macos-13, macos-14, ubuntu-latest]
|
|
27
|
-
python-version: ["3.9", "3.10", "3.11"]
|
|
27
|
+
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
28
28
|
exclude:
|
|
29
29
|
# Just run macos tests on one Python version
|
|
30
30
|
- os: macos-13
|
|
31
31
|
python-version: "3.10"
|
|
32
32
|
- os: macos-13
|
|
33
33
|
python-version: "3.11"
|
|
34
|
+
- os: macos-13
|
|
35
|
+
python-version: "3.12"
|
|
34
36
|
- os: macos-14
|
|
35
37
|
python-version: "3.9"
|
|
36
38
|
- os: macos-14
|
|
37
39
|
python-version: "3.10"
|
|
40
|
+
- os: macos-14
|
|
41
|
+
python-version: "3.12"
|
|
38
42
|
steps:
|
|
39
43
|
- uses: actions/checkout@v4
|
|
40
44
|
- name: Set up Python ${{ matrix.python-version }}
|
|
@@ -105,3 +109,48 @@ jobs:
|
|
|
105
109
|
vcfpartition --help
|
|
106
110
|
python -m bio2zarr vcfpartition --help
|
|
107
111
|
|
|
112
|
+
test-numpy-version:
|
|
113
|
+
name: Test numpy versions
|
|
114
|
+
runs-on: ubuntu-latest
|
|
115
|
+
strategy:
|
|
116
|
+
matrix:
|
|
117
|
+
numpy: ["==1.26", ">=2"]
|
|
118
|
+
steps:
|
|
119
|
+
- uses: actions/checkout@v4
|
|
120
|
+
- uses: actions/setup-python@v5
|
|
121
|
+
with:
|
|
122
|
+
python-version: '3.11'
|
|
123
|
+
- name: Install dependencies
|
|
124
|
+
run: |
|
|
125
|
+
python -m pip install --upgrade pip
|
|
126
|
+
python -m pip install '.[dev]'
|
|
127
|
+
- name: Install numpy${{ matrix.numpy }}
|
|
128
|
+
run: |
|
|
129
|
+
python -m pip install 'numpy${{ matrix.numpy }}'
|
|
130
|
+
- name: Run tests
|
|
131
|
+
run: |
|
|
132
|
+
# We just run the CLI tests here because it doesn't require other upstream
|
|
133
|
+
# packages like sgkit (which are tangled up with the numpy 2 dependency)
|
|
134
|
+
python -m pytest tests/test_cli.py
|
|
135
|
+
|
|
136
|
+
test-zarr-version:
|
|
137
|
+
name: Test Zarr versions
|
|
138
|
+
runs-on: ubuntu-latest
|
|
139
|
+
strategy:
|
|
140
|
+
matrix:
|
|
141
|
+
zarr: ["==2.18.3", ">=3.0.3"]
|
|
142
|
+
steps:
|
|
143
|
+
- uses: actions/checkout@v4
|
|
144
|
+
- uses: actions/setup-python@v5
|
|
145
|
+
with:
|
|
146
|
+
python-version: '3.11'
|
|
147
|
+
- name: Install dependencies
|
|
148
|
+
run: |
|
|
149
|
+
python -m pip install --upgrade pip
|
|
150
|
+
python -m pip install '.[dev]'
|
|
151
|
+
- name: Install zarr${{ matrix.zarr }}
|
|
152
|
+
run: |
|
|
153
|
+
python -m pip install 'zarr${{ matrix.zarr }}'
|
|
154
|
+
- name: Run tests
|
|
155
|
+
run: |
|
|
156
|
+
python -m pytest
|
|
@@ -1,8 +1,32 @@
|
|
|
1
|
+
# 0.1.3 2025-03-04
|
|
2
|
+
|
|
3
|
+
- Fix missing dependency issue for packaging
|
|
4
|
+
|
|
5
|
+
- Support out-of-order field definitions in the VCF header (#322, @ACEnglish)
|
|
6
|
+
|
|
7
|
+
# 0.1.2 2025-02-04
|
|
8
|
+
|
|
9
|
+
- Reduce memory requirement for encoding genotypes with large sample sizes
|
|
10
|
+
|
|
11
|
+
- Transpose default chunk sizes to 1000 variants and 10,000 samples (issue:300)
|
|
12
|
+
|
|
13
|
+
- Add chunksize options to mkschema (issue:294)
|
|
14
|
+
|
|
15
|
+
- Add experimental support for local alleles.
|
|
16
|
+
|
|
17
|
+
- Add experimental support for ``region_index``
|
|
18
|
+
|
|
19
|
+
Breaking changes
|
|
20
|
+
|
|
21
|
+
- ICF metadata format version bumped to ensure long-term compatility between numpy 1.26.x
|
|
22
|
+
and numpy >= 2. Existing ICFs will need to be recreated.
|
|
23
|
+
|
|
24
|
+
|
|
1
25
|
# 0.1.1 2024-06-19
|
|
2
26
|
|
|
3
27
|
Maintenance release:
|
|
4
28
|
|
|
5
|
-
- Pin numpy to < 2
|
|
29
|
+
- Pin numpy to < 2
|
|
6
30
|
- Pin Zarr to < 3
|
|
7
31
|
|
|
8
32
|
# 0.1.0 2024-06-10
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
|
|
|
219
219
|
Classifier: Programming Language :: Python :: 3.9
|
|
220
220
|
Classifier: Programming Language :: Python :: 3.10
|
|
221
221
|
Classifier: Programming Language :: Python :: 3.11
|
|
222
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
222
223
|
Classifier: Topic :: Scientific/Engineering
|
|
223
224
|
Requires-Python: >=3.9
|
|
224
225
|
Description-Content-Type: text/markdown
|
|
225
226
|
License-File: LICENSE
|
|
226
|
-
Requires-Dist: numpy
|
|
227
|
+
Requires-Dist: numpy>=1.26
|
|
227
228
|
Requires-Dist: zarr<3,>=2.17
|
|
228
229
|
Requires-Dist: click
|
|
229
230
|
Requires-Dist: tabulate
|
|
@@ -232,6 +233,7 @@ Requires-Dist: humanfriendly
|
|
|
232
233
|
Requires-Dist: cyvcf2
|
|
233
234
|
Requires-Dist: bed_reader
|
|
234
235
|
Provides-Extra: dev
|
|
236
|
+
Requires-Dist: hypothesis-vcf; extra == "dev"
|
|
235
237
|
Requires-Dist: msprime; extra == "dev"
|
|
236
238
|
Requires-Dist: pysam; extra == "dev"
|
|
237
239
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
# file generated by
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
|
5
|
+
|
|
3
6
|
TYPE_CHECKING = False
|
|
4
7
|
if TYPE_CHECKING:
|
|
5
|
-
from typing import Tuple
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
6
11
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
12
|
else:
|
|
8
13
|
VERSION_TUPLE = object
|
|
@@ -12,5 +17,5 @@ __version__: str
|
|
|
12
17
|
__version_tuple__: VERSION_TUPLE
|
|
13
18
|
version_tuple: VERSION_TUPLE
|
|
14
19
|
|
|
15
|
-
__version__ = version = '0.1.
|
|
16
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
20
|
+
__version__ = version = '0.1.3'
|
|
21
|
+
__version_tuple__ = version_tuple = (0, 1, 3)
|
|
@@ -149,6 +149,13 @@ max_memory = click.option(
|
|
|
149
149
|
help="An approximate bound on overall memory usage (e.g. 10G),",
|
|
150
150
|
)
|
|
151
151
|
|
|
152
|
+
local_alleles = click.option(
|
|
153
|
+
"--local-alleles/--no-local-alleles",
|
|
154
|
+
show_default=True,
|
|
155
|
+
default=False,
|
|
156
|
+
help="Use local allele fields to reduce the storage requirements of the output.",
|
|
157
|
+
)
|
|
158
|
+
|
|
152
159
|
|
|
153
160
|
def setup_logging(verbosity):
|
|
154
161
|
level = "WARNING"
|
|
@@ -312,7 +319,7 @@ def dexplode_finalise(icf_path, verbose):
|
|
|
312
319
|
|
|
313
320
|
|
|
314
321
|
@click.command
|
|
315
|
-
@click.argument("path", type=click.Path())
|
|
322
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
316
323
|
@verbose
|
|
317
324
|
def inspect(path, verbose):
|
|
318
325
|
"""
|
|
@@ -325,12 +332,26 @@ def inspect(path, verbose):
|
|
|
325
332
|
|
|
326
333
|
@click.command
|
|
327
334
|
@icf_path
|
|
328
|
-
|
|
335
|
+
@variants_chunk_size
|
|
336
|
+
@samples_chunk_size
|
|
337
|
+
@local_alleles
|
|
338
|
+
def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
|
|
329
339
|
"""
|
|
330
340
|
Generate a schema for zarr encoding
|
|
331
341
|
"""
|
|
342
|
+
if local_alleles:
|
|
343
|
+
click.echo(
|
|
344
|
+
"WARNING: Local alleles support is preliminary; please use with caution.",
|
|
345
|
+
err=True,
|
|
346
|
+
)
|
|
332
347
|
stream = click.get_text_stream("stdout")
|
|
333
|
-
vcf2zarr.mkschema(
|
|
348
|
+
vcf2zarr.mkschema(
|
|
349
|
+
icf_path,
|
|
350
|
+
stream,
|
|
351
|
+
variants_chunk_size=variants_chunk_size,
|
|
352
|
+
samples_chunk_size=samples_chunk_size,
|
|
353
|
+
local_alleles=local_alleles,
|
|
354
|
+
)
|
|
334
355
|
|
|
335
356
|
|
|
336
357
|
@click.command
|
|
@@ -469,6 +490,7 @@ def dencode_finalise(zarr_path, verbose, progress):
|
|
|
469
490
|
@verbose
|
|
470
491
|
@progress
|
|
471
492
|
@worker_processes
|
|
493
|
+
@local_alleles
|
|
472
494
|
def convert_vcf(
|
|
473
495
|
vcfs,
|
|
474
496
|
zarr_path,
|
|
@@ -478,6 +500,7 @@ def convert_vcf(
|
|
|
478
500
|
verbose,
|
|
479
501
|
progress,
|
|
480
502
|
worker_processes,
|
|
503
|
+
local_alleles,
|
|
481
504
|
):
|
|
482
505
|
"""
|
|
483
506
|
Convert input VCF(s) directly to vcfzarr (not recommended for large files).
|
|
@@ -491,6 +514,7 @@ def convert_vcf(
|
|
|
491
514
|
samples_chunk_size=samples_chunk_size,
|
|
492
515
|
show_progress=progress,
|
|
493
516
|
worker_processes=worker_processes,
|
|
517
|
+
local_alleles=local_alleles,
|
|
494
518
|
)
|
|
495
519
|
|
|
496
520
|
|
|
@@ -560,7 +584,7 @@ plink2zarr.add_command(convert_plink)
|
|
|
560
584
|
|
|
561
585
|
@click.command
|
|
562
586
|
@version
|
|
563
|
-
@
|
|
587
|
+
@vcfs
|
|
564
588
|
@verbose
|
|
565
589
|
@num_partitions
|
|
566
590
|
@click.option(
|
|
@@ -570,12 +594,16 @@ plink2zarr.add_command(convert_plink)
|
|
|
570
594
|
default=None,
|
|
571
595
|
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
|
|
572
596
|
)
|
|
573
|
-
def vcfpartition(
|
|
597
|
+
def vcfpartition(vcfs, verbose, num_partitions, partition_size):
|
|
574
598
|
"""
|
|
575
|
-
Output bcftools region strings that partition
|
|
599
|
+
Output bcftools region strings that partition the indexed VCF/BCF files
|
|
576
600
|
into either an approximate number of parts (-n), or parts of approximately
|
|
577
601
|
a given size (-s). One of -n or -s must be supplied.
|
|
578
602
|
|
|
603
|
+
If multiple VCF/BCF files are provided, the number of parts (-n) is
|
|
604
|
+
interpreted as the total number of partitions across all the files,
|
|
605
|
+
and the partitions are distributed evenly among the files.
|
|
606
|
+
|
|
579
607
|
Note that both the number of partitions and sizes are a target, and the
|
|
580
608
|
returned number of partitions may not exactly correspond. In particular,
|
|
581
609
|
there is a maximum level of granularity determined by the associated index
|
|
@@ -590,9 +618,15 @@ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
|
|
|
590
618
|
"Either --num-partitions or --partition-size must be specified"
|
|
591
619
|
)
|
|
592
620
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
621
|
+
if num_partitions is None:
|
|
622
|
+
num_parts_per_path = None
|
|
623
|
+
else:
|
|
624
|
+
num_parts_per_path = max(1, num_partitions // len(vcfs))
|
|
625
|
+
|
|
626
|
+
for vcf_path in vcfs:
|
|
627
|
+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
|
|
628
|
+
regions = indexed_vcf.partition_into_regions(
|
|
629
|
+
num_parts=num_parts_per_path, target_part_size=partition_size
|
|
630
|
+
)
|
|
631
|
+
for region in regions:
|
|
632
|
+
click.echo(f"{region}\t{vcf_path}")
|
|
@@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None):
|
|
|
63
63
|
return slices
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def first_dim_slice_iter(z, start, stop):
|
|
67
|
+
"""
|
|
68
|
+
Efficiently iterate over the specified slice of the first dimension of the zarr
|
|
69
|
+
array z.
|
|
70
|
+
"""
|
|
71
|
+
chunk_size = z.chunks[0]
|
|
72
|
+
first_chunk = start // chunk_size
|
|
73
|
+
last_chunk = (stop // chunk_size) + (stop % chunk_size != 0)
|
|
74
|
+
for chunk in range(first_chunk, last_chunk):
|
|
75
|
+
Z = z.blocks[chunk]
|
|
76
|
+
chunk_start = chunk * chunk_size
|
|
77
|
+
chunk_stop = chunk_start + chunk_size
|
|
78
|
+
slice_start = None
|
|
79
|
+
if start > chunk_start:
|
|
80
|
+
slice_start = start - chunk_start
|
|
81
|
+
slice_stop = None
|
|
82
|
+
if stop < chunk_stop:
|
|
83
|
+
slice_stop = stop - chunk_start
|
|
84
|
+
yield from Z[slice_start:slice_stop]
|
|
85
|
+
|
|
86
|
+
|
|
66
87
|
def du(path):
|
|
67
88
|
"""
|
|
68
89
|
Return the total bytes stored at this path.
|
|
@@ -113,13 +134,16 @@ def cancel_futures(futures):
|
|
|
113
134
|
class BufferedArray:
|
|
114
135
|
array: zarr.Array
|
|
115
136
|
array_offset: int
|
|
137
|
+
name: str
|
|
116
138
|
buff: np.ndarray
|
|
117
139
|
buffer_row: int
|
|
140
|
+
max_buff_size: int = 0
|
|
118
141
|
|
|
119
|
-
def __init__(self, array, offset):
|
|
142
|
+
def __init__(self, array, offset, name="Unknown"):
|
|
120
143
|
self.array = array
|
|
121
144
|
self.array_offset = offset
|
|
122
145
|
assert offset % array.chunks[0] == 0
|
|
146
|
+
self.name = name
|
|
123
147
|
dims = list(array.shape)
|
|
124
148
|
dims[0] = min(array.chunks[0], array.shape[0])
|
|
125
149
|
self.buff = np.empty(dims, dtype=array.dtype)
|
|
@@ -150,11 +174,17 @@ class BufferedArray:
|
|
|
150
174
|
self.buff[: self.buffer_row], self.array, self.array_offset
|
|
151
175
|
)
|
|
152
176
|
logger.debug(
|
|
153
|
-
f"Flushed <{self.
|
|
177
|
+
f"Flushed <{self.name} {self.array.shape} "
|
|
154
178
|
f"{self.array.dtype}> "
|
|
155
179
|
f"{self.array_offset}:{self.array_offset + self.buffer_row}"
|
|
156
180
|
f"{self.buff.nbytes / 2**20: .2f}Mb"
|
|
157
181
|
)
|
|
182
|
+
# Note this is inaccurate for string data as we're just reporting the
|
|
183
|
+
# size of the container. When we switch the numpy 2 StringDtype this
|
|
184
|
+
# should improve and we can get more visibility on how memory
|
|
185
|
+
# is being used.
|
|
186
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/30
|
|
187
|
+
self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
|
|
158
188
|
self.array_offset += self.variants_chunk_size
|
|
159
189
|
self.buffer_row = 0
|
|
160
190
|
|
|
@@ -6,6 +6,8 @@ import numcodecs
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import zarr
|
|
8
8
|
|
|
9
|
+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
|
|
10
|
+
|
|
9
11
|
from . import core
|
|
10
12
|
|
|
11
13
|
logger = logging.getLogger(__name__)
|
|
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
17
19
|
# the correct approach is, but it is important to note that the
|
|
18
20
|
# 0th allele is *not* necessarily the REF for these datasets.
|
|
19
21
|
bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
|
|
20
|
-
|
|
21
|
-
root = zarr.group(store=store)
|
|
22
|
+
root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
|
|
22
23
|
gt = core.BufferedArray(root["call_genotype"], start)
|
|
23
24
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
24
25
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
@@ -73,8 +74,7 @@ def convert(
|
|
|
73
74
|
if variants_chunk_size is None:
|
|
74
75
|
variants_chunk_size = 10_000
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
root = zarr.group(store=store, overwrite=True)
|
|
77
|
+
root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
|
|
78
78
|
|
|
79
79
|
ploidy = 2
|
|
80
80
|
shape = [m, n]
|
|
@@ -88,7 +88,8 @@ def convert(
|
|
|
88
88
|
|
|
89
89
|
a = root.array(
|
|
90
90
|
"sample_id",
|
|
91
|
-
bed.iid,
|
|
91
|
+
data=bed.iid,
|
|
92
|
+
shape=bed.iid.shape,
|
|
92
93
|
dtype="str",
|
|
93
94
|
compressor=default_compressor,
|
|
94
95
|
chunks=(samples_chunk_size,),
|
|
@@ -100,7 +101,8 @@ def convert(
|
|
|
100
101
|
# fetching repeatedly from bim file
|
|
101
102
|
a = root.array(
|
|
102
103
|
"variant_position",
|
|
103
|
-
bed.bp_position,
|
|
104
|
+
data=bed.bp_position,
|
|
105
|
+
shape=bed.bp_position.shape,
|
|
104
106
|
dtype=np.int32,
|
|
105
107
|
compressor=default_compressor,
|
|
106
108
|
chunks=(variants_chunk_size,),
|
|
@@ -111,41 +113,45 @@ def convert(
|
|
|
111
113
|
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
112
114
|
a = root.array(
|
|
113
115
|
"variant_allele",
|
|
114
|
-
alleles,
|
|
116
|
+
data=alleles,
|
|
117
|
+
shape=alleles.shape,
|
|
115
118
|
dtype="str",
|
|
116
119
|
compressor=default_compressor,
|
|
117
|
-
chunks=(variants_chunk_size,),
|
|
120
|
+
chunks=(variants_chunk_size, alleles.shape[1]),
|
|
118
121
|
)
|
|
119
122
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
120
123
|
logger.debug("encoded variant_allele")
|
|
121
124
|
|
|
122
125
|
# TODO remove this?
|
|
123
126
|
a = root.empty(
|
|
124
|
-
"call_genotype_phased",
|
|
127
|
+
name="call_genotype_phased",
|
|
125
128
|
dtype="bool",
|
|
126
129
|
shape=list(shape),
|
|
127
130
|
chunks=list(chunks),
|
|
128
131
|
compressor=default_compressor,
|
|
132
|
+
**ZARR_FORMAT_KWARGS,
|
|
129
133
|
)
|
|
130
134
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
131
135
|
|
|
132
136
|
shape += [ploidy]
|
|
133
137
|
dimensions += ["ploidy"]
|
|
134
138
|
a = root.empty(
|
|
135
|
-
"call_genotype",
|
|
139
|
+
name="call_genotype",
|
|
136
140
|
dtype="i1",
|
|
137
141
|
shape=list(shape),
|
|
138
142
|
chunks=list(chunks),
|
|
139
143
|
compressor=default_compressor,
|
|
144
|
+
**ZARR_FORMAT_KWARGS,
|
|
140
145
|
)
|
|
141
146
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
142
147
|
|
|
143
148
|
a = root.empty(
|
|
144
|
-
"call_genotype_mask",
|
|
149
|
+
name="call_genotype_mask",
|
|
145
150
|
dtype="bool",
|
|
146
151
|
shape=list(shape),
|
|
147
152
|
chunks=list(chunks),
|
|
148
153
|
compressor=default_compressor,
|
|
154
|
+
**ZARR_FORMAT_KWARGS,
|
|
149
155
|
)
|
|
150
156
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
151
157
|
|
|
@@ -154,7 +160,7 @@ def convert(
|
|
|
154
160
|
num_slices = max(1, worker_processes * 4)
|
|
155
161
|
slices = core.chunk_aligned_slices(a, num_slices)
|
|
156
162
|
|
|
157
|
-
total_chunks = sum(a.nchunks for a in root.
|
|
163
|
+
total_chunks = sum(a.nchunks for _, a in root.arrays())
|
|
158
164
|
|
|
159
165
|
progress_config = core.ProgressConfig(
|
|
160
166
|
total=total_chunks, title="Convert", units="chunks", show=show_progress
|
|
@@ -171,8 +177,7 @@ def convert(
|
|
|
171
177
|
# FIXME do this more efficiently - currently reading the whole thing
|
|
172
178
|
# in for convenience, and also comparing call-by-call
|
|
173
179
|
def validate(bed_path, zarr_path):
|
|
174
|
-
|
|
175
|
-
root = zarr.group(store=store)
|
|
180
|
+
root = zarr.open(store=zarr_path, mode="r")
|
|
176
181
|
call_genotype = root["call_genotype"][:]
|
|
177
182
|
|
|
178
183
|
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
|
|
@@ -41,7 +41,7 @@ class VcfFieldSummary(core.JsonDataclass):
|
|
|
41
41
|
return VcfFieldSummary(**d)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
@dataclasses.dataclass
|
|
44
|
+
@dataclasses.dataclass(order=True)
|
|
45
45
|
class VcfField:
|
|
46
46
|
category: str
|
|
47
47
|
name: str
|
|
@@ -110,7 +110,7 @@ class VcfPartition:
|
|
|
110
110
|
num_records: int = -1
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
ICF_METADATA_FORMAT_VERSION = "0.
|
|
113
|
+
ICF_METADATA_FORMAT_VERSION = "0.4"
|
|
114
114
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
115
115
|
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
116
116
|
)
|
|
@@ -192,6 +192,16 @@ class IcfMetadata(core.JsonDataclass):
|
|
|
192
192
|
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
193
193
|
return IcfMetadata(**d)
|
|
194
194
|
|
|
195
|
+
def __eq__(self, other):
|
|
196
|
+
if not isinstance(other, IcfMetadata):
|
|
197
|
+
return NotImplemented
|
|
198
|
+
return (
|
|
199
|
+
self.samples == other.samples
|
|
200
|
+
and self.contigs == other.contigs
|
|
201
|
+
and self.filters == other.filters
|
|
202
|
+
and sorted(self.fields) == sorted(other.fields)
|
|
203
|
+
)
|
|
204
|
+
|
|
195
205
|
|
|
196
206
|
def fixed_vcf_field_definitions():
|
|
197
207
|
def make_field_def(name, vcf_type, vcf_number):
|
|
@@ -212,6 +222,7 @@ def fixed_vcf_field_definitions():
|
|
|
212
222
|
make_field_def("FILTERS", "String", "."),
|
|
213
223
|
make_field_def("REF", "String", "1"),
|
|
214
224
|
make_field_def("ALT", "String", "."),
|
|
225
|
+
make_field_def("rlen", "Integer", "1"), # computed field
|
|
215
226
|
]
|
|
216
227
|
return fields
|
|
217
228
|
|
|
@@ -240,7 +251,7 @@ def scan_vcf(path, target_num_partitions):
|
|
|
240
251
|
for h in vcf.header_iter():
|
|
241
252
|
if h["HeaderType"] in ["INFO", "FORMAT"]:
|
|
242
253
|
field = VcfField.from_header(h)
|
|
243
|
-
if field.name == "GT":
|
|
254
|
+
if h["HeaderType"] == "FORMAT" and field.name == "GT":
|
|
244
255
|
field.vcf_type = "Integer"
|
|
245
256
|
field.vcf_number = "."
|
|
246
257
|
fields.append(field)
|
|
@@ -300,7 +311,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
300
311
|
)
|
|
301
312
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
302
313
|
for path in paths:
|
|
303
|
-
pwm.submit(
|
|
314
|
+
pwm.submit(
|
|
315
|
+
scan_vcf,
|
|
316
|
+
path,
|
|
317
|
+
max(1, target_num_partitions // len(paths)),
|
|
318
|
+
)
|
|
304
319
|
results = list(pwm.results_as_completed())
|
|
305
320
|
|
|
306
321
|
# Sort to make the ordering deterministic
|
|
@@ -408,7 +423,7 @@ def sanitise_value_float_1d(buff, j, value):
|
|
|
408
423
|
if value is None:
|
|
409
424
|
buff[j] = constants.FLOAT32_MISSING
|
|
410
425
|
else:
|
|
411
|
-
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=
|
|
426
|
+
value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
|
|
412
427
|
# numpy will map None values to Nan, but we need a
|
|
413
428
|
# specific NaN
|
|
414
429
|
value[np.isnan(value)] = constants.FLOAT32_MISSING
|
|
@@ -422,7 +437,7 @@ def sanitise_value_float_2d(buff, j, value):
|
|
|
422
437
|
buff[j] = constants.FLOAT32_MISSING
|
|
423
438
|
else:
|
|
424
439
|
# print("value = ", value)
|
|
425
|
-
value = np.array(value, ndmin=2, dtype=buff.dtype, copy=
|
|
440
|
+
value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
|
|
426
441
|
buff[j] = constants.FLOAT32_FILL
|
|
427
442
|
buff[j, :, : value.shape[1]] = value
|
|
428
443
|
|
|
@@ -432,7 +447,7 @@ def sanitise_int_array(value, ndmin, dtype):
|
|
|
432
447
|
value = [
|
|
433
448
|
constants.VCF_INT_MISSING if x is None else x for x in value
|
|
434
449
|
] # NEEDS TEST
|
|
435
|
-
value = np.array(value, ndmin=ndmin, copy=
|
|
450
|
+
value = np.array(value, ndmin=ndmin, copy=True)
|
|
436
451
|
value[value == constants.VCF_INT_MISSING] = -1
|
|
437
452
|
value[value == constants.VCF_INT_FILL] = -2
|
|
438
453
|
# TODO watch out for clipping here!
|
|
@@ -494,15 +509,15 @@ class VcfValueTransformer:
|
|
|
494
509
|
def transform(self, vcf_value):
|
|
495
510
|
if isinstance(vcf_value, tuple):
|
|
496
511
|
vcf_value = [self.missing if v is None else v for v in vcf_value]
|
|
497
|
-
value = np.array(vcf_value, ndmin=self.dimension, copy=
|
|
512
|
+
value = np.array(vcf_value, ndmin=self.dimension, copy=True)
|
|
498
513
|
return value
|
|
499
514
|
|
|
500
515
|
def transform_and_update_bounds(self, vcf_value):
|
|
501
516
|
if vcf_value is None:
|
|
502
517
|
return None
|
|
518
|
+
# print(self, self.field.full_name, "T", vcf_value)
|
|
503
519
|
value = self.transform(vcf_value)
|
|
504
520
|
self.update_bounds(value)
|
|
505
|
-
# print(self.field.full_name, "T", vcf_value, "->", value)
|
|
506
521
|
return value
|
|
507
522
|
|
|
508
523
|
|
|
@@ -531,13 +546,15 @@ class FloatValueTransformer(VcfValueTransformer):
|
|
|
531
546
|
class StringValueTransformer(VcfValueTransformer):
|
|
532
547
|
def update_bounds(self, value):
|
|
533
548
|
summary = self.field.summary
|
|
534
|
-
|
|
549
|
+
if self.field.category == "FORMAT":
|
|
550
|
+
number = max(len(v) for v in value)
|
|
551
|
+
else:
|
|
552
|
+
number = value.shape[-1]
|
|
535
553
|
# TODO would be nice to report string lengths, but not
|
|
536
554
|
# really necessary.
|
|
537
555
|
summary.max_number = max(summary.max_number, number)
|
|
538
556
|
|
|
539
557
|
def transform(self, vcf_value):
|
|
540
|
-
# print("transform", vcf_value)
|
|
541
558
|
if self.dimension == 1:
|
|
542
559
|
value = np.array(list(vcf_value.split(",")))
|
|
543
560
|
else:
|
|
@@ -853,11 +870,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
853
870
|
|
|
854
871
|
def summary_table(self):
|
|
855
872
|
data = []
|
|
856
|
-
for name,
|
|
857
|
-
summary =
|
|
873
|
+
for name, icf_field in self.fields.items():
|
|
874
|
+
summary = icf_field.vcf_field.summary
|
|
858
875
|
d = {
|
|
859
876
|
"name": name,
|
|
860
|
-
"type":
|
|
877
|
+
"type": icf_field.vcf_field.vcf_type,
|
|
861
878
|
"chunks": summary.num_chunks,
|
|
862
879
|
"size": core.display_size(summary.uncompressed_size),
|
|
863
880
|
"compressed": core.display_size(summary.compressed_size),
|
|
@@ -962,7 +979,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
962
979
|
compressor=None,
|
|
963
980
|
):
|
|
964
981
|
if self.path.exists():
|
|
965
|
-
raise ValueError("ICF path already exists")
|
|
982
|
+
raise ValueError(f"ICF path already exists: {self.path}")
|
|
966
983
|
if compressor is None:
|
|
967
984
|
compressor = ICF_DEFAULT_COMPRESSOR
|
|
968
985
|
vcfs = [pathlib.Path(vcf) for vcf in vcfs]
|
|
@@ -1009,8 +1026,8 @@ class IntermediateColumnarFormatWriter:
|
|
|
1009
1026
|
self.path.mkdir()
|
|
1010
1027
|
self.wip_path.mkdir()
|
|
1011
1028
|
for field in self.metadata.fields:
|
|
1012
|
-
|
|
1013
|
-
|
|
1029
|
+
field_path = get_vcf_field_path(self.path, field)
|
|
1030
|
+
field_path.mkdir(parents=True)
|
|
1014
1031
|
|
|
1015
1032
|
def load_partition_summaries(self):
|
|
1016
1033
|
summaries = []
|
|
@@ -1074,13 +1091,19 @@ class IntermediateColumnarFormatWriter:
|
|
|
1074
1091
|
tcw.append("FILTERS", variant.FILTERS)
|
|
1075
1092
|
tcw.append("REF", variant.REF)
|
|
1076
1093
|
tcw.append("ALT", variant.ALT)
|
|
1094
|
+
tcw.append("rlen", variant.end - variant.start)
|
|
1077
1095
|
for field in info_fields:
|
|
1078
1096
|
tcw.append(field.full_name, variant.INFO.get(field.name, None))
|
|
1079
1097
|
if has_gt:
|
|
1080
|
-
|
|
1098
|
+
if variant.genotype is None:
|
|
1099
|
+
val = None
|
|
1100
|
+
else:
|
|
1101
|
+
val = variant.genotype.array()
|
|
1102
|
+
tcw.append("FORMAT/GT", val)
|
|
1081
1103
|
for field in format_fields:
|
|
1082
1104
|
val = variant.format(field.name)
|
|
1083
1105
|
tcw.append(field.full_name, val)
|
|
1106
|
+
|
|
1084
1107
|
# Note: an issue with updating the progress per variant here like
|
|
1085
1108
|
# this is that we get a significant pause at the end of the counter
|
|
1086
1109
|
# while all the "small" fields get flushed. Possibly not much to be
|