bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +245 -68
- bio2zarr/core.py +36 -19
- bio2zarr/plink.py +25 -19
- bio2zarr/vcf.py +704 -389
- bio2zarr/vcf_utils.py +0 -1
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/METADATA +1 -1
- bio2zarr-0.0.3.dist-info/RECORD +16 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/WHEEL +1 -1
- bio2zarr-0.0.1.dist-info/RECORD +0 -16
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/top_level.txt +0 -0
bio2zarr/plink.py
CHANGED
|
@@ -4,6 +4,7 @@ import humanfriendly
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import zarr
|
|
6
6
|
import bed_reader
|
|
7
|
+
import numcodecs
|
|
7
8
|
|
|
8
9
|
from . import core
|
|
9
10
|
|
|
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
|
|
|
22
23
|
gt = core.BufferedArray(root["call_genotype"], start)
|
|
23
24
|
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
|
|
24
25
|
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
|
|
25
|
-
|
|
26
|
+
variants_chunk_size = gt.array.chunks[0]
|
|
26
27
|
n = gt.array.shape[1]
|
|
27
|
-
assert start %
|
|
28
|
+
assert start % variants_chunk_size == 0
|
|
28
29
|
|
|
29
30
|
logger.debug(f"Reading slice {start}:{stop}")
|
|
30
31
|
chunk_start = start
|
|
31
32
|
while chunk_start < stop:
|
|
32
|
-
chunk_stop = min(chunk_start +
|
|
33
|
+
chunk_stop = min(chunk_start + variants_chunk_size, stop)
|
|
33
34
|
logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
|
|
34
35
|
bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
|
|
35
36
|
logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
|
|
@@ -60,8 +61,8 @@ def convert(
|
|
|
60
61
|
*,
|
|
61
62
|
show_progress=False,
|
|
62
63
|
worker_processes=1,
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
variants_chunk_size=None,
|
|
65
|
+
samples_chunk_size=None,
|
|
65
66
|
):
|
|
66
67
|
bed = bed_reader.open_bed(bed_path, num_threads=1)
|
|
67
68
|
n = bed.iid_count
|
|
@@ -69,25 +70,30 @@ def convert(
|
|
|
69
70
|
logging.info(f"Scanned plink with {n} samples and {m} variants")
|
|
70
71
|
|
|
71
72
|
# FIXME
|
|
72
|
-
if
|
|
73
|
-
|
|
74
|
-
if
|
|
75
|
-
|
|
73
|
+
if samples_chunk_size is None:
|
|
74
|
+
samples_chunk_size = 1000
|
|
75
|
+
if variants_chunk_size is None:
|
|
76
|
+
variants_chunk_size = 10_000
|
|
76
77
|
|
|
77
78
|
store = zarr.DirectoryStore(zarr_path)
|
|
78
79
|
root = zarr.group(store=store, overwrite=True)
|
|
79
80
|
|
|
80
81
|
ploidy = 2
|
|
81
82
|
shape = [m, n]
|
|
82
|
-
chunks = [
|
|
83
|
+
chunks = [variants_chunk_size, samples_chunk_size]
|
|
83
84
|
dimensions = ["variants", "samples"]
|
|
84
85
|
|
|
86
|
+
# TODO we should be reusing some logic from vcfzarr here on laying
|
|
87
|
+
# out the basic dataset, and using the schema generator. Currently
|
|
88
|
+
# we're not using the best Blosc settings for genotypes here.
|
|
89
|
+
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
90
|
+
|
|
85
91
|
a = root.array(
|
|
86
92
|
"sample_id",
|
|
87
93
|
bed.iid,
|
|
88
94
|
dtype="str",
|
|
89
|
-
compressor=
|
|
90
|
-
chunks=(
|
|
95
|
+
compressor=default_compressor,
|
|
96
|
+
chunks=(samples_chunk_size,),
|
|
91
97
|
)
|
|
92
98
|
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
93
99
|
logger.debug(f"Encoded samples")
|
|
@@ -98,8 +104,8 @@ def convert(
|
|
|
98
104
|
"variant_position",
|
|
99
105
|
bed.bp_position,
|
|
100
106
|
dtype=np.int32,
|
|
101
|
-
compressor=
|
|
102
|
-
chunks=(
|
|
107
|
+
compressor=default_compressor,
|
|
108
|
+
chunks=(variants_chunk_size,),
|
|
103
109
|
)
|
|
104
110
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
105
111
|
logger.debug(f"encoded variant_position")
|
|
@@ -109,8 +115,8 @@ def convert(
|
|
|
109
115
|
"variant_allele",
|
|
110
116
|
alleles,
|
|
111
117
|
dtype="str",
|
|
112
|
-
compressor=
|
|
113
|
-
chunks=(
|
|
118
|
+
compressor=default_compressor,
|
|
119
|
+
chunks=(variants_chunk_size,),
|
|
114
120
|
)
|
|
115
121
|
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
116
122
|
logger.debug(f"encoded variant_allele")
|
|
@@ -121,7 +127,7 @@ def convert(
|
|
|
121
127
|
dtype="bool",
|
|
122
128
|
shape=list(shape),
|
|
123
129
|
chunks=list(chunks),
|
|
124
|
-
compressor=
|
|
130
|
+
compressor=default_compressor,
|
|
125
131
|
)
|
|
126
132
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
127
133
|
|
|
@@ -132,7 +138,7 @@ def convert(
|
|
|
132
138
|
dtype="i1",
|
|
133
139
|
shape=list(shape),
|
|
134
140
|
chunks=list(chunks),
|
|
135
|
-
compressor=
|
|
141
|
+
compressor=default_compressor,
|
|
136
142
|
)
|
|
137
143
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
138
144
|
|
|
@@ -141,7 +147,7 @@ def convert(
|
|
|
141
147
|
dtype="bool",
|
|
142
148
|
shape=list(shape),
|
|
143
149
|
chunks=list(chunks),
|
|
144
|
-
compressor=
|
|
150
|
+
compressor=default_compressor,
|
|
145
151
|
)
|
|
146
152
|
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
147
153
|
|