bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +89 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +314 -189
- bio2zarr/tskit.py +301 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +594 -112
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +17 -6
- bio2zarr-0.1.6.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0
bio2zarr/plink.py
CHANGED
|
@@ -1,207 +1,332 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import logging
|
|
3
|
+
import pathlib
|
|
2
4
|
|
|
3
|
-
import bed_reader
|
|
4
|
-
import humanfriendly
|
|
5
|
-
import numcodecs
|
|
6
5
|
import numpy as np
|
|
7
|
-
import
|
|
6
|
+
import pandas as pd
|
|
8
7
|
|
|
9
|
-
from bio2zarr
|
|
10
|
-
|
|
11
|
-
from . import core
|
|
8
|
+
from bio2zarr import constants, core, vcz
|
|
12
9
|
|
|
13
10
|
logger = logging.getLogger(__name__)
|
|
14
11
|
|
|
15
12
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
13
|
+
FAM_FIELDS = [
|
|
14
|
+
("family_id", str, "U"),
|
|
15
|
+
("individual_id", str, "U"),
|
|
16
|
+
("paternal_id", str, "U"),
|
|
17
|
+
("maternal_id", str, "U"),
|
|
18
|
+
("sex", str, "int8"),
|
|
19
|
+
("phenotype", str, "int8"),
|
|
20
|
+
]
|
|
21
|
+
FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
|
|
22
|
+
FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
|
|
23
|
+
|
|
24
|
+
BIM_FIELDS = [
|
|
25
|
+
("contig", str, "U"),
|
|
26
|
+
("variant_id", str, "U"),
|
|
27
|
+
("cm_position", "float32", "float32"),
|
|
28
|
+
("position", "int32", "int32"),
|
|
29
|
+
("allele_1", str, "S"),
|
|
30
|
+
("allele_2", str, "S"),
|
|
31
|
+
]
|
|
32
|
+
BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
|
|
33
|
+
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
|
|
37
|
+
# on the parameters to Pandas here.
|
|
38
|
+
def read_fam(path):
|
|
39
|
+
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
|
|
40
|
+
names = [f[0] for f in FAM_FIELDS]
|
|
41
|
+
df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
|
|
42
|
+
return df
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def read_bim(path):
|
|
46
|
+
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
|
|
47
|
+
names = [f[0] for f in BIM_FIELDS]
|
|
48
|
+
df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
|
|
49
|
+
return df
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclasses.dataclass
|
|
53
|
+
class PlinkPaths:
|
|
54
|
+
bed_path: str
|
|
55
|
+
bim_path: str
|
|
56
|
+
fam_path: str
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class BedReader:
|
|
60
|
+
def __init__(self, path, num_variants, num_samples):
|
|
61
|
+
self.num_variants = num_variants
|
|
62
|
+
self.num_samples = num_samples
|
|
63
|
+
self.path = path
|
|
64
|
+
# bytes per variant: 1 byte per 4 samples, rounded up
|
|
65
|
+
self.bytes_per_variant = (self.num_samples + 3) // 4
|
|
66
|
+
|
|
67
|
+
# TODO open this as a persistent file and support reading from a
|
|
68
|
+
# stream
|
|
69
|
+
with open(self.path, "rb") as f:
|
|
70
|
+
magic = f.read(3)
|
|
71
|
+
if magic != b"\x6c\x1b\x01":
|
|
72
|
+
raise ValueError("Invalid BED file magic bytes")
|
|
73
|
+
|
|
74
|
+
# We could check the size of the bed file here, but that would
|
|
75
|
+
# mean we can't work with streams.
|
|
76
|
+
|
|
77
|
+
# Initialize the lookup table with shape (256, 4, 2)
|
|
78
|
+
# 256 possible byte values, 4 samples per byte, 2 alleles per sample
|
|
79
|
+
lookup = np.zeros((256, 4, 2), dtype=np.int8)
|
|
80
|
+
|
|
81
|
+
# For each possible byte value (0-255)
|
|
82
|
+
for byte in range(256):
|
|
83
|
+
# For each of the 4 samples encoded in this byte
|
|
84
|
+
for sample in range(4):
|
|
85
|
+
# Extract the 2 bits for this sample
|
|
86
|
+
bits = (byte >> (sample * 2)) & 0b11
|
|
87
|
+
# Convert PLINK's bit encoding to genotype values
|
|
88
|
+
if bits == 0b00:
|
|
89
|
+
lookup[byte, sample] = [1, 1]
|
|
90
|
+
elif bits == 0b01:
|
|
91
|
+
lookup[byte, sample] = [-1, -1]
|
|
92
|
+
elif bits == 0b10:
|
|
93
|
+
lookup[byte, sample] = [0, 1]
|
|
94
|
+
elif bits == 0b11:
|
|
95
|
+
lookup[byte, sample] = [0, 0]
|
|
96
|
+
|
|
97
|
+
self.byte_lookup = lookup
|
|
98
|
+
|
|
99
|
+
def iter_decode(self, start, stop, buffer_size=None):
|
|
100
|
+
"""
|
|
101
|
+
Iterate of over the variants in the specified window
|
|
102
|
+
with the specified approximate buffer size in bytes (default=10MiB).
|
|
103
|
+
"""
|
|
104
|
+
if buffer_size is None:
|
|
105
|
+
buffer_size = 10 * 1024 * 1024
|
|
106
|
+
variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
|
|
107
|
+
for off in range(start, stop, variants_per_read):
|
|
108
|
+
genotypes = self.decode(off, min(off + variants_per_read, stop))
|
|
109
|
+
yield from genotypes
|
|
110
|
+
|
|
111
|
+
def decode(self, start, stop):
|
|
112
|
+
chunk_size = stop - start
|
|
113
|
+
|
|
114
|
+
# Calculate file offsets for the required data
|
|
115
|
+
# 3 bytes for the magic number at the beginning of the file
|
|
116
|
+
start_offset = 3 + (start * self.bytes_per_variant)
|
|
117
|
+
bytes_to_read = chunk_size * self.bytes_per_variant
|
|
118
|
+
|
|
119
|
+
logger.debug(
|
|
120
|
+
f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
|
|
121
|
+
f"from {self.path}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# TODO make it possible to read sequentially from the same file handle,
|
|
125
|
+
# seeking only when necessary.
|
|
126
|
+
with open(self.path, "rb") as f:
|
|
127
|
+
f.seek(start_offset)
|
|
128
|
+
chunk_data = f.read(bytes_to_read)
|
|
129
|
+
|
|
130
|
+
data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
|
|
131
|
+
data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
|
|
132
|
+
|
|
133
|
+
# Apply lookup table to get genotypes
|
|
134
|
+
# Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
|
|
135
|
+
all_genotypes = self.byte_lookup[data_matrix]
|
|
136
|
+
|
|
137
|
+
# Reshape to get all samples in one dimension
|
|
138
|
+
# (chunk_size, bytes_per_variant*4, 2)
|
|
139
|
+
samples_padded = self.bytes_per_variant * 4
|
|
140
|
+
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
|
|
141
|
+
|
|
142
|
+
return genotypes_reshaped[:, : self.num_samples]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class PlinkFormat(vcz.Source):
|
|
146
|
+
def __init__(self, prefix):
|
|
147
|
+
# TODO we will need support multiple chromosomes here to join
|
|
148
|
+
# plinks into on big zarr. So, these will require multiple
|
|
149
|
+
# bed and bim files, but should share a .fam
|
|
150
|
+
self.prefix = str(prefix)
|
|
151
|
+
self.paths = PlinkPaths(
|
|
152
|
+
self.prefix + ".bed",
|
|
153
|
+
self.prefix + ".bim",
|
|
154
|
+
self.prefix + ".fam",
|
|
155
|
+
)
|
|
156
|
+
self.bim = read_bim(self.paths.bim_path)
|
|
157
|
+
self.fam = read_fam(self.paths.fam_path)
|
|
158
|
+
self._num_records = self.bim.shape[0]
|
|
159
|
+
self._num_samples = self.fam.shape[0]
|
|
160
|
+
self.bed_reader = BedReader(
|
|
161
|
+
self.paths.bed_path, self.num_records, self.num_samples
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def path(self):
|
|
166
|
+
return self.prefix
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def num_records(self):
|
|
170
|
+
return self._num_records
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def num_samples(self):
|
|
174
|
+
return self._num_samples
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def samples(self):
|
|
178
|
+
return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def contigs(self):
|
|
182
|
+
return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
|
|
183
|
+
|
|
184
|
+
def iter_contig(self, start, stop):
|
|
185
|
+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
|
|
186
|
+
for chrom in self.bim.contig[start:stop]:
|
|
187
|
+
yield chrom_to_contig_index[str(chrom)]
|
|
188
|
+
|
|
189
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
190
|
+
assert field_name == "position" # Only position field is supported from plink
|
|
191
|
+
yield from self.bim.position[start:stop]
|
|
192
|
+
|
|
193
|
+
def iter_id(self, start, stop):
|
|
194
|
+
yield from self.bim.variant_id[start:stop]
|
|
195
|
+
|
|
196
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
197
|
+
alt_iter = self.bim.allele_1.values[start:stop]
|
|
198
|
+
ref_iter = self.bim.allele_2.values[start:stop]
|
|
199
|
+
gt_iter = self.bed_reader.iter_decode(start, stop)
|
|
200
|
+
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
|
|
201
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
|
|
202
|
+
alleles[0] = ref
|
|
203
|
+
alleles[1 : 1 + len(alt)] = alt
|
|
204
|
+
phased = np.zeros(gt.shape[0], dtype=bool)
|
|
205
|
+
# rlen is the length of the REF in PLINK as there's no END annotations
|
|
206
|
+
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
|
|
207
|
+
|
|
208
|
+
def generate_schema(
|
|
209
|
+
self,
|
|
210
|
+
variants_chunk_size=None,
|
|
211
|
+
samples_chunk_size=None,
|
|
212
|
+
):
|
|
213
|
+
n = self.num_samples
|
|
214
|
+
m = self.num_records
|
|
215
|
+
logging.info(f"Scanned plink with {n} samples and {m} variants")
|
|
216
|
+
dimensions = vcz.standard_dimensions(
|
|
217
|
+
variants_size=m,
|
|
218
|
+
variants_chunk_size=variants_chunk_size,
|
|
219
|
+
samples_size=n,
|
|
220
|
+
samples_chunk_size=samples_chunk_size,
|
|
221
|
+
ploidy_size=2,
|
|
222
|
+
alleles_size=2,
|
|
223
|
+
)
|
|
224
|
+
schema_instance = vcz.VcfZarrSchema(
|
|
225
|
+
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
|
|
226
|
+
dimensions=dimensions,
|
|
227
|
+
fields=[],
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
logger.info(
|
|
231
|
+
"Generating schema with chunks="
|
|
232
|
+
f"variants={dimensions['variants'].chunk_size}, "
|
|
233
|
+
f"samples={dimensions['samples'].chunk_size}"
|
|
234
|
+
)
|
|
235
|
+
# If we don't have SVLEN or END annotations, the rlen field is defined
|
|
236
|
+
# as the length of the REF
|
|
237
|
+
max_len = self.bim.allele_2.values.itemsize
|
|
238
|
+
|
|
239
|
+
array_specs = [
|
|
240
|
+
vcz.ZarrArraySpec(
|
|
241
|
+
source="position",
|
|
242
|
+
name="variant_position",
|
|
243
|
+
dtype="i4",
|
|
244
|
+
dimensions=["variants"],
|
|
245
|
+
description=None,
|
|
246
|
+
),
|
|
247
|
+
vcz.ZarrArraySpec(
|
|
248
|
+
name="variant_allele",
|
|
249
|
+
dtype="O",
|
|
250
|
+
dimensions=["variants", "alleles"],
|
|
251
|
+
description=None,
|
|
252
|
+
),
|
|
253
|
+
vcz.ZarrArraySpec(
|
|
254
|
+
name="variant_id",
|
|
255
|
+
dtype="O",
|
|
256
|
+
dimensions=["variants"],
|
|
257
|
+
description=None,
|
|
258
|
+
),
|
|
259
|
+
vcz.ZarrArraySpec(
|
|
260
|
+
name="variant_id_mask",
|
|
261
|
+
dtype="bool",
|
|
262
|
+
dimensions=["variants"],
|
|
263
|
+
description=None,
|
|
264
|
+
),
|
|
265
|
+
vcz.ZarrArraySpec(
|
|
266
|
+
source=None,
|
|
267
|
+
name="variant_length",
|
|
268
|
+
dtype=core.min_int_dtype(0, max_len),
|
|
269
|
+
dimensions=["variants"],
|
|
270
|
+
description="Length of each variant",
|
|
271
|
+
),
|
|
272
|
+
vcz.ZarrArraySpec(
|
|
273
|
+
name="variant_contig",
|
|
274
|
+
dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
|
|
275
|
+
dimensions=["variants"],
|
|
276
|
+
description="Contig/chromosome index for each variant",
|
|
277
|
+
),
|
|
278
|
+
vcz.ZarrArraySpec(
|
|
279
|
+
name="call_genotype_phased",
|
|
280
|
+
dtype="bool",
|
|
281
|
+
dimensions=["variants", "samples"],
|
|
282
|
+
description=None,
|
|
283
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
284
|
+
),
|
|
285
|
+
vcz.ZarrArraySpec(
|
|
286
|
+
name="call_genotype",
|
|
287
|
+
dtype="i1",
|
|
288
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
289
|
+
description=None,
|
|
290
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
|
|
291
|
+
),
|
|
292
|
+
vcz.ZarrArraySpec(
|
|
293
|
+
name="call_genotype_mask",
|
|
294
|
+
dtype="bool",
|
|
295
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
296
|
+
description=None,
|
|
297
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
298
|
+
),
|
|
299
|
+
]
|
|
300
|
+
schema_instance.fields = array_specs
|
|
301
|
+
return schema_instance
|
|
55
302
|
|
|
56
303
|
|
|
57
304
|
def convert(
|
|
58
|
-
|
|
59
|
-
|
|
305
|
+
prefix,
|
|
306
|
+
out,
|
|
60
307
|
*,
|
|
61
|
-
show_progress=False,
|
|
62
|
-
worker_processes=1,
|
|
63
308
|
variants_chunk_size=None,
|
|
64
309
|
samples_chunk_size=None,
|
|
310
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
311
|
+
show_progress=False,
|
|
65
312
|
):
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# FIXME
|
|
72
|
-
if samples_chunk_size is None:
|
|
73
|
-
samples_chunk_size = 1000
|
|
74
|
-
if variants_chunk_size is None:
|
|
75
|
-
variants_chunk_size = 10_000
|
|
76
|
-
|
|
77
|
-
root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
|
|
78
|
-
|
|
79
|
-
ploidy = 2
|
|
80
|
-
shape = [m, n]
|
|
81
|
-
chunks = [variants_chunk_size, samples_chunk_size]
|
|
82
|
-
dimensions = ["variants", "samples"]
|
|
83
|
-
|
|
84
|
-
# TODO we should be reusing some logic from vcfzarr here on laying
|
|
85
|
-
# out the basic dataset, and using the schema generator. Currently
|
|
86
|
-
# we're not using the best Blosc settings for genotypes here.
|
|
87
|
-
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
88
|
-
|
|
89
|
-
a = root.array(
|
|
90
|
-
"sample_id",
|
|
91
|
-
data=bed.iid,
|
|
92
|
-
shape=bed.iid.shape,
|
|
93
|
-
dtype="str",
|
|
94
|
-
compressor=default_compressor,
|
|
95
|
-
chunks=(samples_chunk_size,),
|
|
96
|
-
)
|
|
97
|
-
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
98
|
-
logger.debug("Encoded samples")
|
|
99
|
-
|
|
100
|
-
# TODO encode these in slices - but read them in one go to avoid
|
|
101
|
-
# fetching repeatedly from bim file
|
|
102
|
-
a = root.array(
|
|
103
|
-
"variant_position",
|
|
104
|
-
data=bed.bp_position,
|
|
105
|
-
shape=bed.bp_position.shape,
|
|
106
|
-
dtype=np.int32,
|
|
107
|
-
compressor=default_compressor,
|
|
108
|
-
chunks=(variants_chunk_size,),
|
|
109
|
-
)
|
|
110
|
-
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug("encoded variant_position")
|
|
112
|
-
|
|
113
|
-
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
|
-
a = root.array(
|
|
115
|
-
"variant_allele",
|
|
116
|
-
data=alleles,
|
|
117
|
-
shape=alleles.shape,
|
|
118
|
-
dtype="str",
|
|
119
|
-
compressor=default_compressor,
|
|
120
|
-
chunks=(variants_chunk_size, alleles.shape[1]),
|
|
313
|
+
plink_format = PlinkFormat(prefix)
|
|
314
|
+
schema_instance = plink_format.generate_schema(
|
|
315
|
+
variants_chunk_size=variants_chunk_size,
|
|
316
|
+
samples_chunk_size=samples_chunk_size,
|
|
121
317
|
)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
chunks=list(chunks),
|
|
131
|
-
compressor=default_compressor,
|
|
132
|
-
**ZARR_FORMAT_KWARGS,
|
|
318
|
+
zarr_path = pathlib.Path(out)
|
|
319
|
+
vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
|
|
320
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
321
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
322
|
+
vzw.init(
|
|
323
|
+
plink_format,
|
|
324
|
+
target_num_partitions=target_num_partitions,
|
|
325
|
+
schema=schema_instance,
|
|
133
326
|
)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
dimensions += ["ploidy"]
|
|
138
|
-
a = root.empty(
|
|
139
|
-
name="call_genotype",
|
|
140
|
-
dtype="i1",
|
|
141
|
-
shape=list(shape),
|
|
142
|
-
chunks=list(chunks),
|
|
143
|
-
compressor=default_compressor,
|
|
144
|
-
**ZARR_FORMAT_KWARGS,
|
|
145
|
-
)
|
|
146
|
-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
147
|
-
|
|
148
|
-
a = root.empty(
|
|
149
|
-
name="call_genotype_mask",
|
|
150
|
-
dtype="bool",
|
|
151
|
-
shape=list(shape),
|
|
152
|
-
chunks=list(chunks),
|
|
153
|
-
compressor=default_compressor,
|
|
154
|
-
**ZARR_FORMAT_KWARGS,
|
|
155
|
-
)
|
|
156
|
-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
157
|
-
|
|
158
|
-
del bed
|
|
159
|
-
|
|
160
|
-
num_slices = max(1, worker_processes * 4)
|
|
161
|
-
slices = core.chunk_aligned_slices(a, num_slices)
|
|
162
|
-
|
|
163
|
-
total_chunks = sum(a.nchunks for _, a in root.arrays())
|
|
164
|
-
|
|
165
|
-
progress_config = core.ProgressConfig(
|
|
166
|
-
total=total_chunks, title="Convert", units="chunks", show=show_progress
|
|
327
|
+
vzw.encode_all_partitions(
|
|
328
|
+
worker_processes=worker_processes,
|
|
329
|
+
show_progress=show_progress,
|
|
167
330
|
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
|
|
171
|
-
|
|
172
|
-
# TODO also add atomic swap like VCF. Should be abstracted to
|
|
173
|
-
# share basic code for setting up the variation dataset zarr
|
|
174
|
-
zarr.consolidate_metadata(zarr_path)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
# FIXME do this more efficiently - currently reading the whole thing
|
|
178
|
-
# in for convenience, and also comparing call-by-call
|
|
179
|
-
def validate(bed_path, zarr_path):
|
|
180
|
-
root = zarr.open(store=zarr_path, mode="r")
|
|
181
|
-
call_genotype = root["call_genotype"][:]
|
|
182
|
-
|
|
183
|
-
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
|
|
184
|
-
|
|
185
|
-
assert call_genotype.shape[0] == bed.sid_count
|
|
186
|
-
assert call_genotype.shape[1] == bed.iid_count
|
|
187
|
-
bed_genotypes = bed.read(dtype="int8").T
|
|
188
|
-
assert call_genotype.shape[0] == bed_genotypes.shape[0]
|
|
189
|
-
assert call_genotype.shape[1] == bed_genotypes.shape[1]
|
|
190
|
-
assert call_genotype.shape[2] == 2
|
|
191
|
-
|
|
192
|
-
row_id = 0
|
|
193
|
-
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
|
|
194
|
-
# print("ROW", row_id)
|
|
195
|
-
# print(bed_row, zarr_row)
|
|
196
|
-
row_id += 1
|
|
197
|
-
for bed_call, zarr_call in zip(bed_row, zarr_row):
|
|
198
|
-
if bed_call == -127:
|
|
199
|
-
assert list(zarr_call) == [-1, -1]
|
|
200
|
-
elif bed_call == 0:
|
|
201
|
-
assert list(zarr_call) == [0, 0]
|
|
202
|
-
elif bed_call == 1:
|
|
203
|
-
assert list(zarr_call) == [1, 0]
|
|
204
|
-
elif bed_call == 2:
|
|
205
|
-
assert list(zarr_call) == [1, 1]
|
|
206
|
-
else: # pragma no cover
|
|
207
|
-
raise AssertionError(f"Unexpected bed call {bed_call}")
|
|
331
|
+
vzw.finalise(show_progress)
|
|
332
|
+
vzw.create_index()
|