bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bio2zarr/__main__.py +2 -1
- bio2zarr/_version.py +16 -3
- bio2zarr/cli.py +102 -22
- bio2zarr/core.py +43 -22
- bio2zarr/plink.py +316 -189
- bio2zarr/tskit.py +296 -0
- bio2zarr/typing.py +1 -2
- bio2zarr/{vcf2zarr/icf.py → vcf.py} +606 -114
- bio2zarr/vcf_utils.py +12 -11
- bio2zarr/{vcf2zarr/vcz.py → vcz.py} +568 -739
- bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
- bio2zarr/zarr_utils.py +169 -2
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/METADATA +23 -8
- bio2zarr-0.1.7.dist-info/RECORD +21 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL +1 -1
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt +2 -0
- bio2zarr/vcf2zarr/__init__.py +0 -38
- bio2zarr-0.1.5.dist-info/RECORD +0 -21
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {bio2zarr-0.1.5.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt +0 -0
bio2zarr/plink.py
CHANGED
|
@@ -1,207 +1,334 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import logging
|
|
3
|
+
import pathlib
|
|
2
4
|
|
|
3
|
-
import bed_reader
|
|
4
|
-
import humanfriendly
|
|
5
|
-
import numcodecs
|
|
6
5
|
import numpy as np
|
|
7
|
-
import
|
|
6
|
+
import pandas as pd
|
|
8
7
|
|
|
9
|
-
from bio2zarr
|
|
10
|
-
|
|
11
|
-
from . import core
|
|
8
|
+
from bio2zarr import constants, core, vcz
|
|
9
|
+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
|
|
12
10
|
|
|
13
11
|
logger = logging.getLogger(__name__)
|
|
14
12
|
|
|
15
13
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
14
|
+
FAM_FIELDS = [
|
|
15
|
+
("family_id", str, "U"),
|
|
16
|
+
("individual_id", str, "U"),
|
|
17
|
+
("paternal_id", str, "U"),
|
|
18
|
+
("maternal_id", str, "U"),
|
|
19
|
+
("sex", str, "int8"),
|
|
20
|
+
("phenotype", str, "int8"),
|
|
21
|
+
]
|
|
22
|
+
FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
|
|
23
|
+
FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
|
|
24
|
+
|
|
25
|
+
BIM_FIELDS = [
|
|
26
|
+
("contig", str, "U"),
|
|
27
|
+
("variant_id", str, "U"),
|
|
28
|
+
("cm_position", "float32", "float32"),
|
|
29
|
+
("position", "int32", "int32"),
|
|
30
|
+
("allele_1", str, "S"),
|
|
31
|
+
("allele_2", str, "S"),
|
|
32
|
+
]
|
|
33
|
+
BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
|
|
34
|
+
BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
|
|
38
|
+
# on the parameters to Pandas here.
|
|
39
|
+
def read_fam(path):
|
|
40
|
+
# See: https://www.cog-genomics.org/plink/1.9/formats#fam
|
|
41
|
+
names = [f[0] for f in FAM_FIELDS]
|
|
42
|
+
df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def read_bim(path):
|
|
47
|
+
# See: https://www.cog-genomics.org/plink/1.9/formats#bim
|
|
48
|
+
names = [f[0] for f in BIM_FIELDS]
|
|
49
|
+
df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclasses.dataclass
|
|
54
|
+
class PlinkPaths:
|
|
55
|
+
bed_path: str
|
|
56
|
+
bim_path: str
|
|
57
|
+
fam_path: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BedReader:
|
|
61
|
+
def __init__(self, path, num_variants, num_samples):
|
|
62
|
+
self.num_variants = num_variants
|
|
63
|
+
self.num_samples = num_samples
|
|
64
|
+
self.path = path
|
|
65
|
+
# bytes per variant: 1 byte per 4 samples, rounded up
|
|
66
|
+
self.bytes_per_variant = (self.num_samples + 3) // 4
|
|
67
|
+
|
|
68
|
+
# TODO open this as a persistent file and support reading from a
|
|
69
|
+
# stream
|
|
70
|
+
with open(self.path, "rb") as f:
|
|
71
|
+
magic = f.read(3)
|
|
72
|
+
if magic != b"\x6c\x1b\x01":
|
|
73
|
+
raise ValueError("Invalid BED file magic bytes")
|
|
74
|
+
|
|
75
|
+
# We could check the size of the bed file here, but that would
|
|
76
|
+
# mean we can't work with streams.
|
|
77
|
+
|
|
78
|
+
# Initialize the lookup table with shape (256, 4, 2)
|
|
79
|
+
# 256 possible byte values, 4 samples per byte, 2 alleles per sample
|
|
80
|
+
lookup = np.zeros((256, 4, 2), dtype=np.int8)
|
|
81
|
+
|
|
82
|
+
# For each possible byte value (0-255)
|
|
83
|
+
for byte in range(256):
|
|
84
|
+
# For each of the 4 samples encoded in this byte
|
|
85
|
+
for sample in range(4):
|
|
86
|
+
# Extract the 2 bits for this sample
|
|
87
|
+
bits = (byte >> (sample * 2)) & 0b11
|
|
88
|
+
# Convert PLINK's bit encoding to genotype values
|
|
89
|
+
if bits == 0b00:
|
|
90
|
+
lookup[byte, sample] = [1, 1]
|
|
91
|
+
elif bits == 0b01:
|
|
92
|
+
lookup[byte, sample] = [-1, -1]
|
|
93
|
+
elif bits == 0b10:
|
|
94
|
+
lookup[byte, sample] = [0, 1]
|
|
95
|
+
elif bits == 0b11:
|
|
96
|
+
lookup[byte, sample] = [0, 0]
|
|
97
|
+
|
|
98
|
+
self.byte_lookup = lookup
|
|
99
|
+
|
|
100
|
+
def iter_decode(self, start, stop, buffer_size=None):
|
|
101
|
+
"""
|
|
102
|
+
Iterate of over the variants in the specified window
|
|
103
|
+
with the specified approximate buffer size in bytes (default=10MiB).
|
|
104
|
+
"""
|
|
105
|
+
if buffer_size is None:
|
|
106
|
+
buffer_size = 10 * 1024 * 1024
|
|
107
|
+
variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
|
|
108
|
+
for off in range(start, stop, variants_per_read):
|
|
109
|
+
genotypes = self.decode(off, min(off + variants_per_read, stop))
|
|
110
|
+
yield from genotypes
|
|
111
|
+
|
|
112
|
+
def decode(self, start, stop):
|
|
113
|
+
chunk_size = stop - start
|
|
114
|
+
|
|
115
|
+
# Calculate file offsets for the required data
|
|
116
|
+
# 3 bytes for the magic number at the beginning of the file
|
|
117
|
+
start_offset = 3 + (start * self.bytes_per_variant)
|
|
118
|
+
bytes_to_read = chunk_size * self.bytes_per_variant
|
|
119
|
+
|
|
120
|
+
logger.debug(
|
|
121
|
+
f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
|
|
122
|
+
f"from {self.path}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# TODO make it possible to read sequentially from the same file handle,
|
|
126
|
+
# seeking only when necessary.
|
|
127
|
+
with open(self.path, "rb") as f:
|
|
128
|
+
f.seek(start_offset)
|
|
129
|
+
chunk_data = f.read(bytes_to_read)
|
|
130
|
+
|
|
131
|
+
data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
|
|
132
|
+
data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
|
|
133
|
+
|
|
134
|
+
# Apply lookup table to get genotypes
|
|
135
|
+
# Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
|
|
136
|
+
all_genotypes = self.byte_lookup[data_matrix]
|
|
137
|
+
|
|
138
|
+
# Reshape to get all samples in one dimension
|
|
139
|
+
# (chunk_size, bytes_per_variant*4, 2)
|
|
140
|
+
samples_padded = self.bytes_per_variant * 4
|
|
141
|
+
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
|
|
142
|
+
|
|
143
|
+
return genotypes_reshaped[:, : self.num_samples]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class PlinkFormat(vcz.Source):
|
|
147
|
+
def __init__(self, prefix):
|
|
148
|
+
# TODO we will need support multiple chromosomes here to join
|
|
149
|
+
# plinks into on big zarr. So, these will require multiple
|
|
150
|
+
# bed and bim files, but should share a .fam
|
|
151
|
+
self.prefix = str(prefix)
|
|
152
|
+
self.paths = PlinkPaths(
|
|
153
|
+
self.prefix + ".bed",
|
|
154
|
+
self.prefix + ".bim",
|
|
155
|
+
self.prefix + ".fam",
|
|
156
|
+
)
|
|
157
|
+
self.bim = read_bim(self.paths.bim_path)
|
|
158
|
+
self.fam = read_fam(self.paths.fam_path)
|
|
159
|
+
self._num_records = self.bim.shape[0]
|
|
160
|
+
self._num_samples = self.fam.shape[0]
|
|
161
|
+
self.bed_reader = BedReader(
|
|
162
|
+
self.paths.bed_path, self.num_records, self.num_samples
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def path(self):
|
|
167
|
+
return self.prefix
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def num_records(self):
|
|
171
|
+
return self._num_records
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def num_samples(self):
|
|
175
|
+
return self._num_samples
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def samples(self):
|
|
179
|
+
return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def contigs(self):
|
|
183
|
+
return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
|
|
184
|
+
|
|
185
|
+
def iter_contig(self, start, stop):
|
|
186
|
+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
|
|
187
|
+
for chrom in self.bim.contig[start:stop]:
|
|
188
|
+
yield chrom_to_contig_index[str(chrom)]
|
|
189
|
+
|
|
190
|
+
def iter_field(self, field_name, shape, start, stop):
|
|
191
|
+
assert field_name == "position" # Only position field is supported from plink
|
|
192
|
+
yield from self.bim.position[start:stop]
|
|
193
|
+
|
|
194
|
+
def iter_id(self, start, stop):
|
|
195
|
+
yield from self.bim.variant_id[start:stop]
|
|
196
|
+
|
|
197
|
+
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
|
|
198
|
+
alt_iter = self.bim.allele_1.values[start:stop]
|
|
199
|
+
ref_iter = self.bim.allele_2.values[start:stop]
|
|
200
|
+
gt_iter = self.bed_reader.iter_decode(start, stop)
|
|
201
|
+
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
|
|
202
|
+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
|
|
203
|
+
alleles[0] = ref
|
|
204
|
+
alleles[1 : 1 + len(alt)] = alt
|
|
205
|
+
phased = np.zeros(gt.shape[0], dtype=bool)
|
|
206
|
+
# rlen is the length of the REF in PLINK as there's no END annotations
|
|
207
|
+
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
|
|
208
|
+
|
|
209
|
+
def generate_schema(
|
|
210
|
+
self,
|
|
211
|
+
variants_chunk_size=None,
|
|
212
|
+
samples_chunk_size=None,
|
|
213
|
+
):
|
|
214
|
+
n = self.num_samples
|
|
215
|
+
m = self.num_records
|
|
216
|
+
logging.info(f"Scanned plink with {n} samples and {m} variants")
|
|
217
|
+
dimensions = vcz.standard_dimensions(
|
|
218
|
+
variants_size=m,
|
|
219
|
+
variants_chunk_size=variants_chunk_size,
|
|
220
|
+
samples_size=n,
|
|
221
|
+
samples_chunk_size=samples_chunk_size,
|
|
222
|
+
ploidy_size=2,
|
|
223
|
+
alleles_size=2,
|
|
224
|
+
)
|
|
225
|
+
schema_instance = vcz.VcfZarrSchema(
|
|
226
|
+
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
|
|
227
|
+
dimensions=dimensions,
|
|
228
|
+
fields=[],
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
logger.info(
|
|
232
|
+
"Generating schema with chunks="
|
|
233
|
+
f"variants={dimensions['variants'].chunk_size}, "
|
|
234
|
+
f"samples={dimensions['samples'].chunk_size}"
|
|
235
|
+
)
|
|
236
|
+
# If we don't have SVLEN or END annotations, the rlen field is defined
|
|
237
|
+
# as the length of the REF
|
|
238
|
+
# Explicitly cast to fixed size array to support pandas 2.x and 3.x
|
|
239
|
+
allele_2_array = self.bim.allele_2.values.astype("S")
|
|
240
|
+
max_len = allele_2_array.itemsize
|
|
241
|
+
array_specs = [
|
|
242
|
+
vcz.ZarrArraySpec(
|
|
243
|
+
source="position",
|
|
244
|
+
name="variant_position",
|
|
245
|
+
dtype="i4",
|
|
246
|
+
dimensions=["variants"],
|
|
247
|
+
description=None,
|
|
248
|
+
),
|
|
249
|
+
vcz.ZarrArraySpec(
|
|
250
|
+
name="variant_allele",
|
|
251
|
+
dtype=STRING_DTYPE_NAME,
|
|
252
|
+
dimensions=["variants", "alleles"],
|
|
253
|
+
description=None,
|
|
254
|
+
),
|
|
255
|
+
vcz.ZarrArraySpec(
|
|
256
|
+
name="variant_id",
|
|
257
|
+
dtype=STRING_DTYPE_NAME,
|
|
258
|
+
dimensions=["variants"],
|
|
259
|
+
description=None,
|
|
260
|
+
),
|
|
261
|
+
vcz.ZarrArraySpec(
|
|
262
|
+
name="variant_id_mask",
|
|
263
|
+
dtype="bool",
|
|
264
|
+
dimensions=["variants"],
|
|
265
|
+
description=None,
|
|
266
|
+
),
|
|
267
|
+
vcz.ZarrArraySpec(
|
|
268
|
+
source=None,
|
|
269
|
+
name="variant_length",
|
|
270
|
+
dtype=core.min_int_dtype(0, max_len),
|
|
271
|
+
dimensions=["variants"],
|
|
272
|
+
description="Length of each variant",
|
|
273
|
+
),
|
|
274
|
+
vcz.ZarrArraySpec(
|
|
275
|
+
name="variant_contig",
|
|
276
|
+
dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
|
|
277
|
+
dimensions=["variants"],
|
|
278
|
+
description="Contig/chromosome index for each variant",
|
|
279
|
+
),
|
|
280
|
+
vcz.ZarrArraySpec(
|
|
281
|
+
name="call_genotype_phased",
|
|
282
|
+
dtype="bool",
|
|
283
|
+
dimensions=["variants", "samples"],
|
|
284
|
+
description=None,
|
|
285
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
286
|
+
),
|
|
287
|
+
vcz.ZarrArraySpec(
|
|
288
|
+
name="call_genotype",
|
|
289
|
+
dtype="i1",
|
|
290
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
291
|
+
description=None,
|
|
292
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
|
|
293
|
+
),
|
|
294
|
+
vcz.ZarrArraySpec(
|
|
295
|
+
name="call_genotype_mask",
|
|
296
|
+
dtype="bool",
|
|
297
|
+
dimensions=["variants", "samples", "ploidy"],
|
|
298
|
+
description=None,
|
|
299
|
+
compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
|
|
300
|
+
),
|
|
301
|
+
]
|
|
302
|
+
schema_instance.fields = array_specs
|
|
303
|
+
return schema_instance
|
|
55
304
|
|
|
56
305
|
|
|
57
306
|
def convert(
|
|
58
|
-
|
|
59
|
-
|
|
307
|
+
prefix,
|
|
308
|
+
out,
|
|
60
309
|
*,
|
|
61
|
-
show_progress=False,
|
|
62
|
-
worker_processes=1,
|
|
63
310
|
variants_chunk_size=None,
|
|
64
311
|
samples_chunk_size=None,
|
|
312
|
+
worker_processes=core.DEFAULT_WORKER_PROCESSES,
|
|
313
|
+
show_progress=False,
|
|
65
314
|
):
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# FIXME
|
|
72
|
-
if samples_chunk_size is None:
|
|
73
|
-
samples_chunk_size = 1000
|
|
74
|
-
if variants_chunk_size is None:
|
|
75
|
-
variants_chunk_size = 10_000
|
|
76
|
-
|
|
77
|
-
root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
|
|
78
|
-
|
|
79
|
-
ploidy = 2
|
|
80
|
-
shape = [m, n]
|
|
81
|
-
chunks = [variants_chunk_size, samples_chunk_size]
|
|
82
|
-
dimensions = ["variants", "samples"]
|
|
83
|
-
|
|
84
|
-
# TODO we should be reusing some logic from vcfzarr here on laying
|
|
85
|
-
# out the basic dataset, and using the schema generator. Currently
|
|
86
|
-
# we're not using the best Blosc settings for genotypes here.
|
|
87
|
-
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
|
|
88
|
-
|
|
89
|
-
a = root.array(
|
|
90
|
-
"sample_id",
|
|
91
|
-
data=bed.iid,
|
|
92
|
-
shape=bed.iid.shape,
|
|
93
|
-
dtype="str",
|
|
94
|
-
compressor=default_compressor,
|
|
95
|
-
chunks=(samples_chunk_size,),
|
|
96
|
-
)
|
|
97
|
-
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
98
|
-
logger.debug("Encoded samples")
|
|
99
|
-
|
|
100
|
-
# TODO encode these in slices - but read them in one go to avoid
|
|
101
|
-
# fetching repeatedly from bim file
|
|
102
|
-
a = root.array(
|
|
103
|
-
"variant_position",
|
|
104
|
-
data=bed.bp_position,
|
|
105
|
-
shape=bed.bp_position.shape,
|
|
106
|
-
dtype=np.int32,
|
|
107
|
-
compressor=default_compressor,
|
|
108
|
-
chunks=(variants_chunk_size,),
|
|
109
|
-
)
|
|
110
|
-
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
|
|
111
|
-
logger.debug("encoded variant_position")
|
|
112
|
-
|
|
113
|
-
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
|
|
114
|
-
a = root.array(
|
|
115
|
-
"variant_allele",
|
|
116
|
-
data=alleles,
|
|
117
|
-
shape=alleles.shape,
|
|
118
|
-
dtype="str",
|
|
119
|
-
compressor=default_compressor,
|
|
120
|
-
chunks=(variants_chunk_size, alleles.shape[1]),
|
|
121
|
-
)
|
|
122
|
-
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
|
|
123
|
-
logger.debug("encoded variant_allele")
|
|
124
|
-
|
|
125
|
-
# TODO remove this?
|
|
126
|
-
a = root.empty(
|
|
127
|
-
name="call_genotype_phased",
|
|
128
|
-
dtype="bool",
|
|
129
|
-
shape=list(shape),
|
|
130
|
-
chunks=list(chunks),
|
|
131
|
-
compressor=default_compressor,
|
|
132
|
-
**ZARR_FORMAT_KWARGS,
|
|
315
|
+
plink_format = PlinkFormat(prefix)
|
|
316
|
+
schema_instance = plink_format.generate_schema(
|
|
317
|
+
variants_chunk_size=variants_chunk_size,
|
|
318
|
+
samples_chunk_size=samples_chunk_size,
|
|
133
319
|
)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
chunks=list(chunks),
|
|
143
|
-
compressor=default_compressor,
|
|
144
|
-
**ZARR_FORMAT_KWARGS,
|
|
320
|
+
zarr_path = pathlib.Path(out)
|
|
321
|
+
vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
|
|
322
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
323
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
324
|
+
vzw.init(
|
|
325
|
+
plink_format,
|
|
326
|
+
target_num_partitions=target_num_partitions,
|
|
327
|
+
schema=schema_instance,
|
|
145
328
|
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
name="call_genotype_mask",
|
|
150
|
-
dtype="bool",
|
|
151
|
-
shape=list(shape),
|
|
152
|
-
chunks=list(chunks),
|
|
153
|
-
compressor=default_compressor,
|
|
154
|
-
**ZARR_FORMAT_KWARGS,
|
|
155
|
-
)
|
|
156
|
-
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
|
|
157
|
-
|
|
158
|
-
del bed
|
|
159
|
-
|
|
160
|
-
num_slices = max(1, worker_processes * 4)
|
|
161
|
-
slices = core.chunk_aligned_slices(a, num_slices)
|
|
162
|
-
|
|
163
|
-
total_chunks = sum(a.nchunks for _, a in root.arrays())
|
|
164
|
-
|
|
165
|
-
progress_config = core.ProgressConfig(
|
|
166
|
-
total=total_chunks, title="Convert", units="chunks", show=show_progress
|
|
329
|
+
vzw.encode_all_partitions(
|
|
330
|
+
worker_processes=worker_processes,
|
|
331
|
+
show_progress=show_progress,
|
|
167
332
|
)
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
|
|
171
|
-
|
|
172
|
-
# TODO also add atomic swap like VCF. Should be abstracted to
|
|
173
|
-
# share basic code for setting up the variation dataset zarr
|
|
174
|
-
zarr.consolidate_metadata(zarr_path)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
# FIXME do this more efficiently - currently reading the whole thing
|
|
178
|
-
# in for convenience, and also comparing call-by-call
|
|
179
|
-
def validate(bed_path, zarr_path):
|
|
180
|
-
root = zarr.open(store=zarr_path, mode="r")
|
|
181
|
-
call_genotype = root["call_genotype"][:]
|
|
182
|
-
|
|
183
|
-
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
|
|
184
|
-
|
|
185
|
-
assert call_genotype.shape[0] == bed.sid_count
|
|
186
|
-
assert call_genotype.shape[1] == bed.iid_count
|
|
187
|
-
bed_genotypes = bed.read(dtype="int8").T
|
|
188
|
-
assert call_genotype.shape[0] == bed_genotypes.shape[0]
|
|
189
|
-
assert call_genotype.shape[1] == bed_genotypes.shape[1]
|
|
190
|
-
assert call_genotype.shape[2] == 2
|
|
191
|
-
|
|
192
|
-
row_id = 0
|
|
193
|
-
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
|
|
194
|
-
# print("ROW", row_id)
|
|
195
|
-
# print(bed_row, zarr_row)
|
|
196
|
-
row_id += 1
|
|
197
|
-
for bed_call, zarr_call in zip(bed_row, zarr_row):
|
|
198
|
-
if bed_call == -127:
|
|
199
|
-
assert list(zarr_call) == [-1, -1]
|
|
200
|
-
elif bed_call == 0:
|
|
201
|
-
assert list(zarr_call) == [0, 0]
|
|
202
|
-
elif bed_call == 1:
|
|
203
|
-
assert list(zarr_call) == [1, 0]
|
|
204
|
-
elif bed_call == 2:
|
|
205
|
-
assert list(zarr_call) == [1, 1]
|
|
206
|
-
else: # pragma no cover
|
|
207
|
-
raise AssertionError(f"Unexpected bed call {bed_call}")
|
|
333
|
+
vzw.finalise(show_progress)
|
|
334
|
+
vzw.create_index()
|