bio2zarr 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/plink.py CHANGED
@@ -1,207 +1,332 @@
1
+ import dataclasses
1
2
  import logging
3
+ import pathlib
2
4
 
3
- import bed_reader
4
- import humanfriendly
5
- import numcodecs
6
5
  import numpy as np
7
- import zarr
6
+ import pandas as pd
8
7
 
9
- from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
10
-
11
- from . import core
8
+ from bio2zarr import constants, core, vcz
12
9
 
13
10
  logger = logging.getLogger(__name__)
14
11
 
15
12
 
16
- def encode_genotypes_slice(bed_path, zarr_path, start, stop):
17
- # We need to count the A2 alleles here if we want to keep the
18
- # alleles reported as allele_1, allele_2. It's obvious here what
19
- # the correct approach is, but it is important to note that the
20
- # 0th allele is *not* necessarily the REF for these datasets.
21
- bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
22
- root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
23
- gt = core.BufferedArray(root["call_genotype"], start)
24
- gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
- gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
- variants_chunk_size = gt.array.chunks[0]
27
- assert start % variants_chunk_size == 0
28
-
29
- logger.debug(f"Reading slice {start}:{stop}")
30
- chunk_start = start
31
- while chunk_start < stop:
32
- chunk_stop = min(chunk_start + variants_chunk_size, stop)
33
- logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
34
- bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
35
- logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
36
- # Probably should do this without iterating over rows, but it's a bit
37
- # simpler and lines up better with the array buffering API. The bottleneck
38
- # is in the encoding anyway.
39
- for values in bed_chunk:
40
- j = gt.next_buffer_row()
41
- g = np.zeros_like(gt.buff[j])
42
- g[values == -127] = -1
43
- g[values == 2] = 1
44
- g[values == 1, 0] = 1
45
- gt.buff[j] = g
46
- j = gt_phased.next_buffer_row()
47
- gt_phased.buff[j] = False
48
- j = gt_mask.next_buffer_row()
49
- gt_mask.buff[j] = gt.buff[j] == -1
50
- chunk_start = chunk_stop
51
- gt.flush()
52
- gt_phased.flush()
53
- gt_mask.flush()
54
- logger.debug(f"GT slice {start}:{stop} done")
13
+ FAM_FIELDS = [
14
+ ("family_id", str, "U"),
15
+ ("individual_id", str, "U"),
16
+ ("paternal_id", str, "U"),
17
+ ("maternal_id", str, "U"),
18
+ ("sex", str, "int8"),
19
+ ("phenotype", str, "int8"),
20
+ ]
21
+ FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
22
+ FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
23
+
24
+ BIM_FIELDS = [
25
+ ("contig", str, "U"),
26
+ ("variant_id", str, "U"),
27
+ ("cm_position", "float32", "float32"),
28
+ ("position", "int32", "int32"),
29
+ ("allele_1", str, "S"),
30
+ ("allele_2", str, "S"),
31
+ ]
32
+ BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
33
+ BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
34
+
35
+
36
+ # See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
37
+ # on the parameters to Pandas here.
38
+ def read_fam(path):
39
+ # See: https://www.cog-genomics.org/plink/1.9/formats#fam
40
+ names = [f[0] for f in FAM_FIELDS]
41
+ df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
42
+ return df
43
+
44
+
45
+ def read_bim(path):
46
+ # See: https://www.cog-genomics.org/plink/1.9/formats#bim
47
+ names = [f[0] for f in BIM_FIELDS]
48
+ df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
49
+ return df
50
+
51
+
52
+ @dataclasses.dataclass
53
+ class PlinkPaths:
54
+ bed_path: str
55
+ bim_path: str
56
+ fam_path: str
57
+
58
+
59
+ class BedReader:
60
+ def __init__(self, path, num_variants, num_samples):
61
+ self.num_variants = num_variants
62
+ self.num_samples = num_samples
63
+ self.path = path
64
+ # bytes per variant: 1 byte per 4 samples, rounded up
65
+ self.bytes_per_variant = (self.num_samples + 3) // 4
66
+
67
+ # TODO open this as a persistent file and support reading from a
68
+ # stream
69
+ with open(self.path, "rb") as f:
70
+ magic = f.read(3)
71
+ if magic != b"\x6c\x1b\x01":
72
+ raise ValueError("Invalid BED file magic bytes")
73
+
74
+ # We could check the size of the bed file here, but that would
75
+ # mean we can't work with streams.
76
+
77
+ # Initialize the lookup table with shape (256, 4, 2)
78
+ # 256 possible byte values, 4 samples per byte, 2 alleles per sample
79
+ lookup = np.zeros((256, 4, 2), dtype=np.int8)
80
+
81
+ # For each possible byte value (0-255)
82
+ for byte in range(256):
83
+ # For each of the 4 samples encoded in this byte
84
+ for sample in range(4):
85
+ # Extract the 2 bits for this sample
86
+ bits = (byte >> (sample * 2)) & 0b11
87
+ # Convert PLINK's bit encoding to genotype values
88
+ if bits == 0b00:
89
+ lookup[byte, sample] = [1, 1]
90
+ elif bits == 0b01:
91
+ lookup[byte, sample] = [-1, -1]
92
+ elif bits == 0b10:
93
+ lookup[byte, sample] = [0, 1]
94
+ elif bits == 0b11:
95
+ lookup[byte, sample] = [0, 0]
96
+
97
+ self.byte_lookup = lookup
98
+
99
+ def iter_decode(self, start, stop, buffer_size=None):
100
+ """
101
+ Iterate of over the variants in the specified window
102
+ with the specified approximate buffer size in bytes (default=10MiB).
103
+ """
104
+ if buffer_size is None:
105
+ buffer_size = 10 * 1024 * 1024
106
+ variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
107
+ for off in range(start, stop, variants_per_read):
108
+ genotypes = self.decode(off, min(off + variants_per_read, stop))
109
+ yield from genotypes
110
+
111
+ def decode(self, start, stop):
112
+ chunk_size = stop - start
113
+
114
+ # Calculate file offsets for the required data
115
+ # 3 bytes for the magic number at the beginning of the file
116
+ start_offset = 3 + (start * self.bytes_per_variant)
117
+ bytes_to_read = chunk_size * self.bytes_per_variant
118
+
119
+ logger.debug(
120
+ f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
121
+ f"from {self.path}"
122
+ )
123
+
124
+ # TODO make it possible to read sequentially from the same file handle,
125
+ # seeking only when necessary.
126
+ with open(self.path, "rb") as f:
127
+ f.seek(start_offset)
128
+ chunk_data = f.read(bytes_to_read)
129
+
130
+ data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
131
+ data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
132
+
133
+ # Apply lookup table to get genotypes
134
+ # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
135
+ all_genotypes = self.byte_lookup[data_matrix]
136
+
137
+ # Reshape to get all samples in one dimension
138
+ # (chunk_size, bytes_per_variant*4, 2)
139
+ samples_padded = self.bytes_per_variant * 4
140
+ genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
141
+
142
+ return genotypes_reshaped[:, : self.num_samples]
143
+
144
+
145
+ class PlinkFormat(vcz.Source):
146
+ def __init__(self, prefix):
147
+ # TODO we will need support multiple chromosomes here to join
148
+ # plinks into on big zarr. So, these will require multiple
149
+ # bed and bim files, but should share a .fam
150
+ self.prefix = str(prefix)
151
+ self.paths = PlinkPaths(
152
+ self.prefix + ".bed",
153
+ self.prefix + ".bim",
154
+ self.prefix + ".fam",
155
+ )
156
+ self.bim = read_bim(self.paths.bim_path)
157
+ self.fam = read_fam(self.paths.fam_path)
158
+ self._num_records = self.bim.shape[0]
159
+ self._num_samples = self.fam.shape[0]
160
+ self.bed_reader = BedReader(
161
+ self.paths.bed_path, self.num_records, self.num_samples
162
+ )
163
+
164
+ @property
165
+ def path(self):
166
+ return self.prefix
167
+
168
+ @property
169
+ def num_records(self):
170
+ return self._num_records
171
+
172
+ @property
173
+ def num_samples(self):
174
+ return self._num_samples
175
+
176
+ @property
177
+ def samples(self):
178
+ return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
179
+
180
+ @property
181
+ def contigs(self):
182
+ return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
183
+
184
+ def iter_contig(self, start, stop):
185
+ chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
186
+ for chrom in self.bim.contig[start:stop]:
187
+ yield chrom_to_contig_index[str(chrom)]
188
+
189
+ def iter_field(self, field_name, shape, start, stop):
190
+ assert field_name == "position" # Only position field is supported from plink
191
+ yield from self.bim.position[start:stop]
192
+
193
+ def iter_id(self, start, stop):
194
+ yield from self.bim.variant_id[start:stop]
195
+
196
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
197
+ alt_iter = self.bim.allele_1.values[start:stop]
198
+ ref_iter = self.bim.allele_2.values[start:stop]
199
+ gt_iter = self.bed_reader.iter_decode(start, stop)
200
+ for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
201
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
202
+ alleles[0] = ref
203
+ alleles[1 : 1 + len(alt)] = alt
204
+ phased = np.zeros(gt.shape[0], dtype=bool)
205
+ # rlen is the length of the REF in PLINK as there's no END annotations
206
+ yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
207
+
208
+ def generate_schema(
209
+ self,
210
+ variants_chunk_size=None,
211
+ samples_chunk_size=None,
212
+ ):
213
+ n = self.num_samples
214
+ m = self.num_records
215
+ logging.info(f"Scanned plink with {n} samples and {m} variants")
216
+ dimensions = vcz.standard_dimensions(
217
+ variants_size=m,
218
+ variants_chunk_size=variants_chunk_size,
219
+ samples_size=n,
220
+ samples_chunk_size=samples_chunk_size,
221
+ ploidy_size=2,
222
+ alleles_size=2,
223
+ )
224
+ schema_instance = vcz.VcfZarrSchema(
225
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
226
+ dimensions=dimensions,
227
+ fields=[],
228
+ )
229
+
230
+ logger.info(
231
+ "Generating schema with chunks="
232
+ f"variants={dimensions['variants'].chunk_size}, "
233
+ f"samples={dimensions['samples'].chunk_size}"
234
+ )
235
+ # If we don't have SVLEN or END annotations, the rlen field is defined
236
+ # as the length of the REF
237
+ max_len = self.bim.allele_2.values.itemsize
238
+
239
+ array_specs = [
240
+ vcz.ZarrArraySpec(
241
+ source="position",
242
+ name="variant_position",
243
+ dtype="i4",
244
+ dimensions=["variants"],
245
+ description=None,
246
+ ),
247
+ vcz.ZarrArraySpec(
248
+ name="variant_allele",
249
+ dtype="O",
250
+ dimensions=["variants", "alleles"],
251
+ description=None,
252
+ ),
253
+ vcz.ZarrArraySpec(
254
+ name="variant_id",
255
+ dtype="O",
256
+ dimensions=["variants"],
257
+ description=None,
258
+ ),
259
+ vcz.ZarrArraySpec(
260
+ name="variant_id_mask",
261
+ dtype="bool",
262
+ dimensions=["variants"],
263
+ description=None,
264
+ ),
265
+ vcz.ZarrArraySpec(
266
+ source=None,
267
+ name="variant_length",
268
+ dtype=core.min_int_dtype(0, max_len),
269
+ dimensions=["variants"],
270
+ description="Length of each variant",
271
+ ),
272
+ vcz.ZarrArraySpec(
273
+ name="variant_contig",
274
+ dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
275
+ dimensions=["variants"],
276
+ description="Contig/chromosome index for each variant",
277
+ ),
278
+ vcz.ZarrArraySpec(
279
+ name="call_genotype_phased",
280
+ dtype="bool",
281
+ dimensions=["variants", "samples"],
282
+ description=None,
283
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
284
+ ),
285
+ vcz.ZarrArraySpec(
286
+ name="call_genotype",
287
+ dtype="i1",
288
+ dimensions=["variants", "samples", "ploidy"],
289
+ description=None,
290
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
291
+ ),
292
+ vcz.ZarrArraySpec(
293
+ name="call_genotype_mask",
294
+ dtype="bool",
295
+ dimensions=["variants", "samples", "ploidy"],
296
+ description=None,
297
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
298
+ ),
299
+ ]
300
+ schema_instance.fields = array_specs
301
+ return schema_instance
55
302
 
56
303
 
57
304
  def convert(
58
- bed_path,
59
- zarr_path,
305
+ prefix,
306
+ out,
60
307
  *,
61
- show_progress=False,
62
- worker_processes=1,
63
308
  variants_chunk_size=None,
64
309
  samples_chunk_size=None,
310
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
311
+ show_progress=False,
65
312
  ):
66
- bed = bed_reader.open_bed(bed_path, num_threads=1)
67
- n = bed.iid_count
68
- m = bed.sid_count
69
- logging.info(f"Scanned plink with {n} samples and {m} variants")
70
-
71
- # FIXME
72
- if samples_chunk_size is None:
73
- samples_chunk_size = 1000
74
- if variants_chunk_size is None:
75
- variants_chunk_size = 10_000
76
-
77
- root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
78
-
79
- ploidy = 2
80
- shape = [m, n]
81
- chunks = [variants_chunk_size, samples_chunk_size]
82
- dimensions = ["variants", "samples"]
83
-
84
- # TODO we should be reusing some logic from vcfzarr here on laying
85
- # out the basic dataset, and using the schema generator. Currently
86
- # we're not using the best Blosc settings for genotypes here.
87
- default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
88
-
89
- a = root.array(
90
- "sample_id",
91
- data=bed.iid,
92
- shape=bed.iid.shape,
93
- dtype="str",
94
- compressor=default_compressor,
95
- chunks=(samples_chunk_size,),
96
- )
97
- a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
98
- logger.debug("Encoded samples")
99
-
100
- # TODO encode these in slices - but read them in one go to avoid
101
- # fetching repeatedly from bim file
102
- a = root.array(
103
- "variant_position",
104
- data=bed.bp_position,
105
- shape=bed.bp_position.shape,
106
- dtype=np.int32,
107
- compressor=default_compressor,
108
- chunks=(variants_chunk_size,),
109
- )
110
- a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug("encoded variant_position")
112
-
113
- alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
- a = root.array(
115
- "variant_allele",
116
- data=alleles,
117
- shape=alleles.shape,
118
- dtype="str",
119
- compressor=default_compressor,
120
- chunks=(variants_chunk_size, alleles.shape[1]),
313
+ plink_format = PlinkFormat(prefix)
314
+ schema_instance = plink_format.generate_schema(
315
+ variants_chunk_size=variants_chunk_size,
316
+ samples_chunk_size=samples_chunk_size,
121
317
  )
122
- a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
123
- logger.debug("encoded variant_allele")
124
-
125
- # TODO remove this?
126
- a = root.empty(
127
- name="call_genotype_phased",
128
- dtype="bool",
129
- shape=list(shape),
130
- chunks=list(chunks),
131
- compressor=default_compressor,
132
- **ZARR_FORMAT_KWARGS,
318
+ zarr_path = pathlib.Path(out)
319
+ vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
320
+ # Rough heuristic to split work up enough to keep utilisation high
321
+ target_num_partitions = max(1, worker_processes * 4)
322
+ vzw.init(
323
+ plink_format,
324
+ target_num_partitions=target_num_partitions,
325
+ schema=schema_instance,
133
326
  )
134
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
135
-
136
- shape += [ploidy]
137
- dimensions += ["ploidy"]
138
- a = root.empty(
139
- name="call_genotype",
140
- dtype="i1",
141
- shape=list(shape),
142
- chunks=list(chunks),
143
- compressor=default_compressor,
144
- **ZARR_FORMAT_KWARGS,
145
- )
146
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147
-
148
- a = root.empty(
149
- name="call_genotype_mask",
150
- dtype="bool",
151
- shape=list(shape),
152
- chunks=list(chunks),
153
- compressor=default_compressor,
154
- **ZARR_FORMAT_KWARGS,
155
- )
156
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
157
-
158
- del bed
159
-
160
- num_slices = max(1, worker_processes * 4)
161
- slices = core.chunk_aligned_slices(a, num_slices)
162
-
163
- total_chunks = sum(a.nchunks for _, a in root.arrays())
164
-
165
- progress_config = core.ProgressConfig(
166
- total=total_chunks, title="Convert", units="chunks", show=show_progress
327
+ vzw.encode_all_partitions(
328
+ worker_processes=worker_processes,
329
+ show_progress=show_progress,
167
330
  )
168
- with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
169
- for start, stop in slices:
170
- pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
171
-
172
- # TODO also add atomic swap like VCF. Should be abstracted to
173
- # share basic code for setting up the variation dataset zarr
174
- zarr.consolidate_metadata(zarr_path)
175
-
176
-
177
- # FIXME do this more efficiently - currently reading the whole thing
178
- # in for convenience, and also comparing call-by-call
179
- def validate(bed_path, zarr_path):
180
- root = zarr.open(store=zarr_path, mode="r")
181
- call_genotype = root["call_genotype"][:]
182
-
183
- bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
184
-
185
- assert call_genotype.shape[0] == bed.sid_count
186
- assert call_genotype.shape[1] == bed.iid_count
187
- bed_genotypes = bed.read(dtype="int8").T
188
- assert call_genotype.shape[0] == bed_genotypes.shape[0]
189
- assert call_genotype.shape[1] == bed_genotypes.shape[1]
190
- assert call_genotype.shape[2] == 2
191
-
192
- row_id = 0
193
- for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
194
- # print("ROW", row_id)
195
- # print(bed_row, zarr_row)
196
- row_id += 1
197
- for bed_call, zarr_call in zip(bed_row, zarr_row):
198
- if bed_call == -127:
199
- assert list(zarr_call) == [-1, -1]
200
- elif bed_call == 0:
201
- assert list(zarr_call) == [0, 0]
202
- elif bed_call == 1:
203
- assert list(zarr_call) == [1, 0]
204
- elif bed_call == 2:
205
- assert list(zarr_call) == [1, 1]
206
- else: # pragma no cover
207
- raise AssertionError(f"Unexpected bed call {bed_call}")
331
+ vzw.finalise(show_progress)
332
+ vzw.create_index()