bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bio2zarr/plink.py CHANGED
@@ -1,207 +1,334 @@
1
+ import dataclasses
1
2
  import logging
3
+ import pathlib
2
4
 
3
- import bed_reader
4
- import humanfriendly
5
- import numcodecs
6
5
  import numpy as np
7
- import zarr
6
+ import pandas as pd
8
7
 
9
- from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
10
-
11
- from . import core
8
+ from bio2zarr import constants, core, vcz
9
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
12
10
 
13
11
  logger = logging.getLogger(__name__)
14
12
 
15
13
 
16
- def encode_genotypes_slice(bed_path, zarr_path, start, stop):
17
- # We need to count the A2 alleles here if we want to keep the
18
- # alleles reported as allele_1, allele_2. It's obvious here what
19
- # the correct approach is, but it is important to note that the
20
- # 0th allele is *not* necessarily the REF for these datasets.
21
- bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
22
- root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
23
- gt = core.BufferedArray(root["call_genotype"], start)
24
- gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
- gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
- variants_chunk_size = gt.array.chunks[0]
27
- assert start % variants_chunk_size == 0
28
-
29
- logger.debug(f"Reading slice {start}:{stop}")
30
- chunk_start = start
31
- while chunk_start < stop:
32
- chunk_stop = min(chunk_start + variants_chunk_size, stop)
33
- logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
34
- bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
35
- logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
36
- # Probably should do this without iterating over rows, but it's a bit
37
- # simpler and lines up better with the array buffering API. The bottleneck
38
- # is in the encoding anyway.
39
- for values in bed_chunk:
40
- j = gt.next_buffer_row()
41
- g = np.zeros_like(gt.buff[j])
42
- g[values == -127] = -1
43
- g[values == 2] = 1
44
- g[values == 1, 0] = 1
45
- gt.buff[j] = g
46
- j = gt_phased.next_buffer_row()
47
- gt_phased.buff[j] = False
48
- j = gt_mask.next_buffer_row()
49
- gt_mask.buff[j] = gt.buff[j] == -1
50
- chunk_start = chunk_stop
51
- gt.flush()
52
- gt_phased.flush()
53
- gt_mask.flush()
54
- logger.debug(f"GT slice {start}:{stop} done")
14
+ FAM_FIELDS = [
15
+ ("family_id", str, "U"),
16
+ ("individual_id", str, "U"),
17
+ ("paternal_id", str, "U"),
18
+ ("maternal_id", str, "U"),
19
+ ("sex", str, "int8"),
20
+ ("phenotype", str, "int8"),
21
+ ]
22
+ FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
23
+ FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
24
+
25
+ BIM_FIELDS = [
26
+ ("contig", str, "U"),
27
+ ("variant_id", str, "U"),
28
+ ("cm_position", "float32", "float32"),
29
+ ("position", "int32", "int32"),
30
+ ("allele_1", str, "S"),
31
+ ("allele_2", str, "S"),
32
+ ]
33
+ BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
34
+ BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
35
+
36
+
37
+ # See https://github.com/sgkit-dev/bio2zarr/issues/409 for discussion
38
+ # on the parameters to Pandas here.
39
+ def read_fam(path):
40
+ # See: https://www.cog-genomics.org/plink/1.9/formats#fam
41
+ names = [f[0] for f in FAM_FIELDS]
42
+ df = pd.read_csv(path, sep=None, names=names, dtype=FAM_DF_DTYPE, engine="python")
43
+ return df
44
+
45
+
46
+ def read_bim(path):
47
+ # See: https://www.cog-genomics.org/plink/1.9/formats#bim
48
+ names = [f[0] for f in BIM_FIELDS]
49
+ df = pd.read_csv(path, sep=None, names=names, dtype=BIM_DF_DTYPE, engine="python")
50
+ return df
51
+
52
+
53
+ @dataclasses.dataclass
54
+ class PlinkPaths:
55
+ bed_path: str
56
+ bim_path: str
57
+ fam_path: str
58
+
59
+
60
+ class BedReader:
61
+ def __init__(self, path, num_variants, num_samples):
62
+ self.num_variants = num_variants
63
+ self.num_samples = num_samples
64
+ self.path = path
65
+ # bytes per variant: 1 byte per 4 samples, rounded up
66
+ self.bytes_per_variant = (self.num_samples + 3) // 4
67
+
68
+ # TODO open this as a persistent file and support reading from a
69
+ # stream
70
+ with open(self.path, "rb") as f:
71
+ magic = f.read(3)
72
+ if magic != b"\x6c\x1b\x01":
73
+ raise ValueError("Invalid BED file magic bytes")
74
+
75
+ # We could check the size of the bed file here, but that would
76
+ # mean we can't work with streams.
77
+
78
+ # Initialize the lookup table with shape (256, 4, 2)
79
+ # 256 possible byte values, 4 samples per byte, 2 alleles per sample
80
+ lookup = np.zeros((256, 4, 2), dtype=np.int8)
81
+
82
+ # For each possible byte value (0-255)
83
+ for byte in range(256):
84
+ # For each of the 4 samples encoded in this byte
85
+ for sample in range(4):
86
+ # Extract the 2 bits for this sample
87
+ bits = (byte >> (sample * 2)) & 0b11
88
+ # Convert PLINK's bit encoding to genotype values
89
+ if bits == 0b00:
90
+ lookup[byte, sample] = [1, 1]
91
+ elif bits == 0b01:
92
+ lookup[byte, sample] = [-1, -1]
93
+ elif bits == 0b10:
94
+ lookup[byte, sample] = [0, 1]
95
+ elif bits == 0b11:
96
+ lookup[byte, sample] = [0, 0]
97
+
98
+ self.byte_lookup = lookup
99
+
100
+ def iter_decode(self, start, stop, buffer_size=None):
101
+ """
102
+ Iterate of over the variants in the specified window
103
+ with the specified approximate buffer size in bytes (default=10MiB).
104
+ """
105
+ if buffer_size is None:
106
+ buffer_size = 10 * 1024 * 1024
107
+ variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
108
+ for off in range(start, stop, variants_per_read):
109
+ genotypes = self.decode(off, min(off + variants_per_read, stop))
110
+ yield from genotypes
111
+
112
+ def decode(self, start, stop):
113
+ chunk_size = stop - start
114
+
115
+ # Calculate file offsets for the required data
116
+ # 3 bytes for the magic number at the beginning of the file
117
+ start_offset = 3 + (start * self.bytes_per_variant)
118
+ bytes_to_read = chunk_size * self.bytes_per_variant
119
+
120
+ logger.debug(
121
+ f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
122
+ f"from {self.path}"
123
+ )
124
+
125
+ # TODO make it possible to read sequentially from the same file handle,
126
+ # seeking only when necessary.
127
+ with open(self.path, "rb") as f:
128
+ f.seek(start_offset)
129
+ chunk_data = f.read(bytes_to_read)
130
+
131
+ data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
132
+ data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
133
+
134
+ # Apply lookup table to get genotypes
135
+ # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
136
+ all_genotypes = self.byte_lookup[data_matrix]
137
+
138
+ # Reshape to get all samples in one dimension
139
+ # (chunk_size, bytes_per_variant*4, 2)
140
+ samples_padded = self.bytes_per_variant * 4
141
+ genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
142
+
143
+ return genotypes_reshaped[:, : self.num_samples]
144
+
145
+
146
+ class PlinkFormat(vcz.Source):
147
+ def __init__(self, prefix):
148
+ # TODO we will need support multiple chromosomes here to join
149
+ # plinks into on big zarr. So, these will require multiple
150
+ # bed and bim files, but should share a .fam
151
+ self.prefix = str(prefix)
152
+ self.paths = PlinkPaths(
153
+ self.prefix + ".bed",
154
+ self.prefix + ".bim",
155
+ self.prefix + ".fam",
156
+ )
157
+ self.bim = read_bim(self.paths.bim_path)
158
+ self.fam = read_fam(self.paths.fam_path)
159
+ self._num_records = self.bim.shape[0]
160
+ self._num_samples = self.fam.shape[0]
161
+ self.bed_reader = BedReader(
162
+ self.paths.bed_path, self.num_records, self.num_samples
163
+ )
164
+
165
+ @property
166
+ def path(self):
167
+ return self.prefix
168
+
169
+ @property
170
+ def num_records(self):
171
+ return self._num_records
172
+
173
+ @property
174
+ def num_samples(self):
175
+ return self._num_samples
176
+
177
+ @property
178
+ def samples(self):
179
+ return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
180
+
181
+ @property
182
+ def contigs(self):
183
+ return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
184
+
185
+ def iter_contig(self, start, stop):
186
+ chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
187
+ for chrom in self.bim.contig[start:stop]:
188
+ yield chrom_to_contig_index[str(chrom)]
189
+
190
+ def iter_field(self, field_name, shape, start, stop):
191
+ assert field_name == "position" # Only position field is supported from plink
192
+ yield from self.bim.position[start:stop]
193
+
194
+ def iter_id(self, start, stop):
195
+ yield from self.bim.variant_id[start:stop]
196
+
197
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
198
+ alt_iter = self.bim.allele_1.values[start:stop]
199
+ ref_iter = self.bim.allele_2.values[start:stop]
200
+ gt_iter = self.bed_reader.iter_decode(start, stop)
201
+ for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
202
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
203
+ alleles[0] = ref
204
+ alleles[1 : 1 + len(alt)] = alt
205
+ phased = np.zeros(gt.shape[0], dtype=bool)
206
+ # rlen is the length of the REF in PLINK as there's no END annotations
207
+ yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
208
+
209
+ def generate_schema(
210
+ self,
211
+ variants_chunk_size=None,
212
+ samples_chunk_size=None,
213
+ ):
214
+ n = self.num_samples
215
+ m = self.num_records
216
+ logging.info(f"Scanned plink with {n} samples and {m} variants")
217
+ dimensions = vcz.standard_dimensions(
218
+ variants_size=m,
219
+ variants_chunk_size=variants_chunk_size,
220
+ samples_size=n,
221
+ samples_chunk_size=samples_chunk_size,
222
+ ploidy_size=2,
223
+ alleles_size=2,
224
+ )
225
+ schema_instance = vcz.VcfZarrSchema(
226
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
227
+ dimensions=dimensions,
228
+ fields=[],
229
+ )
230
+
231
+ logger.info(
232
+ "Generating schema with chunks="
233
+ f"variants={dimensions['variants'].chunk_size}, "
234
+ f"samples={dimensions['samples'].chunk_size}"
235
+ )
236
+ # If we don't have SVLEN or END annotations, the rlen field is defined
237
+ # as the length of the REF
238
+ # Explicitly cast to fixed size array to support pandas 2.x and 3.x
239
+ allele_2_array = self.bim.allele_2.values.astype("S")
240
+ max_len = allele_2_array.itemsize
241
+ array_specs = [
242
+ vcz.ZarrArraySpec(
243
+ source="position",
244
+ name="variant_position",
245
+ dtype="i4",
246
+ dimensions=["variants"],
247
+ description=None,
248
+ ),
249
+ vcz.ZarrArraySpec(
250
+ name="variant_allele",
251
+ dtype=STRING_DTYPE_NAME,
252
+ dimensions=["variants", "alleles"],
253
+ description=None,
254
+ ),
255
+ vcz.ZarrArraySpec(
256
+ name="variant_id",
257
+ dtype=STRING_DTYPE_NAME,
258
+ dimensions=["variants"],
259
+ description=None,
260
+ ),
261
+ vcz.ZarrArraySpec(
262
+ name="variant_id_mask",
263
+ dtype="bool",
264
+ dimensions=["variants"],
265
+ description=None,
266
+ ),
267
+ vcz.ZarrArraySpec(
268
+ source=None,
269
+ name="variant_length",
270
+ dtype=core.min_int_dtype(0, max_len),
271
+ dimensions=["variants"],
272
+ description="Length of each variant",
273
+ ),
274
+ vcz.ZarrArraySpec(
275
+ name="variant_contig",
276
+ dtype=core.min_int_dtype(0, len(np.unique(self.bim.contig))),
277
+ dimensions=["variants"],
278
+ description="Contig/chromosome index for each variant",
279
+ ),
280
+ vcz.ZarrArraySpec(
281
+ name="call_genotype_phased",
282
+ dtype="bool",
283
+ dimensions=["variants", "samples"],
284
+ description=None,
285
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
286
+ ),
287
+ vcz.ZarrArraySpec(
288
+ name="call_genotype",
289
+ dtype="i1",
290
+ dimensions=["variants", "samples", "ploidy"],
291
+ description=None,
292
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
293
+ ),
294
+ vcz.ZarrArraySpec(
295
+ name="call_genotype_mask",
296
+ dtype="bool",
297
+ dimensions=["variants", "samples", "ploidy"],
298
+ description=None,
299
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
300
+ ),
301
+ ]
302
+ schema_instance.fields = array_specs
303
+ return schema_instance
55
304
 
56
305
 
57
306
  def convert(
58
- bed_path,
59
- zarr_path,
307
+ prefix,
308
+ out,
60
309
  *,
61
- show_progress=False,
62
- worker_processes=1,
63
310
  variants_chunk_size=None,
64
311
  samples_chunk_size=None,
312
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
313
+ show_progress=False,
65
314
  ):
66
- bed = bed_reader.open_bed(bed_path, num_threads=1)
67
- n = bed.iid_count
68
- m = bed.sid_count
69
- logging.info(f"Scanned plink with {n} samples and {m} variants")
70
-
71
- # FIXME
72
- if samples_chunk_size is None:
73
- samples_chunk_size = 1000
74
- if variants_chunk_size is None:
75
- variants_chunk_size = 10_000
76
-
77
- root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
78
-
79
- ploidy = 2
80
- shape = [m, n]
81
- chunks = [variants_chunk_size, samples_chunk_size]
82
- dimensions = ["variants", "samples"]
83
-
84
- # TODO we should be reusing some logic from vcfzarr here on laying
85
- # out the basic dataset, and using the schema generator. Currently
86
- # we're not using the best Blosc settings for genotypes here.
87
- default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
88
-
89
- a = root.array(
90
- "sample_id",
91
- data=bed.iid,
92
- shape=bed.iid.shape,
93
- dtype="str",
94
- compressor=default_compressor,
95
- chunks=(samples_chunk_size,),
96
- )
97
- a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
98
- logger.debug("Encoded samples")
99
-
100
- # TODO encode these in slices - but read them in one go to avoid
101
- # fetching repeatedly from bim file
102
- a = root.array(
103
- "variant_position",
104
- data=bed.bp_position,
105
- shape=bed.bp_position.shape,
106
- dtype=np.int32,
107
- compressor=default_compressor,
108
- chunks=(variants_chunk_size,),
109
- )
110
- a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug("encoded variant_position")
112
-
113
- alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
- a = root.array(
115
- "variant_allele",
116
- data=alleles,
117
- shape=alleles.shape,
118
- dtype="str",
119
- compressor=default_compressor,
120
- chunks=(variants_chunk_size, alleles.shape[1]),
121
- )
122
- a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
123
- logger.debug("encoded variant_allele")
124
-
125
- # TODO remove this?
126
- a = root.empty(
127
- name="call_genotype_phased",
128
- dtype="bool",
129
- shape=list(shape),
130
- chunks=list(chunks),
131
- compressor=default_compressor,
132
- **ZARR_FORMAT_KWARGS,
315
+ plink_format = PlinkFormat(prefix)
316
+ schema_instance = plink_format.generate_schema(
317
+ variants_chunk_size=variants_chunk_size,
318
+ samples_chunk_size=samples_chunk_size,
133
319
  )
134
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
135
-
136
- shape += [ploidy]
137
- dimensions += ["ploidy"]
138
- a = root.empty(
139
- name="call_genotype",
140
- dtype="i1",
141
- shape=list(shape),
142
- chunks=list(chunks),
143
- compressor=default_compressor,
144
- **ZARR_FORMAT_KWARGS,
320
+ zarr_path = pathlib.Path(out)
321
+ vzw = vcz.VcfZarrWriter(PlinkFormat, zarr_path)
322
+ # Rough heuristic to split work up enough to keep utilisation high
323
+ target_num_partitions = max(1, worker_processes * 4)
324
+ vzw.init(
325
+ plink_format,
326
+ target_num_partitions=target_num_partitions,
327
+ schema=schema_instance,
145
328
  )
146
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
147
-
148
- a = root.empty(
149
- name="call_genotype_mask",
150
- dtype="bool",
151
- shape=list(shape),
152
- chunks=list(chunks),
153
- compressor=default_compressor,
154
- **ZARR_FORMAT_KWARGS,
155
- )
156
- a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
157
-
158
- del bed
159
-
160
- num_slices = max(1, worker_processes * 4)
161
- slices = core.chunk_aligned_slices(a, num_slices)
162
-
163
- total_chunks = sum(a.nchunks for _, a in root.arrays())
164
-
165
- progress_config = core.ProgressConfig(
166
- total=total_chunks, title="Convert", units="chunks", show=show_progress
329
+ vzw.encode_all_partitions(
330
+ worker_processes=worker_processes,
331
+ show_progress=show_progress,
167
332
  )
168
- with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
169
- for start, stop in slices:
170
- pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
171
-
172
- # TODO also add atomic swap like VCF. Should be abstracted to
173
- # share basic code for setting up the variation dataset zarr
174
- zarr.consolidate_metadata(zarr_path)
175
-
176
-
177
- # FIXME do this more efficiently - currently reading the whole thing
178
- # in for convenience, and also comparing call-by-call
179
- def validate(bed_path, zarr_path):
180
- root = zarr.open(store=zarr_path, mode="r")
181
- call_genotype = root["call_genotype"][:]
182
-
183
- bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
184
-
185
- assert call_genotype.shape[0] == bed.sid_count
186
- assert call_genotype.shape[1] == bed.iid_count
187
- bed_genotypes = bed.read(dtype="int8").T
188
- assert call_genotype.shape[0] == bed_genotypes.shape[0]
189
- assert call_genotype.shape[1] == bed_genotypes.shape[1]
190
- assert call_genotype.shape[2] == 2
191
-
192
- row_id = 0
193
- for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
194
- # print("ROW", row_id)
195
- # print(bed_row, zarr_row)
196
- row_id += 1
197
- for bed_call, zarr_call in zip(bed_row, zarr_row):
198
- if bed_call == -127:
199
- assert list(zarr_call) == [-1, -1]
200
- elif bed_call == 0:
201
- assert list(zarr_call) == [0, 0]
202
- elif bed_call == 1:
203
- assert list(zarr_call) == [1, 0]
204
- elif bed_call == 2:
205
- assert list(zarr_call) == [1, 1]
206
- else: # pragma no cover
207
- raise AssertionError(f"Unexpected bed call {bed_call}")
333
+ vzw.finalise(show_progress)
334
+ vzw.create_index()