bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -0,0 +1,230 @@
1
+ import cyvcf2
2
+ import numpy as np
3
+ import numpy.testing as nt
4
+ import tqdm
5
+ import zarr
6
+
7
+ from .. import constants
8
+
9
+
10
+ def assert_all_missing_float(a):
11
+ v = np.array(a, dtype=np.float32).view(np.int32)
12
+ nt.assert_equal(v, constants.FLOAT32_MISSING_AS_INT32)
13
+
14
+
15
+ def assert_all_fill_float(a):
16
+ v = np.array(a, dtype=np.float32).view(np.int32)
17
+ nt.assert_equal(v, constants.FLOAT32_FILL_AS_INT32)
18
+
19
+
20
+ def assert_all_missing_int(a):
21
+ v = np.array(a, dtype=int)
22
+ nt.assert_equal(v, constants.INT_MISSING)
23
+
24
+
25
+ def assert_all_fill_int(a):
26
+ v = np.array(a, dtype=int)
27
+ nt.assert_equal(v, constants.INT_FILL)
28
+
29
+
30
+ def assert_all_missing_string(a):
31
+ nt.assert_equal(a, constants.STR_MISSING)
32
+
33
+
34
+ def assert_all_fill_string(a):
35
+ nt.assert_equal(a, constants.STR_FILL)
36
+
37
+
38
+ def assert_all_fill(zarr_val, vcf_type):
39
+ if vcf_type == "Integer":
40
+ assert_all_fill_int(zarr_val)
41
+ elif vcf_type in ("String", "Character"):
42
+ assert_all_fill_string(zarr_val)
43
+ elif vcf_type == "Float":
44
+ assert_all_fill_float(zarr_val)
45
+ else: # pragma: no cover
46
+ assert False # noqa PT015
47
+
48
+
49
+ def assert_all_missing(zarr_val, vcf_type):
50
+ if vcf_type == "Integer":
51
+ assert_all_missing_int(zarr_val)
52
+ elif vcf_type in ("String", "Character"):
53
+ assert_all_missing_string(zarr_val)
54
+ elif vcf_type == "Flag":
55
+ assert zarr_val == False # noqa 712
56
+ elif vcf_type == "Float":
57
+ assert_all_missing_float(zarr_val)
58
+ else: # pragma: no cover
59
+ assert False # noqa PT015
60
+
61
+
62
+ def assert_info_val_missing(zarr_val, vcf_type):
63
+ assert_all_missing(zarr_val, vcf_type)
64
+
65
+
66
+ def assert_format_val_missing(zarr_val, vcf_type):
67
+ assert_info_val_missing(zarr_val, vcf_type)
68
+
69
+
70
+ # Note: checking exact equality may prove problematic here
71
+ # but we should be deterministically storing what cyvcf2
72
+ # provides, which should compare equal.
73
+
74
+
75
+ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
76
+ assert vcf_val is not None
77
+ if vcf_type in ("String", "Character"):
78
+ split = list(vcf_val.split(","))
79
+ k = len(split)
80
+ if isinstance(zarr_val, str):
81
+ assert k == 1
82
+ # Scalar
83
+ assert vcf_val == zarr_val
84
+ else:
85
+ nt.assert_equal(split, zarr_val[:k])
86
+ assert_all_fill(zarr_val[k:], vcf_type)
87
+
88
+ elif isinstance(vcf_val, tuple):
89
+ vcf_missing_value_map = {
90
+ "Integer": constants.INT_MISSING,
91
+ "Float": constants.FLOAT32_MISSING,
92
+ }
93
+ v = [vcf_missing_value_map[vcf_type] if x is None else x for x in vcf_val]
94
+ missing = np.array([j for j, x in enumerate(vcf_val) if x is None], dtype=int)
95
+ a = np.array(v)
96
+ k = len(a)
97
+ # We are checking for int missing twice here, but it's necessary to have
98
+ # a separate check for floats because different NaNs compare equal
99
+ nt.assert_equal(a, zarr_val[:k])
100
+ assert_all_missing(zarr_val[missing], vcf_type)
101
+ if k < len(zarr_val):
102
+ assert_all_fill(zarr_val[k:], vcf_type)
103
+ else:
104
+ # Scalar
105
+ zarr_val = np.array(zarr_val, ndmin=1)
106
+ assert len(zarr_val.shape) == 1
107
+ assert vcf_val == zarr_val[0]
108
+ if len(zarr_val) > 1:
109
+ assert_all_fill(zarr_val[1:], vcf_type)
110
+
111
+
112
+ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
113
+ assert vcf_val is not None
114
+ assert isinstance(vcf_val, np.ndarray)
115
+ if vcf_type in ("String", "Character"):
116
+ assert len(vcf_val) == len(zarr_val)
117
+ for v, z in zip(vcf_val, zarr_val):
118
+ split = list(v.split(","))
119
+ # Note: deliberately duplicating logic here between this and the
120
+ # INFO col above to make sure all combinations are covered by tests
121
+ k = len(split)
122
+ if k == 1:
123
+ assert v == z
124
+ else:
125
+ nt.assert_equal(split, z[:k])
126
+ assert_all_fill(z[k:], vcf_type)
127
+ else:
128
+ assert vcf_val.shape[0] == zarr_val.shape[0]
129
+ if len(vcf_val.shape) == len(zarr_val.shape) + 1:
130
+ assert vcf_val.shape[-1] == 1
131
+ vcf_val = vcf_val[..., 0]
132
+ assert len(vcf_val.shape) <= 2
133
+ assert len(vcf_val.shape) == len(zarr_val.shape)
134
+ if len(vcf_val.shape) == 2:
135
+ k = vcf_val.shape[1]
136
+ if zarr_val.shape[1] != k:
137
+ assert_all_fill(zarr_val[:, k:], vcf_type)
138
+ zarr_val = zarr_val[:, :k]
139
+ assert vcf_val.shape == zarr_val.shape
140
+ if vcf_type == "Integer":
141
+ vcf_val[vcf_val == constants.VCF_INT_MISSING] = constants.INT_MISSING
142
+ vcf_val[vcf_val == constants.VCF_INT_FILL] = constants.INT_FILL
143
+ elif vcf_type == "Float":
144
+ nt.assert_equal(vcf_val.view(np.int32), zarr_val.view(np.int32))
145
+
146
+ nt.assert_equal(vcf_val, zarr_val)
147
+
148
+
149
+ def verify(vcf_path, zarr_path, show_progress=False):
150
+ store = zarr.DirectoryStore(zarr_path)
151
+
152
+ root = zarr.group(store=store)
153
+ pos = root["variant_position"][:]
154
+ allele = root["variant_allele"][:]
155
+ chrom = root["contig_id"][:][root["variant_contig"][:]]
156
+ vid = root["variant_id"][:]
157
+ call_genotype = None
158
+ if "call_genotype" in root:
159
+ call_genotype = iter(root["call_genotype"])
160
+
161
+ vcf = cyvcf2.VCF(vcf_path)
162
+ format_headers = {}
163
+ info_headers = {}
164
+ for h in vcf.header_iter():
165
+ if h["HeaderType"] == "FORMAT":
166
+ format_headers[h["ID"]] = h
167
+ if h["HeaderType"] == "INFO":
168
+ info_headers[h["ID"]] = h
169
+
170
+ format_fields = {}
171
+ info_fields = {}
172
+ for colname in root.keys():
173
+ if colname.startswith("call") and not colname.startswith("call_genotype"):
174
+ vcf_name = colname.split("_", 1)[1]
175
+ vcf_type = format_headers[vcf_name]["Type"]
176
+ format_fields[vcf_name] = vcf_type, iter(root[colname])
177
+ if colname.startswith("variant"):
178
+ name = colname.split("_", 1)[1]
179
+ if name.isupper():
180
+ vcf_type = info_headers[name]["Type"]
181
+ info_fields[name] = vcf_type, iter(root[colname])
182
+
183
+ first_pos = next(vcf).POS
184
+ start_index = np.searchsorted(pos, first_pos)
185
+ assert pos[start_index] == first_pos
186
+ vcf = cyvcf2.VCF(vcf_path)
187
+ if show_progress:
188
+ iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
189
+ else:
190
+ iterator = vcf
191
+ for j, row in enumerate(iterator, start_index):
192
+ assert chrom[j] == row.CHROM
193
+ assert pos[j] == row.POS
194
+ assert vid[j] == ("." if row.ID is None else row.ID)
195
+ assert allele[j, 0] == row.REF
196
+ k = len(row.ALT)
197
+ nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
198
+ assert np.all(allele[j, k + 1 :] == "")
199
+ # TODO FILTERS
200
+
201
+ if call_genotype is None:
202
+ val = None
203
+ try:
204
+ val = row.format("GT")
205
+ except KeyError:
206
+ pass
207
+ assert val is None
208
+ else:
209
+ gt = row.genotype.array()
210
+ gt_zarr = next(call_genotype)
211
+ gt_vcf = gt[:, :-1]
212
+ # NOTE cyvcf2 remaps genotypes automatically
213
+ # into the same missing/pad encoding that sgkit uses.
214
+ nt.assert_array_equal(gt_zarr, gt_vcf)
215
+
216
+ for name, (vcf_type, zarr_iter) in info_fields.items():
217
+ vcf_val = row.INFO.get(name, None)
218
+ zarr_val = next(zarr_iter)
219
+ if vcf_val is None:
220
+ assert_info_val_missing(zarr_val, vcf_type)
221
+ else:
222
+ assert_info_val_equal(vcf_val, zarr_val, vcf_type)
223
+
224
+ for name, (vcf_type, zarr_iter) in format_fields.items():
225
+ vcf_val = row.format(name)
226
+ zarr_val = next(zarr_iter)
227
+ if vcf_val is None:
228
+ assert_format_val_missing(zarr_val, vcf_type)
229
+ else:
230
+ assert_format_val_equal(vcf_val, zarr_val, vcf_type)
bio2zarr/vcf_utils.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import contextlib
2
2
  import gzip
3
+ import logging
3
4
  import os
4
5
  import pathlib
5
6
  import struct
@@ -13,6 +14,8 @@ import numpy as np
13
14
 
14
15
  from bio2zarr.typing import PathType
15
16
 
17
+ logger = logging.getLogger(__name__)
18
+
16
19
  CSI_EXTENSION = ".csi"
17
20
  TABIX_EXTENSION = ".tbi"
18
21
  TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
@@ -411,6 +414,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
411
414
  raise ValueError("Only .tbi or .csi indexes are supported.")
412
415
  self.vcf = cyvcf2.VCF(vcf_path)
413
416
  self.vcf.set_index(str(self.index_path))
417
+ logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
414
418
  self.sequence_names = None
415
419
  if self.index_type == "csi":
416
420
  # Determine the file-type based on the "aux" field.
@@ -441,24 +445,25 @@ class IndexedVcf(contextlib.AbstractContextManager):
441
445
  return sum(1 for _ in self.variants(region))
442
446
 
443
447
  def variants(self, region):
444
- # Need to filter because of indels overlapping the region
445
448
  start = 1 if region.start is None else region.start
446
449
  for var in self.vcf(str(region)):
450
+ # Need to filter because of indels overlapping the region
447
451
  if var.POS >= start:
448
452
  yield var
449
453
 
450
454
  def _filter_empty_and_refine(self, regions):
451
455
  """
452
456
  Return all regions in the specified list that have one or more records,
453
- and refine the start coordinate of the region to be the actual first coord
457
+ and refine the start coordinate of the region to be the actual first coord.
458
+
459
+ Because this is a relatively expensive operation requiring seeking around
460
+ the file, we return the results as an iterator.
454
461
  """
455
- ret = []
456
462
  for region in regions:
457
463
  var = next(self.variants(region), None)
458
464
  if var is not None:
459
465
  region.start = var.POS
460
- ret.append(region)
461
- return ret
466
+ yield region
462
467
 
463
468
  def partition_into_regions(
464
469
  self,
@@ -490,7 +495,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
490
495
  target_part_size_bytes = file_length // num_parts
491
496
  elif target_part_size_bytes is not None:
492
497
  num_parts = ceildiv(file_length, target_part_size_bytes)
493
- part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)])
498
+ part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
494
499
  file_offsets, region_contig_indexes, region_positions = self.index.offsets()
495
500
 
496
501
  # Search the file offsets to find which indexes the part lengths fall at
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.9
3
+ Version: 0.1.0
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -206,10 +206,13 @@ License: Apache License
206
206
  limitations under the License.
207
207
 
208
208
  Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
209
- Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/intro.html
210
- Classifier: Development Status :: 3 - Alpha
209
+ Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
210
+ Classifier: Development Status :: 4 - Beta
211
211
  Classifier: License :: OSI Approved :: Apache Software License
212
- Classifier: Operating System :: OS Independent
212
+ Classifier: Operating System :: POSIX
213
+ Classifier: Operating System :: POSIX :: Linux
214
+ Classifier: Operating System :: MacOS
215
+ Classifier: Operating System :: MacOS :: MacOS X
213
216
  Classifier: Intended Audience :: Science/Research
214
217
  Classifier: Programming Language :: Python
215
218
  Classifier: Programming Language :: Python :: 3
@@ -238,126 +241,10 @@ Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
238
241
  Requires-Dist: tqdm ; extra == 'dev'
239
242
 
240
243
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
244
+ [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
245
+
241
246
 
242
247
  # bio2zarr
243
248
  Convert bioinformatics file formats to Zarr
244
249
 
245
- Initially supports converting VCF to the
246
- [sgkit vcf-zarr specification](https://github.com/pystatgen/vcf-zarr-spec/)
247
-
248
- **This is early alpha-status code: everything is subject to change,
249
- and it has not been thoroughly tested**
250
-
251
- ## Install
252
-
253
- ```
254
- $ python3 -m pip install bio2zarr
255
- ```
256
-
257
- This will install the programs ``vcf2zarr``, ``plink2zarr`` and ``vcf_partition``
258
- into your local Python path. You may need to update your $PATH to call the
259
- executables directly.
260
-
261
- Alternatively, calling
262
- ```
263
- $ python3 -m bio2zarr vcf2zarr <args>
264
- ```
265
- is equivalent to
266
-
267
- ```
268
- $ vcf2zarr <args>
269
- ```
270
- and will always work.
271
-
272
-
273
- ## vcf2zarr
274
-
275
-
276
- Convert a VCF to zarr format:
277
-
278
- ```
279
- $ vcf2zarr convert <VCF1> <VCF2> <zarr>
280
- ```
281
-
282
- Converts the VCF to zarr format.
283
-
284
- **Do not use this for anything but the smallest files**
285
-
286
- The recommended approach is to use a multi-stage conversion
287
-
288
- First, convert the VCF into the intermediate format:
289
-
290
- ```
291
- vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exploded
292
- ```
293
-
294
- Then, (optionally) inspect this representation to get a feel for your dataset
295
- ```
296
- vcf2zarr inspect tmp/sample.exploded
297
- ```
298
-
299
- Then, (optionally) generate a conversion schema to describe the corresponding
300
- Zarr arrays:
301
-
302
- ```
303
- vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
304
- ```
305
-
306
- View and edit the schema, deleting any columns you don't want, or tweaking
307
- dtypes and compression settings to your taste.
308
-
309
- Finally, encode to Zarr:
310
- ```
311
- vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
312
- ```
313
-
314
- Use the ``-p, --worker-processes`` argument to control the number of workers used
315
- in the ``explode`` and ``encode`` phases.
316
-
317
- ### Shell completion
318
-
319
- To enable shell completion for a particular session in Bash do:
320
-
321
- ```
322
- eval "$(_VCF2ZARR_COMPLETE=bash_source vcf2zarr)"
323
- ```
324
-
325
- If you add this to your ``.bashrc`` vcf2zarr shell completion should available
326
- in all new shell sessions.
327
-
328
- See the [Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/#enabling-completion)
329
- for instructions on how to enable completion in other shells.
330
- a
331
-
332
- ## plink2zarr
333
-
334
- Convert a plink ``.bed`` file to zarr format. **This is incomplete**
335
-
336
- ## vcf_partition
337
-
338
- Partition a given VCF file into (approximately) a give number of regions:
339
-
340
- ```
341
- vcf_partition 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz -n 10
342
- ```
343
- gives
344
- ```
345
- chr20:1-6799360
346
- chr20:6799361-14319616
347
- chr20:14319617-21790720
348
- chr20:21790721-28770304
349
- chr20:28770305-31096832
350
- chr20:31096833-38043648
351
- chr20:38043649-45580288
352
- chr20:45580289-52117504
353
- chr20:52117505-58834944
354
- chr20:58834945-
355
- ```
356
-
357
- These reqion strings can then be used to split computation of the VCF
358
- into chunks for parallelisation.
359
-
360
- **TODO give a nice example here using xargs**
361
-
362
- **WARNING that this does not take into account that indels may overlap
363
- partitions and you may count variants twice or more if they do**
250
+ See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.
@@ -0,0 +1,20 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
+ bio2zarr/_version.py,sha256=IMl2Pr_Sy4LVRKy_Sm4CdwUl1Gryous6ncL96EMYsnM,411
4
+ bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
5
+ bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
+ bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
7
+ bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
8
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
+ bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
+ bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
11
+ bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
12
+ bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
13
+ bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
14
+ bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
15
+ bio2zarr-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
16
+ bio2zarr-0.1.0.dist-info/METADATA,sha256=zezBzqrJPB4ED7IqFvVj8Lura2untJA8optBdVTBNzc,14848
17
+ bio2zarr-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
18
+ bio2zarr-0.1.0.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
19
+ bio2zarr-0.1.0.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
20
+ bio2zarr-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ vcf2zarr = bio2zarr.cli:vcf2zarr_main
3
+ vcfpartition = bio2zarr.cli:vcfpartition