bio2zarr 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -0,0 +1,230 @@
1
+ import cyvcf2
2
+ import numpy as np
3
+ import numpy.testing as nt
4
+ import tqdm
5
+ import zarr
6
+
7
+ from .. import constants
8
+
9
+
10
+ def assert_all_missing_float(a):
11
+ v = np.array(a, dtype=np.float32).view(np.int32)
12
+ nt.assert_equal(v, constants.FLOAT32_MISSING_AS_INT32)
13
+
14
+
15
+ def assert_all_fill_float(a):
16
+ v = np.array(a, dtype=np.float32).view(np.int32)
17
+ nt.assert_equal(v, constants.FLOAT32_FILL_AS_INT32)
18
+
19
+
20
+ def assert_all_missing_int(a):
21
+ v = np.array(a, dtype=int)
22
+ nt.assert_equal(v, constants.INT_MISSING)
23
+
24
+
25
+ def assert_all_fill_int(a):
26
+ v = np.array(a, dtype=int)
27
+ nt.assert_equal(v, constants.INT_FILL)
28
+
29
+
30
+ def assert_all_missing_string(a):
31
+ nt.assert_equal(a, constants.STR_MISSING)
32
+
33
+
34
+ def assert_all_fill_string(a):
35
+ nt.assert_equal(a, constants.STR_FILL)
36
+
37
+
38
+ def assert_all_fill(zarr_val, vcf_type):
39
+ if vcf_type == "Integer":
40
+ assert_all_fill_int(zarr_val)
41
+ elif vcf_type in ("String", "Character"):
42
+ assert_all_fill_string(zarr_val)
43
+ elif vcf_type == "Float":
44
+ assert_all_fill_float(zarr_val)
45
+ else: # pragma: no cover
46
+ assert False # noqa PT015
47
+
48
+
49
+ def assert_all_missing(zarr_val, vcf_type):
50
+ if vcf_type == "Integer":
51
+ assert_all_missing_int(zarr_val)
52
+ elif vcf_type in ("String", "Character"):
53
+ assert_all_missing_string(zarr_val)
54
+ elif vcf_type == "Flag":
55
+ assert zarr_val == False # noqa 712
56
+ elif vcf_type == "Float":
57
+ assert_all_missing_float(zarr_val)
58
+ else: # pragma: no cover
59
+ assert False # noqa PT015
60
+
61
+
62
+ def assert_info_val_missing(zarr_val, vcf_type):
63
+ assert_all_missing(zarr_val, vcf_type)
64
+
65
+
66
+ def assert_format_val_missing(zarr_val, vcf_type):
67
+ assert_info_val_missing(zarr_val, vcf_type)
68
+
69
+
70
+ # Note: checking exact equality may prove problematic here
71
+ # but we should be deterministically storing what cyvcf2
72
+ # provides, which should compare equal.
73
+
74
+
75
+ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
76
+ assert vcf_val is not None
77
+ if vcf_type in ("String", "Character"):
78
+ split = list(vcf_val.split(","))
79
+ k = len(split)
80
+ if isinstance(zarr_val, str):
81
+ assert k == 1
82
+ # Scalar
83
+ assert vcf_val == zarr_val
84
+ else:
85
+ nt.assert_equal(split, zarr_val[:k])
86
+ assert_all_fill(zarr_val[k:], vcf_type)
87
+
88
+ elif isinstance(vcf_val, tuple):
89
+ vcf_missing_value_map = {
90
+ "Integer": constants.INT_MISSING,
91
+ "Float": constants.FLOAT32_MISSING,
92
+ }
93
+ v = [vcf_missing_value_map[vcf_type] if x is None else x for x in vcf_val]
94
+ missing = np.array([j for j, x in enumerate(vcf_val) if x is None], dtype=int)
95
+ a = np.array(v)
96
+ k = len(a)
97
+ # We are checking for int missing twice here, but it's necessary to have
98
+ # a separate check for floats because different NaNs compare equal
99
+ nt.assert_equal(a, zarr_val[:k])
100
+ assert_all_missing(zarr_val[missing], vcf_type)
101
+ if k < len(zarr_val):
102
+ assert_all_fill(zarr_val[k:], vcf_type)
103
+ else:
104
+ # Scalar
105
+ zarr_val = np.array(zarr_val, ndmin=1)
106
+ assert len(zarr_val.shape) == 1
107
+ assert vcf_val == zarr_val[0]
108
+ if len(zarr_val) > 1:
109
+ assert_all_fill(zarr_val[1:], vcf_type)
110
+
111
+
112
+ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
113
+ assert vcf_val is not None
114
+ assert isinstance(vcf_val, np.ndarray)
115
+ if vcf_type in ("String", "Character"):
116
+ assert len(vcf_val) == len(zarr_val)
117
+ for v, z in zip(vcf_val, zarr_val):
118
+ split = list(v.split(","))
119
+ # Note: deliberately duplicating logic here between this and the
120
+ # INFO col above to make sure all combinations are covered by tests
121
+ k = len(split)
122
+ if k == 1:
123
+ assert v == z
124
+ else:
125
+ nt.assert_equal(split, z[:k])
126
+ assert_all_fill(z[k:], vcf_type)
127
+ else:
128
+ assert vcf_val.shape[0] == zarr_val.shape[0]
129
+ if len(vcf_val.shape) == len(zarr_val.shape) + 1:
130
+ assert vcf_val.shape[-1] == 1
131
+ vcf_val = vcf_val[..., 0]
132
+ assert len(vcf_val.shape) <= 2
133
+ assert len(vcf_val.shape) == len(zarr_val.shape)
134
+ if len(vcf_val.shape) == 2:
135
+ k = vcf_val.shape[1]
136
+ if zarr_val.shape[1] != k:
137
+ assert_all_fill(zarr_val[:, k:], vcf_type)
138
+ zarr_val = zarr_val[:, :k]
139
+ assert vcf_val.shape == zarr_val.shape
140
+ if vcf_type == "Integer":
141
+ vcf_val[vcf_val == constants.VCF_INT_MISSING] = constants.INT_MISSING
142
+ vcf_val[vcf_val == constants.VCF_INT_FILL] = constants.INT_FILL
143
+ elif vcf_type == "Float":
144
+ nt.assert_equal(vcf_val.view(np.int32), zarr_val.view(np.int32))
145
+
146
+ nt.assert_equal(vcf_val, zarr_val)
147
+
148
+
149
+ def verify(vcf_path, zarr_path, show_progress=False):
150
+ store = zarr.DirectoryStore(zarr_path)
151
+
152
+ root = zarr.group(store=store)
153
+ pos = root["variant_position"][:]
154
+ allele = root["variant_allele"][:]
155
+ chrom = root["contig_id"][:][root["variant_contig"][:]]
156
+ vid = root["variant_id"][:]
157
+ call_genotype = None
158
+ if "call_genotype" in root:
159
+ call_genotype = iter(root["call_genotype"])
160
+
161
+ vcf = cyvcf2.VCF(vcf_path)
162
+ format_headers = {}
163
+ info_headers = {}
164
+ for h in vcf.header_iter():
165
+ if h["HeaderType"] == "FORMAT":
166
+ format_headers[h["ID"]] = h
167
+ if h["HeaderType"] == "INFO":
168
+ info_headers[h["ID"]] = h
169
+
170
+ format_fields = {}
171
+ info_fields = {}
172
+ for colname in root.keys():
173
+ if colname.startswith("call") and not colname.startswith("call_genotype"):
174
+ vcf_name = colname.split("_", 1)[1]
175
+ vcf_type = format_headers[vcf_name]["Type"]
176
+ format_fields[vcf_name] = vcf_type, iter(root[colname])
177
+ if colname.startswith("variant"):
178
+ name = colname.split("_", 1)[1]
179
+ if name.isupper():
180
+ vcf_type = info_headers[name]["Type"]
181
+ info_fields[name] = vcf_type, iter(root[colname])
182
+
183
+ first_pos = next(vcf).POS
184
+ start_index = np.searchsorted(pos, first_pos)
185
+ assert pos[start_index] == first_pos
186
+ vcf = cyvcf2.VCF(vcf_path)
187
+ if show_progress:
188
+ iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
189
+ else:
190
+ iterator = vcf
191
+ for j, row in enumerate(iterator, start_index):
192
+ assert chrom[j] == row.CHROM
193
+ assert pos[j] == row.POS
194
+ assert vid[j] == ("." if row.ID is None else row.ID)
195
+ assert allele[j, 0] == row.REF
196
+ k = len(row.ALT)
197
+ nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
198
+ assert np.all(allele[j, k + 1 :] == "")
199
+ # TODO FILTERS
200
+
201
+ if call_genotype is None:
202
+ val = None
203
+ try:
204
+ val = row.format("GT")
205
+ except KeyError:
206
+ pass
207
+ assert val is None
208
+ else:
209
+ gt = row.genotype.array()
210
+ gt_zarr = next(call_genotype)
211
+ gt_vcf = gt[:, :-1]
212
+ # NOTE cyvcf2 remaps genotypes automatically
213
+ # into the same missing/pad encoding that sgkit uses.
214
+ nt.assert_array_equal(gt_zarr, gt_vcf)
215
+
216
+ for name, (vcf_type, zarr_iter) in info_fields.items():
217
+ vcf_val = row.INFO.get(name, None)
218
+ zarr_val = next(zarr_iter)
219
+ if vcf_val is None:
220
+ assert_info_val_missing(zarr_val, vcf_type)
221
+ else:
222
+ assert_info_val_equal(vcf_val, zarr_val, vcf_type)
223
+
224
+ for name, (vcf_type, zarr_iter) in format_fields.items():
225
+ vcf_val = row.format(name)
226
+ zarr_val = next(zarr_iter)
227
+ if vcf_val is None:
228
+ assert_format_val_missing(zarr_val, vcf_type)
229
+ else:
230
+ assert_format_val_equal(vcf_val, zarr_val, vcf_type)
bio2zarr/vcf_utils.py CHANGED
@@ -441,9 +441,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
441
441
  return sum(1 for _ in self.variants(region))
442
442
 
443
443
  def variants(self, region):
444
- # Need to filter because of indels overlapping the region
445
444
  start = 1 if region.start is None else region.start
446
445
  for var in self.vcf(str(region)):
446
+ # Need to filter because of indels overlapping the region
447
447
  if var.POS >= start:
448
448
  yield var
449
449
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.9
3
+ Version: 0.0.10
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -206,10 +206,13 @@ License: Apache License
206
206
  limitations under the License.
207
207
 
208
208
  Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
209
- Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/intro.html
209
+ Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
210
210
  Classifier: Development Status :: 3 - Alpha
211
211
  Classifier: License :: OSI Approved :: Apache Software License
212
- Classifier: Operating System :: OS Independent
212
+ Classifier: Operating System :: POSIX
213
+ Classifier: Operating System :: POSIX :: Linux
214
+ Classifier: Operating System :: MacOS
215
+ Classifier: Operating System :: MacOS :: MacOS X
213
216
  Classifier: Intended Audience :: Science/Research
214
217
  Classifier: Programming Language :: Python
215
218
  Classifier: Programming Language :: Python :: 3
@@ -238,126 +241,10 @@ Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
238
241
  Requires-Dist: tqdm ; extra == 'dev'
239
242
 
240
243
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
244
+ [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
245
+
241
246
 
242
247
  # bio2zarr
243
248
  Convert bioinformatics file formats to Zarr
244
249
 
245
- Initially supports converting VCF to the
246
- [sgkit vcf-zarr specification](https://github.com/pystatgen/vcf-zarr-spec/)
247
-
248
- **This is early alpha-status code: everything is subject to change,
249
- and it has not been thoroughly tested**
250
-
251
- ## Install
252
-
253
- ```
254
- $ python3 -m pip install bio2zarr
255
- ```
256
-
257
- This will install the programs ``vcf2zarr``, ``plink2zarr`` and ``vcf_partition``
258
- into your local Python path. You may need to update your $PATH to call the
259
- executables directly.
260
-
261
- Alternatively, calling
262
- ```
263
- $ python3 -m bio2zarr vcf2zarr <args>
264
- ```
265
- is equivalent to
266
-
267
- ```
268
- $ vcf2zarr <args>
269
- ```
270
- and will always work.
271
-
272
-
273
- ## vcf2zarr
274
-
275
-
276
- Convert a VCF to zarr format:
277
-
278
- ```
279
- $ vcf2zarr convert <VCF1> <VCF2> <zarr>
280
- ```
281
-
282
- Converts the VCF to zarr format.
283
-
284
- **Do not use this for anything but the smallest files**
285
-
286
- The recommended approach is to use a multi-stage conversion
287
-
288
- First, convert the VCF into the intermediate format:
289
-
290
- ```
291
- vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exploded
292
- ```
293
-
294
- Then, (optionally) inspect this representation to get a feel for your dataset
295
- ```
296
- vcf2zarr inspect tmp/sample.exploded
297
- ```
298
-
299
- Then, (optionally) generate a conversion schema to describe the corresponding
300
- Zarr arrays:
301
-
302
- ```
303
- vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
304
- ```
305
-
306
- View and edit the schema, deleting any columns you don't want, or tweaking
307
- dtypes and compression settings to your taste.
308
-
309
- Finally, encode to Zarr:
310
- ```
311
- vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
312
- ```
313
-
314
- Use the ``-p, --worker-processes`` argument to control the number of workers used
315
- in the ``explode`` and ``encode`` phases.
316
-
317
- ### Shell completion
318
-
319
- To enable shell completion for a particular session in Bash do:
320
-
321
- ```
322
- eval "$(_VCF2ZARR_COMPLETE=bash_source vcf2zarr)"
323
- ```
324
-
325
- If you add this to your ``.bashrc`` vcf2zarr shell completion should available
326
- in all new shell sessions.
327
-
328
- See the [Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/#enabling-completion)
329
- for instructions on how to enable completion in other shells.
330
- a
331
-
332
- ## plink2zarr
333
-
334
- Convert a plink ``.bed`` file to zarr format. **This is incomplete**
335
-
336
- ## vcf_partition
337
-
338
- Partition a given VCF file into (approximately) a give number of regions:
339
-
340
- ```
341
- vcf_partition 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz -n 10
342
- ```
343
- gives
344
- ```
345
- chr20:1-6799360
346
- chr20:6799361-14319616
347
- chr20:14319617-21790720
348
- chr20:21790721-28770304
349
- chr20:28770305-31096832
350
- chr20:31096833-38043648
351
- chr20:38043649-45580288
352
- chr20:45580289-52117504
353
- chr20:52117505-58834944
354
- chr20:58834945-
355
- ```
356
-
357
- These reqion strings can then be used to split computation of the VCF
358
- into chunks for parallelisation.
359
-
360
- **TODO give a nice example here using xargs**
361
-
362
- **WARNING that this does not take into account that indels may overlap
363
- partitions and you may count variants twice or more if they do**
250
+ See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.
@@ -0,0 +1,20 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
+ bio2zarr/_version.py,sha256=IBUgg21Ew0JtWj9Z6eN1r4zXlrNseQQNV4zo-nYzlEY,413
4
+ bio2zarr/cli.py,sha256=Bv4k9V-5HJVVbqBMiYLWz5IQyILQ0bTicqgkQrr9hd0,13209
5
+ bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
+ bio2zarr/core.py,sha256=3UFh7nKB3CbAIaJV3wgoqlkRy1M235C2vz7Iua73qwM,9234
7
+ bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
8
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
+ bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
+ bio2zarr/vcf_utils.py,sha256=b3Ti1AFXFlK7S1mu6jotqHPrujCIQXBKIHH8yIzd3zk,17781
11
+ bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
12
+ bio2zarr/vcf2zarr/icf.py,sha256=BJKPJDeqP8QtVz7ebm6NQQgvsba8H-JLsNEz4whOxsw,41559
13
+ bio2zarr/vcf2zarr/vcz.py,sha256=sy8VVYuOntMuPs5gUwQx6IA39_Gl_YFW2h-CeRyQw2A,36865
14
+ bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
15
+ bio2zarr-0.0.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
16
+ bio2zarr-0.0.10.dist-info/METADATA,sha256=7su1JbkFtR7eDjq2Rp5A8CjP9KnvwWaDS6bPH43Z2qI,14850
17
+ bio2zarr-0.0.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
18
+ bio2zarr-0.0.10.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
19
+ bio2zarr-0.0.10.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
20
+ bio2zarr-0.0.10.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ vcf2zarr = bio2zarr.cli:vcf2zarr_main
3
+ vcfpartition = bio2zarr.cli:vcfpartition