bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/__main__.py +2 -2
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +176 -113
- bio2zarr/constants.py +18 -0
- bio2zarr/core.py +65 -20
- bio2zarr/vcf2zarr/__init__.py +38 -0
- bio2zarr/vcf2zarr/icf.py +1221 -0
- bio2zarr/vcf2zarr/vcz.py +1053 -0
- bio2zarr/vcf2zarr/verification.py +230 -0
- bio2zarr/vcf_utils.py +11 -6
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
- bio2zarr-0.1.0.dist-info/RECORD +20 -0
- bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
- bio2zarr/vcf.py +0 -2445
- bio2zarr-0.0.9.dist-info/RECORD +0 -16
- bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import cyvcf2
|
|
2
|
+
import numpy as np
|
|
3
|
+
import numpy.testing as nt
|
|
4
|
+
import tqdm
|
|
5
|
+
import zarr
|
|
6
|
+
|
|
7
|
+
from .. import constants
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def assert_all_missing_float(a):
|
|
11
|
+
v = np.array(a, dtype=np.float32).view(np.int32)
|
|
12
|
+
nt.assert_equal(v, constants.FLOAT32_MISSING_AS_INT32)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def assert_all_fill_float(a):
|
|
16
|
+
v = np.array(a, dtype=np.float32).view(np.int32)
|
|
17
|
+
nt.assert_equal(v, constants.FLOAT32_FILL_AS_INT32)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def assert_all_missing_int(a):
|
|
21
|
+
v = np.array(a, dtype=int)
|
|
22
|
+
nt.assert_equal(v, constants.INT_MISSING)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def assert_all_fill_int(a):
|
|
26
|
+
v = np.array(a, dtype=int)
|
|
27
|
+
nt.assert_equal(v, constants.INT_FILL)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def assert_all_missing_string(a):
|
|
31
|
+
nt.assert_equal(a, constants.STR_MISSING)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def assert_all_fill_string(a):
|
|
35
|
+
nt.assert_equal(a, constants.STR_FILL)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def assert_all_fill(zarr_val, vcf_type):
|
|
39
|
+
if vcf_type == "Integer":
|
|
40
|
+
assert_all_fill_int(zarr_val)
|
|
41
|
+
elif vcf_type in ("String", "Character"):
|
|
42
|
+
assert_all_fill_string(zarr_val)
|
|
43
|
+
elif vcf_type == "Float":
|
|
44
|
+
assert_all_fill_float(zarr_val)
|
|
45
|
+
else: # pragma: no cover
|
|
46
|
+
assert False # noqa PT015
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def assert_all_missing(zarr_val, vcf_type):
|
|
50
|
+
if vcf_type == "Integer":
|
|
51
|
+
assert_all_missing_int(zarr_val)
|
|
52
|
+
elif vcf_type in ("String", "Character"):
|
|
53
|
+
assert_all_missing_string(zarr_val)
|
|
54
|
+
elif vcf_type == "Flag":
|
|
55
|
+
assert zarr_val == False # noqa 712
|
|
56
|
+
elif vcf_type == "Float":
|
|
57
|
+
assert_all_missing_float(zarr_val)
|
|
58
|
+
else: # pragma: no cover
|
|
59
|
+
assert False # noqa PT015
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def assert_info_val_missing(zarr_val, vcf_type):
|
|
63
|
+
assert_all_missing(zarr_val, vcf_type)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def assert_format_val_missing(zarr_val, vcf_type):
|
|
67
|
+
assert_info_val_missing(zarr_val, vcf_type)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Note: checking exact equality may prove problematic here
|
|
71
|
+
# but we should be deterministically storing what cyvcf2
|
|
72
|
+
# provides, which should compare equal.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
|
|
76
|
+
assert vcf_val is not None
|
|
77
|
+
if vcf_type in ("String", "Character"):
|
|
78
|
+
split = list(vcf_val.split(","))
|
|
79
|
+
k = len(split)
|
|
80
|
+
if isinstance(zarr_val, str):
|
|
81
|
+
assert k == 1
|
|
82
|
+
# Scalar
|
|
83
|
+
assert vcf_val == zarr_val
|
|
84
|
+
else:
|
|
85
|
+
nt.assert_equal(split, zarr_val[:k])
|
|
86
|
+
assert_all_fill(zarr_val[k:], vcf_type)
|
|
87
|
+
|
|
88
|
+
elif isinstance(vcf_val, tuple):
|
|
89
|
+
vcf_missing_value_map = {
|
|
90
|
+
"Integer": constants.INT_MISSING,
|
|
91
|
+
"Float": constants.FLOAT32_MISSING,
|
|
92
|
+
}
|
|
93
|
+
v = [vcf_missing_value_map[vcf_type] if x is None else x for x in vcf_val]
|
|
94
|
+
missing = np.array([j for j, x in enumerate(vcf_val) if x is None], dtype=int)
|
|
95
|
+
a = np.array(v)
|
|
96
|
+
k = len(a)
|
|
97
|
+
# We are checking for int missing twice here, but it's necessary to have
|
|
98
|
+
# a separate check for floats because different NaNs compare equal
|
|
99
|
+
nt.assert_equal(a, zarr_val[:k])
|
|
100
|
+
assert_all_missing(zarr_val[missing], vcf_type)
|
|
101
|
+
if k < len(zarr_val):
|
|
102
|
+
assert_all_fill(zarr_val[k:], vcf_type)
|
|
103
|
+
else:
|
|
104
|
+
# Scalar
|
|
105
|
+
zarr_val = np.array(zarr_val, ndmin=1)
|
|
106
|
+
assert len(zarr_val.shape) == 1
|
|
107
|
+
assert vcf_val == zarr_val[0]
|
|
108
|
+
if len(zarr_val) > 1:
|
|
109
|
+
assert_all_fill(zarr_val[1:], vcf_type)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
|
|
113
|
+
assert vcf_val is not None
|
|
114
|
+
assert isinstance(vcf_val, np.ndarray)
|
|
115
|
+
if vcf_type in ("String", "Character"):
|
|
116
|
+
assert len(vcf_val) == len(zarr_val)
|
|
117
|
+
for v, z in zip(vcf_val, zarr_val):
|
|
118
|
+
split = list(v.split(","))
|
|
119
|
+
# Note: deliberately duplicating logic here between this and the
|
|
120
|
+
# INFO col above to make sure all combinations are covered by tests
|
|
121
|
+
k = len(split)
|
|
122
|
+
if k == 1:
|
|
123
|
+
assert v == z
|
|
124
|
+
else:
|
|
125
|
+
nt.assert_equal(split, z[:k])
|
|
126
|
+
assert_all_fill(z[k:], vcf_type)
|
|
127
|
+
else:
|
|
128
|
+
assert vcf_val.shape[0] == zarr_val.shape[0]
|
|
129
|
+
if len(vcf_val.shape) == len(zarr_val.shape) + 1:
|
|
130
|
+
assert vcf_val.shape[-1] == 1
|
|
131
|
+
vcf_val = vcf_val[..., 0]
|
|
132
|
+
assert len(vcf_val.shape) <= 2
|
|
133
|
+
assert len(vcf_val.shape) == len(zarr_val.shape)
|
|
134
|
+
if len(vcf_val.shape) == 2:
|
|
135
|
+
k = vcf_val.shape[1]
|
|
136
|
+
if zarr_val.shape[1] != k:
|
|
137
|
+
assert_all_fill(zarr_val[:, k:], vcf_type)
|
|
138
|
+
zarr_val = zarr_val[:, :k]
|
|
139
|
+
assert vcf_val.shape == zarr_val.shape
|
|
140
|
+
if vcf_type == "Integer":
|
|
141
|
+
vcf_val[vcf_val == constants.VCF_INT_MISSING] = constants.INT_MISSING
|
|
142
|
+
vcf_val[vcf_val == constants.VCF_INT_FILL] = constants.INT_FILL
|
|
143
|
+
elif vcf_type == "Float":
|
|
144
|
+
nt.assert_equal(vcf_val.view(np.int32), zarr_val.view(np.int32))
|
|
145
|
+
|
|
146
|
+
nt.assert_equal(vcf_val, zarr_val)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def verify(vcf_path, zarr_path, show_progress=False):
|
|
150
|
+
store = zarr.DirectoryStore(zarr_path)
|
|
151
|
+
|
|
152
|
+
root = zarr.group(store=store)
|
|
153
|
+
pos = root["variant_position"][:]
|
|
154
|
+
allele = root["variant_allele"][:]
|
|
155
|
+
chrom = root["contig_id"][:][root["variant_contig"][:]]
|
|
156
|
+
vid = root["variant_id"][:]
|
|
157
|
+
call_genotype = None
|
|
158
|
+
if "call_genotype" in root:
|
|
159
|
+
call_genotype = iter(root["call_genotype"])
|
|
160
|
+
|
|
161
|
+
vcf = cyvcf2.VCF(vcf_path)
|
|
162
|
+
format_headers = {}
|
|
163
|
+
info_headers = {}
|
|
164
|
+
for h in vcf.header_iter():
|
|
165
|
+
if h["HeaderType"] == "FORMAT":
|
|
166
|
+
format_headers[h["ID"]] = h
|
|
167
|
+
if h["HeaderType"] == "INFO":
|
|
168
|
+
info_headers[h["ID"]] = h
|
|
169
|
+
|
|
170
|
+
format_fields = {}
|
|
171
|
+
info_fields = {}
|
|
172
|
+
for colname in root.keys():
|
|
173
|
+
if colname.startswith("call") and not colname.startswith("call_genotype"):
|
|
174
|
+
vcf_name = colname.split("_", 1)[1]
|
|
175
|
+
vcf_type = format_headers[vcf_name]["Type"]
|
|
176
|
+
format_fields[vcf_name] = vcf_type, iter(root[colname])
|
|
177
|
+
if colname.startswith("variant"):
|
|
178
|
+
name = colname.split("_", 1)[1]
|
|
179
|
+
if name.isupper():
|
|
180
|
+
vcf_type = info_headers[name]["Type"]
|
|
181
|
+
info_fields[name] = vcf_type, iter(root[colname])
|
|
182
|
+
|
|
183
|
+
first_pos = next(vcf).POS
|
|
184
|
+
start_index = np.searchsorted(pos, first_pos)
|
|
185
|
+
assert pos[start_index] == first_pos
|
|
186
|
+
vcf = cyvcf2.VCF(vcf_path)
|
|
187
|
+
if show_progress:
|
|
188
|
+
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
|
|
189
|
+
else:
|
|
190
|
+
iterator = vcf
|
|
191
|
+
for j, row in enumerate(iterator, start_index):
|
|
192
|
+
assert chrom[j] == row.CHROM
|
|
193
|
+
assert pos[j] == row.POS
|
|
194
|
+
assert vid[j] == ("." if row.ID is None else row.ID)
|
|
195
|
+
assert allele[j, 0] == row.REF
|
|
196
|
+
k = len(row.ALT)
|
|
197
|
+
nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
|
|
198
|
+
assert np.all(allele[j, k + 1 :] == "")
|
|
199
|
+
# TODO FILTERS
|
|
200
|
+
|
|
201
|
+
if call_genotype is None:
|
|
202
|
+
val = None
|
|
203
|
+
try:
|
|
204
|
+
val = row.format("GT")
|
|
205
|
+
except KeyError:
|
|
206
|
+
pass
|
|
207
|
+
assert val is None
|
|
208
|
+
else:
|
|
209
|
+
gt = row.genotype.array()
|
|
210
|
+
gt_zarr = next(call_genotype)
|
|
211
|
+
gt_vcf = gt[:, :-1]
|
|
212
|
+
# NOTE cyvcf2 remaps genotypes automatically
|
|
213
|
+
# into the same missing/pad encoding that sgkit uses.
|
|
214
|
+
nt.assert_array_equal(gt_zarr, gt_vcf)
|
|
215
|
+
|
|
216
|
+
for name, (vcf_type, zarr_iter) in info_fields.items():
|
|
217
|
+
vcf_val = row.INFO.get(name, None)
|
|
218
|
+
zarr_val = next(zarr_iter)
|
|
219
|
+
if vcf_val is None:
|
|
220
|
+
assert_info_val_missing(zarr_val, vcf_type)
|
|
221
|
+
else:
|
|
222
|
+
assert_info_val_equal(vcf_val, zarr_val, vcf_type)
|
|
223
|
+
|
|
224
|
+
for name, (vcf_type, zarr_iter) in format_fields.items():
|
|
225
|
+
vcf_val = row.format(name)
|
|
226
|
+
zarr_val = next(zarr_iter)
|
|
227
|
+
if vcf_val is None:
|
|
228
|
+
assert_format_val_missing(zarr_val, vcf_type)
|
|
229
|
+
else:
|
|
230
|
+
assert_format_val_equal(vcf_val, zarr_val, vcf_type)
|
bio2zarr/vcf_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import gzip
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
import pathlib
|
|
5
6
|
import struct
|
|
@@ -13,6 +14,8 @@ import numpy as np
|
|
|
13
14
|
|
|
14
15
|
from bio2zarr.typing import PathType
|
|
15
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
16
19
|
CSI_EXTENSION = ".csi"
|
|
17
20
|
TABIX_EXTENSION = ".tbi"
|
|
18
21
|
TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14 # 16kb interval size
|
|
@@ -411,6 +414,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
411
414
|
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
412
415
|
self.vcf = cyvcf2.VCF(vcf_path)
|
|
413
416
|
self.vcf.set_index(str(self.index_path))
|
|
417
|
+
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
|
|
414
418
|
self.sequence_names = None
|
|
415
419
|
if self.index_type == "csi":
|
|
416
420
|
# Determine the file-type based on the "aux" field.
|
|
@@ -441,24 +445,25 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
441
445
|
return sum(1 for _ in self.variants(region))
|
|
442
446
|
|
|
443
447
|
def variants(self, region):
|
|
444
|
-
# Need to filter because of indels overlapping the region
|
|
445
448
|
start = 1 if region.start is None else region.start
|
|
446
449
|
for var in self.vcf(str(region)):
|
|
450
|
+
# Need to filter because of indels overlapping the region
|
|
447
451
|
if var.POS >= start:
|
|
448
452
|
yield var
|
|
449
453
|
|
|
450
454
|
def _filter_empty_and_refine(self, regions):
|
|
451
455
|
"""
|
|
452
456
|
Return all regions in the specified list that have one or more records,
|
|
453
|
-
and refine the start coordinate of the region to be the actual first coord
|
|
457
|
+
and refine the start coordinate of the region to be the actual first coord.
|
|
458
|
+
|
|
459
|
+
Because this is a relatively expensive operation requiring seeking around
|
|
460
|
+
the file, we return the results as an iterator.
|
|
454
461
|
"""
|
|
455
|
-
ret = []
|
|
456
462
|
for region in regions:
|
|
457
463
|
var = next(self.variants(region), None)
|
|
458
464
|
if var is not None:
|
|
459
465
|
region.start = var.POS
|
|
460
|
-
|
|
461
|
-
return ret
|
|
466
|
+
yield region
|
|
462
467
|
|
|
463
468
|
def partition_into_regions(
|
|
464
469
|
self,
|
|
@@ -490,7 +495,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
490
495
|
target_part_size_bytes = file_length // num_parts
|
|
491
496
|
elif target_part_size_bytes is not None:
|
|
492
497
|
num_parts = ceildiv(file_length, target_part_size_bytes)
|
|
493
|
-
part_lengths =
|
|
498
|
+
part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
|
|
494
499
|
file_offsets, region_contig_indexes, region_positions = self.index.offsets()
|
|
495
500
|
|
|
496
501
|
# Search the file offsets to find which indexes the part lengths fall at
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -206,10 +206,13 @@ License: Apache License
|
|
|
206
206
|
limitations under the License.
|
|
207
207
|
|
|
208
208
|
Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
|
|
209
|
-
Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
|
|
210
|
-
Classifier: Development Status ::
|
|
209
|
+
Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
|
|
210
|
+
Classifier: Development Status :: 4 - Beta
|
|
211
211
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
212
|
-
Classifier: Operating System ::
|
|
212
|
+
Classifier: Operating System :: POSIX
|
|
213
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
214
|
+
Classifier: Operating System :: MacOS
|
|
215
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
213
216
|
Classifier: Intended Audience :: Science/Research
|
|
214
217
|
Classifier: Programming Language :: Python
|
|
215
218
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -238,126 +241,10 @@ Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
|
|
|
238
241
|
Requires-Dist: tqdm ; extra == 'dev'
|
|
239
242
|
|
|
240
243
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
244
|
+
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
245
|
+
|
|
241
246
|
|
|
242
247
|
# bio2zarr
|
|
243
248
|
Convert bioinformatics file formats to Zarr
|
|
244
249
|
|
|
245
|
-
|
|
246
|
-
[sgkit vcf-zarr specification](https://github.com/pystatgen/vcf-zarr-spec/)
|
|
247
|
-
|
|
248
|
-
**This is early alpha-status code: everything is subject to change,
|
|
249
|
-
and it has not been thoroughly tested**
|
|
250
|
-
|
|
251
|
-
## Install
|
|
252
|
-
|
|
253
|
-
```
|
|
254
|
-
$ python3 -m pip install bio2zarr
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
This will install the programs ``vcf2zarr``, ``plink2zarr`` and ``vcf_partition``
|
|
258
|
-
into your local Python path. You may need to update your $PATH to call the
|
|
259
|
-
executables directly.
|
|
260
|
-
|
|
261
|
-
Alternatively, calling
|
|
262
|
-
```
|
|
263
|
-
$ python3 -m bio2zarr vcf2zarr <args>
|
|
264
|
-
```
|
|
265
|
-
is equivalent to
|
|
266
|
-
|
|
267
|
-
```
|
|
268
|
-
$ vcf2zarr <args>
|
|
269
|
-
```
|
|
270
|
-
and will always work.
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
## vcf2zarr
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
Convert a VCF to zarr format:
|
|
277
|
-
|
|
278
|
-
```
|
|
279
|
-
$ vcf2zarr convert <VCF1> <VCF2> <zarr>
|
|
280
|
-
```
|
|
281
|
-
|
|
282
|
-
Converts the VCF to zarr format.
|
|
283
|
-
|
|
284
|
-
**Do not use this for anything but the smallest files**
|
|
285
|
-
|
|
286
|
-
The recommended approach is to use a multi-stage conversion
|
|
287
|
-
|
|
288
|
-
First, convert the VCF into the intermediate format:
|
|
289
|
-
|
|
290
|
-
```
|
|
291
|
-
vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exploded
|
|
292
|
-
```
|
|
293
|
-
|
|
294
|
-
Then, (optionally) inspect this representation to get a feel for your dataset
|
|
295
|
-
```
|
|
296
|
-
vcf2zarr inspect tmp/sample.exploded
|
|
297
|
-
```
|
|
298
|
-
|
|
299
|
-
Then, (optionally) generate a conversion schema to describe the corresponding
|
|
300
|
-
Zarr arrays:
|
|
301
|
-
|
|
302
|
-
```
|
|
303
|
-
vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
View and edit the schema, deleting any columns you don't want, or tweaking
|
|
307
|
-
dtypes and compression settings to your taste.
|
|
308
|
-
|
|
309
|
-
Finally, encode to Zarr:
|
|
310
|
-
```
|
|
311
|
-
vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
|
|
312
|
-
```
|
|
313
|
-
|
|
314
|
-
Use the ``-p, --worker-processes`` argument to control the number of workers used
|
|
315
|
-
in the ``explode`` and ``encode`` phases.
|
|
316
|
-
|
|
317
|
-
### Shell completion
|
|
318
|
-
|
|
319
|
-
To enable shell completion for a particular session in Bash do:
|
|
320
|
-
|
|
321
|
-
```
|
|
322
|
-
eval "$(_VCF2ZARR_COMPLETE=bash_source vcf2zarr)"
|
|
323
|
-
```
|
|
324
|
-
|
|
325
|
-
If you add this to your ``.bashrc`` vcf2zarr shell completion should available
|
|
326
|
-
in all new shell sessions.
|
|
327
|
-
|
|
328
|
-
See the [Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/#enabling-completion)
|
|
329
|
-
for instructions on how to enable completion in other shells.
|
|
330
|
-
a
|
|
331
|
-
|
|
332
|
-
## plink2zarr
|
|
333
|
-
|
|
334
|
-
Convert a plink ``.bed`` file to zarr format. **This is incomplete**
|
|
335
|
-
|
|
336
|
-
## vcf_partition
|
|
337
|
-
|
|
338
|
-
Partition a given VCF file into (approximately) a give number of regions:
|
|
339
|
-
|
|
340
|
-
```
|
|
341
|
-
vcf_partition 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz -n 10
|
|
342
|
-
```
|
|
343
|
-
gives
|
|
344
|
-
```
|
|
345
|
-
chr20:1-6799360
|
|
346
|
-
chr20:6799361-14319616
|
|
347
|
-
chr20:14319617-21790720
|
|
348
|
-
chr20:21790721-28770304
|
|
349
|
-
chr20:28770305-31096832
|
|
350
|
-
chr20:31096833-38043648
|
|
351
|
-
chr20:38043649-45580288
|
|
352
|
-
chr20:45580289-52117504
|
|
353
|
-
chr20:52117505-58834944
|
|
354
|
-
chr20:58834945-
|
|
355
|
-
```
|
|
356
|
-
|
|
357
|
-
These reqion strings can then be used to split computation of the VCF
|
|
358
|
-
into chunks for parallelisation.
|
|
359
|
-
|
|
360
|
-
**TODO give a nice example here using xargs**
|
|
361
|
-
|
|
362
|
-
**WARNING that this does not take into account that indels may overlap
|
|
363
|
-
partitions and you may count variants twice or more if they do**
|
|
250
|
+
See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
+
bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
|
|
3
|
+
bio2zarr/_version.py,sha256=IMl2Pr_Sy4LVRKy_Sm4CdwUl1Gryous6ncL96EMYsnM,411
|
|
4
|
+
bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
|
|
5
|
+
bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
|
|
6
|
+
bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
|
|
7
|
+
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
8
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
9
|
+
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
|
|
11
|
+
bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
|
|
12
|
+
bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
|
|
13
|
+
bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
|
|
14
|
+
bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
|
|
15
|
+
bio2zarr-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
16
|
+
bio2zarr-0.1.0.dist-info/METADATA,sha256=zezBzqrJPB4ED7IqFvVj8Lura2untJA8optBdVTBNzc,14848
|
|
17
|
+
bio2zarr-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
18
|
+
bio2zarr-0.1.0.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
|
|
19
|
+
bio2zarr-0.1.0.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
20
|
+
bio2zarr-0.1.0.dist-info/RECORD,,
|