bio-vcf 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/Gemfile.lock +8 -0
- data/README.md +376 -11
- data/VERSION +1 -1
- data/bin/bio-vcf +172 -39
- data/bio-vcf.gemspec +18 -3
- data/features/cli.feature +32 -0
- data/features/multisample.feature +28 -10
- data/features/step_definitions/cli-feature.rb +12 -0
- data/features/step_definitions/multisample.rb +64 -18
- data/features/support/env.rb +5 -0
- data/lib/bio-vcf.rb +2 -0
- data/lib/bio-vcf/utils.rb +23 -0
- data/lib/bio-vcf/vcfgenotypefield.rb +73 -28
- data/lib/bio-vcf/vcfheader.rb +8 -0
- data/lib/bio-vcf/vcfline.rb +1 -0
- data/lib/bio-vcf/vcfrecord.rb +142 -14
- data/lib/bio-vcf/vcfsample.rb +88 -0
- data/test/data/input/dbsnp.vcf +200 -0
- data/test/data/input/multisample.vcf +2 -2
- data/test/data/regression/eval_r.info.dp.ref +150 -0
- data/test/data/regression/r.info.dp.ref +147 -0
- data/test/data/regression/rewrite.info.sample.ref +150 -0
- data/test/data/regression/s.dp.ref +145 -0
- data/test/data/regression/seval_s.dp.ref +36 -0
- data/test/data/regression/sfilter001.ref +145 -0
- data/test/performance/metrics.md +98 -0
- metadata +28 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f08be0a8d7ad751ad4758156e5ce6ccbc518cc0
|
4
|
+
data.tar.gz: 741386a278d7c38340abf35cc08d1f1923636131
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04d20d248629cccebbd3d639c2a25bb5d33efd2163999f87912343f44442c1c3f19429d6008900973116354a522679e75d853e3e6f9428d54a6647a38ef5e7fe
|
7
|
+
data.tar.gz: 625b39c9172569d3e893721a6f943721b30032b9946cea03923524af75345793edb3654f1489ebb21e084d81970a288e0552e442c27857647d0982999c497487
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -15,6 +15,8 @@ GEM
|
|
15
15
|
multipart-post (>= 1.2, < 3)
|
16
16
|
gherkin (2.12.2)
|
17
17
|
multi_json (~> 1.3)
|
18
|
+
gherkin (2.12.2-java)
|
19
|
+
multi_json (~> 1.3)
|
18
20
|
git (1.2.6)
|
19
21
|
github_api (0.11.3)
|
20
22
|
addressable (~> 2.3)
|
@@ -36,6 +38,7 @@ GEM
|
|
36
38
|
rake
|
37
39
|
rdoc
|
38
40
|
json (1.8.1)
|
41
|
+
json (1.8.1-java)
|
39
42
|
jwt (0.1.11)
|
40
43
|
multi_json (>= 1.5)
|
41
44
|
mini_portile (0.5.2)
|
@@ -45,6 +48,8 @@ GEM
|
|
45
48
|
multipart-post (2.0.0)
|
46
49
|
nokogiri (1.6.1)
|
47
50
|
mini_portile (~> 0.5.0)
|
51
|
+
nokogiri (1.6.1-java)
|
52
|
+
mini_portile (~> 0.5.0)
|
48
53
|
oauth2 (0.9.3)
|
49
54
|
faraday (>= 0.8, < 0.10)
|
50
55
|
jwt (~> 0.1.8)
|
@@ -55,6 +60,7 @@ GEM
|
|
55
60
|
rake (10.1.1)
|
56
61
|
rdoc (4.1.1)
|
57
62
|
json (~> 1.4)
|
63
|
+
regressiontest (0.0.2)
|
58
64
|
rspec (2.14.1)
|
59
65
|
rspec-core (~> 2.14.0)
|
60
66
|
rspec-expectations (~> 2.14.0)
|
@@ -65,9 +71,11 @@ GEM
|
|
65
71
|
rspec-mocks (2.14.6)
|
66
72
|
|
67
73
|
PLATFORMS
|
74
|
+
java
|
68
75
|
ruby
|
69
76
|
|
70
77
|
DEPENDENCIES
|
71
78
|
cucumber
|
72
79
|
jeweler
|
80
|
+
regressiontest
|
73
81
|
rspec
|
data/README.md
CHANGED
@@ -4,12 +4,102 @@
|
|
4
4
|
|
5
5
|
Yet another VCF parser. This one may give better performance because
|
6
6
|
of lazy parsing and useful combinations of (fancy) command line
|
7
|
-
filtering.
|
7
|
+
filtering. bio-vcf comes with a sensible parser definition language,
|
8
|
+
as well as primitives for set analysis. Also few assumptions are made
|
9
|
+
about the actual contents of the VCF file (field names are resolved on
|
10
|
+
the fly).
|
11
|
+
|
12
|
+
To fetch all entries where all samples have depth larger than 20 use an sfilter
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
bio-vcf --sfilter 'sample.dp>20' < file.vcf
|
16
|
+
```
|
17
|
+
|
18
|
+
To only filter on some samples number 0 and 3:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
bio-vcf --sfilter-samples 0,3 --sfilter 's.dp>20' < file.vcf
|
22
|
+
```
|
23
|
+
|
24
|
+
Where 's.dp' is the shorter name for 'sample.dp'.
|
25
|
+
|
26
|
+
It is also possible to specify sample names, or info fields:
|
27
|
+
|
28
|
+
For example, to filter somatic data
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
bio-vcf --filter 'rec.info.dp>5 and rec.alt.size==1 and rec.tumor.bq[rec.alt]>30 and rec.tumor.mq>20' < file.vcf
|
32
|
+
```
|
33
|
+
|
34
|
+
To output specific fields in tabular (and HTML, XML or LaTeX) format
|
35
|
+
use the --eval switch, e.g.,
|
36
|
+
|
37
|
+
```ruby
|
38
|
+
bio-vcf --eval 'rec.alt+"\t"+rec.info.dp+"\t"+rec.tumor.gq.to_s' < file.vcf
|
39
|
+
```
|
40
|
+
|
41
|
+
In fact, if the result is an Array the output gets tab dilimited so
|
42
|
+
the nicer version is
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
bio-vcf --eval '[r.alt,r.info.dp,r.tumor.gq.to_s]' < file.vcf
|
46
|
+
```
|
47
|
+
|
48
|
+
To output the DP values of every sample that has a depth larger than
|
49
|
+
100:
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
bio-vcf -i --sfilter 's.dp>100' --seval 's.dp' < file.vcf
|
53
|
+
|
54
|
+
1 10257 159 242 249 249 186 212 218
|
55
|
+
1 10291 165 249 249 247 161 163 189
|
56
|
+
1 10297 182 246 250 246 165 158 183
|
57
|
+
1 10303 198 247 248 248 172 157 182
|
58
|
+
(etc.)
|
59
|
+
```
|
60
|
+
|
61
|
+
Where -i ignores missing samples. Pick up sample allele depth
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
bio-vcf -i --seval 's.ad'
|
65
|
+
1 10257 151,8 219,22 227,22 226,22 166,18 185,27 201,15
|
66
|
+
1 10291 145,16 218,26 214,30 213,32 122,36 131,27 156,31
|
67
|
+
1 10297 155,18 218,23 219,26 207,30 137,20 124,27 151,27
|
68
|
+
```
|
69
|
+
|
70
|
+
And to output DP ang GQ values for tumor normal:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
bio-vcf --filter 'r.normal.dp>=7 and r.tumor.dp>=5' --seval '[s.dp,s.gq]' < freebayes.vcf
|
74
|
+
|
75
|
+
17 45235620 22 139.35 20 0
|
76
|
+
17 45235635 20 137.224 14 41.5688
|
77
|
+
17 45235653 18 146.509 12 146.509
|
78
|
+
17 45247354 32 0 9 6.59312
|
79
|
+
17 45247362 27 0 6 110.097
|
80
|
+
|
81
|
+
```
|
82
|
+
|
83
|
+
To parse and output genotype
|
8
84
|
|
9
85
|
```ruby
|
10
|
-
|
86
|
+
bio-vcf -iq --sfilter 's.dp>=20 and s.gq>=20' --ifilter-sampler 's.gt!="0/0"' --seval s.gt < test/data/input/multisample.vcf
|
87
|
+
1 10257 0/0 0/0 0/0 0/0 0/0 0/1 0/0
|
88
|
+
1 10291 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
89
|
+
1 10297 0/1 0/1 0/1 0/0 0/0 0/1 0/1
|
90
|
+
1 12783 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
11
91
|
```
|
12
92
|
|
93
|
+
Most filter and eval commands can be used at the same time. Special set
|
94
|
+
commands exit for filtering and eval. When a set is defined, based on
|
95
|
+
the sample name, you can apply filters on the samples inside the set,
|
96
|
+
outside the set and over all samples. E.g.
|
97
|
+
|
98
|
+
Also note you can use
|
99
|
+
[bio-table](https://github.com/pjotrp/bioruby-table) to
|
100
|
+
filter/transform data further and convert to other formats, such as
|
101
|
+
RDF.
|
102
|
+
|
13
103
|
The VCF format is commonly used for variant calling between NGS
|
14
104
|
samples. The fast parser needs to carry some state, recorded for each
|
15
105
|
file in VcfHeader, which contains the VCF file header. Individual
|
@@ -18,17 +108,18 @@ of fields. Further (lazy) parsing is handled through VcfRecord.
|
|
18
108
|
|
19
109
|
At this point the filter is pretty generic with multi-sample support.
|
20
110
|
If something is not working, check out the feature descriptions and
|
21
|
-
the source code. It is not hard to add features. Otherwise, send
|
111
|
+
the source code. It is not hard to add features. Otherwise, send a short
|
22
112
|
example of a VCF statement you need to work on.
|
23
113
|
|
114
|
+
bio-vcf is fast. Parsing a 55K line DbSNP file (22Mb) takes 1.5 seconds on a
|
115
|
+
Macbook PRO running 64-bits Linux (Ruby 2.1.0).
|
116
|
+
|
24
117
|
## Installation
|
25
118
|
|
26
119
|
```sh
|
27
120
|
gem install bio-vcf
|
28
121
|
```
|
29
122
|
|
30
|
-
## Quick start
|
31
|
-
|
32
123
|
## Command line interface (CLI)
|
33
124
|
|
34
125
|
Get the version of the VCF file
|
@@ -56,13 +147,13 @@ The 'fields' array contains unprocessed data (strings). Print first
|
|
56
147
|
five raw fields
|
57
148
|
|
58
149
|
```ruby
|
59
|
-
bio-vcf --eval 'fields[0..4]
|
150
|
+
bio-vcf --eval 'fields[0..4]' < file.vcf
|
60
151
|
```
|
61
152
|
|
62
153
|
Add a filter to display the fields on chromosome 12
|
63
154
|
|
64
155
|
```ruby
|
65
|
-
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]
|
156
|
+
bio-vcf --filter 'fields[0]=="12"' --eval 'fields[0..4]' < file.vcf
|
66
157
|
```
|
67
158
|
|
68
159
|
It gets better when we start using processed data, represented by an
|
@@ -72,6 +163,19 @@ object named 'rec'. Position is a value, so we can filter a range
|
|
72
163
|
bio-vcf --filter 'rec.chrom=="12" and rec.pos>96_641_270 and rec.pos<96_641_276' < file.vcf
|
73
164
|
```
|
74
165
|
|
166
|
+
The shorter name for 'rec.chrom' is 'r.chrom', so you may write
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
bio-vcf --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
170
|
+
```
|
171
|
+
|
172
|
+
To ignore and continue parsing on missing data use the
|
173
|
+
--ignore-missing (-i) and or --quiet (-q) switches
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
bio-vcf -i --filter 'r.chrom=="12" and r.pos>96_641_270 and r.pos<96_641_276' < file.vcf
|
177
|
+
```
|
178
|
+
|
75
179
|
Info fields are referenced by
|
76
180
|
|
77
181
|
```ruby
|
@@ -118,26 +222,287 @@ Similar for base quality scores
|
|
118
222
|
bio-vcf --filter 'rec.alt.size==1 and rec.tumor.amq[rec.alt]>30' < test.vcf
|
119
223
|
```
|
120
224
|
|
225
|
+
Filter out on sample values
|
226
|
+
|
227
|
+
```ruby
|
228
|
+
bio-vcf --sfilter 's.dp>20' < test.vcf
|
229
|
+
```
|
230
|
+
|
231
|
+
To filter missing on samples:
|
232
|
+
|
233
|
+
```sh
|
234
|
+
bio-vcf --filter "rec.s3t2?" < file.vcf
|
235
|
+
```
|
236
|
+
|
237
|
+
or for all
|
238
|
+
|
239
|
+
```sh
|
240
|
+
bio-vcf --filter "rec.missing_samples?" < file.vcf
|
241
|
+
```
|
242
|
+
|
243
|
+
Likewise you can check for record validity
|
244
|
+
|
245
|
+
```sh
|
246
|
+
bio-vcf --filter "not rec.valid?" < file.vcf
|
247
|
+
```
|
248
|
+
|
249
|
+
which, at this point, simply counts the number of fields.
|
250
|
+
|
121
251
|
If your samples have other names you can fetch genotypes for that
|
122
252
|
sample with
|
123
253
|
|
124
254
|
```sh
|
125
|
-
bio-vcf --eval "rec.sample['
|
255
|
+
bio-vcf --eval "rec.sample['Original'].gt" < file.vcf
|
126
256
|
```
|
127
257
|
|
128
258
|
Or read depth for another
|
129
259
|
|
130
260
|
```sh
|
131
|
-
bio-vcf --eval "rec.sample['
|
261
|
+
bio-vcf --eval "rec.sample['s3t2'].dp" < file.vcf
|
132
262
|
```
|
133
263
|
|
134
264
|
Better even, you can access samples directly with
|
135
265
|
|
136
266
|
```sh
|
137
|
-
bio-vcf --eval "rec.sample.
|
138
|
-
bio-vcf --eval "rec.sample.
|
267
|
+
bio-vcf --eval "rec.sample.original.gt" < file.vcf
|
268
|
+
bio-vcf --eval "rec.sample.s3t2.dp" < file.vcf
|
269
|
+
```
|
270
|
+
|
271
|
+
And even better because of Ruby magic
|
272
|
+
|
273
|
+
```sh
|
274
|
+
bio-vcf --eval "rec.original.gt" < file.vcf
|
275
|
+
bio-vcf --eval "rec.s3t2.dp" < file.vcf
|
276
|
+
```
|
277
|
+
|
278
|
+
Note that only valid method names in lower case get picked up this
|
279
|
+
way. Also by convention normal is sample 1 and tumor is sample 2.
|
280
|
+
|
281
|
+
Even shorter r is an alias for rec (nyi)
|
282
|
+
|
283
|
+
```sh
|
284
|
+
bio-vcf --eval "r.original.gt" < file.vcf
|
285
|
+
bio-vcf --eval "r.s3t2.dp" < file.vcf
|
286
|
+
```
|
287
|
+
|
288
|
+
## Special functions
|
289
|
+
|
290
|
+
Note: special functions are not yet implemented!
|
291
|
+
|
292
|
+
Sometime you want to use a special function in a filter. For
|
293
|
+
example percentage variant reads can be defined as [a,c,g,t]
|
294
|
+
with frequencies against sample read depth (dp) as
|
295
|
+
[0,0.03,0.47,0.50]. Filtering would with a special function,
|
296
|
+
which we named freq
|
297
|
+
|
298
|
+
```sh
|
299
|
+
bio-vcf --sfilter "s.freq(2)>0.30" < file.vcf
|
300
|
+
```
|
301
|
+
|
302
|
+
which is equal to
|
303
|
+
|
304
|
+
```sh
|
305
|
+
bio-vcf --sfilter "s.freq.g>0.30" < file.vcf
|
306
|
+
```
|
307
|
+
|
308
|
+
To check for ref or variant frequencies use more sugar
|
309
|
+
|
310
|
+
```sh
|
311
|
+
bio-vcf --sfilter "s.freq.var>0.30 and s.freq.ref<0.10" < file.vcf
|
312
|
+
```
|
313
|
+
|
314
|
+
For all includes var should be identical for set analysis except for
|
315
|
+
cartesian. So when --include is defined test for identical var and in
|
316
|
+
the case of cartesian one unique var, when tested.
|
317
|
+
|
318
|
+
ref should always be identical across samples.
|
319
|
+
|
320
|
+
## DbSNP
|
321
|
+
|
322
|
+
One clinical variant DbSNP example
|
323
|
+
|
324
|
+
```sh
|
325
|
+
bio-vcf --eval '[rec.id,rec.chr,rec.pos,rec.alt,rec.info.sao,rec.info.CLNDBN].join("\t")' < clinvar_20140303.vcf
|
326
|
+
```
|
327
|
+
|
328
|
+
renders
|
329
|
+
|
330
|
+
```
|
331
|
+
1 1916905 rs267598254 A 3 Malignant_melanoma
|
332
|
+
1 1916906 rs267598255 A 3 Malignant_melanoma
|
333
|
+
1 1959075 rs121434580 C 1 Generalized_epilepsy_with_febrile_seizures_plus_type_5
|
334
|
+
1 1959699 rs41307846 A 1 Generalized_epilepsy_with_febrile_seizures_plus_type_5|Epilepsy\x2c_juvenile_myoclonic_7|Epilepsy\x2c_idiopathic_generalized_10
|
335
|
+
1 1961453 rs142619552 T 3 Malignant_melanoma
|
336
|
+
1 2160299 rs387907304 G 0 Shprintzen-Goldberg_syndrome
|
337
|
+
1 2160305 rs387907306 A T 0 Shprintzen-Goldberg_syndrome,Shprintzen-Goldberg_syndrome
|
338
|
+
1 2160306 rs387907305 A T 0 Shprintzen-Goldberg_syndrome,Shprintzen-Goldberg_syndrome
|
339
|
+
1 2160308 rs397514590 T 0 Shprintzen-Goldberg_syndrome
|
340
|
+
1 2160309 rs397514589 A 0 Shprintzen-Goldberg_syndrome
|
341
|
+
```
|
342
|
+
|
343
|
+
## Set analysis
|
344
|
+
|
345
|
+
bio-vcf allows for set analysis. With the complement filter, for
|
346
|
+
example, samples are selected that evaluate to true, all others should
|
347
|
+
evaluate to false. For this we create three filters, one for all
|
348
|
+
samples that are included (the --ifilter or -if), for all samples that
|
349
|
+
are excluded (the --efilter or -ef) and for any sample (the --sfilter
|
350
|
+
or -sf). So i=include, e=exclude and s=any sample.
|
351
|
+
|
352
|
+
The equivalent of the union filter is by using the --sfilter, so
|
353
|
+
|
354
|
+
```sh
|
355
|
+
bio-vcf --sfilter 's.dp>20'
|
356
|
+
```
|
357
|
+
|
358
|
+
Filters DP on all samples. To filter on a subset you can add a
|
359
|
+
selector
|
360
|
+
|
361
|
+
```sh
|
362
|
+
bio-vcf --sfilter-samples 0,1,4 --sfilter 's.dp>20'
|
363
|
+
```
|
364
|
+
|
365
|
+
For set analysis there are the additional ifilter (include) and efilter (exclude). To filter
|
366
|
+
on samples 0,1,4 and output the gq values
|
367
|
+
|
368
|
+
```sh
|
369
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gq<10 or s.gq==99' --seval s.gq
|
370
|
+
1 14907 99 99 99 99 99 99 99
|
371
|
+
1 14930 99 99 99 99 99 99 99
|
372
|
+
1 14933 1 99 99 39 99 99 99
|
373
|
+
1 15190 99 99 91 99 99 99 99
|
374
|
+
1 15211 99 99 99 99 99 99 99
|
375
|
+
```
|
376
|
+
|
377
|
+
The equivalent of the complement filter is by specifying what samples
|
378
|
+
to include, here with a regex and define filters on the included
|
379
|
+
and excluded samples (the ones not in ifilter-samples) and the
|
380
|
+
|
381
|
+
```sh
|
382
|
+
./bin/bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt'
|
383
|
+
```
|
384
|
+
|
385
|
+
To print out the GT's add --seval
|
386
|
+
|
387
|
+
```sh
|
388
|
+
bio-vcf -i --sfilter 's.dp>20' --ifilter-samples 2,4 --ifilter 's.gt==r.s1t1.gt' --seval 's.gt'
|
389
|
+
1 14673 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
390
|
+
1 14907 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
391
|
+
1 14930 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
392
|
+
1 15211 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
393
|
+
1 15274 1/2 1/2 1/2 1/2 1/2 1/2 1/2
|
394
|
+
1 16103 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
139
395
|
```
|
140
396
|
|
397
|
+
To set an additional filter on the excluded samples:
|
398
|
+
|
399
|
+
```sh
|
400
|
+
bio-vcf -i --ifilter-samples 0,1,4 --ifilter 's.gt==rec.s1t1.gt and s.gq>10' --seval s.gq --efilter 's.gq==99'
|
401
|
+
```
|
402
|
+
|
403
|
+
Etc. etc. Any combination of sfilter, ifilter and efilter is possible.
|
404
|
+
|
405
|
+
The following are not yet implemented:
|
406
|
+
|
407
|
+
In the near future it is also possible to select samples on a regex (here
|
408
|
+
select all samples where the name starts with s3)
|
409
|
+
|
410
|
+
```sh
|
411
|
+
bio-vcf --isample-regex '/^s3/' --ifilter 's.dp>20'
|
412
|
+
```
|
413
|
+
|
414
|
+
```sh
|
415
|
+
bio-vcf --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt' --efilter 'gt!=s3t1.gt'
|
416
|
+
--set-intersect include=true
|
417
|
+
bio-vcf --include /s3.+/ --sample-regex /^t2/ --sfilter 'dp>20' --ifilter 'gt==s3t1.gt'
|
418
|
+
--set-catesian one in include=true, rest=false
|
419
|
+
bio-vcf --unique-sample (any) --include /s3.+/ --sfilter 'dp>20' --ifilter 'gt!="0/0"'
|
420
|
+
```
|
421
|
+
|
422
|
+
With the filter commands you can use --ignore-missing to skip errors.
|
423
|
+
|
424
|
+
## Genotype processing
|
425
|
+
|
426
|
+
The sample GT field counts 0 as the reference and numbers >1 as
|
427
|
+
indexed ALT values. The field is simply built up using a slash or | as
|
428
|
+
a separator (e.g., 0/1, 0|2, ./. are valid values). The standard field
|
429
|
+
results in a string value
|
430
|
+
|
431
|
+
```ruby
|
432
|
+
bio-vcf --seval s.gt
|
433
|
+
1 10665 ./. ./. 0/1 0/1 ./. 0/0 0/0
|
434
|
+
1 10694 ./. ./. 1/1 1/1 ./. ./. ./.
|
435
|
+
1 12783 0/1 0/1 0/1 0/1 0/1 0/1 0/1
|
436
|
+
1 15274 1/2 1/2 1/2 1/2 1/2 1/2 1/2
|
437
|
+
```
|
438
|
+
|
439
|
+
to access components of the genotype field we can use standard Ruby
|
440
|
+
|
441
|
+
```ruby
|
442
|
+
bio-vcf --seval 's.gt.split(/\//)[0]'
|
443
|
+
1 10665 . . 0 0 . 0 0
|
444
|
+
1 10694 . . 1 1 . . .
|
445
|
+
1 12783 0 0 0 0 0 0 0
|
446
|
+
1 15274 1 1 1 1 1 1 1
|
447
|
+
```
|
448
|
+
|
449
|
+
or special functions, such as 'gti' which gives the genotype as an
|
450
|
+
indexed value array
|
451
|
+
|
452
|
+
```ruby
|
453
|
+
bio-vcf --seval 's.gti[0]'
|
454
|
+
1 10665 0 0 0 0
|
455
|
+
1 10694 1 1
|
456
|
+
1 12783 0 0 0 0 0 0 0
|
457
|
+
1 15274 1 1 1 1 1 1 1
|
458
|
+
```
|
459
|
+
|
460
|
+
and 'gts' as a nucleotide string array
|
461
|
+
|
462
|
+
```ruby
|
463
|
+
bio-vcf --seval 's.gts[0]'
|
464
|
+
1 10665 C C C C
|
465
|
+
1 10694 G G
|
466
|
+
1 12783 G G G G G G G
|
467
|
+
1 15274 G G G G G G G
|
468
|
+
```
|
469
|
+
|
470
|
+
These values can also be used in filters and output allele depth, for
|
471
|
+
example
|
472
|
+
|
473
|
+
```ruby
|
474
|
+
bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gt=="0/0"' --seval 'rec.original.ad[s.gti[1]]'
|
475
|
+
1 10257 151 151 151 151 151 8 151
|
476
|
+
1 13302 26 10 10 10 10 10 10
|
477
|
+
1 13757 47 47 4 47 47 4 47
|
478
|
+
```
|
479
|
+
|
480
|
+
The following does not yet work (using the gti in a sample directly)
|
481
|
+
|
482
|
+
```ruby
|
483
|
+
bio-vcf -vi --ifilter 'rec.original.gt!="0/1"' --efilter 'rec.original.gti[0]==0' --seval 'rec.original.ad[s.gti[1]]'
|
484
|
+
```
|
485
|
+
|
486
|
+
## Modify VCF files
|
487
|
+
|
488
|
+
Add or modify the sample file name in the INFO fields:
|
489
|
+
|
490
|
+
```sh
|
491
|
+
bio-vcf --rewrite 'rec.info["sample"]="mytest"' < mytest.vcf
|
492
|
+
```
|
493
|
+
|
494
|
+
To remove/select 3 samples and create a new file:
|
495
|
+
|
496
|
+
```sh
|
497
|
+
bio-vcf --samples 0,1,3 < mytest.vcf
|
498
|
+
```
|
499
|
+
|
500
|
+
## RDF output
|
501
|
+
|
502
|
+
Use [bio-table](https://github.com/pjotrp/bioruby-table) to convert tabular data to RDF.
|
503
|
+
|
504
|
+
## Other examples
|
505
|
+
|
141
506
|
For more examples see the feature [section](https://github.com/pjotrp/bioruby-vcf/tree/master/features).
|
142
507
|
|
143
508
|
## API
|