bio-locus 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +13 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +20 -0
- data/README.md +162 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/bio-locus +117 -0
- data/features/bio-locus.feature +9 -0
- data/features/step_definitions/bio-locus_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-locus.rb +12 -0
- data/lib/bio-locus/locus.rb +49 -0
- data/lib/bio-locus/match.rb +43 -0
- data/lib/bio-locus/store.rb +29 -0
- data/spec/bio-locus_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/input/dbsnp.vcf +200 -0
- data/test/data/input/multisample.vcf +150 -0
- data/test/data/input/somaticsniper.vcf +87 -0
- data/test/data/input/test.snv1 +106 -0
- metadata +139 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 33548bcc8a3474a7e1d3ebbec6c4cfe2472d4af9
|
4
|
+
data.tar.gz: e7c7f93a5638a79f2052142d472dbad3dc57b334
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 04b8576748a2f324c7e4de0224c1b7409e28da8e2e2697e5bf0da60a55db0a8994a6185649d4c3b68930e7d617732e92edd8aed54cba4646cea14ded86f09be6
|
7
|
+
data.tar.gz: 2b17b04ef00b04a37a1d272ee4a10137fc50392a108850671f13d01a777755674413c6bf21e26843a700168d57484cb0fd150c0a054adf012d4c2577f973e6a8
|
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
|
7
|
+
# - rbx-19mode
|
8
|
+
# - 1.8.7
|
9
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
+
# - rbx-18mode
|
11
|
+
|
12
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
13
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "cucumber"
|
10
|
+
gem "jeweler"
|
11
|
+
gem "bundler"
|
12
|
+
end
|
13
|
+
gem "localmemcache"
|
14
|
+
gem "moneta"
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2014 Pjotr Prins
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# bio-locus
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/pjotrp/bioruby-locus.png)](http://travis-ci.org/pjotrp/bioruby-locus)
|
4
|
+
|
5
|
+
Bio-locus is a tool for fast querying of genome locations. Many file
|
6
|
+
formats in bioinformatics contain records that start with a chromosome
|
7
|
+
name and a position for a SNP, or a start-end position for indels.
|
8
|
+
|
9
|
+
This tool essentially allows your to store this information in a Hash
|
10
|
+
or database:
|
11
|
+
|
12
|
+
```sh
|
13
|
+
bio-locus --store < one.vcf
|
14
|
+
```
|
15
|
+
|
16
|
+
which creates or adds to a cache file or database with unique entries
|
17
|
+
for all listed positions (chr+pos) AND for all listed positions with
|
18
|
+
listed alt alleles. To find positions in another dataset which match
|
19
|
+
those in the database:
|
20
|
+
|
21
|
+
```sh
|
22
|
+
bio-locus --match < two.vcf
|
23
|
+
```
|
24
|
+
|
25
|
+
The point is that this is a two-step process, first create the
|
26
|
+
indexed database, next query it. It is also possible to remove entries
|
27
|
+
with the --delete switch.
|
28
|
+
|
29
|
+
To match with alt use
|
30
|
+
|
31
|
+
```sh
|
32
|
+
bio-locus --match --include-alt < two.vcf
|
33
|
+
```
|
34
|
+
|
35
|
+
Why would you use bio-locus?
|
36
|
+
|
37
|
+
* To reduce the size of large SNP databases before storage/querying
|
38
|
+
* To gain performance
|
39
|
+
* To filter on chr+pos (default)
|
40
|
+
* To filter on chr+pos+field (where field can be a VCF ALT)
|
41
|
+
|
42
|
+
Use cases are
|
43
|
+
|
44
|
+
* To filter for annotated variants
|
45
|
+
* To remove common variants from a set
|
46
|
+
|
47
|
+
In short a more targeted approach allowing you to work with less data. This
|
48
|
+
tool is decently fast. For example, looking for 130 positions in 20 million SNPs
|
49
|
+
in GoNL takes 0.11s to store and 1.5 minutes to match on my laptop:
|
50
|
+
|
51
|
+
```sh
|
52
|
+
cat my_130_variants.vcf | ./bin/bio-locus --store
|
53
|
+
Stored 130 positions out of 130 in locus.db
|
54
|
+
real 0m0.119s
|
55
|
+
user 0m0.108s
|
56
|
+
sys 0m0.012s
|
57
|
+
|
58
|
+
cat gonl.*.vcf |./bin/bio-locus --match
|
59
|
+
Matched 3 out of 20736323 lines in locus.db!
|
60
|
+
real 1m34.577s
|
61
|
+
user 1m33.602s
|
62
|
+
sys 0m1.868s
|
63
|
+
```
|
64
|
+
|
65
|
+
Note: for the storage the [moneta](https://github.com/minad/moneta) gem is used, currently with localmemcache.
|
66
|
+
|
67
|
+
Note: the ALT field is split into components for matching, so A,C
|
68
|
+
becomes two chr+pos records, one for A and one for C.
|
69
|
+
|
70
|
+
## Installation
|
71
|
+
|
72
|
+
```sh
|
73
|
+
gem install bio-locus
|
74
|
+
```
|
75
|
+
|
76
|
+
## Command line
|
77
|
+
|
78
|
+
In addition to --store and --match mentioned above there are a number
|
79
|
+
of options available through
|
80
|
+
|
81
|
+
```sh
|
82
|
+
bio-locus --help
|
83
|
+
```
|
84
|
+
|
85
|
+
### Deleting keys
|
86
|
+
|
87
|
+
To delete entries use
|
88
|
+
|
89
|
+
```sh
|
90
|
+
bio-locus --delete < two.vcf
|
91
|
+
```
|
92
|
+
|
93
|
+
To match with alt use
|
94
|
+
|
95
|
+
```sh
|
96
|
+
bio-locus --delete --include-alt < two.vcf
|
97
|
+
```
|
98
|
+
|
99
|
+
You may need to run both with and without alt, depending on your needs!
|
100
|
+
|
101
|
+
### Parsing
|
102
|
+
|
103
|
+
It is possible to use any line based format. For example parsing the
|
104
|
+
alt from
|
105
|
+
|
106
|
+
```
|
107
|
+
X 107976940 G/C -1 5 5 0.75 H879D 0 IRS4 CCDS14544 Cat/Gat rs1801164 missense_variant ENST00000372129.2:c.2635C>G
|
108
|
+
```
|
109
|
+
|
110
|
+
can be done with
|
111
|
+
|
112
|
+
```sh
|
113
|
+
bio-locus --store --eval-alt 'field[2].split(/\//)[1]'
|
114
|
+
```
|
115
|
+
|
116
|
+
### COSMIC
|
117
|
+
|
118
|
+
COSMIC is pretty large, so it can be useful to cut the database down to the
|
119
|
+
variants that you have. The locus information is combined
|
120
|
+
in the before last column as chr:start-end, e.g.,
|
121
|
+
19:58861911-58861911. This will work:
|
122
|
+
|
123
|
+
```sh
|
124
|
+
bio-locus -i --match --eval-chr='field[13] =~ /^([^:]+)/ ; $1' --eval-pos='field[13] =~ /:(\d+)-/ ; $1 ' < CosmicMutantExportIncFus_v68.tsv
|
125
|
+
```
|
126
|
+
|
127
|
+
Note the -i switch is needed to skip records that lack position
|
128
|
+
information.
|
129
|
+
|
130
|
+
## Usage
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
require 'bio-locus'
|
134
|
+
```
|
135
|
+
|
136
|
+
The API doc is online. For more code examples see the test files in
|
137
|
+
the source tree.
|
138
|
+
|
139
|
+
## Project home page
|
140
|
+
|
141
|
+
Information on the source tree, documentation, examples, issues and
|
142
|
+
how to contribute, see
|
143
|
+
|
144
|
+
http://github.com/pjotrp/bioruby-locus
|
145
|
+
|
146
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
147
|
+
|
148
|
+
## Cite
|
149
|
+
|
150
|
+
If you use this software, please cite one of
|
151
|
+
|
152
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
153
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
154
|
+
|
155
|
+
## Biogems.info
|
156
|
+
|
157
|
+
This Biogem is published at (http://biogems.info/index.html#bio-locus)
|
158
|
+
|
159
|
+
## Copyright
|
160
|
+
|
161
|
+
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
162
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-locus"
|
18
|
+
gem.homepage = "http://github.com/pjotrp/bioruby-locus"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Fast storage and comparison of chr+pos(+alt) locations}
|
21
|
+
gem.description = %Q{A tool for fast querying and filtering of genome locations in VCF and other formats}
|
22
|
+
gem.email = "pjotr.public01@thebird.nl"
|
23
|
+
gem.authors = ["Pjotr Prins"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
# require 'rspec/core'
|
29
|
+
# require 'rspec/core/rake_task'
|
30
|
+
# RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
# spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
# end
|
33
|
+
|
34
|
+
# RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
# spec.rcov = true
|
37
|
+
# end
|
38
|
+
|
39
|
+
require 'cucumber/rake/task'
|
40
|
+
Cucumber::Rake::Task.new(:features)
|
41
|
+
|
42
|
+
task :default => :spec
|
43
|
+
|
44
|
+
require 'rdoc/task'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "bio-locus #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
data/bin/bio-locus
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
|
4
|
+
USAGE = "Use --help for info\n"
|
5
|
+
|
6
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
7
|
+
$: << File.join(gempath,'lib')
|
8
|
+
|
9
|
+
VERSION_FILENAME=File.join(gempath,'VERSION')
|
10
|
+
version = File.new(VERSION_FILENAME).read.chomp
|
11
|
+
|
12
|
+
if ARGV.size == 0
|
13
|
+
print USAGE
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'bio-locus'
|
17
|
+
require 'optparse'
|
18
|
+
|
19
|
+
options = {task: nil, db: 'locus.db', show_help: false, header: 1}
|
20
|
+
opts = OptionParser.new do |o|
|
21
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} test.txt"
|
22
|
+
|
23
|
+
o.on("--store", 'Create or add to a cache file') do
|
24
|
+
options[:task] = :store
|
25
|
+
options[:include_alt] = true # always include alt
|
26
|
+
end
|
27
|
+
|
28
|
+
o.on("--delete", 'Remove matches from a cache file') do
|
29
|
+
options[:task] = :delete
|
30
|
+
end
|
31
|
+
|
32
|
+
o.on("--match", 'Match a cache file') do
|
33
|
+
options[:task] = :match
|
34
|
+
end
|
35
|
+
|
36
|
+
o.on("--include-alt", 'Include chr+pos+ALT VCF field to filter') do
|
37
|
+
options[:include_alt] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
o.on("--exclude-alt", 'Override adding chr+pos+ALT field to store') do
|
41
|
+
options[:exclude_alt] = true
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
o.on("--db filename",String,"Use db file") do | fn |
|
46
|
+
options[:db] = fn
|
47
|
+
end
|
48
|
+
|
49
|
+
o.on("--eval-chr expr",String,"Evaluate record to retrieve chr name") do | expr |
|
50
|
+
options[:eval_chr] = expr
|
51
|
+
end
|
52
|
+
|
53
|
+
o.on("--eval-pos expr",String,"Evaluate record to retrieve position") do | expr |
|
54
|
+
options[:eval_pos] = expr
|
55
|
+
end
|
56
|
+
|
57
|
+
o.on("--eval-alt expr",String,"Evaluate record to retrieve alt list") do | expr |
|
58
|
+
options[:eval_alt] = expr
|
59
|
+
end
|
60
|
+
|
61
|
+
o.on("--header num", "Header lines (default 1)") do |l|
|
62
|
+
options[:header] = l.to_i
|
63
|
+
end
|
64
|
+
|
65
|
+
o.on("-q", "--quiet", "Run quietly") do |q|
|
66
|
+
options[:quiet] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
o.on("-v", "--verbose", "Run verbosely") do |v|
|
70
|
+
options[:verbose] = true
|
71
|
+
end
|
72
|
+
|
73
|
+
o.on("-d", "--debug", "Debug mode") do |v|
|
74
|
+
options[:debug] = true
|
75
|
+
end
|
76
|
+
|
77
|
+
o.on("-i", "--ignore-errors", "Continue on error") do
|
78
|
+
options[:ignore_errors] = true
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
o.separator ""
|
83
|
+
o.on_tail('-h', '--help', 'display this help and exit') do
|
84
|
+
options[:show_help] = true
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
begin
|
89
|
+
opts.parse!(ARGV)
|
90
|
+
|
91
|
+
$stderr.print "bio-locus #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
92
|
+
|
93
|
+
if options[:show_help]
|
94
|
+
print opts
|
95
|
+
print USAGE
|
96
|
+
exit 1
|
97
|
+
end
|
98
|
+
|
99
|
+
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
100
|
+
|
101
|
+
rescue OptionParser::InvalidOption => e
|
102
|
+
options[:invalid_argument] = e.message
|
103
|
+
end
|
104
|
+
|
105
|
+
(0..options[:header]).each { STDIN.gets }
|
106
|
+
|
107
|
+
case options[:task]
|
108
|
+
when :store then
|
109
|
+
require 'bio-locus/store'
|
110
|
+
options[:include_alt]=false if options[:exclude_alt]
|
111
|
+
BioLocus::Store.run(options)
|
112
|
+
when :match ,:delete then
|
113
|
+
require 'bio-locus/match'
|
114
|
+
BioLocus::Match.run(options)
|
115
|
+
else
|
116
|
+
raise "I do not know what to do!"
|
117
|
+
end
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-locus'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-locus.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-locus/locus.rb'
|
12
|
+
|