bio-locus 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +13 -0
- data/Gemfile +14 -0
- data/LICENSE.txt +20 -0
- data/README.md +162 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/bio-locus +117 -0
- data/features/bio-locus.feature +9 -0
- data/features/step_definitions/bio-locus_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-locus.rb +12 -0
- data/lib/bio-locus/locus.rb +49 -0
- data/lib/bio-locus/match.rb +43 -0
- data/lib/bio-locus/store.rb +29 -0
- data/spec/bio-locus_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/data/input/dbsnp.vcf +200 -0
- data/test/data/input/multisample.vcf +150 -0
- data/test/data/input/somaticsniper.vcf +87 -0
- data/test/data/input/test.snv1 +106 -0
- metadata +139 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 33548bcc8a3474a7e1d3ebbec6c4cfe2472d4af9
|
4
|
+
data.tar.gz: e7c7f93a5638a79f2052142d472dbad3dc57b334
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 04b8576748a2f324c7e4de0224c1b7409e28da8e2e2697e5bf0da60a55db0a8994a6185649d4c3b68930e7d617732e92edd8aed54cba4646cea14ded86f09be6
|
7
|
+
data.tar.gz: 2b17b04ef00b04a37a1d272ee4a10137fc50392a108850671f13d01a777755674413c6bf21e26843a700168d57484cb0fd150c0a054adf012d4c2577f973e6a8
|
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
|
7
|
+
# - rbx-19mode
|
8
|
+
# - 1.8.7
|
9
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
+
# - rbx-18mode
|
11
|
+
|
12
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
13
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "cucumber"
|
10
|
+
gem "jeweler"
|
11
|
+
gem "bundler"
|
12
|
+
end
|
13
|
+
gem "localmemcache"
|
14
|
+
gem "moneta"
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2014 Pjotr Prins
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
# bio-locus
|
2
|
+
|
3
|
+
[](http://travis-ci.org/pjotrp/bioruby-locus)
|
4
|
+
|
5
|
+
Bio-locus is a tool for fast querying of genome locations. Many file
|
6
|
+
formats in bioinformatics contain records that start with a chromosome
|
7
|
+
name and a position for a SNP, or a start-end position for indels.
|
8
|
+
|
9
|
+
This tool essentially allows your to store this information in a Hash
|
10
|
+
or database:
|
11
|
+
|
12
|
+
```sh
|
13
|
+
bio-locus --store < one.vcf
|
14
|
+
```
|
15
|
+
|
16
|
+
which creates or adds to a cache file or database with unique entries
|
17
|
+
for all listed positions (chr+pos) AND for all listed positions with
|
18
|
+
listed alt alleles. To find positions in another dataset which match
|
19
|
+
those in the database:
|
20
|
+
|
21
|
+
```sh
|
22
|
+
bio-locus --match < two.vcf
|
23
|
+
```
|
24
|
+
|
25
|
+
The point is that this is a two-step process, first create the
|
26
|
+
indexed database, next query it. It is also possible to remove entries
|
27
|
+
with the --delete switch.
|
28
|
+
|
29
|
+
To match with alt use
|
30
|
+
|
31
|
+
```sh
|
32
|
+
bio-locus --match --include-alt < two.vcf
|
33
|
+
```
|
34
|
+
|
35
|
+
Why would you use bio-locus?
|
36
|
+
|
37
|
+
* To reduce the size of large SNP databases before storage/querying
|
38
|
+
* To gain performance
|
39
|
+
* To filter on chr+pos (default)
|
40
|
+
* To filter on chr+pos+field (where field can be a VCF ALT)
|
41
|
+
|
42
|
+
Use cases are
|
43
|
+
|
44
|
+
* To filter for annotated variants
|
45
|
+
* To remove common variants from a set
|
46
|
+
|
47
|
+
In short a more targeted approach allowing you to work with less data. This
|
48
|
+
tool is decently fast. For example, looking for 130 positions in 20 million SNPs
|
49
|
+
in GoNL takes 0.11s to store and 1.5 minutes to match on my laptop:
|
50
|
+
|
51
|
+
```sh
|
52
|
+
cat my_130_variants.vcf | ./bin/bio-locus --store
|
53
|
+
Stored 130 positions out of 130 in locus.db
|
54
|
+
real 0m0.119s
|
55
|
+
user 0m0.108s
|
56
|
+
sys 0m0.012s
|
57
|
+
|
58
|
+
cat gonl.*.vcf |./bin/bio-locus --match
|
59
|
+
Matched 3 out of 20736323 lines in locus.db!
|
60
|
+
real 1m34.577s
|
61
|
+
user 1m33.602s
|
62
|
+
sys 0m1.868s
|
63
|
+
```
|
64
|
+
|
65
|
+
Note: for the storage the [moneta](https://github.com/minad/moneta) gem is used, currently with localmemcache.
|
66
|
+
|
67
|
+
Note: the ALT field is split into components for matching, so A,C
|
68
|
+
becomes two chr+pos records, one for A and one for C.
|
69
|
+
|
70
|
+
## Installation
|
71
|
+
|
72
|
+
```sh
|
73
|
+
gem install bio-locus
|
74
|
+
```
|
75
|
+
|
76
|
+
## Command line
|
77
|
+
|
78
|
+
In addition to --store and --match mentioned above there are a number
|
79
|
+
of options available through
|
80
|
+
|
81
|
+
```sh
|
82
|
+
bio-locus --help
|
83
|
+
```
|
84
|
+
|
85
|
+
### Deleting keys
|
86
|
+
|
87
|
+
To delete entries use
|
88
|
+
|
89
|
+
```sh
|
90
|
+
bio-locus --delete < two.vcf
|
91
|
+
```
|
92
|
+
|
93
|
+
To match with alt use
|
94
|
+
|
95
|
+
```sh
|
96
|
+
bio-locus --delete --include-alt < two.vcf
|
97
|
+
```
|
98
|
+
|
99
|
+
You may need to run both with and without alt, depending on your needs!
|
100
|
+
|
101
|
+
### Parsing
|
102
|
+
|
103
|
+
It is possible to use any line based format. For example parsing the
|
104
|
+
alt from
|
105
|
+
|
106
|
+
```
|
107
|
+
X 107976940 G/C -1 5 5 0.75 H879D 0 IRS4 CCDS14544 Cat/Gat rs1801164 missense_variant ENST00000372129.2:c.2635C>G
|
108
|
+
```
|
109
|
+
|
110
|
+
can be done with
|
111
|
+
|
112
|
+
```sh
|
113
|
+
bio-locus --store --eval-alt 'field[2].split(/\//)[1]'
|
114
|
+
```
|
115
|
+
|
116
|
+
### COSMIC
|
117
|
+
|
118
|
+
COSMIC is pretty large, so it can be useful to cut the database down to the
|
119
|
+
variants that you have. The locus information is combined
|
120
|
+
in the before last column as chr:start-end, e.g.,
|
121
|
+
19:58861911-58861911. This will work:
|
122
|
+
|
123
|
+
```sh
|
124
|
+
bio-locus -i --match --eval-chr='field[13] =~ /^([^:]+)/ ; $1' --eval-pos='field[13] =~ /:(\d+)-/ ; $1 ' < CosmicMutantExportIncFus_v68.tsv
|
125
|
+
```
|
126
|
+
|
127
|
+
Note the -i switch is needed to skip records that lack position
|
128
|
+
information.
|
129
|
+
|
130
|
+
## Usage
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
require 'bio-locus'
|
134
|
+
```
|
135
|
+
|
136
|
+
The API doc is online. For more code examples see the test files in
|
137
|
+
the source tree.
|
138
|
+
|
139
|
+
## Project home page
|
140
|
+
|
141
|
+
Information on the source tree, documentation, examples, issues and
|
142
|
+
how to contribute, see
|
143
|
+
|
144
|
+
http://github.com/pjotrp/bioruby-locus
|
145
|
+
|
146
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
147
|
+
|
148
|
+
## Cite
|
149
|
+
|
150
|
+
If you use this software, please cite one of
|
151
|
+
|
152
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
153
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
154
|
+
|
155
|
+
## Biogems.info
|
156
|
+
|
157
|
+
This Biogem is published at (http://biogems.info/index.html#bio-locus)
|
158
|
+
|
159
|
+
## Copyright
|
160
|
+
|
161
|
+
Copyright (c) 2014 Pjotr Prins. See LICENSE.txt for further details.
|
162
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-locus"
|
18
|
+
gem.homepage = "http://github.com/pjotrp/bioruby-locus"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Fast storage and comparison of chr+pos(+alt) locations}
|
21
|
+
gem.description = %Q{A tool for fast querying and filtering of genome locations in VCF and other formats}
|
22
|
+
gem.email = "pjotr.public01@thebird.nl"
|
23
|
+
gem.authors = ["Pjotr Prins"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
# require 'rspec/core'
|
29
|
+
# require 'rspec/core/rake_task'
|
30
|
+
# RSpec::Core::RakeTask.new(:spec) do |spec|
|
31
|
+
# spec.pattern = FileList['spec/**/*_spec.rb']
|
32
|
+
# end
|
33
|
+
|
34
|
+
# RSpec::Core::RakeTask.new(:rcov) do |spec|
|
35
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
# spec.rcov = true
|
37
|
+
# end
|
38
|
+
|
39
|
+
require 'cucumber/rake/task'
|
40
|
+
Cucumber::Rake::Task.new(:features)
|
41
|
+
|
42
|
+
task :default => :spec
|
43
|
+
|
44
|
+
require 'rdoc/task'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "bio-locus #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
data/bin/bio-locus
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
|
4
|
+
USAGE = "Use --help for info\n"
|
5
|
+
|
6
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
7
|
+
$: << File.join(gempath,'lib')
|
8
|
+
|
9
|
+
VERSION_FILENAME=File.join(gempath,'VERSION')
|
10
|
+
version = File.new(VERSION_FILENAME).read.chomp
|
11
|
+
|
12
|
+
if ARGV.size == 0
|
13
|
+
print USAGE
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'bio-locus'
|
17
|
+
require 'optparse'
|
18
|
+
|
19
|
+
options = {task: nil, db: 'locus.db', show_help: false, header: 1}
|
20
|
+
opts = OptionParser.new do |o|
|
21
|
+
o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} test.txt"
|
22
|
+
|
23
|
+
o.on("--store", 'Create or add to a cache file') do
|
24
|
+
options[:task] = :store
|
25
|
+
options[:include_alt] = true # always include alt
|
26
|
+
end
|
27
|
+
|
28
|
+
o.on("--delete", 'Remove matches from a cache file') do
|
29
|
+
options[:task] = :delete
|
30
|
+
end
|
31
|
+
|
32
|
+
o.on("--match", 'Match a cache file') do
|
33
|
+
options[:task] = :match
|
34
|
+
end
|
35
|
+
|
36
|
+
o.on("--include-alt", 'Include chr+pos+ALT VCF field to filter') do
|
37
|
+
options[:include_alt] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
o.on("--exclude-alt", 'Override adding chr+pos+ALT field to store') do
|
41
|
+
options[:exclude_alt] = true
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
o.on("--db filename",String,"Use db file") do | fn |
|
46
|
+
options[:db] = fn
|
47
|
+
end
|
48
|
+
|
49
|
+
o.on("--eval-chr expr",String,"Evaluate record to retrieve chr name") do | expr |
|
50
|
+
options[:eval_chr] = expr
|
51
|
+
end
|
52
|
+
|
53
|
+
o.on("--eval-pos expr",String,"Evaluate record to retrieve position") do | expr |
|
54
|
+
options[:eval_pos] = expr
|
55
|
+
end
|
56
|
+
|
57
|
+
o.on("--eval-alt expr",String,"Evaluate record to retrieve alt list") do | expr |
|
58
|
+
options[:eval_alt] = expr
|
59
|
+
end
|
60
|
+
|
61
|
+
o.on("--header num", "Header lines (default 1)") do |l|
|
62
|
+
options[:header] = l.to_i
|
63
|
+
end
|
64
|
+
|
65
|
+
o.on("-q", "--quiet", "Run quietly") do |q|
|
66
|
+
options[:quiet] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
o.on("-v", "--verbose", "Run verbosely") do |v|
|
70
|
+
options[:verbose] = true
|
71
|
+
end
|
72
|
+
|
73
|
+
o.on("-d", "--debug", "Debug mode") do |v|
|
74
|
+
options[:debug] = true
|
75
|
+
end
|
76
|
+
|
77
|
+
o.on("-i", "--ignore-errors", "Continue on error") do
|
78
|
+
options[:ignore_errors] = true
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
o.separator ""
|
83
|
+
o.on_tail('-h', '--help', 'display this help and exit') do
|
84
|
+
options[:show_help] = true
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
begin
|
89
|
+
opts.parse!(ARGV)
|
90
|
+
|
91
|
+
$stderr.print "bio-locus #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n" if !options[:quiet]
|
92
|
+
|
93
|
+
if options[:show_help]
|
94
|
+
print opts
|
95
|
+
print USAGE
|
96
|
+
exit 1
|
97
|
+
end
|
98
|
+
|
99
|
+
$stderr.print "Options: ",options,"\n" if !options[:quiet]
|
100
|
+
|
101
|
+
rescue OptionParser::InvalidOption => e
|
102
|
+
options[:invalid_argument] = e.message
|
103
|
+
end
|
104
|
+
|
105
|
+
(0..options[:header]).each { STDIN.gets }
|
106
|
+
|
107
|
+
case options[:task]
|
108
|
+
when :store then
|
109
|
+
require 'bio-locus/store'
|
110
|
+
options[:include_alt]=false if options[:exclude_alt]
|
111
|
+
BioLocus::Store.run(options)
|
112
|
+
when :match ,:delete then
|
113
|
+
require 'bio-locus/match'
|
114
|
+
BioLocus::Match.run(options)
|
115
|
+
else
|
116
|
+
raise "I do not know what to do!"
|
117
|
+
end
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-locus'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-locus.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-locus/locus.rb'
|
12
|
+
|