bio-kmer_counter 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +48 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/bin/kmer_counter.rb +166 -0
- data/lib/bio-kmer_counter.rb +16 -0
- data/lib/bio-kmer_counter/kmer_counter.rb +45 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-kmer_counter.rb +98 -0
- metadata +218 -0
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem 'bio', ">= 1.4.2"
|
6
|
+
gem 'progressbar', '>=0.11.0'
|
7
|
+
gem 'parallel', '>=0.5.17'
|
8
|
+
gem 'bio-logger', '>=1.0.1'
|
9
|
+
|
10
|
+
# Add dependencies to develop your gem here.
|
11
|
+
# Include everything needed to run rake, tests, features, etc.
|
12
|
+
group :development do
|
13
|
+
gem "shoulda", ">= 0"
|
14
|
+
gem "rdoc", "~> 3.12"
|
15
|
+
gem "jeweler", "~> 1.8.3"
|
16
|
+
gem "bundler", ">= 1.0.21"
|
17
|
+
gem "rdoc", "~> 3.12"
|
18
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Ben J Woodcroft
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# bio-kmer_counter
|
2
|
+
|
3
|
+
[![Build Status](https://secure.travis-ci.org/wwood/bioruby-kmer_counter.png)](http://travis-ci.org/wwood/bioruby-kmer_counter)
|
4
|
+
|
5
|
+
bio-kmer_counter is a simple [biogem](http://biogem.info) for fingerprinting
|
6
|
+
nucleotide sequences by counting the occurences of particular kmers in the
|
7
|
+
sequence. The methodology is not new, for references see [Teeling et. al. 2004](http://www.biomedcentral.com/1471-2105/5/163). The default parameters are derived from the methods section of [Dick et. al. 2009](http://genomebiology.com/content/10/8/R85).
|
8
|
+
|
9
|
+
This methodology is quite different to that of other software that counts
|
10
|
+
kmer content with longer kmers, e.g. [khmer](https://github.com/ged-lab/khmer).
|
11
|
+
Here only small kmers are intended (e.g. 1mer or 4mer).
|
12
|
+
|
13
|
+
Note: this software is under active development!
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
```sh
|
18
|
+
gem install bio-kmer_counter
|
19
|
+
```
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
To analyse a fasta file (that contains one or more sequences in it) for 4-mer (tetranucleotide)
|
24
|
+
content, reporting the fingerprint of 5kb windows in each sequence separately,
|
25
|
+
plus the leftover part if it is longer than 2kb:
|
26
|
+
|
27
|
+
```sh
|
28
|
+
kmer_counter.rb <fasta_file> >tetranucleotide_content.csv
|
29
|
+
```
|
30
|
+
|
31
|
+
The fingerprints are reported in percentages. Well, between 0 and 1, that is.
|
32
|
+
From there it is up to you how to use the fingerprints, sorry.
|
33
|
+
|
34
|
+
## Project home page
|
35
|
+
|
36
|
+
Information on the source tree, documentation, examples, issues and
|
37
|
+
how to contribute, see
|
38
|
+
|
39
|
+
http://github.com/wwood/bioruby-kmer_counter
|
40
|
+
|
41
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
42
|
+
|
43
|
+
## Cite
|
44
|
+
|
45
|
+
This software is currently unpublished, so please just cite the homepage (thanks!).
|
46
|
+
|
47
|
+
Please also cite the tools upon which it is based, one of:
|
48
|
+
|
49
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
50
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2012 Ben J Woodcroft. See LICENSE.txt for further details.
|
55
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-kmer_counter
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/wwood/bioruby-kmer_counter.png"
|
5
|
+
/>}[http://travis-ci.org/#!/wwood/bioruby-kmer_counter]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-kmer_counter
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-kmer_counter'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/wwood/bioruby-kmer_counter
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-kmer_counter
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2012 Ben J Woodcroft. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-kmer_counter"
|
18
|
+
gem.homepage = "http://github.com/wwood/bioruby-kmer_counter"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{A biogem for counting small kmers for fingerprinting nucleotide sequences}
|
21
|
+
gem.description = %Q{A biogem for counting small kmers for fingerprinting nucleotide sequences. See README for details.}
|
22
|
+
gem.email = "gmail.com after donttrustben"
|
23
|
+
gem.authors = ["Ben J Woodcroft"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
task :default => :test
|
36
|
+
|
37
|
+
require 'rdoc/task'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "bio-kmer_counter #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
data/bin/kmer_counter.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'progressbar'
|
5
|
+
require 'parallel'
|
6
|
+
|
7
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
8
|
+
$: << File.join(gempath,'lib')
|
9
|
+
require 'bio-kmer_counter'
|
10
|
+
|
11
|
+
# Parse cmd line options
|
12
|
+
USAGE = "Usage: kmer_counter.rb [-w window_size] [-W window_offset] [-m minimum_window_size] [--window-length] [-k kmer_length] [--contig-name] <fasta_filename>"
|
13
|
+
options = {
|
14
|
+
:window_size => 5000,
|
15
|
+
:minimum_window_size => 2000,
|
16
|
+
:window_offset => 5000,
|
17
|
+
:kmer => 4,
|
18
|
+
:contig_name => false,
|
19
|
+
:sequence_length => false,
|
20
|
+
:logger => 'stderr',
|
21
|
+
:threads => 1,
|
22
|
+
:processes => 1,
|
23
|
+
:progressbar => true,
|
24
|
+
}
|
25
|
+
|
26
|
+
OptionParser.new do |opts|
|
27
|
+
opts.banner = USAGE
|
28
|
+
|
29
|
+
opts.on("-w", "--window-size SIZE", "Length of the window to be used [default #{options[:window_size]}]") do |v|
|
30
|
+
window = v.to_i
|
31
|
+
unless window > 0
|
32
|
+
raise Exception, "Unexpected window size specified: #{v} - it must be greater than 0 residues long!"
|
33
|
+
end
|
34
|
+
options[:window_size] = window
|
35
|
+
options[:window_offset] = window
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-W", "--window-offset SIZE", "Length of the offset between windows [default #{options[:window_offset]}]") do |v|
|
39
|
+
offset = v.to_i
|
40
|
+
unless offset > 0
|
41
|
+
offset = options[:window_isze]
|
42
|
+
end
|
43
|
+
options[:window_offset] = offset
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-m", "--minimum-window-size SIZE", "Length of the minimum window to be used [default #{options[:minimum_window_size]}]") do |v|
|
47
|
+
window = v.to_i
|
48
|
+
unless window > 0
|
49
|
+
raise Exception, "Unexpected minimum window size specified: #{v} - it must be greater than 0 residues long!"
|
50
|
+
end
|
51
|
+
options[:minimum_window_size] = window
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("-k", "--kmer-length SIZE", "Length of the kmer to be used [default #{options[:kmer]}]") do |v|
|
55
|
+
window = v.to_i
|
56
|
+
unless window > 0
|
57
|
+
raise Exception, "Unexpected minimum window size specified: #{v} - it must be greater than 0 residues long!"
|
58
|
+
end
|
59
|
+
options[:kmer] = window
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on("-n", "--contig-name", "Output the contig name, on top of the default contig chunk name [default: #{options[:contig_name]}]") do |v|
|
63
|
+
options[:contig_name] = true
|
64
|
+
end
|
65
|
+
|
66
|
+
opts.on("-l", "--window-length", "print the length of the window in the output [default #{options[:sequence_length]}]") do |v|
|
67
|
+
options[:sequence_length] = true
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on("-p", "--processes NUM_PROCESSES", "Use this many processes. Currently setting multiple processes means there is no progress bar [default #{options[:processes]}]") do |v|
|
71
|
+
options[:processes] = v.to_i
|
72
|
+
if options[:processes] < 1
|
73
|
+
raise "Unexpected number of processes specified (after converting to integer) - '#{options[:processes]}'"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on("-t", "--threads NUM_THREADS", "Use this many threads. This currently only makes sense if you are running on JRuby, since the standard MRI ruby 1.9 can't use multiple cores. Maybe use --processes instead? [default #{options[:threads]}]") do |v|
|
78
|
+
options[:threads] = v.to_i
|
79
|
+
if options[:threads] < 1
|
80
|
+
raise "Unexpected number of threads specified (after converting to integer) - '#{options[:threads]}'"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# logger options
|
86
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
|
87
|
+
Bio::Log::CLI.trace('error')
|
88
|
+
end
|
89
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name |
|
90
|
+
options[:logger] = name
|
91
|
+
end
|
92
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s |
|
93
|
+
Bio::Log::CLI.trace(s)
|
94
|
+
end
|
95
|
+
end.parse!
|
96
|
+
if ARGV.length != 1
|
97
|
+
$stderr.puts o
|
98
|
+
exit 1
|
99
|
+
end
|
100
|
+
# multiple processes doesn't work well with ProgressBar
|
101
|
+
options[:progressbar] = false if options[:processes] != 1
|
102
|
+
|
103
|
+
LOG_NAME = 'bio-kmer_counter'
|
104
|
+
Bio::Log::CLI.logger(options[:logger]) #bio-logger defaults to STDERR not STDOUT, I disagree
|
105
|
+
log = Bio::Log::LoggerPlus.new(LOG_NAME)
|
106
|
+
Bio::Log::CLI.configure(LOG_NAME)
|
107
|
+
|
108
|
+
|
109
|
+
# Print headers
|
110
|
+
print "ID\t"
|
111
|
+
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.join("\t")
|
112
|
+
print "\tWindowLength" if options[:sequence_length]
|
113
|
+
print "\tcontig" if options[:contig_name]
|
114
|
+
puts
|
115
|
+
|
116
|
+
orig = Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])
|
117
|
+
process_window = lambda do |window,kmer,sequence_name,contig_name|
|
118
|
+
counts = orig.dup
|
119
|
+
num_kmers_counted = 0
|
120
|
+
|
121
|
+
window.window_search(options[:kmer],1) do |tetranucleotide|
|
122
|
+
str = tetranucleotide.to_s
|
123
|
+
next unless str.gsub(/[ATGC]+/,'') == ''
|
124
|
+
num_kmers_counted += 1
|
125
|
+
counts[str]+=1
|
126
|
+
#counts[Bio::Sequence::NA.new(tetranucleotide).lowest_lexigraphical_form.to_s.upcase] += 1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Merge everything into lowest lexigraphical form
|
130
|
+
new_counts = Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form counts
|
131
|
+
|
132
|
+
if num_kmers_counted == 0
|
133
|
+
log.warn "Skipping window #{sequence_name} because few/none ATGC's were detected (was it all N's?)"
|
134
|
+
else
|
135
|
+
print "#{sequence_name}"
|
136
|
+
new_counts.keys.sort.each do |tetramer|
|
137
|
+
print "\t#{new_counts[tetramer].to_f/num_kmers_counted}"
|
138
|
+
end
|
139
|
+
print "\t#{window.length}" if options[:sequence_length]
|
140
|
+
print "\t#{contig_name}" if options[:contig_name]
|
141
|
+
puts
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
fasta_filename = ARGV[0]
|
146
|
+
progress = nil
|
147
|
+
progress = ProgressBar.new('kmer_counter', `grep -c '>' '#{fasta_filename}'`.to_i) if options[:progressbar]
|
148
|
+
ff = Bio::FlatFile.open(fasta_filename)
|
149
|
+
Parallel.each(ff, :in_processes => options[:processes], :threads => options[:threads]) do |sequence|
|
150
|
+
#ff.each do |sequence|
|
151
|
+
window_counter = 0
|
152
|
+
sequence.seq.window_search(options[:window_size],options[:window_offset]) do |window|
|
153
|
+
process_window.call(window, options[:kmer], "#{sequence.definition}_#{window_counter}",sequence.definition)
|
154
|
+
window_counter += 1
|
155
|
+
end
|
156
|
+
leftover_length = sequence.seq.length % options[:window_size]
|
157
|
+
if leftover_length >= options[:minimum_window_size]
|
158
|
+
process_window.call(
|
159
|
+
sequence.seq[sequence.seq.length-leftover_length..sequence.seq.length],
|
160
|
+
options[:kmer], "#{sequence.definition}_leftover_#{window_counter}",sequence.definition)
|
161
|
+
end
|
162
|
+
progress.inc if options[:progressbar]
|
163
|
+
end
|
164
|
+
progress.finish if options[:progressbar]
|
165
|
+
|
166
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-logger'
|
12
|
+
Bio::Log::LoggerPlus.new('bio-kmer_counter')
|
13
|
+
|
14
|
+
require 'bio'
|
15
|
+
require 'bio-kmer_counter/kmer_counter.rb'
|
16
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Initialise the hash of the different
|
2
|
+
module Bio
|
3
|
+
class Sequence
|
4
|
+
class NA
|
5
|
+
# Return the current object or its reverse complement, whichever
|
6
|
+
# has the sequence that comes first in lexigraphical (alphabetical)
|
7
|
+
# order
|
8
|
+
def lowest_lexigraphical_form
|
9
|
+
rev = self.reverse_complement
|
10
|
+
to_s < rev.to_s ? self : rev
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Kmer
|
15
|
+
def self.empty_full_kmer_hash(k=4)
|
16
|
+
return @empty_full_hash.dup unless @empty_full_hash.nil?
|
17
|
+
|
18
|
+
counts = {}
|
19
|
+
|
20
|
+
ordered_possibilities = %w(A T C G)
|
21
|
+
keys = ordered_possibilities
|
22
|
+
(k-1).times do
|
23
|
+
keys = keys.collect{|k| ordered_possibilities.collect{|n| "#{k}#{n}"}.flatten}.flatten
|
24
|
+
end
|
25
|
+
|
26
|
+
keys.each do |key|
|
27
|
+
counts[key] = 0
|
28
|
+
end
|
29
|
+
counts
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.merge_down_to_lowest_lexigraphical_form(hash)
|
33
|
+
keys = empty_full_kmer_hash.keys
|
34
|
+
|
35
|
+
new_hash = {}
|
36
|
+
hash.each do |kmer, count|
|
37
|
+
key = Bio::Sequence::NA.new(kmer).lowest_lexigraphical_form.to_s.upcase
|
38
|
+
new_hash[key] ||= 0
|
39
|
+
new_hash[key] += count
|
40
|
+
end
|
41
|
+
return new_hash
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'bio-kmer_counter'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'tempfile'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
class TestBioKmerCounter < Test::Unit::TestCase
|
6
|
+
should 'test_lowest_lexigraphical_form' do
|
7
|
+
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('AA').lowest_lexigraphical_form
|
8
|
+
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('TT').lowest_lexigraphical_form
|
9
|
+
assert_equal Bio::Sequence::NA.new('AG'), Bio::Sequence::NA.new('CT').lowest_lexigraphical_form
|
10
|
+
end
|
11
|
+
|
12
|
+
should 'test_empty_full_kmer_hash' do
|
13
|
+
answer = {}; %w(A C G T).each{|k| answer[k] = 0}
|
14
|
+
assert_equal answer, Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
15
|
+
end
|
16
|
+
|
17
|
+
should 'test merge down' do
|
18
|
+
answer = {}; %w(A C).each{|k| answer[k] = 0}
|
19
|
+
full = Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
20
|
+
assert_equal answer, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full)
|
21
|
+
full = Bio::Sequence::Kmer.empty_full_kmer_hash #defaults to kmer hash length 4
|
22
|
+
assert_equal 136, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full).length
|
23
|
+
end
|
24
|
+
|
25
|
+
def script_path
|
26
|
+
File.join(File.dirname(__FILE__),'..','bin','kmer_counter.rb')
|
27
|
+
end
|
28
|
+
|
29
|
+
should 'test_running1' do
|
30
|
+
Tempfile.open('one') do |tempfile|
|
31
|
+
tempfile.puts '>one'
|
32
|
+
tempfile.puts 'ACAGT'
|
33
|
+
tempfile.close
|
34
|
+
|
35
|
+
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
should 'not whack out when there isnt any sequence to count' do
|
40
|
+
Tempfile.open('one') do |tempfile|
|
41
|
+
tempfile.puts '>one'
|
42
|
+
tempfile.puts 'NNNNN'
|
43
|
+
tempfile.close
|
44
|
+
|
45
|
+
assert_equal "ID\tA\tC\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
should 'give correct increments in window numbering' do
|
50
|
+
Tempfile.open('one') do |tempfile|
|
51
|
+
tempfile.puts '>one'
|
52
|
+
tempfile.puts 'ATGCATGCAT' #10 letters long
|
53
|
+
tempfile.close
|
54
|
+
|
55
|
+
expected = "ID\tA\tC\n"+
|
56
|
+
"one_0\t0.5\t0.5\n"+
|
57
|
+
"one_1\t0.5\t0.5\n"+
|
58
|
+
"one_leftover_2\t1.0\t0.0\n"
|
59
|
+
|
60
|
+
assert_equal expected, `#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}`
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
should "not give a progressbar with multiple processes" do
|
65
|
+
Tempfile.open('one') do |tempfile|
|
66
|
+
tempfile.puts '>one'
|
67
|
+
tempfile.puts 'ATGCATGCAT' #10 letters long
|
68
|
+
tempfile.close
|
69
|
+
|
70
|
+
expected = ["ID\tA\tC\n",
|
71
|
+
"one_0\t0.5\t0.5\n",
|
72
|
+
"one_1\t0.5\t0.5\n",
|
73
|
+
"one_leftover_2\t1.0\t0.0\n"]
|
74
|
+
|
75
|
+
# execute command and capture both stdout, and stderr
|
76
|
+
# no extra processes
|
77
|
+
command = "#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}"
|
78
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
79
|
+
result = stdout.readlines # convert to string?
|
80
|
+
error = stderr.readlines
|
81
|
+
|
82
|
+
assert_not_equal [], error
|
83
|
+
assert_equal expected, result
|
84
|
+
end
|
85
|
+
|
86
|
+
# execute command and capture both stdout, and stderr
|
87
|
+
# extra processes
|
88
|
+
command = "#{script_path} -p 2 -w 4 -k 1 -m 2 #{tempfile.path}"
|
89
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
90
|
+
result = stdout.readlines # convert to string?
|
91
|
+
error = stderr.readlines
|
92
|
+
|
93
|
+
assert_equal [], error
|
94
|
+
assert_equal expected, result
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-kmer_counter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Ben J Woodcroft
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-06-08 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
+
none: false
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
hash: 3
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 4
|
30
|
+
- 2
|
31
|
+
version: 1.4.2
|
32
|
+
version_requirements: *id001
|
33
|
+
name: bio
|
34
|
+
prerelease: false
|
35
|
+
type: :runtime
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 51
|
43
|
+
segments:
|
44
|
+
- 0
|
45
|
+
- 11
|
46
|
+
- 0
|
47
|
+
version: 0.11.0
|
48
|
+
version_requirements: *id002
|
49
|
+
name: progressbar
|
50
|
+
prerelease: false
|
51
|
+
type: :runtime
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 41
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
- 5
|
62
|
+
- 17
|
63
|
+
version: 0.5.17
|
64
|
+
version_requirements: *id003
|
65
|
+
name: parallel
|
66
|
+
prerelease: false
|
67
|
+
type: :runtime
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
hash: 21
|
75
|
+
segments:
|
76
|
+
- 1
|
77
|
+
- 0
|
78
|
+
- 1
|
79
|
+
version: 1.0.1
|
80
|
+
version_requirements: *id004
|
81
|
+
name: bio-logger
|
82
|
+
prerelease: false
|
83
|
+
type: :runtime
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
91
|
+
segments:
|
92
|
+
- 0
|
93
|
+
version: "0"
|
94
|
+
version_requirements: *id005
|
95
|
+
name: shoulda
|
96
|
+
prerelease: false
|
97
|
+
type: :development
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ~>
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
hash: 31
|
105
|
+
segments:
|
106
|
+
- 3
|
107
|
+
- 12
|
108
|
+
version: "3.12"
|
109
|
+
version_requirements: *id006
|
110
|
+
name: rdoc
|
111
|
+
prerelease: false
|
112
|
+
type: :development
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ~>
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
hash: 49
|
120
|
+
segments:
|
121
|
+
- 1
|
122
|
+
- 8
|
123
|
+
- 3
|
124
|
+
version: 1.8.3
|
125
|
+
version_requirements: *id007
|
126
|
+
name: jeweler
|
127
|
+
prerelease: false
|
128
|
+
type: :development
|
129
|
+
- !ruby/object:Gem::Dependency
|
130
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
131
|
+
none: false
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
hash: 61
|
136
|
+
segments:
|
137
|
+
- 1
|
138
|
+
- 0
|
139
|
+
- 21
|
140
|
+
version: 1.0.21
|
141
|
+
version_requirements: *id008
|
142
|
+
name: bundler
|
143
|
+
prerelease: false
|
144
|
+
type: :development
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
147
|
+
none: false
|
148
|
+
requirements:
|
149
|
+
- - ~>
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
hash: 31
|
152
|
+
segments:
|
153
|
+
- 3
|
154
|
+
- 12
|
155
|
+
version: "3.12"
|
156
|
+
version_requirements: *id009
|
157
|
+
name: rdoc
|
158
|
+
prerelease: false
|
159
|
+
type: :development
|
160
|
+
description: A biogem for counting small kmers for fingerprinting nucleotide sequences. See README for details.
|
161
|
+
email: gmail.com after donttrustben
|
162
|
+
executables:
|
163
|
+
- kmer_counter.rb
|
164
|
+
extensions: []
|
165
|
+
|
166
|
+
extra_rdoc_files:
|
167
|
+
- LICENSE.txt
|
168
|
+
- README.md
|
169
|
+
- README.rdoc
|
170
|
+
files:
|
171
|
+
- .document
|
172
|
+
- .travis.yml
|
173
|
+
- Gemfile
|
174
|
+
- LICENSE.txt
|
175
|
+
- README.md
|
176
|
+
- README.rdoc
|
177
|
+
- Rakefile
|
178
|
+
- VERSION
|
179
|
+
- bin/kmer_counter.rb
|
180
|
+
- lib/bio-kmer_counter.rb
|
181
|
+
- lib/bio-kmer_counter/kmer_counter.rb
|
182
|
+
- test/helper.rb
|
183
|
+
- test/test_bio-kmer_counter.rb
|
184
|
+
homepage: http://github.com/wwood/bioruby-kmer_counter
|
185
|
+
licenses:
|
186
|
+
- MIT
|
187
|
+
post_install_message:
|
188
|
+
rdoc_options: []
|
189
|
+
|
190
|
+
require_paths:
|
191
|
+
- lib
|
192
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
hash: 3
|
198
|
+
segments:
|
199
|
+
- 0
|
200
|
+
version: "0"
|
201
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
202
|
+
none: false
|
203
|
+
requirements:
|
204
|
+
- - ">="
|
205
|
+
- !ruby/object:Gem::Version
|
206
|
+
hash: 3
|
207
|
+
segments:
|
208
|
+
- 0
|
209
|
+
version: "0"
|
210
|
+
requirements: []
|
211
|
+
|
212
|
+
rubyforge_project:
|
213
|
+
rubygems_version: 1.8.24
|
214
|
+
signing_key:
|
215
|
+
specification_version: 3
|
216
|
+
summary: A biogem for counting small kmers for fingerprinting nucleotide sequences
|
217
|
+
test_files: []
|
218
|
+
|