bio-kmer_counter 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.travis.yml +12 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +48 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/bin/kmer_counter.rb +166 -0
- data/lib/bio-kmer_counter.rb +16 -0
- data/lib/bio-kmer_counter/kmer_counter.rb +45 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-kmer_counter.rb +98 -0
- metadata +218 -0
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
- rbx-19mode
|
7
|
+
# - 1.8.7
|
8
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
9
|
+
# - rbx-18mode
|
10
|
+
|
11
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
12
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem 'bio', ">= 1.4.2"
|
6
|
+
gem 'progressbar', '>=0.11.0'
|
7
|
+
gem 'parallel', '>=0.5.17'
|
8
|
+
gem 'bio-logger', '>=1.0.1'
|
9
|
+
|
10
|
+
# Add dependencies to develop your gem here.
|
11
|
+
# Include everything needed to run rake, tests, features, etc.
|
12
|
+
group :development do
|
13
|
+
gem "shoulda", ">= 0"
|
14
|
+
gem "rdoc", "~> 3.12"
|
15
|
+
gem "jeweler", "~> 1.8.3"
|
16
|
+
gem "bundler", ">= 1.0.21"
|
17
|
+
gem "rdoc", "~> 3.12"
|
18
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 Ben J Woodcroft
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# bio-kmer_counter
|
2
|
+
|
3
|
+
[](http://travis-ci.org/wwood/bioruby-kmer_counter)
|
4
|
+
|
5
|
+
bio-kmer_counter is a simple [biogem](http://biogem.info) for fingerprinting
|
6
|
+
nucleotide sequences by counting the occurences of particular kmers in the
|
7
|
+
sequence. The methodology is not new, for references see [Teeling et. al. 2004](http://www.biomedcentral.com/1471-2105/5/163). The default parameters are derived from the methods section of [Dick et. al. 2009](http://genomebiology.com/content/10/8/R85).
|
8
|
+
|
9
|
+
This methodology is quite different to that of other software that counts
|
10
|
+
kmer content with longer kmers, e.g. [khmer](https://github.com/ged-lab/khmer).
|
11
|
+
Here only small kmers are intended (e.g. 1mer or 4mer).
|
12
|
+
|
13
|
+
Note: this software is under active development!
|
14
|
+
|
15
|
+
## Installation
|
16
|
+
|
17
|
+
```sh
|
18
|
+
gem install bio-kmer_counter
|
19
|
+
```
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
To analyse a fasta file (that contains one or more sequences in it) for 4-mer (tetranucleotide)
|
24
|
+
content, reporting the fingerprint of 5kb windows in each sequence separately,
|
25
|
+
plus the leftover part if it is longer than 2kb:
|
26
|
+
|
27
|
+
```sh
|
28
|
+
kmer_counter.rb <fasta_file> >tetranucleotide_content.csv
|
29
|
+
```
|
30
|
+
|
31
|
+
The fingerprints are reported in percentages. Well, between 0 and 1, that is.
|
32
|
+
From there it is up to you how to use the fingerprints, sorry.
|
33
|
+
|
34
|
+
## Project home page
|
35
|
+
|
36
|
+
Information on the source tree, documentation, examples, issues and
|
37
|
+
how to contribute, see
|
38
|
+
|
39
|
+
http://github.com/wwood/bioruby-kmer_counter
|
40
|
+
|
41
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
42
|
+
|
43
|
+
## Cite
|
44
|
+
|
45
|
+
This software is currently unpublished, so please just cite the homepage (thanks!).
|
46
|
+
|
47
|
+
Please also cite the tools upon which it is based, one of:
|
48
|
+
|
49
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
50
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
51
|
+
|
52
|
+
## Copyright
|
53
|
+
|
54
|
+
Copyright (c) 2012 Ben J Woodcroft. See LICENSE.txt for further details.
|
55
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-kmer_counter
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/wwood/bioruby-kmer_counter.png"
|
5
|
+
/>}[http://travis-ci.org/#!/wwood/bioruby-kmer_counter]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-kmer_counter
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-kmer_counter'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/wwood/bioruby-kmer_counter
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-kmer_counter
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2012 Ben J Woodcroft. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
|
+
gem.name = "bio-kmer_counter"
|
18
|
+
gem.homepage = "http://github.com/wwood/bioruby-kmer_counter"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{A biogem for counting small kmers for fingerprinting nucleotide sequences}
|
21
|
+
gem.description = %Q{A biogem for counting small kmers for fingerprinting nucleotide sequences. See README for details.}
|
22
|
+
gem.email = "gmail.com after donttrustben"
|
23
|
+
gem.authors = ["Ben J Woodcroft"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:test) do |test|
|
30
|
+
test.libs << 'lib' << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
task :default => :test
|
36
|
+
|
37
|
+
require 'rdoc/task'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "bio-kmer_counter #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
data/bin/kmer_counter.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'progressbar'
|
5
|
+
require 'parallel'
|
6
|
+
|
7
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
8
|
+
$: << File.join(gempath,'lib')
|
9
|
+
require 'bio-kmer_counter'
|
10
|
+
|
11
|
+
# Parse cmd line options
|
12
|
+
USAGE = "Usage: kmer_counter.rb [-w window_size] [-W window_offset] [-m minimum_window_size] [--window-length] [-k kmer_length] [--contig-name] <fasta_filename>"
|
13
|
+
options = {
|
14
|
+
:window_size => 5000,
|
15
|
+
:minimum_window_size => 2000,
|
16
|
+
:window_offset => 5000,
|
17
|
+
:kmer => 4,
|
18
|
+
:contig_name => false,
|
19
|
+
:sequence_length => false,
|
20
|
+
:logger => 'stderr',
|
21
|
+
:threads => 1,
|
22
|
+
:processes => 1,
|
23
|
+
:progressbar => true,
|
24
|
+
}
|
25
|
+
|
26
|
+
OptionParser.new do |opts|
|
27
|
+
opts.banner = USAGE
|
28
|
+
|
29
|
+
opts.on("-w", "--window-size SIZE", "Length of the window to be used [default #{options[:window_size]}]") do |v|
|
30
|
+
window = v.to_i
|
31
|
+
unless window > 0
|
32
|
+
raise Exception, "Unexpected window size specified: #{v} - it must be greater than 0 residues long!"
|
33
|
+
end
|
34
|
+
options[:window_size] = window
|
35
|
+
options[:window_offset] = window
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-W", "--window-offset SIZE", "Length of the offset between windows [default #{options[:window_offset]}]") do |v|
|
39
|
+
offset = v.to_i
|
40
|
+
unless offset > 0
|
41
|
+
offset = options[:window_isze]
|
42
|
+
end
|
43
|
+
options[:window_offset] = offset
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-m", "--minimum-window-size SIZE", "Length of the minimum window to be used [default #{options[:minimum_window_size]}]") do |v|
|
47
|
+
window = v.to_i
|
48
|
+
unless window > 0
|
49
|
+
raise Exception, "Unexpected minimum window size specified: #{v} - it must be greater than 0 residues long!"
|
50
|
+
end
|
51
|
+
options[:minimum_window_size] = window
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("-k", "--kmer-length SIZE", "Length of the kmer to be used [default #{options[:kmer]}]") do |v|
|
55
|
+
window = v.to_i
|
56
|
+
unless window > 0
|
57
|
+
raise Exception, "Unexpected minimum window size specified: #{v} - it must be greater than 0 residues long!"
|
58
|
+
end
|
59
|
+
options[:kmer] = window
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on("-n", "--contig-name", "Output the contig name, on top of the default contig chunk name [default: #{options[:contig_name]}]") do |v|
|
63
|
+
options[:contig_name] = true
|
64
|
+
end
|
65
|
+
|
66
|
+
opts.on("-l", "--window-length", "print the length of the window in the output [default #{options[:sequence_length]}]") do |v|
|
67
|
+
options[:sequence_length] = true
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on("-p", "--processes NUM_PROCESSES", "Use this many processes. Currently setting multiple processes means there is no progress bar [default #{options[:processes]}]") do |v|
|
71
|
+
options[:processes] = v.to_i
|
72
|
+
if options[:processes] < 1
|
73
|
+
raise "Unexpected number of processes specified (after converting to integer) - '#{options[:processes]}'"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on("-t", "--threads NUM_THREADS", "Use this many threads. This currently only makes sense if you are running on JRuby, since the standard MRI ruby 1.9 can't use multiple cores. Maybe use --processes instead? [default #{options[:threads]}]") do |v|
|
78
|
+
options[:threads] = v.to_i
|
79
|
+
if options[:threads] < 1
|
80
|
+
raise "Unexpected number of threads specified (after converting to integer) - '#{options[:threads]}'"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# logger options
|
86
|
+
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
|
87
|
+
Bio::Log::CLI.trace('error')
|
88
|
+
end
|
89
|
+
opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name |
|
90
|
+
options[:logger] = name
|
91
|
+
end
|
92
|
+
opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s |
|
93
|
+
Bio::Log::CLI.trace(s)
|
94
|
+
end
|
95
|
+
end.parse!
|
96
|
+
if ARGV.length != 1
|
97
|
+
$stderr.puts o
|
98
|
+
exit 1
|
99
|
+
end
|
100
|
+
# multiple processes doesn't work well with ProgressBar
|
101
|
+
options[:progressbar] = false if options[:processes] != 1
|
102
|
+
|
103
|
+
LOG_NAME = 'bio-kmer_counter'
|
104
|
+
Bio::Log::CLI.logger(options[:logger]) #bio-logger defaults to STDERR not STDOUT, I disagree
|
105
|
+
log = Bio::Log::LoggerPlus.new(LOG_NAME)
|
106
|
+
Bio::Log::CLI.configure(LOG_NAME)
|
107
|
+
|
108
|
+
|
109
|
+
# Print headers
|
110
|
+
print "ID\t"
|
111
|
+
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.join("\t")
|
112
|
+
print "\tWindowLength" if options[:sequence_length]
|
113
|
+
print "\tcontig" if options[:contig_name]
|
114
|
+
puts
|
115
|
+
|
116
|
+
orig = Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])
|
117
|
+
process_window = lambda do |window,kmer,sequence_name,contig_name|
|
118
|
+
counts = orig.dup
|
119
|
+
num_kmers_counted = 0
|
120
|
+
|
121
|
+
window.window_search(options[:kmer],1) do |tetranucleotide|
|
122
|
+
str = tetranucleotide.to_s
|
123
|
+
next unless str.gsub(/[ATGC]+/,'') == ''
|
124
|
+
num_kmers_counted += 1
|
125
|
+
counts[str]+=1
|
126
|
+
#counts[Bio::Sequence::NA.new(tetranucleotide).lowest_lexigraphical_form.to_s.upcase] += 1
|
127
|
+
end
|
128
|
+
|
129
|
+
# Merge everything into lowest lexigraphical form
|
130
|
+
new_counts = Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form counts
|
131
|
+
|
132
|
+
if num_kmers_counted == 0
|
133
|
+
log.warn "Skipping window #{sequence_name} because few/none ATGC's were detected (was it all N's?)"
|
134
|
+
else
|
135
|
+
print "#{sequence_name}"
|
136
|
+
new_counts.keys.sort.each do |tetramer|
|
137
|
+
print "\t#{new_counts[tetramer].to_f/num_kmers_counted}"
|
138
|
+
end
|
139
|
+
print "\t#{window.length}" if options[:sequence_length]
|
140
|
+
print "\t#{contig_name}" if options[:contig_name]
|
141
|
+
puts
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
fasta_filename = ARGV[0]
|
146
|
+
progress = nil
|
147
|
+
progress = ProgressBar.new('kmer_counter', `grep -c '>' '#{fasta_filename}'`.to_i) if options[:progressbar]
|
148
|
+
ff = Bio::FlatFile.open(fasta_filename)
|
149
|
+
Parallel.each(ff, :in_processes => options[:processes], :threads => options[:threads]) do |sequence|
|
150
|
+
#ff.each do |sequence|
|
151
|
+
window_counter = 0
|
152
|
+
sequence.seq.window_search(options[:window_size],options[:window_offset]) do |window|
|
153
|
+
process_window.call(window, options[:kmer], "#{sequence.definition}_#{window_counter}",sequence.definition)
|
154
|
+
window_counter += 1
|
155
|
+
end
|
156
|
+
leftover_length = sequence.seq.length % options[:window_size]
|
157
|
+
if leftover_length >= options[:minimum_window_size]
|
158
|
+
process_window.call(
|
159
|
+
sequence.seq[sequence.seq.length-leftover_length..sequence.seq.length],
|
160
|
+
options[:kmer], "#{sequence.definition}_leftover_#{window_counter}",sequence.definition)
|
161
|
+
end
|
162
|
+
progress.inc if options[:progressbar]
|
163
|
+
end
|
164
|
+
progress.finish if options[:progressbar]
|
165
|
+
|
166
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-logger'
|
12
|
+
Bio::Log::LoggerPlus.new('bio-kmer_counter')
|
13
|
+
|
14
|
+
require 'bio'
|
15
|
+
require 'bio-kmer_counter/kmer_counter.rb'
|
16
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Initialise the hash of the different
|
2
|
+
module Bio
|
3
|
+
class Sequence
|
4
|
+
class NA
|
5
|
+
# Return the current object or its reverse complement, whichever
|
6
|
+
# has the sequence that comes first in lexigraphical (alphabetical)
|
7
|
+
# order
|
8
|
+
def lowest_lexigraphical_form
|
9
|
+
rev = self.reverse_complement
|
10
|
+
to_s < rev.to_s ? self : rev
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Kmer
|
15
|
+
def self.empty_full_kmer_hash(k=4)
|
16
|
+
return @empty_full_hash.dup unless @empty_full_hash.nil?
|
17
|
+
|
18
|
+
counts = {}
|
19
|
+
|
20
|
+
ordered_possibilities = %w(A T C G)
|
21
|
+
keys = ordered_possibilities
|
22
|
+
(k-1).times do
|
23
|
+
keys = keys.collect{|k| ordered_possibilities.collect{|n| "#{k}#{n}"}.flatten}.flatten
|
24
|
+
end
|
25
|
+
|
26
|
+
keys.each do |key|
|
27
|
+
counts[key] = 0
|
28
|
+
end
|
29
|
+
counts
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.merge_down_to_lowest_lexigraphical_form(hash)
|
33
|
+
keys = empty_full_kmer_hash.keys
|
34
|
+
|
35
|
+
new_hash = {}
|
36
|
+
hash.each do |kmer, count|
|
37
|
+
key = Bio::Sequence::NA.new(kmer).lowest_lexigraphical_form.to_s.upcase
|
38
|
+
new_hash[key] ||= 0
|
39
|
+
new_hash[key] += count
|
40
|
+
end
|
41
|
+
return new_hash
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'bio-kmer_counter'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'tempfile'
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
class TestBioKmerCounter < Test::Unit::TestCase
|
6
|
+
should 'test_lowest_lexigraphical_form' do
|
7
|
+
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('AA').lowest_lexigraphical_form
|
8
|
+
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('TT').lowest_lexigraphical_form
|
9
|
+
assert_equal Bio::Sequence::NA.new('AG'), Bio::Sequence::NA.new('CT').lowest_lexigraphical_form
|
10
|
+
end
|
11
|
+
|
12
|
+
should 'test_empty_full_kmer_hash' do
|
13
|
+
answer = {}; %w(A C G T).each{|k| answer[k] = 0}
|
14
|
+
assert_equal answer, Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
15
|
+
end
|
16
|
+
|
17
|
+
should 'test merge down' do
|
18
|
+
answer = {}; %w(A C).each{|k| answer[k] = 0}
|
19
|
+
full = Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
20
|
+
assert_equal answer, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full)
|
21
|
+
full = Bio::Sequence::Kmer.empty_full_kmer_hash #defaults to kmer hash length 4
|
22
|
+
assert_equal 136, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full).length
|
23
|
+
end
|
24
|
+
|
25
|
+
def script_path
|
26
|
+
File.join(File.dirname(__FILE__),'..','bin','kmer_counter.rb')
|
27
|
+
end
|
28
|
+
|
29
|
+
should 'test_running1' do
|
30
|
+
Tempfile.open('one') do |tempfile|
|
31
|
+
tempfile.puts '>one'
|
32
|
+
tempfile.puts 'ACAGT'
|
33
|
+
tempfile.close
|
34
|
+
|
35
|
+
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
should 'not whack out when there isnt any sequence to count' do
|
40
|
+
Tempfile.open('one') do |tempfile|
|
41
|
+
tempfile.puts '>one'
|
42
|
+
tempfile.puts 'NNNNN'
|
43
|
+
tempfile.close
|
44
|
+
|
45
|
+
assert_equal "ID\tA\tC\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
should 'give correct increments in window numbering' do
|
50
|
+
Tempfile.open('one') do |tempfile|
|
51
|
+
tempfile.puts '>one'
|
52
|
+
tempfile.puts 'ATGCATGCAT' #10 letters long
|
53
|
+
tempfile.close
|
54
|
+
|
55
|
+
expected = "ID\tA\tC\n"+
|
56
|
+
"one_0\t0.5\t0.5\n"+
|
57
|
+
"one_1\t0.5\t0.5\n"+
|
58
|
+
"one_leftover_2\t1.0\t0.0\n"
|
59
|
+
|
60
|
+
assert_equal expected, `#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}`
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
should "not give a progressbar with multiple processes" do
|
65
|
+
Tempfile.open('one') do |tempfile|
|
66
|
+
tempfile.puts '>one'
|
67
|
+
tempfile.puts 'ATGCATGCAT' #10 letters long
|
68
|
+
tempfile.close
|
69
|
+
|
70
|
+
expected = ["ID\tA\tC\n",
|
71
|
+
"one_0\t0.5\t0.5\n",
|
72
|
+
"one_1\t0.5\t0.5\n",
|
73
|
+
"one_leftover_2\t1.0\t0.0\n"]
|
74
|
+
|
75
|
+
# execute command and capture both stdout, and stderr
|
76
|
+
# no extra processes
|
77
|
+
command = "#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}"
|
78
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
79
|
+
result = stdout.readlines # convert to string?
|
80
|
+
error = stderr.readlines
|
81
|
+
|
82
|
+
assert_not_equal [], error
|
83
|
+
assert_equal expected, result
|
84
|
+
end
|
85
|
+
|
86
|
+
# execute command and capture both stdout, and stderr
|
87
|
+
# extra processes
|
88
|
+
command = "#{script_path} -p 2 -w 4 -k 1 -m 2 #{tempfile.path}"
|
89
|
+
Open3.popen3(command) do |stdin, stdout, stderr|
|
90
|
+
result = stdout.readlines # convert to string?
|
91
|
+
error = stderr.readlines
|
92
|
+
|
93
|
+
assert_equal [], error
|
94
|
+
assert_equal expected, result
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
metadata
ADDED
@@ -0,0 +1,218 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-kmer_counter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Ben J Woodcroft
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-06-08 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
22
|
+
none: false
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
hash: 3
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 4
|
30
|
+
- 2
|
31
|
+
version: 1.4.2
|
32
|
+
version_requirements: *id001
|
33
|
+
name: bio
|
34
|
+
prerelease: false
|
35
|
+
type: :runtime
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
none: false
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
hash: 51
|
43
|
+
segments:
|
44
|
+
- 0
|
45
|
+
- 11
|
46
|
+
- 0
|
47
|
+
version: 0.11.0
|
48
|
+
version_requirements: *id002
|
49
|
+
name: progressbar
|
50
|
+
prerelease: false
|
51
|
+
type: :runtime
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
hash: 41
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
- 5
|
62
|
+
- 17
|
63
|
+
version: 0.5.17
|
64
|
+
version_requirements: *id003
|
65
|
+
name: parallel
|
66
|
+
prerelease: false
|
67
|
+
type: :runtime
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
hash: 21
|
75
|
+
segments:
|
76
|
+
- 1
|
77
|
+
- 0
|
78
|
+
- 1
|
79
|
+
version: 1.0.1
|
80
|
+
version_requirements: *id004
|
81
|
+
name: bio-logger
|
82
|
+
prerelease: false
|
83
|
+
type: :runtime
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
91
|
+
segments:
|
92
|
+
- 0
|
93
|
+
version: "0"
|
94
|
+
version_requirements: *id005
|
95
|
+
name: shoulda
|
96
|
+
prerelease: false
|
97
|
+
type: :development
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
100
|
+
none: false
|
101
|
+
requirements:
|
102
|
+
- - ~>
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
hash: 31
|
105
|
+
segments:
|
106
|
+
- 3
|
107
|
+
- 12
|
108
|
+
version: "3.12"
|
109
|
+
version_requirements: *id006
|
110
|
+
name: rdoc
|
111
|
+
prerelease: false
|
112
|
+
type: :development
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
115
|
+
none: false
|
116
|
+
requirements:
|
117
|
+
- - ~>
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
hash: 49
|
120
|
+
segments:
|
121
|
+
- 1
|
122
|
+
- 8
|
123
|
+
- 3
|
124
|
+
version: 1.8.3
|
125
|
+
version_requirements: *id007
|
126
|
+
name: jeweler
|
127
|
+
prerelease: false
|
128
|
+
type: :development
|
129
|
+
- !ruby/object:Gem::Dependency
|
130
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
131
|
+
none: false
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
hash: 61
|
136
|
+
segments:
|
137
|
+
- 1
|
138
|
+
- 0
|
139
|
+
- 21
|
140
|
+
version: 1.0.21
|
141
|
+
version_requirements: *id008
|
142
|
+
name: bundler
|
143
|
+
prerelease: false
|
144
|
+
type: :development
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
147
|
+
none: false
|
148
|
+
requirements:
|
149
|
+
- - ~>
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
hash: 31
|
152
|
+
segments:
|
153
|
+
- 3
|
154
|
+
- 12
|
155
|
+
version: "3.12"
|
156
|
+
version_requirements: *id009
|
157
|
+
name: rdoc
|
158
|
+
prerelease: false
|
159
|
+
type: :development
|
160
|
+
description: A biogem for counting small kmers for fingerprinting nucleotide sequences. See README for details.
|
161
|
+
email: gmail.com after donttrustben
|
162
|
+
executables:
|
163
|
+
- kmer_counter.rb
|
164
|
+
extensions: []
|
165
|
+
|
166
|
+
extra_rdoc_files:
|
167
|
+
- LICENSE.txt
|
168
|
+
- README.md
|
169
|
+
- README.rdoc
|
170
|
+
files:
|
171
|
+
- .document
|
172
|
+
- .travis.yml
|
173
|
+
- Gemfile
|
174
|
+
- LICENSE.txt
|
175
|
+
- README.md
|
176
|
+
- README.rdoc
|
177
|
+
- Rakefile
|
178
|
+
- VERSION
|
179
|
+
- bin/kmer_counter.rb
|
180
|
+
- lib/bio-kmer_counter.rb
|
181
|
+
- lib/bio-kmer_counter/kmer_counter.rb
|
182
|
+
- test/helper.rb
|
183
|
+
- test/test_bio-kmer_counter.rb
|
184
|
+
homepage: http://github.com/wwood/bioruby-kmer_counter
|
185
|
+
licenses:
|
186
|
+
- MIT
|
187
|
+
post_install_message:
|
188
|
+
rdoc_options: []
|
189
|
+
|
190
|
+
require_paths:
|
191
|
+
- lib
|
192
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
hash: 3
|
198
|
+
segments:
|
199
|
+
- 0
|
200
|
+
version: "0"
|
201
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
202
|
+
none: false
|
203
|
+
requirements:
|
204
|
+
- - ">="
|
205
|
+
- !ruby/object:Gem::Version
|
206
|
+
hash: 3
|
207
|
+
segments:
|
208
|
+
- 0
|
209
|
+
version: "0"
|
210
|
+
requirements: []
|
211
|
+
|
212
|
+
rubyforge_project:
|
213
|
+
rubygems_version: 1.8.24
|
214
|
+
signing_key:
|
215
|
+
specification_version: 3
|
216
|
+
summary: A biogem for counting small kmers for fingerprinting nucleotide sequences
|
217
|
+
test_files: []
|
218
|
+
|