bio-kmer_counter 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -3
- data/README.md +28 -9
- data/VERSION +1 -1
- data/bin/kmer_counter.rb +7 -7
- data/test/data/100random.fa +2 -0
- data/test/helper.rb +2 -0
- data/test/test_bio-kmer_counter.rb +36 -9
- metadata +25 -24
data/Gemfile
CHANGED
@@ -11,8 +11,8 @@ gem 'bio-logger', '>=1.0.1'
|
|
11
11
|
# Include everything needed to run rake, tests, features, etc.
|
12
12
|
group :development do
|
13
13
|
gem "shoulda", ">= 0"
|
14
|
-
gem "rdoc", "
|
15
|
-
gem "jeweler",
|
14
|
+
gem "rdoc", ">= 3.12"
|
15
|
+
gem "jeweler",">= 1.8.3"
|
16
16
|
gem "bundler", ">= 1.0.21"
|
17
|
-
gem "rdoc", "
|
17
|
+
gem "rdoc", ">= 3.12"
|
18
18
|
end
|
data/README.md
CHANGED
@@ -4,28 +4,47 @@
|
|
4
4
|
|
5
5
|
bio-kmer_counter is a simple [biogem](http://biogem.info) for fingerprinting
|
6
6
|
nucleotide sequences by counting the occurences of particular kmers in the
|
7
|
-
sequence. The methodology is not new, for
|
7
|
+
sequence. The methodology is not new, for a reference see
|
8
|
+
[Teeling et. al. 2004](http://www.biomedcentral.com/1471-2105/5/163).
|
9
|
+
The default parameters are derived from the well explained methods section of
|
10
|
+
[Dick et. al. 2009](http://genomebiology.com/content/10/8/R85).
|
8
11
|
|
9
12
|
This methodology is quite different to that of other software that counts
|
10
13
|
kmer content with longer kmers, e.g. [khmer](https://github.com/ged-lab/khmer).
|
11
|
-
Here only small kmers are intended (e.g.
|
12
|
-
|
13
|
-
Note: this software is under active development!
|
14
|
+
Here only small kmers are intended (e.g. 1-mer or 4-mer).
|
14
15
|
|
15
16
|
## Installation
|
16
17
|
|
18
|
+
After installing [Ruby](http://www.ruby-lang.org) itself, install the bio-kmer_counter rubygem:
|
19
|
+
|
17
20
|
```sh
|
18
21
|
gem install bio-kmer_counter
|
19
22
|
```
|
20
23
|
|
21
|
-
|
24
|
+
bio-kmer_counter is only tested on Linux, but probably works on OSX too. It might even work on Windows if
|
25
|
+
the progress bar is turned off. Maybe.
|
22
26
|
|
23
|
-
|
24
|
-
content, reporting the fingerprint of 5kb windows in each sequence separately,
|
25
|
-
plus the leftover part if it is longer than 2kb:
|
27
|
+
## Usage
|
26
28
|
|
29
|
+
The default parameters analyse a fasta file that contains one or more sequences in it for 4-mer (tetranucleotide)
|
30
|
+
content. By default, any sequence
|
31
|
+
in the fasta file 2kb or longer is included at least once. Sequences are split up
|
32
|
+
into 5kb windows if they are that long, and each window is reported separately.
|
33
|
+
If the leftover bit at the end after any 5kb windows is 2kb or longer then this is also included.
|
34
|
+
|
35
|
+
By default, each 4 base window in the input sequence is included exactly once in the output file.
|
36
|
+
To account for the fact
|
37
|
+
that the directions of sequences with respect to each other are presumed to be unknown (as is the
|
38
|
+
case for de-novo genome assembly), either the forward or reverse complement is included. Which one
|
39
|
+
(forward or reverse) depends on which one comes first alphabetically. So for instance if the window is ```CTTT```, then ```AAAG```
|
40
|
+
is used. Accounting for palindromic sequences like ```ATAT```, there are 136 of these lowest lexigraphical 4-mers.
|
41
|
+
So there are 136 columns in the output, plus one for the name of the window. Using only 1 is
|
42
|
+
actually slightly different than the method outlined in Dick et. al. 2009, but we
|
43
|
+
don't expect the results to differ.
|
44
|
+
|
45
|
+
Example usage, if you wish to fingerprint a fasta file ```my_nucleotide_sequences.fasta```:
|
27
46
|
```sh
|
28
|
-
kmer_counter.rb
|
47
|
+
kmer_counter.rb my_nucleotide_sequences.fasta >tetranucleotide_content.csv
|
29
48
|
```
|
30
49
|
|
31
50
|
The fingerprints are reported in percentages. Well, between 0 and 1, that is.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/bin/kmer_counter.rb
CHANGED
@@ -64,8 +64,8 @@ o = OptionParser.new do |opts|
|
|
64
64
|
opts.on("-l", "--window-length", "print the length of the window in the output [default #{options[:sequence_length]}]") do |v|
|
65
65
|
options[:sequence_length] = true
|
66
66
|
end
|
67
|
-
|
68
|
-
|
67
|
+
|
68
|
+
|
69
69
|
# logger options
|
70
70
|
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
|
71
71
|
Bio::Log::CLI.trace('error')
|
@@ -90,7 +90,7 @@ Bio::Log::CLI.configure(LOG_NAME)
|
|
90
90
|
|
91
91
|
# Print headers
|
92
92
|
print "ID\t"
|
93
|
-
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.join("\t")
|
93
|
+
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.sort.join("\t")
|
94
94
|
print "\tWindowLength" if options[:sequence_length]
|
95
95
|
print "\tcontig" if options[:contig_name]
|
96
96
|
puts
|
@@ -99,7 +99,7 @@ orig = Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])
|
|
99
99
|
process_window = lambda do |window,kmer,sequence_name,contig_name|
|
100
100
|
counts = orig.dup
|
101
101
|
num_kmers_counted = 0
|
102
|
-
|
102
|
+
|
103
103
|
window.window_search(options[:kmer],1) do |tetranucleotide|
|
104
104
|
str = tetranucleotide.to_s
|
105
105
|
next unless str.gsub(/[ATGC]+/,'') == ''
|
@@ -107,10 +107,10 @@ process_window = lambda do |window,kmer,sequence_name,contig_name|
|
|
107
107
|
counts[str]+=1
|
108
108
|
#counts[Bio::Sequence::NA.new(tetranucleotide).lowest_lexigraphical_form.to_s.upcase] += 1
|
109
109
|
end
|
110
|
-
|
110
|
+
|
111
111
|
# Merge everything into lowest lexigraphical form
|
112
112
|
new_counts = Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form counts
|
113
|
-
|
113
|
+
|
114
114
|
if num_kmers_counted == 0
|
115
115
|
log.warn "Skipping window #{sequence_name} because few/none ATGC's were detected (was it all N's?)"
|
116
116
|
else
|
@@ -127,7 +127,7 @@ end
|
|
127
127
|
fasta_filename = ARGV[0]
|
128
128
|
progress = nil
|
129
129
|
progress = ProgressBar.new('kmer_counter', `grep -c '>' '#{fasta_filename}'`.to_i) if options[:progressbar]
|
130
|
-
ff = Bio::FlatFile.open(fasta_filename)
|
130
|
+
ff = Bio::FlatFile.open(fasta_filename)
|
131
131
|
|
132
132
|
ff.each do |sequence|
|
133
133
|
window_counter = 0
|
data/test/helper.rb
CHANGED
@@ -8,12 +8,12 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
8
8
|
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('TT').lowest_lexigraphical_form
|
9
9
|
assert_equal Bio::Sequence::NA.new('AG'), Bio::Sequence::NA.new('CT').lowest_lexigraphical_form
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
should 'test_empty_full_kmer_hash' do
|
13
13
|
answer = {}; %w(A C G T).each{|k| answer[k] = 0}
|
14
14
|
assert_equal answer, Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
should 'test merge down' do
|
18
18
|
answer = {}; %w(A C).each{|k| answer[k] = 0}
|
19
19
|
full = Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
@@ -21,11 +21,11 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
21
21
|
full = Bio::Sequence::Kmer.empty_full_kmer_hash #defaults to kmer hash length 4
|
22
22
|
assert_equal 136, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full).length
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
def script_path
|
26
26
|
File.join(File.dirname(__FILE__),'..','bin','kmer_counter.rb')
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
should 'test_running1' do
|
30
30
|
Tempfile.open('one') do |tempfile|
|
31
31
|
tempfile.puts '>one'
|
@@ -35,7 +35,7 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
35
35
|
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
36
36
|
end
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
should 'not whack out when there isnt any sequence to count' do
|
40
40
|
Tempfile.open('one') do |tempfile|
|
41
41
|
tempfile.puts '>one'
|
@@ -45,13 +45,13 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
45
45
|
assert_equal "ID\tA\tC\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
46
46
|
end
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
should 'give correct increments in window numbering' do
|
50
50
|
Tempfile.open('one') do |tempfile|
|
51
51
|
tempfile.puts '>one'
|
52
52
|
tempfile.puts 'ATGCATGCAT' #10 letters long
|
53
53
|
tempfile.close
|
54
|
-
|
54
|
+
|
55
55
|
expected = "ID\tA\tC\n"+
|
56
56
|
"one_0\t0.5\t0.5\n"+
|
57
57
|
"one_1\t0.5\t0.5\n"+
|
@@ -60,14 +60,14 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
60
60
|
assert_equal expected, `#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}`
|
61
61
|
end
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
should "print help when no arguments are given" do
|
65
65
|
command = "#{script_path}"
|
66
66
|
Open3.popen3(command) do |stdin, stdout, stderr|
|
67
67
|
assert stderr.readlines[0].match(/^Usage: kmer_counter/)
|
68
68
|
end
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
should 'work with lowercase' do
|
72
72
|
Tempfile.open('one') do |tempfile|
|
73
73
|
tempfile.puts '>one'
|
@@ -77,4 +77,31 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
77
77
|
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
78
78
|
end
|
79
79
|
end
|
80
|
+
|
81
|
+
should 'by default count contigs greater than 2kb but less than 5kb' do
|
82
|
+
Tempfile.open('one') do |tempfile|
|
83
|
+
tempfile.puts '>one'
|
84
|
+
tempfile.puts 'A'*2500
|
85
|
+
tempfile.close
|
86
|
+
|
87
|
+
assert_equal "ID\tA\tC\none_leftover_0\t1.0\t0.0\n", `#{script_path} -k 1 #{tempfile.path}`
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
should 'by default count contigs greater than 2kb but less than 5kb' do
|
92
|
+
Tempfile.open('one') do |tempfile|
|
93
|
+
tempfile.puts '>one'
|
94
|
+
tempfile.puts 'A'*7500
|
95
|
+
tempfile.close
|
96
|
+
|
97
|
+
assert_equal "ID\tA\tC\none_0\t1.0\t0.0\none_leftover_1\t1.0\t0.0\n", `#{script_path} -k 1 #{tempfile.path}`
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
should 'work simulated example with kmer length = 2' do
|
102
|
+
expected = %w(ID AA AC AG AT CA CC CG GA GC TA).join("\t")+"\n"+
|
103
|
+
%w(random_leftover_0 0.1111111111111111 0.13131313131313133 0.1414141414141414 0.0707070707070707 0.1717171717171717 0.1111111111111111 0.020202020202020204 0.1414141414141414 0.050505050505050504 0.050505050505050504).join("\t")+"\n"
|
104
|
+
|
105
|
+
assert_equal expected, `#{script_path} -k 2 -m 1 #{File.join(TEST_DATA_DIR,'100random.fa')}`
|
106
|
+
end
|
80
107
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-kmer_counter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bio
|
16
|
-
requirement: &
|
16
|
+
requirement: &73018740 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.4.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73018740
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: progressbar
|
27
|
-
requirement: &
|
27
|
+
requirement: &73018270 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.11.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73018270
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: parallel
|
38
|
-
requirement: &
|
38
|
+
requirement: &73017680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.5.17
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *73017680
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: bio-logger
|
49
|
-
requirement: &
|
49
|
+
requirement: &73016840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.0.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *73016840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: shoulda
|
60
|
-
requirement: &
|
60
|
+
requirement: &73016590 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,32 +65,32 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *73016590
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rdoc
|
71
|
-
requirement: &
|
71
|
+
requirement: &73016160 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
|
-
- -
|
74
|
+
- - ! '>='
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '3.12'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *73016160
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: jeweler
|
82
|
-
requirement: &
|
82
|
+
requirement: &73015700 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
|
-
- -
|
85
|
+
- - ! '>='
|
86
86
|
- !ruby/object:Gem::Version
|
87
87
|
version: 1.8.3
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *73015700
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: bundler
|
93
|
-
requirement: &
|
93
|
+
requirement: &73015320 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,18 +98,18 @@ dependencies:
|
|
98
98
|
version: 1.0.21
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *73015320
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: rdoc
|
104
|
-
requirement: &
|
104
|
+
requirement: &73015000 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
|
-
- -
|
107
|
+
- - ! '>='
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '3.12'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *73015000
|
113
113
|
description: A biogem for counting small kmers for fingerprinting nucleotide sequences.
|
114
114
|
See README for details.
|
115
115
|
email: gmail.com after donttrustben
|
@@ -130,6 +130,7 @@ files:
|
|
130
130
|
- bin/kmer_counter.rb
|
131
131
|
- lib/bio-kmer_counter.rb
|
132
132
|
- lib/bio-kmer_counter/kmer_counter.rb
|
133
|
+
- test/data/100random.fa
|
133
134
|
- test/helper.rb
|
134
135
|
- test/test_bio-kmer_counter.rb
|
135
136
|
homepage: http://github.com/wwood/bioruby-kmer_counter
|
@@ -147,7 +148,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
147
148
|
version: '0'
|
148
149
|
segments:
|
149
150
|
- 0
|
150
|
-
hash:
|
151
|
+
hash: -117512543
|
151
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
152
153
|
none: false
|
153
154
|
requirements:
|