bio-kmer_counter 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -3
- data/README.md +28 -9
- data/VERSION +1 -1
- data/bin/kmer_counter.rb +7 -7
- data/test/data/100random.fa +2 -0
- data/test/helper.rb +2 -0
- data/test/test_bio-kmer_counter.rb +36 -9
- metadata +25 -24
data/Gemfile
CHANGED
@@ -11,8 +11,8 @@ gem 'bio-logger', '>=1.0.1'
|
|
11
11
|
# Include everything needed to run rake, tests, features, etc.
|
12
12
|
group :development do
|
13
13
|
gem "shoulda", ">= 0"
|
14
|
-
gem "rdoc", "
|
15
|
-
gem "jeweler",
|
14
|
+
gem "rdoc", ">= 3.12"
|
15
|
+
gem "jeweler",">= 1.8.3"
|
16
16
|
gem "bundler", ">= 1.0.21"
|
17
|
-
gem "rdoc", "
|
17
|
+
gem "rdoc", ">= 3.12"
|
18
18
|
end
|
data/README.md
CHANGED
@@ -4,28 +4,47 @@
|
|
4
4
|
|
5
5
|
bio-kmer_counter is a simple [biogem](http://biogem.info) for fingerprinting
|
6
6
|
nucleotide sequences by counting the occurences of particular kmers in the
|
7
|
-
sequence. The methodology is not new, for
|
7
|
+
sequence. The methodology is not new, for a reference see
|
8
|
+
[Teeling et. al. 2004](http://www.biomedcentral.com/1471-2105/5/163).
|
9
|
+
The default parameters are derived from the well explained methods section of
|
10
|
+
[Dick et. al. 2009](http://genomebiology.com/content/10/8/R85).
|
8
11
|
|
9
12
|
This methodology is quite different to that of other software that counts
|
10
13
|
kmer content with longer kmers, e.g. [khmer](https://github.com/ged-lab/khmer).
|
11
|
-
Here only small kmers are intended (e.g.
|
12
|
-
|
13
|
-
Note: this software is under active development!
|
14
|
+
Here only small kmers are intended (e.g. 1-mer or 4-mer).
|
14
15
|
|
15
16
|
## Installation
|
16
17
|
|
18
|
+
After installing [Ruby](http://www.ruby-lang.org) itself, install the bio-kmer_counter rubygem:
|
19
|
+
|
17
20
|
```sh
|
18
21
|
gem install bio-kmer_counter
|
19
22
|
```
|
20
23
|
|
21
|
-
|
24
|
+
bio-kmer_counter is only tested on Linux, but probably works on OSX too. It might even work on Windows if
|
25
|
+
the progress bar is turned off. Maybe.
|
22
26
|
|
23
|
-
|
24
|
-
content, reporting the fingerprint of 5kb windows in each sequence separately,
|
25
|
-
plus the leftover part if it is longer than 2kb:
|
27
|
+
## Usage
|
26
28
|
|
29
|
+
The default parameters analyse a fasta file that contains one or more sequences in it for 4-mer (tetranucleotide)
|
30
|
+
content. By default, any sequence
|
31
|
+
in the fasta file 2kb or longer is included at least once. Sequences are split up
|
32
|
+
into 5kb windows if they are that long, and each window is reported separately.
|
33
|
+
If the leftover bit at the end after any 5kb windows is 2kb or longer then this is also included.
|
34
|
+
|
35
|
+
By default, each 4 base window in the input sequence is included exactly once in the output file.
|
36
|
+
To account for the fact
|
37
|
+
that the directions of sequences with respect to each other are presumed to be unknown (as is the
|
38
|
+
case for de-novo genome assembly), either the forward or reverse complement is included. Which one
|
39
|
+
(forward or reverse) depends on which one comes first alphabetically. So for instance if the window is ```CTTT```, then ```AAAG```
|
40
|
+
is used. Accounting for palindromic sequences like ```ATAT```, there are 136 of these lowest lexigraphical 4-mers.
|
41
|
+
So there are 136 columns in the output, plus one for the name of the window. Using only 1 is
|
42
|
+
actually slightly different than the method outlined in Dick et. al. 2009, but we
|
43
|
+
don't expect the results to differ.
|
44
|
+
|
45
|
+
Example usage, if you wish to fingerprint a fasta file ```my_nucleotide_sequences.fasta```:
|
27
46
|
```sh
|
28
|
-
kmer_counter.rb
|
47
|
+
kmer_counter.rb my_nucleotide_sequences.fasta >tetranucleotide_content.csv
|
29
48
|
```
|
30
49
|
|
31
50
|
The fingerprints are reported in percentages. Well, between 0 and 1, that is.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/bin/kmer_counter.rb
CHANGED
@@ -64,8 +64,8 @@ o = OptionParser.new do |opts|
|
|
64
64
|
opts.on("-l", "--window-length", "print the length of the window in the output [default #{options[:sequence_length]}]") do |v|
|
65
65
|
options[:sequence_length] = true
|
66
66
|
end
|
67
|
-
|
68
|
-
|
67
|
+
|
68
|
+
|
69
69
|
# logger options
|
70
70
|
opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
|
71
71
|
Bio::Log::CLI.trace('error')
|
@@ -90,7 +90,7 @@ Bio::Log::CLI.configure(LOG_NAME)
|
|
90
90
|
|
91
91
|
# Print headers
|
92
92
|
print "ID\t"
|
93
|
-
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.join("\t")
|
93
|
+
print Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])).keys.sort.join("\t")
|
94
94
|
print "\tWindowLength" if options[:sequence_length]
|
95
95
|
print "\tcontig" if options[:contig_name]
|
96
96
|
puts
|
@@ -99,7 +99,7 @@ orig = Bio::Sequence::Kmer.empty_full_kmer_hash(options[:kmer])
|
|
99
99
|
process_window = lambda do |window,kmer,sequence_name,contig_name|
|
100
100
|
counts = orig.dup
|
101
101
|
num_kmers_counted = 0
|
102
|
-
|
102
|
+
|
103
103
|
window.window_search(options[:kmer],1) do |tetranucleotide|
|
104
104
|
str = tetranucleotide.to_s
|
105
105
|
next unless str.gsub(/[ATGC]+/,'') == ''
|
@@ -107,10 +107,10 @@ process_window = lambda do |window,kmer,sequence_name,contig_name|
|
|
107
107
|
counts[str]+=1
|
108
108
|
#counts[Bio::Sequence::NA.new(tetranucleotide).lowest_lexigraphical_form.to_s.upcase] += 1
|
109
109
|
end
|
110
|
-
|
110
|
+
|
111
111
|
# Merge everything into lowest lexigraphical form
|
112
112
|
new_counts = Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form counts
|
113
|
-
|
113
|
+
|
114
114
|
if num_kmers_counted == 0
|
115
115
|
log.warn "Skipping window #{sequence_name} because few/none ATGC's were detected (was it all N's?)"
|
116
116
|
else
|
@@ -127,7 +127,7 @@ end
|
|
127
127
|
fasta_filename = ARGV[0]
|
128
128
|
progress = nil
|
129
129
|
progress = ProgressBar.new('kmer_counter', `grep -c '>' '#{fasta_filename}'`.to_i) if options[:progressbar]
|
130
|
-
ff = Bio::FlatFile.open(fasta_filename)
|
130
|
+
ff = Bio::FlatFile.open(fasta_filename)
|
131
131
|
|
132
132
|
ff.each do |sequence|
|
133
133
|
window_counter = 0
|
data/test/helper.rb
CHANGED
@@ -8,12 +8,12 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
8
8
|
assert_equal Bio::Sequence::NA.new('AA'), Bio::Sequence::NA.new('TT').lowest_lexigraphical_form
|
9
9
|
assert_equal Bio::Sequence::NA.new('AG'), Bio::Sequence::NA.new('CT').lowest_lexigraphical_form
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
should 'test_empty_full_kmer_hash' do
|
13
13
|
answer = {}; %w(A C G T).each{|k| answer[k] = 0}
|
14
14
|
assert_equal answer, Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
should 'test merge down' do
|
18
18
|
answer = {}; %w(A C).each{|k| answer[k] = 0}
|
19
19
|
full = Bio::Sequence::Kmer.empty_full_kmer_hash(1)
|
@@ -21,11 +21,11 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
21
21
|
full = Bio::Sequence::Kmer.empty_full_kmer_hash #defaults to kmer hash length 4
|
22
22
|
assert_equal 136, Bio::Sequence::Kmer.merge_down_to_lowest_lexigraphical_form(full).length
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
def script_path
|
26
26
|
File.join(File.dirname(__FILE__),'..','bin','kmer_counter.rb')
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
should 'test_running1' do
|
30
30
|
Tempfile.open('one') do |tempfile|
|
31
31
|
tempfile.puts '>one'
|
@@ -35,7 +35,7 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
35
35
|
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
36
36
|
end
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
should 'not whack out when there isnt any sequence to count' do
|
40
40
|
Tempfile.open('one') do |tempfile|
|
41
41
|
tempfile.puts '>one'
|
@@ -45,13 +45,13 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
45
45
|
assert_equal "ID\tA\tC\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
46
46
|
end
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
should 'give correct increments in window numbering' do
|
50
50
|
Tempfile.open('one') do |tempfile|
|
51
51
|
tempfile.puts '>one'
|
52
52
|
tempfile.puts 'ATGCATGCAT' #10 letters long
|
53
53
|
tempfile.close
|
54
|
-
|
54
|
+
|
55
55
|
expected = "ID\tA\tC\n"+
|
56
56
|
"one_0\t0.5\t0.5\n"+
|
57
57
|
"one_1\t0.5\t0.5\n"+
|
@@ -60,14 +60,14 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
60
60
|
assert_equal expected, `#{script_path} -w 4 -k 1 -m 2 #{tempfile.path}`
|
61
61
|
end
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
should "print help when no arguments are given" do
|
65
65
|
command = "#{script_path}"
|
66
66
|
Open3.popen3(command) do |stdin, stdout, stderr|
|
67
67
|
assert stderr.readlines[0].match(/^Usage: kmer_counter/)
|
68
68
|
end
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
should 'work with lowercase' do
|
72
72
|
Tempfile.open('one') do |tempfile|
|
73
73
|
tempfile.puts '>one'
|
@@ -77,4 +77,31 @@ class TestBioKmerCounter < Test::Unit::TestCase
|
|
77
77
|
assert_equal "ID\tA\tC\none_0\t0.6\t0.4\n", `#{script_path} -w 5 -k 1 #{tempfile.path}`
|
78
78
|
end
|
79
79
|
end
|
80
|
+
|
81
|
+
should 'by default count contigs greater than 2kb but less than 5kb' do
|
82
|
+
Tempfile.open('one') do |tempfile|
|
83
|
+
tempfile.puts '>one'
|
84
|
+
tempfile.puts 'A'*2500
|
85
|
+
tempfile.close
|
86
|
+
|
87
|
+
assert_equal "ID\tA\tC\none_leftover_0\t1.0\t0.0\n", `#{script_path} -k 1 #{tempfile.path}`
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
should 'by default count contigs greater than 2kb but less than 5kb' do
|
92
|
+
Tempfile.open('one') do |tempfile|
|
93
|
+
tempfile.puts '>one'
|
94
|
+
tempfile.puts 'A'*7500
|
95
|
+
tempfile.close
|
96
|
+
|
97
|
+
assert_equal "ID\tA\tC\none_0\t1.0\t0.0\none_leftover_1\t1.0\t0.0\n", `#{script_path} -k 1 #{tempfile.path}`
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
should 'work simulated example with kmer length = 2' do
|
102
|
+
expected = %w(ID AA AC AG AT CA CC CG GA GC TA).join("\t")+"\n"+
|
103
|
+
%w(random_leftover_0 0.1111111111111111 0.13131313131313133 0.1414141414141414 0.0707070707070707 0.1717171717171717 0.1111111111111111 0.020202020202020204 0.1414141414141414 0.050505050505050504 0.050505050505050504).join("\t")+"\n"
|
104
|
+
|
105
|
+
assert_equal expected, `#{script_path} -k 2 -m 1 #{File.join(TEST_DATA_DIR,'100random.fa')}`
|
106
|
+
end
|
80
107
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-kmer_counter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bio
|
16
|
-
requirement: &
|
16
|
+
requirement: &73018740 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.4.2
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *73018740
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: progressbar
|
27
|
-
requirement: &
|
27
|
+
requirement: &73018270 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.11.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *73018270
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: parallel
|
38
|
-
requirement: &
|
38
|
+
requirement: &73017680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.5.17
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *73017680
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: bio-logger
|
49
|
-
requirement: &
|
49
|
+
requirement: &73016840 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 1.0.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *73016840
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: shoulda
|
60
|
-
requirement: &
|
60
|
+
requirement: &73016590 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,32 +65,32 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *73016590
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rdoc
|
71
|
-
requirement: &
|
71
|
+
requirement: &73016160 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
|
-
- -
|
74
|
+
- - ! '>='
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '3.12'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *73016160
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: jeweler
|
82
|
-
requirement: &
|
82
|
+
requirement: &73015700 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
|
-
- -
|
85
|
+
- - ! '>='
|
86
86
|
- !ruby/object:Gem::Version
|
87
87
|
version: 1.8.3
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *73015700
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: bundler
|
93
|
-
requirement: &
|
93
|
+
requirement: &73015320 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,18 +98,18 @@ dependencies:
|
|
98
98
|
version: 1.0.21
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *73015320
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: rdoc
|
104
|
-
requirement: &
|
104
|
+
requirement: &73015000 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
|
-
- -
|
107
|
+
- - ! '>='
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '3.12'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *73015000
|
113
113
|
description: A biogem for counting small kmers for fingerprinting nucleotide sequences.
|
114
114
|
See README for details.
|
115
115
|
email: gmail.com after donttrustben
|
@@ -130,6 +130,7 @@ files:
|
|
130
130
|
- bin/kmer_counter.rb
|
131
131
|
- lib/bio-kmer_counter.rb
|
132
132
|
- lib/bio-kmer_counter/kmer_counter.rb
|
133
|
+
- test/data/100random.fa
|
133
134
|
- test/helper.rb
|
134
135
|
- test/test_bio-kmer_counter.rb
|
135
136
|
homepage: http://github.com/wwood/bioruby-kmer_counter
|
@@ -147,7 +148,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
147
148
|
version: '0'
|
148
149
|
segments:
|
149
150
|
- 0
|
150
|
-
hash:
|
151
|
+
hash: -117512543
|
151
152
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
152
153
|
none: false
|
153
154
|
requirements:
|