qu-seqcluster 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/Gemfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +45 -0
- data/Rakefile +1 -0
- data/bin/seqcluster +6 -0
- data/lib/qu/seqcluster.rb +13 -0
- data/lib/qu/seqcluster/cluster.rb +53 -0
- data/lib/qu/seqcluster/options.rb +106 -0
- data/lib/qu/seqcluster/program.rb +5 -0
- data/lib/qu/seqcluster/runner.rb +29 -0
- data/lib/qu/seqcluster/version.rb +5 -0
- data/qu-seqcluster.gemspec +25 -0
- metadata +100 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e998132776509d1b787ab5ba02ebf81c43179ce9
|
4
|
+
data.tar.gz: ec82c798b27f9eaeb12f8554be750adb46b1a8d7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f8027e2bfa6f14d3d9c7664f2efac98de8dc19e8786552c47ce759c2cf4a7912b8cf5b77b86112ea3b44e7ae98e01fd66be86363ee768ed22adf2f6c88ad8ccf
|
7
|
+
data.tar.gz: 51ad5ab6631c161db2e08025c6228fed584dfb640cde2cc1a956dc23e3373689708f0476b4620e8664fb3f3062a551b4ec13a84728f0ae5692d3749054ea0693
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Wubin Qu
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# Qu::Seqcluster
|
2
|
+
|
3
|
+
Cluster DNA/RNA based on k-mer algorithm
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'qu-seqcluster'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install qu-seqcluster
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```
|
22
|
+
Usage: seqcluster -i seq.fasta -k 9 -c 0.9
|
23
|
+
|
24
|
+
Required options:
|
25
|
+
|
26
|
+
-i, --in File Input sequence file in fasta format.
|
27
|
+
|
28
|
+
Optional options:
|
29
|
+
|
30
|
+
-k, --kvalue Integer K value, default is 9.
|
31
|
+
-o, --out File Output file name for storing the results, default is screen.
|
32
|
+
-c, --cutoff Float Cutoff value for the similarity for cluster, default is 0.9.
|
33
|
+
|
34
|
+
|
35
|
+
-h, --help Show this message and quit
|
36
|
+
-v, --version Show version
|
37
|
+
```
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/seqcluster
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "qu/utils"
|
2
|
+
|
3
|
+
require_relative "seqcluster/program"
|
4
|
+
require_relative "seqcluster/version"
|
5
|
+
require_relative "seqcluster/options"
|
6
|
+
require_relative "seqcluster/cluster"
|
7
|
+
require_relative "seqcluster/runner"
|
8
|
+
|
9
|
+
module Qu
|
10
|
+
module Seqcluster
|
11
|
+
# Your code goes here...
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Qu
|
4
|
+
module Seqcluster
|
5
|
+
class Cluster
|
6
|
+
attr_reader :opts
|
7
|
+
def initialize(opts)
|
8
|
+
@opts = opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def find_cluster
|
12
|
+
records = Bio::FlatFile.new(Bio::FastaFormat, File.open(@opts.in)).to_a
|
13
|
+
records.sort_by! {|r| -r.seq.size}
|
14
|
+
|
15
|
+
groups = []
|
16
|
+
while records.size > 0
|
17
|
+
groups << []
|
18
|
+
seed_record = records.shift
|
19
|
+
groups[-1] << seed_record
|
20
|
+
seed_kmer_set = seed_kmer(seed_record, @opts.kvalue, 1)
|
21
|
+
records.each_entry do |record|
|
22
|
+
plus = cal_similarity(record.naseq, @opts.kvalue, seed_kmer_set)
|
23
|
+
minus = cal_similarity(record.naseq.reverse_complement, @opts.kvalue, seed_kmer_set)
|
24
|
+
|
25
|
+
similarity, strand = plus > minus ? [plus, '+'] : [minus, '-']
|
26
|
+
|
27
|
+
if similarity >= @opts.cutoff
|
28
|
+
groups[-1] << [record, similarity, strand]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
records.delete_if {|record| groups[-1].collect {|group_record, similarity, strand| group_record}.include?(record)}
|
32
|
+
end
|
33
|
+
|
34
|
+
return groups
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def seed_kmer(record, window, step)
|
40
|
+
mer_set = Set.new
|
41
|
+
record.naseq.window_search(window, step) {|s| mer_set.add(s.to_sym)}
|
42
|
+
return mer_set
|
43
|
+
end
|
44
|
+
|
45
|
+
def cal_similarity(seq, k, seed_kmer_set)
|
46
|
+
sum = 0
|
47
|
+
remainder = seq.window_search(k, k) {|s| sum += 1 if seed_kmer_set.include?(s.to_sym)}
|
48
|
+
return (sum.to_f * k) / (seq.size - remainder.size)
|
49
|
+
end
|
50
|
+
|
51
|
+
end # Cluster
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
|
6
|
+
module Qu
|
7
|
+
module Seqcluster
|
8
|
+
|
9
|
+
module_function
|
10
|
+
|
11
|
+
def default_options
|
12
|
+
options = OpenStruct.new
|
13
|
+
options.in = nil
|
14
|
+
options.out = $stdout
|
15
|
+
|
16
|
+
options.kvalue = 9
|
17
|
+
options.cutoff = 0.9
|
18
|
+
|
19
|
+
return options
|
20
|
+
end
|
21
|
+
|
22
|
+
class Options
|
23
|
+
|
24
|
+
attr_reader :opts
|
25
|
+
|
26
|
+
def initialize(argv)
|
27
|
+
@opts = parse_opts(argv)
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
def parse_opts(argv)
|
32
|
+
|
33
|
+
options = Seqcluster::default_options
|
34
|
+
|
35
|
+
OptionParser.new do |opts|
|
36
|
+
opts.banner = "#{PROGRAM} [#{VERSION}]: Cluster DNA/RNA based on k-mer algorithm
|
37
|
+
Usage: #{PROGRAM} -i seq.fasta -k 9 -c 0.9"
|
38
|
+
|
39
|
+
opts.separator ""
|
40
|
+
opts.separator "Required options:"
|
41
|
+
opts.separator ""
|
42
|
+
|
43
|
+
opts.on('-i', '--in File', String, 'Input sequence file in fasta format.') do |value|
|
44
|
+
if File.directory?(value)
|
45
|
+
$stderr.puts "Error: #{value} is a direcotry."
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
|
49
|
+
unless File.exists?(value)
|
50
|
+
$stderr.puts "Error: #{value} is not exists."
|
51
|
+
exit
|
52
|
+
end
|
53
|
+
options.in = value
|
54
|
+
end
|
55
|
+
|
56
|
+
opts.separator ""
|
57
|
+
opts.separator "Optional options:"
|
58
|
+
opts.separator ""
|
59
|
+
|
60
|
+
opts.on('-k', '--kvalue Integer', Integer, "K value, default is #{options.kvalue}.") do |value|
|
61
|
+
options.kvalue = value
|
62
|
+
end
|
63
|
+
|
64
|
+
opts.on("-o", "--out File", String, "Output file name for storing the results, default is screen.") do |value|
|
65
|
+
options.out = File.open(value, 'w')
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on('-c', '--cutoff Float', Float, "Cutoff value for the similarity for cluster, default is #{options.cutoff}.") do |value|
|
69
|
+
options.cutoff = value
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.separator ""
|
73
|
+
opts.separator ""
|
74
|
+
|
75
|
+
opts.on("-h", "--help", "Show this message and quit") do
|
76
|
+
puts opts
|
77
|
+
exit
|
78
|
+
end
|
79
|
+
|
80
|
+
opts.on("-v", "--version", "Show version") do
|
81
|
+
puts "#{PROGRAM} #{VERSION}"
|
82
|
+
exit
|
83
|
+
end
|
84
|
+
|
85
|
+
opts.separator ""
|
86
|
+
opts.separator "Author: Wubin Qu <quwubin@gmail.com>"
|
87
|
+
opts.separator ""
|
88
|
+
|
89
|
+
begin
|
90
|
+
argv = ["-h"] if argv.empty?
|
91
|
+
opts.parse!(argv)
|
92
|
+
rescue OptionParser::ParseError => e
|
93
|
+
$stderr.puts e.message, "\n", opts
|
94
|
+
exit
|
95
|
+
end
|
96
|
+
|
97
|
+
if options.in.nil?
|
98
|
+
$stderr.puts "Error: option -i required."
|
99
|
+
exit
|
100
|
+
end
|
101
|
+
end
|
102
|
+
return options
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative 'options'
|
2
|
+
require_relative 'cluster'
|
3
|
+
|
4
|
+
module Qu
|
5
|
+
module Seqcluster
|
6
|
+
class Runner
|
7
|
+
attr_reader :opts
|
8
|
+
def initialize(opts)
|
9
|
+
@opts = opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
groups = Cluster.new(@opts).find_cluster
|
14
|
+
output(groups)
|
15
|
+
end
|
16
|
+
|
17
|
+
def output(groups)
|
18
|
+
@opts.out.puts "Cluster count: #{groups.size}"
|
19
|
+
groups.each_with_index do |group, group_index|
|
20
|
+
@opts.out.puts "Cluster #{group_index + 1}: #{group[0].entry_id}"
|
21
|
+
group[1..-1].each do |record, similarity, strand|
|
22
|
+
@opts.out.puts "\tMember: #{record.entry_id} [#{strand}/#{similarity.round(2)}]"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'qu/seqcluster/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "qu-seqcluster"
|
8
|
+
spec.version = Qu::Seqcluster::VERSION
|
9
|
+
spec.authors = ["Wubin Qu"]
|
10
|
+
spec.email = ["quwubin@gmail.com"]
|
11
|
+
spec.description = %q{Cluster DNA/RNA based on k-mer algorithm}
|
12
|
+
spec.summary = %q{k-mer based cluster program}
|
13
|
+
spec.homepage = "https://github.com/quwubin/qu-seqcluster"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'qu-utils', '~> 1.0'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: qu-seqcluster
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Wubin Qu
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: qu-utils
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Cluster DNA/RNA based on k-mer algorithm
|
56
|
+
email:
|
57
|
+
- quwubin@gmail.com
|
58
|
+
executables:
|
59
|
+
- seqcluster
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- ".gitignore"
|
64
|
+
- Gemfile
|
65
|
+
- LICENSE.txt
|
66
|
+
- README.md
|
67
|
+
- Rakefile
|
68
|
+
- bin/seqcluster
|
69
|
+
- lib/qu/seqcluster.rb
|
70
|
+
- lib/qu/seqcluster/cluster.rb
|
71
|
+
- lib/qu/seqcluster/options.rb
|
72
|
+
- lib/qu/seqcluster/program.rb
|
73
|
+
- lib/qu/seqcluster/runner.rb
|
74
|
+
- lib/qu/seqcluster/version.rb
|
75
|
+
- qu-seqcluster.gemspec
|
76
|
+
homepage: https://github.com/quwubin/qu-seqcluster
|
77
|
+
licenses:
|
78
|
+
- MIT
|
79
|
+
metadata: {}
|
80
|
+
post_install_message:
|
81
|
+
rdoc_options: []
|
82
|
+
require_paths:
|
83
|
+
- lib
|
84
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
requirements: []
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 2.2.0
|
97
|
+
signing_key:
|
98
|
+
specification_version: 4
|
99
|
+
summary: k-mer based cluster program
|
100
|
+
test_files: []
|