fasta_util 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +0 -0
- data/Gemfile +0 -0
- data/Gemfile.lock +0 -0
- data/LICENSE.txt +0 -0
- data/README.rdoc +0 -0
- data/Rakefile +0 -0
- data/VERSION +1 -1
- data/bin/fasta_util +85 -1
- data/fasta_util.gemspec +2 -3
- data/test/helper.rb +0 -0
- data/test/test_fasta_util.rb +0 -0
- metadata +3 -4
- data/lib/fasta_util.rb +0 -86
data/.document
CHANGED
File without changes
|
data/Gemfile
CHANGED
File without changes
|
data/Gemfile.lock
CHANGED
File without changes
|
data/LICENSE.txt
CHANGED
File without changes
|
data/README.rdoc
CHANGED
File without changes
|
data/Rakefile
CHANGED
File without changes
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
data/bin/fasta_util
CHANGED
@@ -1,4 +1,88 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require '
|
2
|
+
require 'thor'
|
3
|
+
require 'bio'
|
3
4
|
|
5
|
+
class FastaUtility < Thor
|
6
|
+
include Thor::Actions
|
7
|
+
Struct.new("Stats", :sum, :l50, :n50, :count, :mean, :median)
|
8
|
+
|
9
|
+
no_tasks do
|
10
|
+
def stats(lengths)
|
11
|
+
lengths = lengths.sort{|a, b| b <=> a}
|
12
|
+
stats = Struct::Stats.new
|
13
|
+
|
14
|
+
temp_sum = 0
|
15
|
+
stats[:sum] = lengths.inject(:+)
|
16
|
+
stats[:l50] = lengths.find{|length| (temp_sum += length) > stats[:sum]/2.0}
|
17
|
+
stats[:n50] = lengths.count{|length| length >= stats[:l50]}
|
18
|
+
stats[:mean] = stats[:sum].to_f/lengths.length
|
19
|
+
stats[:median] = (lengths.length % 2 == 0) ? (lengths[lengths.length/2-1] + lengths[lengths.length/2])/2.0 : lengths[lengths.length/2]
|
20
|
+
stats[:count] = lengths.count
|
21
|
+
return stats
|
22
|
+
end
|
23
|
+
|
24
|
+
def format(stats)
|
25
|
+
output = []
|
26
|
+
buffer_length = stats.members.map{|key| key.length}.max
|
27
|
+
stats.each_pair do |key, value|
|
28
|
+
numtype = value.is_a?(Float) ? "f" : "d"
|
29
|
+
output << " %-#{buffer_length}s: %#{numtype}" % [key.to_s.capitalize, value]
|
30
|
+
end
|
31
|
+
output.join("\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "filecheck", "Checks to see if a given file exists. Used internally, don't worry about it too much", :hide => true
|
36
|
+
def filecheck(filename)
|
37
|
+
say "The file '#{filename}' doesn't seem to exist!", :red unless File.exists?(filename)
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "lengths", "Print a set of summary statistics for the given fasta file, including L50, N50, sum and count."
|
41
|
+
method_options [:cutoff, '-c'] => 0
|
42
|
+
def lengths(filename)
|
43
|
+
invoke :filecheck
|
44
|
+
lengths = Bio::FlatFile.open(filename).map{|entry| (entry.seq[-1,1] == "*") ? entry.length - 1 : entry.length}
|
45
|
+
|
46
|
+
say "All entries", :green
|
47
|
+
puts format(stats(lengths))
|
48
|
+
if options.cutoff > 0
|
49
|
+
say "Entries with length >= #{options.cutoff}", :green
|
50
|
+
puts format(stats(lengths.find_all{|l| l >= options.cutoff}))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
desc "filter FILENAME [options]", "Impose a filter or set of filters on entries in a fasta file."
|
55
|
+
long_desc "Impose a filter or set of filters on entries in a fasta file where each sequence in the file has to pass all of the filters to be printed."
|
56
|
+
method_option :length_cutoff, :aliases => '-l', :type => :numeric, :default => 0, :desc => 'Only entries with length >= cutoff will be returned.'
|
57
|
+
method_option :inverse_match, :aliases => '-v', :type => :boolean, :desc => "Return the inverse of the match after all the other filters have been applied."
|
58
|
+
method_option :defline_grep, :aliases => '-d', :type => :string, :default => '', :desc => "A regular expression, used to search the entry's definition line."
|
59
|
+
def filter(filename)
|
60
|
+
invoke :filecheck
|
61
|
+
Bio::FlatFile.open(filename).each do |entry|
|
62
|
+
passed = true
|
63
|
+
passed &&= (entry.length >= options.length_cutoff)
|
64
|
+
passed &&= (entry.definition.match(Regexp.new(options.defline_grep)))
|
65
|
+
passed = !passed if options.inverse_match
|
66
|
+
puts entry if passed
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
desc "clean FILENAME [options]", "Clean up a fasta file"
|
71
|
+
method_option :wrap_width, :aliases => '-w', :type => :numeric, :desc => 'Wrap the fasta to N columns'
|
72
|
+
def clean(filename)
|
73
|
+
invoke :filecheck
|
74
|
+
Bio::FlatFile.open(filename).each do |entry|
|
75
|
+
puts entry.to_biosequence.output(:fasta, :header => entry.definition, :width => options.wrap_width)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
desc "sort FILENAME [options]", "Sorts a fasta file according to criteria"
|
81
|
+
def sort(filename)
|
82
|
+
invoke :filecheck
|
83
|
+
Bio::FlatFile.open(filename).to_a.sort{|a,b| b.length <=> a.length}.each do |entry|
|
84
|
+
puts entry
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
4
88
|
FastaUtility.start
|
data/fasta_util.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{fasta_util}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robsyme"]
|
12
|
-
s.date = %q{2011-02-
|
12
|
+
s.date = %q{2011-02-18}
|
13
13
|
s.default_executable = %q{fasta_util}
|
14
14
|
s.description = %q{Easy fasta filtering, wrapping, calculating common statistics, sorting etc. Based on the fasta_tool script that I think was written by Jason Stajich.}
|
15
15
|
s.email = %q{rob.syme@gmail.com}
|
@@ -28,7 +28,6 @@ Gem::Specification.new do |s|
|
|
28
28
|
"VERSION",
|
29
29
|
"bin/fasta_util",
|
30
30
|
"fasta_util.gemspec",
|
31
|
-
"lib/fasta_util.rb",
|
32
31
|
"test/helper.rb",
|
33
32
|
"test/test_fasta_util.rb"
|
34
33
|
]
|
data/test/helper.rb
CHANGED
File without changes
|
data/test/test_fasta_util.rb
CHANGED
File without changes
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: fasta_util
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.2.
|
5
|
+
version: 0.2.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- robsyme
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-02-
|
13
|
+
date: 2011-02-18 00:00:00 +08:00
|
14
14
|
default_executable: fasta_util
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -109,7 +109,6 @@ files:
|
|
109
109
|
- VERSION
|
110
110
|
- bin/fasta_util
|
111
111
|
- fasta_util.gemspec
|
112
|
-
- lib/fasta_util.rb
|
113
112
|
- test/helper.rb
|
114
113
|
- test/test_fasta_util.rb
|
115
114
|
has_rdoc: true
|
@@ -126,7 +125,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
125
|
requirements:
|
127
126
|
- - ">="
|
128
127
|
- !ruby/object:Gem::Version
|
129
|
-
hash: -
|
128
|
+
hash: -443172315906729972
|
130
129
|
segments:
|
131
130
|
- 0
|
132
131
|
version: "0"
|
data/lib/fasta_util.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'bio'
|
3
|
-
|
4
|
-
class FastaUtility < Thor
|
5
|
-
include Thor::Actions
|
6
|
-
Struct.new("Stats", :sum, :l50, :n50, :count, :mean, :median)
|
7
|
-
|
8
|
-
no_tasks do
|
9
|
-
def stats(lengths)
|
10
|
-
lengths = lengths.sort{|a, b| b <=> a}
|
11
|
-
stats = Struct::Stats.new
|
12
|
-
|
13
|
-
temp_sum = 0
|
14
|
-
stats[:sum] = lengths.inject(:+)
|
15
|
-
stats[:l50] = lengths.find{|length| (temp_sum += length) > stats[:sum]/2.0}
|
16
|
-
stats[:n50] = lengths.count{|length| length >= stats[:l50]}
|
17
|
-
stats[:mean] = stats[:sum].to_f/lengths.length
|
18
|
-
stats[:median] = (lengths.length % 2 == 0) ? (lengths[lengths.length/2-1] + lengths[lengths.length/2])/2.0 : lengths[lengths.length/2]
|
19
|
-
stats[:count] = lengths.count
|
20
|
-
return stats
|
21
|
-
end
|
22
|
-
|
23
|
-
def format(stats)
|
24
|
-
output = []
|
25
|
-
buffer_length = stats.members.map{|key| key.length}.max
|
26
|
-
stats.each_pair do |key, value|
|
27
|
-
numtype = value.is_a?(Float) ? "f" : "d"
|
28
|
-
output << " %-#{buffer_length}s: %#{numtype}" % [key.to_s.capitalize, value]
|
29
|
-
end
|
30
|
-
output.join("\n")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
desc "filecheck", "Checks to see if a given file exists. Used internally, don't worry about it too much", :hide => true
|
35
|
-
def filecheck(filename)
|
36
|
-
say "The file '#{filename}' doesn't seem to exist!", :red unless File.exists?(filename)
|
37
|
-
end
|
38
|
-
|
39
|
-
desc "lengths", "Print a set of summary statistics for the given fasta file, including L50, N50, sum and count."
|
40
|
-
method_options [:cutoff, '-c'] => 0
|
41
|
-
def lengths(filename)
|
42
|
-
invoke :filecheck
|
43
|
-
lengths = Bio::FlatFile.open(filename).map{|entry| (entry.seq[-1,1] == "*") ? entry.length - 1 : entry.length}
|
44
|
-
|
45
|
-
say "All entries", :green
|
46
|
-
puts format(stats(lengths))
|
47
|
-
if options.cutoff > 0
|
48
|
-
say "Entries with length >= #{options.cutoff}", :green
|
49
|
-
puts format(stats(lengths.find_all{|l| l >= options.cutoff}))
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
desc "filter FILENAME [options]", "Impose a filter or set of filters on entries in a fasta file."
|
54
|
-
long_desc "Impose a filter or set of filters on entries in a fasta file where each sequence in the file has to pass all of the filters to be printed."
|
55
|
-
method_option :length_cutoff, :aliases => '-l', :type => :numeric, :default => 0, :desc => 'Only entries with length >= cutoff will be returned.'
|
56
|
-
method_option :inverse_match, :aliases => '-v', :type => :boolean, :desc => "Return the inverse of the match after all the other filters have been applied."
|
57
|
-
method_option :defline_grep, :aliases => '-d', :type => :string, :default => '', :desc => "A regular expression, used to search the entry's definition line."
|
58
|
-
def filter(filename)
|
59
|
-
invoke :filecheck
|
60
|
-
Bio::FlatFile.open(filename).each do |entry|
|
61
|
-
passed = true
|
62
|
-
passed &&= (entry.length >= options.length_cutoff)
|
63
|
-
passed &&= (entry.definition.match(Regexp.new(options.defline_grep)))
|
64
|
-
passed = !passed if options.inverse_match
|
65
|
-
puts entry if passed
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
desc "clean FILENAME [options]", "Clean up a fasta file"
|
70
|
-
method_option :wrap_width, :aliases => '-w', :type => :numeric, :desc => 'Wrap the fasta to N columns'
|
71
|
-
def clean(filename)
|
72
|
-
invoke :filecheck
|
73
|
-
Bio::FlatFile.open(filename).each do |entry|
|
74
|
-
puts entry.to_biosequence.output(:fasta, :header => entry.definition, :width => options.wrap_width)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
desc "sort FILENAME [options]", "Sorts a fasta file according to criteria"
|
80
|
-
def sort(filename)
|
81
|
-
invoke :filecheck
|
82
|
-
Bio::FlatFile.open(filename).to_a.sort{|a,b| b.length <=> a.length}.each do |entry|
|
83
|
-
puts entry
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|