fasta_util 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +0 -0
- data/Gemfile +0 -0
- data/Gemfile.lock +0 -0
- data/LICENSE.txt +0 -0
- data/README.rdoc +0 -0
- data/Rakefile +0 -0
- data/VERSION +1 -1
- data/bin/fasta_util +85 -1
- data/fasta_util.gemspec +2 -3
- data/test/helper.rb +0 -0
- data/test/test_fasta_util.rb +0 -0
- metadata +3 -4
- data/lib/fasta_util.rb +0 -86
data/.document
CHANGED
File without changes
|
data/Gemfile
CHANGED
File without changes
|
data/Gemfile.lock
CHANGED
File without changes
|
data/LICENSE.txt
CHANGED
File without changes
|
data/README.rdoc
CHANGED
File without changes
|
data/Rakefile
CHANGED
File without changes
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
data/bin/fasta_util
CHANGED
@@ -1,4 +1,88 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require '
|
2
|
+
require 'thor'
|
3
|
+
require 'bio'
|
3
4
|
|
5
|
+
class FastaUtility < Thor
|
6
|
+
include Thor::Actions
|
7
|
+
Struct.new("Stats", :sum, :l50, :n50, :count, :mean, :median)
|
8
|
+
|
9
|
+
no_tasks do
|
10
|
+
def stats(lengths)
|
11
|
+
lengths = lengths.sort{|a, b| b <=> a}
|
12
|
+
stats = Struct::Stats.new
|
13
|
+
|
14
|
+
temp_sum = 0
|
15
|
+
stats[:sum] = lengths.inject(:+)
|
16
|
+
stats[:l50] = lengths.find{|length| (temp_sum += length) > stats[:sum]/2.0}
|
17
|
+
stats[:n50] = lengths.count{|length| length >= stats[:l50]}
|
18
|
+
stats[:mean] = stats[:sum].to_f/lengths.length
|
19
|
+
stats[:median] = (lengths.length % 2 == 0) ? (lengths[lengths.length/2-1] + lengths[lengths.length/2])/2.0 : lengths[lengths.length/2]
|
20
|
+
stats[:count] = lengths.count
|
21
|
+
return stats
|
22
|
+
end
|
23
|
+
|
24
|
+
def format(stats)
|
25
|
+
output = []
|
26
|
+
buffer_length = stats.members.map{|key| key.length}.max
|
27
|
+
stats.each_pair do |key, value|
|
28
|
+
numtype = value.is_a?(Float) ? "f" : "d"
|
29
|
+
output << " %-#{buffer_length}s: %#{numtype}" % [key.to_s.capitalize, value]
|
30
|
+
end
|
31
|
+
output.join("\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "filecheck", "Checks to see if a given file exists. Used internally, don't worry about it too much", :hide => true
|
36
|
+
def filecheck(filename)
|
37
|
+
say "The file '#{filename}' doesn't seem to exist!", :red unless File.exists?(filename)
|
38
|
+
end
|
39
|
+
|
40
|
+
desc "lengths", "Print a set of summary statistics for the given fasta file, including L50, N50, sum and count."
|
41
|
+
method_options [:cutoff, '-c'] => 0
|
42
|
+
def lengths(filename)
|
43
|
+
invoke :filecheck
|
44
|
+
lengths = Bio::FlatFile.open(filename).map{|entry| (entry.seq[-1,1] == "*") ? entry.length - 1 : entry.length}
|
45
|
+
|
46
|
+
say "All entries", :green
|
47
|
+
puts format(stats(lengths))
|
48
|
+
if options.cutoff > 0
|
49
|
+
say "Entries with length >= #{options.cutoff}", :green
|
50
|
+
puts format(stats(lengths.find_all{|l| l >= options.cutoff}))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
desc "filter FILENAME [options]", "Impose a filter or set of filters on entries in a fasta file."
|
55
|
+
long_desc "Impose a filter or set of filters on entries in a fasta file where each sequence in the file has to pass all of the filters to be printed."
|
56
|
+
method_option :length_cutoff, :aliases => '-l', :type => :numeric, :default => 0, :desc => 'Only entries with length >= cutoff will be returned.'
|
57
|
+
method_option :inverse_match, :aliases => '-v', :type => :boolean, :desc => "Return the inverse of the match after all the other filters have been applied."
|
58
|
+
method_option :defline_grep, :aliases => '-d', :type => :string, :default => '', :desc => "A regular expression, used to search the entry's definition line."
|
59
|
+
def filter(filename)
|
60
|
+
invoke :filecheck
|
61
|
+
Bio::FlatFile.open(filename).each do |entry|
|
62
|
+
passed = true
|
63
|
+
passed &&= (entry.length >= options.length_cutoff)
|
64
|
+
passed &&= (entry.definition.match(Regexp.new(options.defline_grep)))
|
65
|
+
passed = !passed if options.inverse_match
|
66
|
+
puts entry if passed
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
desc "clean FILENAME [options]", "Clean up a fasta file"
|
71
|
+
method_option :wrap_width, :aliases => '-w', :type => :numeric, :desc => 'Wrap the fasta to N columns'
|
72
|
+
def clean(filename)
|
73
|
+
invoke :filecheck
|
74
|
+
Bio::FlatFile.open(filename).each do |entry|
|
75
|
+
puts entry.to_biosequence.output(:fasta, :header => entry.definition, :width => options.wrap_width)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
desc "sort FILENAME [options]", "Sorts a fasta file according to criteria"
|
81
|
+
def sort(filename)
|
82
|
+
invoke :filecheck
|
83
|
+
Bio::FlatFile.open(filename).to_a.sort{|a,b| b.length <=> a.length}.each do |entry|
|
84
|
+
puts entry
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
4
88
|
FastaUtility.start
|
data/fasta_util.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{fasta_util}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["robsyme"]
|
12
|
-
s.date = %q{2011-02-
|
12
|
+
s.date = %q{2011-02-18}
|
13
13
|
s.default_executable = %q{fasta_util}
|
14
14
|
s.description = %q{Easy fasta filtering, wrapping, calculating common statistics, sorting etc. Based on the fasta_tool script that I think was written by Jason Stajich.}
|
15
15
|
s.email = %q{rob.syme@gmail.com}
|
@@ -28,7 +28,6 @@ Gem::Specification.new do |s|
|
|
28
28
|
"VERSION",
|
29
29
|
"bin/fasta_util",
|
30
30
|
"fasta_util.gemspec",
|
31
|
-
"lib/fasta_util.rb",
|
32
31
|
"test/helper.rb",
|
33
32
|
"test/test_fasta_util.rb"
|
34
33
|
]
|
data/test/helper.rb
CHANGED
File without changes
|
data/test/test_fasta_util.rb
CHANGED
File without changes
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: fasta_util
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.2.
|
5
|
+
version: 0.2.2
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- robsyme
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-02-
|
13
|
+
date: 2011-02-18 00:00:00 +08:00
|
14
14
|
default_executable: fasta_util
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
@@ -109,7 +109,6 @@ files:
|
|
109
109
|
- VERSION
|
110
110
|
- bin/fasta_util
|
111
111
|
- fasta_util.gemspec
|
112
|
-
- lib/fasta_util.rb
|
113
112
|
- test/helper.rb
|
114
113
|
- test/test_fasta_util.rb
|
115
114
|
has_rdoc: true
|
@@ -126,7 +125,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
126
125
|
requirements:
|
127
126
|
- - ">="
|
128
127
|
- !ruby/object:Gem::Version
|
129
|
-
hash: -
|
128
|
+
hash: -443172315906729972
|
130
129
|
segments:
|
131
130
|
- 0
|
132
131
|
version: "0"
|
data/lib/fasta_util.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
require 'thor'
|
2
|
-
require 'bio'
|
3
|
-
|
4
|
-
class FastaUtility < Thor
|
5
|
-
include Thor::Actions
|
6
|
-
Struct.new("Stats", :sum, :l50, :n50, :count, :mean, :median)
|
7
|
-
|
8
|
-
no_tasks do
|
9
|
-
def stats(lengths)
|
10
|
-
lengths = lengths.sort{|a, b| b <=> a}
|
11
|
-
stats = Struct::Stats.new
|
12
|
-
|
13
|
-
temp_sum = 0
|
14
|
-
stats[:sum] = lengths.inject(:+)
|
15
|
-
stats[:l50] = lengths.find{|length| (temp_sum += length) > stats[:sum]/2.0}
|
16
|
-
stats[:n50] = lengths.count{|length| length >= stats[:l50]}
|
17
|
-
stats[:mean] = stats[:sum].to_f/lengths.length
|
18
|
-
stats[:median] = (lengths.length % 2 == 0) ? (lengths[lengths.length/2-1] + lengths[lengths.length/2])/2.0 : lengths[lengths.length/2]
|
19
|
-
stats[:count] = lengths.count
|
20
|
-
return stats
|
21
|
-
end
|
22
|
-
|
23
|
-
def format(stats)
|
24
|
-
output = []
|
25
|
-
buffer_length = stats.members.map{|key| key.length}.max
|
26
|
-
stats.each_pair do |key, value|
|
27
|
-
numtype = value.is_a?(Float) ? "f" : "d"
|
28
|
-
output << " %-#{buffer_length}s: %#{numtype}" % [key.to_s.capitalize, value]
|
29
|
-
end
|
30
|
-
output.join("\n")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
desc "filecheck", "Checks to see if a given file exists. Used internally, don't worry about it too much", :hide => true
|
35
|
-
def filecheck(filename)
|
36
|
-
say "The file '#{filename}' doesn't seem to exist!", :red unless File.exists?(filename)
|
37
|
-
end
|
38
|
-
|
39
|
-
desc "lengths", "Print a set of summary statistics for the given fasta file, including L50, N50, sum and count."
|
40
|
-
method_options [:cutoff, '-c'] => 0
|
41
|
-
def lengths(filename)
|
42
|
-
invoke :filecheck
|
43
|
-
lengths = Bio::FlatFile.open(filename).map{|entry| (entry.seq[-1,1] == "*") ? entry.length - 1 : entry.length}
|
44
|
-
|
45
|
-
say "All entries", :green
|
46
|
-
puts format(stats(lengths))
|
47
|
-
if options.cutoff > 0
|
48
|
-
say "Entries with length >= #{options.cutoff}", :green
|
49
|
-
puts format(stats(lengths.find_all{|l| l >= options.cutoff}))
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
desc "filter FILENAME [options]", "Impose a filter or set of filters on entries in a fasta file."
|
54
|
-
long_desc "Impose a filter or set of filters on entries in a fasta file where each sequence in the file has to pass all of the filters to be printed."
|
55
|
-
method_option :length_cutoff, :aliases => '-l', :type => :numeric, :default => 0, :desc => 'Only entries with length >= cutoff will be returned.'
|
56
|
-
method_option :inverse_match, :aliases => '-v', :type => :boolean, :desc => "Return the inverse of the match after all the other filters have been applied."
|
57
|
-
method_option :defline_grep, :aliases => '-d', :type => :string, :default => '', :desc => "A regular expression, used to search the entry's definition line."
|
58
|
-
def filter(filename)
|
59
|
-
invoke :filecheck
|
60
|
-
Bio::FlatFile.open(filename).each do |entry|
|
61
|
-
passed = true
|
62
|
-
passed &&= (entry.length >= options.length_cutoff)
|
63
|
-
passed &&= (entry.definition.match(Regexp.new(options.defline_grep)))
|
64
|
-
passed = !passed if options.inverse_match
|
65
|
-
puts entry if passed
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
desc "clean FILENAME [options]", "Clean up a fasta file"
|
70
|
-
method_option :wrap_width, :aliases => '-w', :type => :numeric, :desc => 'Wrap the fasta to N columns'
|
71
|
-
def clean(filename)
|
72
|
-
invoke :filecheck
|
73
|
-
Bio::FlatFile.open(filename).each do |entry|
|
74
|
-
puts entry.to_biosequence.output(:fasta, :header => entry.definition, :width => options.wrap_width)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
desc "sort FILENAME [options]", "Sorts a fasta file according to criteria"
|
80
|
-
def sort(filename)
|
81
|
-
invoke :filecheck
|
82
|
-
Bio::FlatFile.open(filename).to_a.sort{|a,b| b.length <=> a.length}.each do |entry|
|
83
|
-
puts entry
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|