ms-error_rate 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +21 -0
- data/README.rdoc +53 -0
- data/VERSION +1 -0
- data/bin/error_rate +92 -0
- data/bin/fasta_to_nuclear.rb +14 -0
- data/bin/fasta_to_peptide_centric_db.rb +7 -0
- data/bin/fasta_to_phobius.rb +34 -0
- data/bin/generate_sbv_input_hashes.rb +62 -0
- data/bin/mascot_pepxml_to_peptide_hit_qvalues.rb +61 -0
- data/bin/phobius_to_nontransmembrane.rb +35 -0
- data/bin/qvalues.rb +105 -0
- data/spec/ms/error_rate/qvalue_spec.rb +25 -0
- data/spec/ms/error_rate_spec.rb +25 -0
- data/spec/spec_helper.rb +6 -0
- metadata +129 -0
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright shared among contributing institutions:
|
2
|
+
Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
|
3
|
+
Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
= {ms-error_rate}[http://mspire.rubyforge.org/projects/ms-error_rate]
|
2
|
+
|
3
|
+
An {Mspire}[http://mspire.rubyforge.org] library for calculating or dealing
|
4
|
+
with error rates. These may be from target-decoy searches, sample bias
|
5
|
+
validation, or other sources.
|
6
|
+
|
7
|
+
== Examples
|
8
|
+
|
9
|
+
=== Target-Decoy with Mascot
|
10
|
+
|
11
|
+
Generate q-values (right now only with Mascot and MascotPercolator):
|
12
|
+
|
13
|
+
require 'ms/error_rate/qvalue'
|
14
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues(target_files, decoy_files)
|
15
|
+
# target_hit is a PeptideHit Struct (:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
16
|
+
|
17
|
+
# or on the commandline:
|
18
|
+
% qvalues.rb <target>.dat <decoy>.dat
|
19
|
+
|
20
|
+
The same output can be produced from Mascot-Percolator output:
|
21
|
+
|
22
|
+
require 'ms/error_rate/qvalue'
|
23
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues(datp_files, tab_dot_text_files)
|
24
|
+
# or commandline:
|
25
|
+
% qvalues.rb <target>.datp <target>.tab.txt
|
26
|
+
|
27
|
+
=== Sample Bias Validation
|
28
|
+
|
29
|
+
Sample Bias Validation allows error rate determination based on expected biases in sample composition. Here is an example using transmembrane sequence content. We will assume a fasta file called `proteins.fasta`:
|
30
|
+
|
31
|
+
# create a peptide-centric database
|
32
|
+
fasta_to_peptide_centric_db.rb proteins.fasta # defaults 2 missed cleavages, min aaseq 4
|
33
|
+
# generates a file: proteins.msd_clvg2.min_aaseq4.yml
|
34
|
+
|
35
|
+
# create a transmembrane sequence prediction file
|
36
|
+
fasta_to_phobius.rb proteins.fasta # => generates proteins.phobius
|
37
|
+
|
38
|
+
generate_sbv_input_hashes.rb proteins.msd_clvg2.min_aaseq4.yml --tm proteins.phobius,1
|
39
|
+
# creates two files:
|
40
|
+
# proteins.msd_clvg2.min_aaseq4.tm_min1.by_aaseq.yml
|
41
|
+
# proteins.msd_clvg2.min_aaseq4.tm_min1.freq_by_length.yml
|
42
|
+
|
43
|
+
# cytosolic fraction (transmembrane sequences not expected):
|
44
|
+
error_rate qvalues.yml --fp-sbv proteins.msd_clvg2.min_aaseq4.tm_min1.by_aaseq.yml,\
|
45
|
+
proteins.msd_clvg2.min_aaseq4.tm_min1.freq_by_length.yml,0.05
|
46
|
+
|
47
|
+
== Installation
|
48
|
+
|
49
|
+
gem install ms-error_rate
|
50
|
+
|
51
|
+
== Copyright
|
52
|
+
|
53
|
+
See LICENSE
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.7
|
data/bin/error_rate
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'support/sort_by_attributes'
|
4
|
+
require 'ms/error_rate'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
@num_tp_validators = 0
|
8
|
+
@num_fp_validators = 0
|
9
|
+
|
10
|
+
|
11
|
+
arg_strings_ar = [true,false].map do |boolean|
|
12
|
+
["a false positive indicator", "A = aaseq to indictor (0-1) yml file", "B = aaseq_length to frequency yml file", "C = rate indicators are #{boolean} pos.", "D = name of the validator"]
|
13
|
+
end
|
14
|
+
|
15
|
+
op_sbv_ars = [true,false].zip(arg_strings_ar).map do |boolean, arg_strings|
|
16
|
+
letter = boolean ? 't' : 'f'
|
17
|
+
["--#{letter}p-sbv <A,B,C[,D]>,", Array, *arg_strings]
|
18
|
+
end
|
19
|
+
|
20
|
+
raw_validator_args = []
|
21
|
+
validator_names = []
|
22
|
+
|
23
|
+
opt = {
|
24
|
+
:order_by => [:qvalue],
|
25
|
+
}
|
26
|
+
|
27
|
+
opts = OptionParser.new do |op|
|
28
|
+
op.banner = "usage: #{File.basename(__FILE__)} qvalues.yml ..."
|
29
|
+
|
30
|
+
op.on("--order-by <Array>", Array, "the keys to order on (default: [qvalue])") {|v| opt[:order_by] = v.map {|v| v.to_sym } }
|
31
|
+
op.on("--best-is-low <Array>", Array, "the keys where better score is lower") {|v| opt[:best_is_low] = v.map {|v| v.to_sym } }
|
32
|
+
|
33
|
+
[true, false].each do |boolean|
|
34
|
+
index = boolean ? 0 : 1
|
35
|
+
op.on(*op_sbv_ars[index]) do |v|
|
36
|
+
(a,b,c,d) = v
|
37
|
+
name =
|
38
|
+
if d
|
39
|
+
d
|
40
|
+
else
|
41
|
+
if boolean
|
42
|
+
@num_tp_validator += 1
|
43
|
+
"tp#{@num_tp_validator}"
|
44
|
+
else
|
45
|
+
@num_fp_validator += 1
|
46
|
+
"fp#{@num_fp_validators}"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
validator_names << name
|
50
|
+
raw_validator_args << [a,b,c]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.parse!
|
56
|
+
|
57
|
+
|
58
|
+
if ARGV.size != 1
|
59
|
+
puts opts.to_s
|
60
|
+
exit
|
61
|
+
elsif !opt[:order_by]
|
62
|
+
puts "you must specify the order-by array!"
|
63
|
+
exit
|
64
|
+
end
|
65
|
+
|
66
|
+
sort_args = opt[:order_by]
|
67
|
+
sort_args << {:down => opt[:best_is_low] } # because we will sort normal and reverse the array
|
68
|
+
|
69
|
+
|
70
|
+
# load one validator at a time
|
71
|
+
|
72
|
+
raw_validator_args.zip(validator_names) do |args, name|
|
73
|
+
(a,b,c) = args
|
74
|
+
val = Ms::ErrorRate::Sbv.new(YAML.load_file(a), YAML.load_file(b), c)
|
75
|
+
|
76
|
+
ARGV.each do |file|
|
77
|
+
yaml = YAML.load_file(file)
|
78
|
+
pepclass = Struct.new(yaml['headers'].map {|v| v.to_sym })
|
79
|
+
peps = yaml['data'].each do |ar|
|
80
|
+
pepclass.new(*ar)
|
81
|
+
end
|
82
|
+
sorted_best_to_worst = peps.sort_by_attributes(sort_args)
|
83
|
+
|
84
|
+
precision_vals = sorted_best_to_worst.map do |pep|
|
85
|
+
val.update_precision(pep.aaseq)
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
p precision_vals
|
90
|
+
end
|
91
|
+
|
92
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
if ARGV.size == 0
|
5
|
+
puts "usage: #{File.basename(__FILE__)} <file>.fasta"
|
6
|
+
puts "output: <file>"
|
7
|
+
#puts "WARNING!!: you need to run phobius_to_nontransmembrane.rb before"
|
8
|
+
#puts "this to weed out transmembrane proteins!"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
page = 'http://phobius.sbc.su.se/'
|
6
|
+
|
7
|
+
if ARGV.size == 0
|
8
|
+
puts "usage: #{File.basename(__FILE__)} <file>.fasta"
|
9
|
+
puts "outputs <file>.phobius "
|
10
|
+
puts "in short format"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
a = WWW::Mechanize.new { |agent|
|
16
|
+
agent.user_agent_alias = 'Mac Safari'
|
17
|
+
}
|
18
|
+
|
19
|
+
ARGV.each do |file|
|
20
|
+
outfile = file.chomp(File.extname(file)) + '.phobius'
|
21
|
+
a.get(page) do |page|
|
22
|
+
form = page.forms.first
|
23
|
+
form.radiobuttons.select {|v| v.value == 'short' }.first.click
|
24
|
+
fu = form.file_uploads.first
|
25
|
+
fu.file_name = File.expand_path(file)
|
26
|
+
#fu.file_data = IO.read(file)
|
27
|
+
reply = form.submit
|
28
|
+
html = reply.body
|
29
|
+
start = html.index("<pre>") + 5
|
30
|
+
stop = html.rindex("</pre>")
|
31
|
+
File.open(outfile, 'w') {|out| out.print html[start...stop] }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
require 'ms/error_rate/sbv/peptide_based'
|
6
|
+
require 'ms/error_rate/sbv/protein_based'
|
7
|
+
|
8
|
+
opt = {}
|
9
|
+
opt[:protein_bias] = []
|
10
|
+
|
11
|
+
opts = OptionParser.new do |op|
|
12
|
+
op.banner = "usage: #{File.basename(__FILE__)} peptide_centric_db [OPTION]"
|
13
|
+
op.on("--tm <phobius,min>", Array, "transmembrane, <phobius> is path to phobius ", "output file (see fasta_to_phobius.rb)", "<min> is the min number of tm sequences required") {|v| opt[:tm] = [v.first, v.last.to_i]}
|
14
|
+
op.on("--aa <aa,min>", Array, "amino acid, <aa> is a string found in the peptides", "<min> is the min number of required for counting") {|v| opt[:aa] = [v.first, v.last.to_i]}
|
15
|
+
op.on("--protein-bias <name,file>", Array, "<name> bias, <file> is path to a yaml hash", " keyed prot -> <0-1>") {|v| opt[:protein_bias] << [v.first.to_sym, v.last]}
|
16
|
+
op.separator "outputs for each bias type:"
|
17
|
+
op.separator " <peptide_centric_db>.<info>.#{Ms::ErrorRate::Sbv::LENGTH_EXT}"
|
18
|
+
op.separator " <peptide_centric_db>.<info>.#{Ms::ErrorRate::Sbv::AASEQ_EXT}"
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.parse!
|
22
|
+
|
23
|
+
if ARGV.size == 0
|
24
|
+
puts opts.to_s
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
peptide_centric_db = ARGV.first
|
29
|
+
|
30
|
+
def note_files(files)
|
31
|
+
files.each do |file| puts "WROTE: #{file}" end
|
32
|
+
end
|
33
|
+
|
34
|
+
klass = Ms::ErrorRate::Sbv
|
35
|
+
prot_klass = Ms::ErrorRate::Sbv::ProteinBased
|
36
|
+
pep_klass = Ms::ErrorRate::Sbv::PeptideBased
|
37
|
+
|
38
|
+
if opt[:tm]
|
39
|
+
index = TransmembraneIndex.new(opt[:tm].first)
|
40
|
+
|
41
|
+
protid_to_transmembrane = {}
|
42
|
+
regexp = nil
|
43
|
+
index.each do |k,v|
|
44
|
+
regexp ||= Ms::Fasta.id_regexp(k)
|
45
|
+
new_key = regexp.match(k)[1]
|
46
|
+
protid_to_transmembrane[new_key] = ((v[:num_certain_transmembrane_segments] >= opt[:tm].last) ? 1 : 0)
|
47
|
+
end
|
48
|
+
|
49
|
+
fnames = prot_klass.generate_hashes( peptide_centric_db, protid_to_transmembrane, {:type_code => "tm_min#{opt[:tm].last}"})
|
50
|
+
note_files fnames
|
51
|
+
end
|
52
|
+
|
53
|
+
if opt[:aa]
|
54
|
+
fnames = pep_klass.generate_hashes( peptide_centric_db, *opt[:aa] )
|
55
|
+
note_files fnames
|
56
|
+
end
|
57
|
+
|
58
|
+
if opt[:protein_bias].size > 0
|
59
|
+
opt[:protein_bias].each do |name, hash_file|
|
60
|
+
prot_klass.generate_hashes( peptide_centric_db, hash_file)
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'ms/error_rate/qvalue'
|
8
|
+
|
9
|
+
opts = Trollop::Parser.new do
|
10
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fwd>.xml <decoy>.xml ...
|
11
|
+
outputs: <fwd>.phq.csv
|
12
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
13
|
+
}
|
14
|
+
opt :z_together, "do not group by charge state", :default => false
|
15
|
+
end
|
16
|
+
|
17
|
+
DELIMITER = "\t"
|
18
|
+
|
19
|
+
opt = opts.parse(ARGV)
|
20
|
+
if ARGV.size == 0 || (ARGV.size%2 != 0)
|
21
|
+
puts "\n\n!! only even numbers of files accepted (target decoy target decoy ...) !!\n\n" if (ARGV.size%2 != 0)
|
22
|
+
opts.educate
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
|
26
|
+
files = ARGV.to_a
|
27
|
+
|
28
|
+
PeptideHit = Struct.new(:aaseq, :charge, :ionscore, :qvalue)
|
29
|
+
|
30
|
+
# this is a list of high quality peptide hits associated with each group
|
31
|
+
peptide_hits_per_file = files.map do |file|
|
32
|
+
File.open(file) do |io|
|
33
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS)
|
34
|
+
# we can work with namespaces, or just remove them ...
|
35
|
+
doc.remove_namespaces!
|
36
|
+
root = doc.root
|
37
|
+
search_hits = root.xpath('//search_hit')
|
38
|
+
search_hits.map do |search_hit|
|
39
|
+
aaseq = search_hit['peptide']
|
40
|
+
ionscore = search_hit.children.find {|node| node.name == 'search_score' && node['name'] == 'ionscore' }['value'].to_f
|
41
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
42
|
+
PeptideHit.new(aaseq, charge, ionscore)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
hits_per_target = peptide_hits_per_file.each_slice(2).map do |target_hits, decoy_hits|
|
48
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(target_hits, decoy_hits, :z_together => opt[:z_together], &:ionscore)
|
49
|
+
target_peptide_hits = pairs.map {|peptide_hit, qvalue| peptide_hit.qvalue = qvalue ; peptide_hit }
|
50
|
+
end
|
51
|
+
|
52
|
+
files.each_slice(2).map(&:first).zip(hits_per_target) do |file, hits|
|
53
|
+
newfile = file.chomp(File.extname(file)) + ".phq.tsv"
|
54
|
+
File.open(newfile,'w') do |out|
|
55
|
+
out.puts %w(aaseq charge qvalue).join(DELIMITER)
|
56
|
+
hits.each do |hit|
|
57
|
+
out.puts hit.values_at(0,1,3).join(DELIMITER)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'ms/fasta'
|
4
|
+
require 'transmembrane/phobius.rb'
|
5
|
+
|
6
|
+
if ARGV.size != 3
|
7
|
+
puts "usage: #{File.basename(__FILE__)} <max#tm> phobius_file_short <file>.fasta"
|
8
|
+
puts "max#tm = max # of transmembrane sequences allowed to be a non-transmembrane."
|
9
|
+
puts ""
|
10
|
+
puts "outputs: <file>_NONTM.fasta"
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
|
14
|
+
(max_num_tm, phobius_short_file, fasta_db_file) = ARGV
|
15
|
+
max_num_tm = max_num_tm.to_i
|
16
|
+
|
17
|
+
base = fasta_db_file.chomp(File.extname(fasta_db_file))
|
18
|
+
outfile = base + "_NONTM.fasta"
|
19
|
+
|
20
|
+
index = Phobius::Index.new(phobius_short_file)
|
21
|
+
|
22
|
+
File.open(outfile, 'w') do |out|
|
23
|
+
Ms::Fasta.open(fasta_db_file) do |fasta|
|
24
|
+
fasta.each do |entry|
|
25
|
+
key = index.reference_to_key(entry.header)
|
26
|
+
abort "can't find key: #{key} for #{entry.header}" unless index.key?(key)
|
27
|
+
num_tms = index[key][:num_certain_transmembrane_segments]
|
28
|
+
if num_tms <= max_num_tm
|
29
|
+
out.print entry.to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
data/bin/qvalues.rb
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'ms/error_rate/qvalue'
|
5
|
+
|
6
|
+
DEF_EXT = "_flip"
|
7
|
+
NORMAL_EXT = 'qval.yml'
|
8
|
+
|
9
|
+
def print_out(outfile, filenames, headers, target_hits)
|
10
|
+
File.open(outfile, 'w') do |out|
|
11
|
+
out.print( {'headers' => headers, 'filenames' => filenames, 'data' => target_hits }.to_yaml )
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
opt = {
|
16
|
+
:outfile => NORMAL_EXT,
|
17
|
+
:min_peptide_length => 9,
|
18
|
+
}
|
19
|
+
|
20
|
+
opts = OptionParser.new do |op|
|
21
|
+
op.banner = "usage: #{File.basename(__FILE__)} <target> <decoy> [... (as pairs)]"
|
22
|
+
op.separator "or: #{File.basename(__FILE__)} <target>.datp <target>.tab.txt [... (as pairs)]"
|
23
|
+
op.separator "for each pair of files"
|
24
|
+
op.separator "sorts the peptide hits by score and determines the precision at each hit"
|
25
|
+
op.separator ""
|
26
|
+
op.separator "writes a yaml file <target>.'#{NORMAL_EXT}' which"
|
27
|
+
op.separator "has three keys: 'headers', 'filenames', and 'data'"
|
28
|
+
op.separator " headers contains an array showing what is in the data"
|
29
|
+
op.separator " filenames: (a hash with two keys holding an array of full path names)"
|
30
|
+
op.separator " target:"
|
31
|
+
op.separator " decoy:"
|
32
|
+
op.separator " data: (an array with the data values)"
|
33
|
+
op.separator "headers: <the headers of the hits>"
|
34
|
+
op.separator ""
|
35
|
+
op.separator "headers guaranteed to have at least: filename, query_title, charge, sequence, qvalue"
|
36
|
+
op.separator ""
|
37
|
+
op.on("-l", "--min-peptide-length <Int>", Integer, "min num aa's to accept (default: #{opt[:min_peptide_length]})") {|v| opt[:min_peptide_length] = v }
|
38
|
+
op.on("--z-together", "combines all charge states for precision calc") {|v| opt[:z_together] = v }
|
39
|
+
op.on("-o", "--outfile <name>", "write to specified file") {|v| opt[:outfile] = v }
|
40
|
+
op.on("-g", "--group-together", "process all forwards together and all decoys together", "will output to opt[:outfile] unless -o given") {|v| opt[:group_together] = v }
|
41
|
+
op.on("-f", "--find-decoy [ext]", "finds the decoy file, default <file>#{DEF_EXT}.<ext>", "obviating the need to specify it on the commandline") do |v|
|
42
|
+
if v.is_a? String
|
43
|
+
opt[:find_decoy] = v
|
44
|
+
else
|
45
|
+
opt[:find_decoy] = DEF_EXT
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.parse!
|
51
|
+
|
52
|
+
if ARGV.size == 0
|
53
|
+
puts opts.to_s
|
54
|
+
exit
|
55
|
+
end
|
56
|
+
|
57
|
+
target_files = []
|
58
|
+
decoy_files = []
|
59
|
+
if opt[:find_decoy]
|
60
|
+
target_files = ARGV.to_a.dup
|
61
|
+
decoy_files = target_files.map do |tf|
|
62
|
+
ext = File.extname(tf)
|
63
|
+
basename = tf.chomp(ext)
|
64
|
+
decoy_file = basename + opt[:find_decoy] + ext
|
65
|
+
raise ArgumentError, "cannot find #{decoy_file}" unless File.exist?(decoy_file)
|
66
|
+
decoy_file
|
67
|
+
end
|
68
|
+
else
|
69
|
+
ARGV.each_slice(2) do |target, decoy|
|
70
|
+
target_files << target
|
71
|
+
decoy_files << decoy
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
require 'ms/error_rate/qvalue/mascot'
|
76
|
+
require 'ms/error_rate/qvalue/mascot/percolator'
|
77
|
+
|
78
|
+
mascot_percolator = (File.extname(target_files.first) == '.datp')
|
79
|
+
headers = Ms::ErrorRate::Qvalue::Mascot::MEMBERS.map(&:to_s)
|
80
|
+
if opt[:group_together]
|
81
|
+
outfile = opt[:outfile]
|
82
|
+
if mascot_percolator
|
83
|
+
filenames = { 'target' => target_files, 'decoy' => decoy_files }
|
84
|
+
# in the case of mascot_percolator, the "target" files are .datp files and
|
85
|
+
# "decoy" files the .tab.txt files
|
86
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues( target_files, decoy_files, opt).sort_by(&:qvalue)
|
87
|
+
else
|
88
|
+
filenames = { 'target' => target_files, 'decoy' => decoy_files }
|
89
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues(target_files, decoy_files, opt).sort_by(&:qvalue)
|
90
|
+
end
|
91
|
+
print_out(outfile, filenames, headers, target_hits)
|
92
|
+
else
|
93
|
+
target_files.zip(decoy_files) do |target_file, decoy_file|
|
94
|
+
if mascot_percolator
|
95
|
+
filenames = { 'datp' => [target_file], 'tab_txt' => [decoy_file] }
|
96
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues([target_file], [decoy_file], opt).sort_by(&:qvalue)
|
97
|
+
else
|
98
|
+
filenames = { 'target' => [target_file], 'decoy' => [decoy_file] }
|
99
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues([target_file], [decoy_file], opt).sort_by(&:qvalue)
|
100
|
+
end
|
101
|
+
base = target_file.chomp(File.extname(target_file))
|
102
|
+
outfile = base + '.' + NORMAL_EXT
|
103
|
+
print_out(outfile, filenames, headers, target_hits)
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'ms/error_rate/qvalue'
|
4
|
+
|
5
|
+
Hit = Struct.new(:score, :charge)
|
6
|
+
|
7
|
+
describe 'calculating q-values' do
|
8
|
+
|
9
|
+
before do
|
10
|
+
scores = [14,15,13,12,11]
|
11
|
+
qvals_expected = [0.5 ,0.0, 2.0/3.0, 3.0/4, 4.0/5]
|
12
|
+
@target_hits = scores.zip(Array.new(scores.size, 2)).map {|pair| Hit.new(*pair) }
|
13
|
+
@decoy_hits = scores.zip(Array.new(scores.size, 2)).map {|pair| Hit.new(pair.first-0.5, pair.last) }
|
14
|
+
@qval_by_hit = {}
|
15
|
+
@target_hits.zip(qvals_expected) {|hit, qval| @qval_by_hit[hit] = qval }
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'can calculate qvalues on target deccoy sets' do
|
19
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(@target_hits, @decoy_hits)
|
20
|
+
pairs.each do |hit, qval|
|
21
|
+
@qval_by_hit[hit].should.be.close(qval, 0.00000001)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'ms/error_rate'
|
4
|
+
require 'ostruct'
|
5
|
+
|
6
|
+
xdescribe 'not quite sure what this is' do
|
7
|
+
|
8
|
+
it 'calculates bayesian probabilities' do
|
9
|
+
# C = is a correct ID
|
10
|
+
# T = transmembrane content
|
11
|
+
# Y = cysteine content
|
12
|
+
# A = abundance
|
13
|
+
# p(C|T,Y,A) = p(T|C)p(Y|C)p(A|C)p(C) / p(T)p(Y)p(A)
|
14
|
+
peps.map do |pep|
|
15
|
+
# what is the probability of that un-transmembraneyness being correct?
|
16
|
+
# what is the probability of that un-cysteineness being correct?
|
17
|
+
# what is the probability of that high abundanceness being correct?
|
18
|
+
pep.bayes_probs.reduce(prob_being_correct) do |prob|
|
19
|
+
end
|
20
|
+
p_correct = pep.prior_prob_correct
|
21
|
+
pep.not_transmembrane? * pep.not_cysteine? * pep.not_low_abundance?
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ms-error_rate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 7
|
9
|
+
version: 0.0.7
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- John T. Prince
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-03-28 00:00:00 -06:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: ms-core
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 0
|
31
|
+
- 2
|
32
|
+
version: 0.0.2
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: ms-fasta
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
segments:
|
44
|
+
- 0
|
45
|
+
- 2
|
46
|
+
- 3
|
47
|
+
version: 0.2.3
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: spec-more
|
52
|
+
prerelease: false
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
54
|
+
none: false
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id003
|
63
|
+
description: aids for creating and calculating error rates using target-decoy searches and sample validation.
|
64
|
+
email: jtprince@gmail.com
|
65
|
+
executables:
|
66
|
+
- error_rate
|
67
|
+
- fasta_to_nuclear.rb
|
68
|
+
- fasta_to_peptide_centric_db.rb
|
69
|
+
- fasta_to_phobius.rb
|
70
|
+
- generate_sbv_input_hashes.rb
|
71
|
+
- mascot_pepxml_to_peptide_hit_qvalues.rb
|
72
|
+
- phobius_to_nontransmembrane.rb
|
73
|
+
- qvalues.rb
|
74
|
+
extensions: []
|
75
|
+
|
76
|
+
extra_rdoc_files:
|
77
|
+
- LICENSE
|
78
|
+
- README.rdoc
|
79
|
+
files:
|
80
|
+
- VERSION
|
81
|
+
- LICENSE
|
82
|
+
- README.rdoc
|
83
|
+
- spec/ms/error_rate/qvalue_spec.rb
|
84
|
+
- spec/ms/error_rate_spec.rb
|
85
|
+
- spec/spec_helper.rb
|
86
|
+
- bin/error_rate
|
87
|
+
- bin/fasta_to_nuclear.rb
|
88
|
+
- bin/fasta_to_peptide_centric_db.rb
|
89
|
+
- bin/fasta_to_phobius.rb
|
90
|
+
- bin/generate_sbv_input_hashes.rb
|
91
|
+
- bin/mascot_pepxml_to_peptide_hit_qvalues.rb
|
92
|
+
- bin/phobius_to_nontransmembrane.rb
|
93
|
+
- bin/qvalues.rb
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://jtprince.github.com/ms-error_rate
|
96
|
+
licenses: []
|
97
|
+
|
98
|
+
post_install_message:
|
99
|
+
rdoc_options: []
|
100
|
+
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
none: false
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
segments:
|
109
|
+
- 0
|
110
|
+
version: "0"
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
none: false
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
requirements: []
|
120
|
+
|
121
|
+
rubyforge_project: mspire
|
122
|
+
rubygems_version: 1.3.7
|
123
|
+
signing_key:
|
124
|
+
specification_version: 3
|
125
|
+
summary: An mspire library for calculating error rates in MS/MS identifications (FDRs).
|
126
|
+
test_files:
|
127
|
+
- spec/ms/error_rate/qvalue_spec.rb
|
128
|
+
- spec/ms/error_rate_spec.rb
|
129
|
+
- spec/spec_helper.rb
|