ms-error_rate 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
- data/README.rdoc +53 -0
- data/VERSION +1 -0
- data/bin/error_rate +92 -0
- data/bin/fasta_to_nuclear.rb +14 -0
- data/bin/fasta_to_peptide_centric_db.rb +7 -0
- data/bin/fasta_to_phobius.rb +34 -0
- data/bin/generate_sbv_input_hashes.rb +62 -0
- data/bin/mascot_pepxml_to_peptide_hit_qvalues.rb +61 -0
- data/bin/phobius_to_nontransmembrane.rb +35 -0
- data/bin/qvalues.rb +105 -0
- data/spec/ms/error_rate/qvalue_spec.rb +25 -0
- data/spec/ms/error_rate_spec.rb +25 -0
- data/spec/spec_helper.rb +6 -0
- metadata +129 -0
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Copyright shared among contributing institutions:
|
|
2
|
+
Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
|
|
3
|
+
Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
= {ms-error_rate}[http://mspire.rubyforge.org/projects/ms-error_rate]
|
|
2
|
+
|
|
3
|
+
An {Mspire}[http://mspire.rubyforge.org] library for calculating or dealing
|
|
4
|
+
with error rates. These may be from target-decoy searches, sample bias
|
|
5
|
+
validation, or other sources.
|
|
6
|
+
|
|
7
|
+
== Examples
|
|
8
|
+
|
|
9
|
+
=== Target-Decoy with Mascot
|
|
10
|
+
|
|
11
|
+
Generate q-values (right now only with Mascot and MascotPercolator):
|
|
12
|
+
|
|
13
|
+
require 'ms/error_rate/qvalue'
|
|
14
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues(target_files, decoy_files)
|
|
15
|
+
# target_hit is a PeptideHit Struct (:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
|
16
|
+
|
|
17
|
+
# or on the commandline:
|
|
18
|
+
% qvalues.rb <target>.dat <decoy>.dat
|
|
19
|
+
|
|
20
|
+
The same output can be produced from Mascot-Percolator output:
|
|
21
|
+
|
|
22
|
+
require 'ms/error_rate/qvalue'
|
|
23
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues(datp_files, tab_dot_text_files)
|
|
24
|
+
# or commandline:
|
|
25
|
+
% qvalues.rb <target>.datp <target>.tab.txt
|
|
26
|
+
|
|
27
|
+
=== Sample Bias Validation
|
|
28
|
+
|
|
29
|
+
Sample Bias Validation allows error rate determination based on expected biases in sample composition. Here is an example using transmembrane sequence content. We will assume a fasta file called `proteins.fasta`:
|
|
30
|
+
|
|
31
|
+
# create a peptide-centric database
|
|
32
|
+
fasta_to_peptide_centric_db.rb proteins.fasta # defaults 2 missed cleavages, min aaseq 4
|
|
33
|
+
# generates a file: proteins.msd_clvg2.min_aaseq4.yml
|
|
34
|
+
|
|
35
|
+
# create a transmembrane sequence prediction file
|
|
36
|
+
fasta_to_phobius.rb proteins.fasta # => generates proteins.phobius
|
|
37
|
+
|
|
38
|
+
generate_sbv_input_hashes.rb proteins.msd_clvg2.min_aaseq4.yml --tm proteins.phobius,1
|
|
39
|
+
# creates two files:
|
|
40
|
+
# proteins.msd_clvg2.min_aaseq4.tm_min1.by_aaseq.yml
|
|
41
|
+
# proteins.msd_clvg2.min_aaseq4.tm_min1.freq_by_length.yml
|
|
42
|
+
|
|
43
|
+
# cytosolic fraction (transmembrane sequences not expected):
|
|
44
|
+
error_rate qvalues.yml --fp-sbv proteins.msd_clvg2.min_aaseq4.tm_min1.by_aaseq.yml,\
|
|
45
|
+
proteins.msd_clvg2.min_aaseq4.tm_min1.freq_by_length.yml,0.05
|
|
46
|
+
|
|
47
|
+
== Installation
|
|
48
|
+
|
|
49
|
+
gem install ms-error_rate
|
|
50
|
+
|
|
51
|
+
== Copyright
|
|
52
|
+
|
|
53
|
+
See LICENSE
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.0.7
|
data/bin/error_rate
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'support/sort_by_attributes'
|
|
4
|
+
require 'ms/error_rate'
|
|
5
|
+
require 'optparse'
|
|
6
|
+
|
|
7
|
+
@num_tp_validators = 0
|
|
8
|
+
@num_fp_validators = 0
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
arg_strings_ar = [true,false].map do |boolean|
|
|
12
|
+
["a false positive indicator", "A = aaseq to indictor (0-1) yml file", "B = aaseq_length to frequency yml file", "C = rate indicators are #{boolean} pos.", "D = name of the validator"]
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
op_sbv_ars = [true,false].zip(arg_strings_ar).map do |boolean, arg_strings|
|
|
16
|
+
letter = boolean ? 't' : 'f'
|
|
17
|
+
["--#{letter}p-sbv <A,B,C[,D]>,", Array, *arg_strings]
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
raw_validator_args = []
|
|
21
|
+
validator_names = []
|
|
22
|
+
|
|
23
|
+
opt = {
|
|
24
|
+
:order_by => [:qvalue],
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
opts = OptionParser.new do |op|
|
|
28
|
+
op.banner = "usage: #{File.basename(__FILE__)} qvalues.yml ..."
|
|
29
|
+
|
|
30
|
+
op.on("--order-by <Array>", Array, "the keys to order on (default: [qvalue])") {|v| opt[:order_by] = v.map {|v| v.to_sym } }
|
|
31
|
+
op.on("--best-is-low <Array>", Array, "the keys where better score is lower") {|v| opt[:best_is_low] = v.map {|v| v.to_sym } }
|
|
32
|
+
|
|
33
|
+
[true, false].each do |boolean|
|
|
34
|
+
index = boolean ? 0 : 1
|
|
35
|
+
op.on(*op_sbv_ars[index]) do |v|
|
|
36
|
+
(a,b,c,d) = v
|
|
37
|
+
name =
|
|
38
|
+
if d
|
|
39
|
+
d
|
|
40
|
+
else
|
|
41
|
+
if boolean
|
|
42
|
+
@num_tp_validator += 1
|
|
43
|
+
"tp#{@num_tp_validator}"
|
|
44
|
+
else
|
|
45
|
+
@num_fp_validator += 1
|
|
46
|
+
"fp#{@num_fp_validators}"
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
validator_names << name
|
|
50
|
+
raw_validator_args << [a,b,c]
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
opts.parse!
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if ARGV.size != 1
|
|
59
|
+
puts opts.to_s
|
|
60
|
+
exit
|
|
61
|
+
elsif !opt[:order_by]
|
|
62
|
+
puts "you must specify the order-by array!"
|
|
63
|
+
exit
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
sort_args = opt[:order_by]
|
|
67
|
+
sort_args << {:down => opt[:best_is_low] } # because we will sort normal and reverse the array
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# load one validator at a time
|
|
71
|
+
|
|
72
|
+
raw_validator_args.zip(validator_names) do |args, name|
|
|
73
|
+
(a,b,c) = args
|
|
74
|
+
val = Ms::ErrorRate::Sbv.new(YAML.load_file(a), YAML.load_file(b), c)
|
|
75
|
+
|
|
76
|
+
ARGV.each do |file|
|
|
77
|
+
yaml = YAML.load_file(file)
|
|
78
|
+
pepclass = Struct.new(yaml['headers'].map {|v| v.to_sym })
|
|
79
|
+
peps = yaml['data'].each do |ar|
|
|
80
|
+
pepclass.new(*ar)
|
|
81
|
+
end
|
|
82
|
+
sorted_best_to_worst = peps.sort_by_attributes(sort_args)
|
|
83
|
+
|
|
84
|
+
precision_vals = sorted_best_to_worst.map do |pep|
|
|
85
|
+
val.update_precision(pep.aaseq)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
end
|
|
89
|
+
p precision_vals
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
if ARGV.size == 0
|
|
5
|
+
puts "usage: #{File.basename(__FILE__)} <file>.fasta"
|
|
6
|
+
puts "output: <file>"
|
|
7
|
+
#puts "WARNING!!: you need to run phobius_to_nontransmembrane.rb before"
|
|
8
|
+
#puts "this to weed out transmembrane proteins!"
|
|
9
|
+
exit
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'mechanize'
|
|
4
|
+
|
|
5
|
+
page = 'http://phobius.sbc.su.se/'
|
|
6
|
+
|
|
7
|
+
if ARGV.size == 0
|
|
8
|
+
puts "usage: #{File.basename(__FILE__)} <file>.fasta"
|
|
9
|
+
puts "outputs <file>.phobius "
|
|
10
|
+
puts "in short format"
|
|
11
|
+
exit
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
a = WWW::Mechanize.new { |agent|
|
|
16
|
+
agent.user_agent_alias = 'Mac Safari'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
ARGV.each do |file|
|
|
20
|
+
outfile = file.chomp(File.extname(file)) + '.phobius'
|
|
21
|
+
a.get(page) do |page|
|
|
22
|
+
form = page.forms.first
|
|
23
|
+
form.radiobuttons.select {|v| v.value == 'short' }.first.click
|
|
24
|
+
fu = form.file_uploads.first
|
|
25
|
+
fu.file_name = File.expand_path(file)
|
|
26
|
+
#fu.file_data = IO.read(file)
|
|
27
|
+
reply = form.submit
|
|
28
|
+
html = reply.body
|
|
29
|
+
start = html.index("<pre>") + 5
|
|
30
|
+
stop = html.rindex("</pre>")
|
|
31
|
+
File.open(outfile, 'w') {|out| out.print html[start...stop] }
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
require 'ms/error_rate/sbv/peptide_based'
|
|
6
|
+
require 'ms/error_rate/sbv/protein_based'
|
|
7
|
+
|
|
8
|
+
opt = {}
|
|
9
|
+
opt[:protein_bias] = []
|
|
10
|
+
|
|
11
|
+
opts = OptionParser.new do |op|
|
|
12
|
+
op.banner = "usage: #{File.basename(__FILE__)} peptide_centric_db [OPTION]"
|
|
13
|
+
op.on("--tm <phobius,min>", Array, "transmembrane, <phobius> is path to phobius ", "output file (see fasta_to_phobius.rb)", "<min> is the min number of tm sequences required") {|v| opt[:tm] = [v.first, v.last.to_i]}
|
|
14
|
+
op.on("--aa <aa,min>", Array, "amino acid, <aa> is a string found in the peptides", "<min> is the min number of required for counting") {|v| opt[:aa] = [v.first, v.last.to_i]}
|
|
15
|
+
op.on("--protein-bias <name,file>", Array, "<name> bias, <file> is path to a yaml hash", " keyed prot -> <0-1>") {|v| opt[:protein_bias] << [v.first.to_sym, v.last]}
|
|
16
|
+
op.separator "outputs for each bias type:"
|
|
17
|
+
op.separator " <peptide_centric_db>.<info>.#{Ms::ErrorRate::Sbv::LENGTH_EXT}"
|
|
18
|
+
op.separator " <peptide_centric_db>.<info>.#{Ms::ErrorRate::Sbv::AASEQ_EXT}"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.parse!
|
|
22
|
+
|
|
23
|
+
if ARGV.size == 0
|
|
24
|
+
puts opts.to_s
|
|
25
|
+
exit
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
peptide_centric_db = ARGV.first
|
|
29
|
+
|
|
30
|
+
def note_files(files)
|
|
31
|
+
files.each do |file| puts "WROTE: #{file}" end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
klass = Ms::ErrorRate::Sbv
|
|
35
|
+
prot_klass = Ms::ErrorRate::Sbv::ProteinBased
|
|
36
|
+
pep_klass = Ms::ErrorRate::Sbv::PeptideBased
|
|
37
|
+
|
|
38
|
+
if opt[:tm]
|
|
39
|
+
index = TransmembraneIndex.new(opt[:tm].first)
|
|
40
|
+
|
|
41
|
+
protid_to_transmembrane = {}
|
|
42
|
+
regexp = nil
|
|
43
|
+
index.each do |k,v|
|
|
44
|
+
regexp ||= Ms::Fasta.id_regexp(k)
|
|
45
|
+
new_key = regexp.match(k)[1]
|
|
46
|
+
protid_to_transmembrane[new_key] = ((v[:num_certain_transmembrane_segments] >= opt[:tm].last) ? 1 : 0)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
fnames = prot_klass.generate_hashes( peptide_centric_db, protid_to_transmembrane, {:type_code => "tm_min#{opt[:tm].last}"})
|
|
50
|
+
note_files fnames
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
if opt[:aa]
|
|
54
|
+
fnames = pep_klass.generate_hashes( peptide_centric_db, *opt[:aa] )
|
|
55
|
+
note_files fnames
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
if opt[:protein_bias].size > 0
|
|
59
|
+
opt[:protein_bias].each do |name, hash_file|
|
|
60
|
+
prot_klass.generate_hashes( peptide_centric_db, hash_file)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'trollop'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'set'
|
|
6
|
+
|
|
7
|
+
require 'ms/error_rate/qvalue'
|
|
8
|
+
|
|
9
|
+
opts = Trollop::Parser.new do
|
|
10
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fwd>.xml <decoy>.xml ...
|
|
11
|
+
outputs: <fwd>.phq.csv
|
|
12
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
|
13
|
+
}
|
|
14
|
+
opt :z_together, "do not group by charge state", :default => false
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
DELIMITER = "\t"
|
|
18
|
+
|
|
19
|
+
opt = opts.parse(ARGV)
|
|
20
|
+
if ARGV.size == 0 || (ARGV.size%2 != 0)
|
|
21
|
+
puts "\n\n!! only even numbers of files accepted (target decoy target decoy ...) !!\n\n" if (ARGV.size%2 != 0)
|
|
22
|
+
opts.educate
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
files = ARGV.to_a
|
|
27
|
+
|
|
28
|
+
PeptideHit = Struct.new(:aaseq, :charge, :ionscore, :qvalue)
|
|
29
|
+
|
|
30
|
+
# this is a list of high quality peptide hits associated with each group
|
|
31
|
+
peptide_hits_per_file = files.map do |file|
|
|
32
|
+
File.open(file) do |io|
|
|
33
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS)
|
|
34
|
+
# we can work with namespaces, or just remove them ...
|
|
35
|
+
doc.remove_namespaces!
|
|
36
|
+
root = doc.root
|
|
37
|
+
search_hits = root.xpath('//search_hit')
|
|
38
|
+
search_hits.map do |search_hit|
|
|
39
|
+
aaseq = search_hit['peptide']
|
|
40
|
+
ionscore = search_hit.children.find {|node| node.name == 'search_score' && node['name'] == 'ionscore' }['value'].to_f
|
|
41
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
|
42
|
+
PeptideHit.new(aaseq, charge, ionscore)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
hits_per_target = peptide_hits_per_file.each_slice(2).map do |target_hits, decoy_hits|
|
|
48
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(target_hits, decoy_hits, :z_together => opt[:z_together], &:ionscore)
|
|
49
|
+
target_peptide_hits = pairs.map {|peptide_hit, qvalue| peptide_hit.qvalue = qvalue ; peptide_hit }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
files.each_slice(2).map(&:first).zip(hits_per_target) do |file, hits|
|
|
53
|
+
newfile = file.chomp(File.extname(file)) + ".phq.tsv"
|
|
54
|
+
File.open(newfile,'w') do |out|
|
|
55
|
+
out.puts %w(aaseq charge qvalue).join(DELIMITER)
|
|
56
|
+
hits.each do |hit|
|
|
57
|
+
out.puts hit.values_at(0,1,3).join(DELIMITER)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'ms/fasta'
|
|
4
|
+
require 'transmembrane/phobius.rb'
|
|
5
|
+
|
|
6
|
+
if ARGV.size != 3
|
|
7
|
+
puts "usage: #{File.basename(__FILE__)} <max#tm> phobius_file_short <file>.fasta"
|
|
8
|
+
puts "max#tm = max # of transmembrane sequences allowed to be a non-transmembrane."
|
|
9
|
+
puts ""
|
|
10
|
+
puts "outputs: <file>_NONTM.fasta"
|
|
11
|
+
exit
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
(max_num_tm, phobius_short_file, fasta_db_file) = ARGV
|
|
15
|
+
max_num_tm = max_num_tm.to_i
|
|
16
|
+
|
|
17
|
+
base = fasta_db_file.chomp(File.extname(fasta_db_file))
|
|
18
|
+
outfile = base + "_NONTM.fasta"
|
|
19
|
+
|
|
20
|
+
index = Phobius::Index.new(phobius_short_file)
|
|
21
|
+
|
|
22
|
+
File.open(outfile, 'w') do |out|
|
|
23
|
+
Ms::Fasta.open(fasta_db_file) do |fasta|
|
|
24
|
+
fasta.each do |entry|
|
|
25
|
+
key = index.reference_to_key(entry.header)
|
|
26
|
+
abort "can't find key: #{key} for #{entry.header}" unless index.key?(key)
|
|
27
|
+
num_tms = index[key][:num_certain_transmembrane_segments]
|
|
28
|
+
if num_tms <= max_num_tm
|
|
29
|
+
out.print entry.to_s
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
|
data/bin/qvalues.rb
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'optparse'
|
|
4
|
+
require 'ms/error_rate/qvalue'
|
|
5
|
+
|
|
6
|
+
DEF_EXT = "_flip"
|
|
7
|
+
NORMAL_EXT = 'qval.yml'
|
|
8
|
+
|
|
9
|
+
def print_out(outfile, filenames, headers, target_hits)
|
|
10
|
+
File.open(outfile, 'w') do |out|
|
|
11
|
+
out.print( {'headers' => headers, 'filenames' => filenames, 'data' => target_hits }.to_yaml )
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
opt = {
|
|
16
|
+
:outfile => NORMAL_EXT,
|
|
17
|
+
:min_peptide_length => 9,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
opts = OptionParser.new do |op|
|
|
21
|
+
op.banner = "usage: #{File.basename(__FILE__)} <target> <decoy> [... (as pairs)]"
|
|
22
|
+
op.separator "or: #{File.basename(__FILE__)} <target>.datp <target>.tab.txt [... (as pairs)]"
|
|
23
|
+
op.separator "for each pair of files"
|
|
24
|
+
op.separator "sorts the peptide hits by score and determines the precision at each hit"
|
|
25
|
+
op.separator ""
|
|
26
|
+
op.separator "writes a yaml file <target>.'#{NORMAL_EXT}' which"
|
|
27
|
+
op.separator "has three keys: 'headers', 'filenames', and 'data'"
|
|
28
|
+
op.separator " headers contains an array showing what is in the data"
|
|
29
|
+
op.separator " filenames: (a hash with two keys holding an array of full path names)"
|
|
30
|
+
op.separator " target:"
|
|
31
|
+
op.separator " decoy:"
|
|
32
|
+
op.separator " data: (an array with the data values)"
|
|
33
|
+
op.separator "headers: <the headers of the hits>"
|
|
34
|
+
op.separator ""
|
|
35
|
+
op.separator "headers guaranteed to have at least: filename, query_title, charge, sequence, qvalue"
|
|
36
|
+
op.separator ""
|
|
37
|
+
op.on("-l", "--min-peptide-length <Int>", Integer, "min num aa's to accept (default: #{opt[:min_peptide_length]})") {|v| opt[:min_peptide_length] = v }
|
|
38
|
+
op.on("--z-together", "combines all charge states for precision calc") {|v| opt[:z_together] = v }
|
|
39
|
+
op.on("-o", "--outfile <name>", "write to specified file") {|v| opt[:outfile] = v }
|
|
40
|
+
op.on("-g", "--group-together", "process all forwards together and all decoys together", "will output to opt[:outfile] unless -o given") {|v| opt[:group_together] = v }
|
|
41
|
+
op.on("-f", "--find-decoy [ext]", "finds the decoy file, default <file>#{DEF_EXT}.<ext>", "obviating the need to specify it on the commandline") do |v|
|
|
42
|
+
if v.is_a? String
|
|
43
|
+
opt[:find_decoy] = v
|
|
44
|
+
else
|
|
45
|
+
opt[:find_decoy] = DEF_EXT
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
opts.parse!
|
|
51
|
+
|
|
52
|
+
if ARGV.size == 0
|
|
53
|
+
puts opts.to_s
|
|
54
|
+
exit
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
target_files = []
|
|
58
|
+
decoy_files = []
|
|
59
|
+
if opt[:find_decoy]
|
|
60
|
+
target_files = ARGV.to_a.dup
|
|
61
|
+
decoy_files = target_files.map do |tf|
|
|
62
|
+
ext = File.extname(tf)
|
|
63
|
+
basename = tf.chomp(ext)
|
|
64
|
+
decoy_file = basename + opt[:find_decoy] + ext
|
|
65
|
+
raise ArgumentError, "cannot find #{decoy_file}" unless File.exist?(decoy_file)
|
|
66
|
+
decoy_file
|
|
67
|
+
end
|
|
68
|
+
else
|
|
69
|
+
ARGV.each_slice(2) do |target, decoy|
|
|
70
|
+
target_files << target
|
|
71
|
+
decoy_files << decoy
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
require 'ms/error_rate/qvalue/mascot'
|
|
76
|
+
require 'ms/error_rate/qvalue/mascot/percolator'
|
|
77
|
+
|
|
78
|
+
mascot_percolator = (File.extname(target_files.first) == '.datp')
|
|
79
|
+
headers = Ms::ErrorRate::Qvalue::Mascot::MEMBERS.map(&:to_s)
|
|
80
|
+
if opt[:group_together]
|
|
81
|
+
outfile = opt[:outfile]
|
|
82
|
+
if mascot_percolator
|
|
83
|
+
filenames = { 'target' => target_files, 'decoy' => decoy_files }
|
|
84
|
+
# in the case of mascot_percolator, the "target" files are .datp files and
|
|
85
|
+
# "decoy" files the .tab.txt files
|
|
86
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues( target_files, decoy_files, opt).sort_by(&:qvalue)
|
|
87
|
+
else
|
|
88
|
+
filenames = { 'target' => target_files, 'decoy' => decoy_files }
|
|
89
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues(target_files, decoy_files, opt).sort_by(&:qvalue)
|
|
90
|
+
end
|
|
91
|
+
print_out(outfile, filenames, headers, target_hits)
|
|
92
|
+
else
|
|
93
|
+
target_files.zip(decoy_files) do |target_file, decoy_file|
|
|
94
|
+
if mascot_percolator
|
|
95
|
+
filenames = { 'datp' => [target_file], 'tab_txt' => [decoy_file] }
|
|
96
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot::Percolator.qvalues([target_file], [decoy_file], opt).sort_by(&:qvalue)
|
|
97
|
+
else
|
|
98
|
+
filenames = { 'target' => [target_file], 'decoy' => [decoy_file] }
|
|
99
|
+
target_hits = Ms::ErrorRate::Qvalue::Mascot.qvalues([target_file], [decoy_file], opt).sort_by(&:qvalue)
|
|
100
|
+
end
|
|
101
|
+
base = target_file.chomp(File.extname(target_file))
|
|
102
|
+
outfile = base + '.' + NORMAL_EXT
|
|
103
|
+
print_out(outfile, filenames, headers, target_hits)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
require 'ms/error_rate/qvalue'
|
|
4
|
+
|
|
5
|
+
Hit = Struct.new(:score, :charge)
|
|
6
|
+
|
|
7
|
+
describe 'calculating q-values' do
|
|
8
|
+
|
|
9
|
+
before do
|
|
10
|
+
scores = [14,15,13,12,11]
|
|
11
|
+
qvals_expected = [0.5 ,0.0, 2.0/3.0, 3.0/4, 4.0/5]
|
|
12
|
+
@target_hits = scores.zip(Array.new(scores.size, 2)).map {|pair| Hit.new(*pair) }
|
|
13
|
+
@decoy_hits = scores.zip(Array.new(scores.size, 2)).map {|pair| Hit.new(pair.first-0.5, pair.last) }
|
|
14
|
+
@qval_by_hit = {}
|
|
15
|
+
@target_hits.zip(qvals_expected) {|hit, qval| @qval_by_hit[hit] = qval }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'can calculate qvalues on target deccoy sets' do
|
|
19
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(@target_hits, @decoy_hits)
|
|
20
|
+
pairs.each do |hit, qval|
|
|
21
|
+
@qval_by_hit[hit].should.be.close(qval, 0.00000001)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
require 'ms/error_rate'
|
|
4
|
+
require 'ostruct'
|
|
5
|
+
|
|
6
|
+
xdescribe 'not quite sure what this is' do
|
|
7
|
+
|
|
8
|
+
it 'calculates bayesian probabilities' do
|
|
9
|
+
# C = is a correct ID
|
|
10
|
+
# T = transmembrane content
|
|
11
|
+
# Y = cysteine content
|
|
12
|
+
# A = abundance
|
|
13
|
+
# p(C|T,Y,A) = p(T|C)p(Y|C)p(A|C)p(C) / p(T)p(Y)p(A)
|
|
14
|
+
peps.map do |pep|
|
|
15
|
+
# what is the probability of that un-transmembraneyness being correct?
|
|
16
|
+
# what is the probability of that un-cysteineness being correct?
|
|
17
|
+
# what is the probability of that high abundanceness being correct?
|
|
18
|
+
pep.bayes_probs.reduce(prob_being_correct) do |prob|
|
|
19
|
+
end
|
|
20
|
+
p_correct = pep.prior_prob_correct
|
|
21
|
+
pep.not_transmembrane? * pep.not_cysteine? * pep.not_low_abundance?
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: ms-error_rate
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 0
|
|
8
|
+
- 7
|
|
9
|
+
version: 0.0.7
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- John T. Prince
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2011-03-28 00:00:00 -06:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies:
|
|
20
|
+
- !ruby/object:Gem::Dependency
|
|
21
|
+
name: ms-core
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
+
none: false
|
|
25
|
+
requirements:
|
|
26
|
+
- - ">="
|
|
27
|
+
- !ruby/object:Gem::Version
|
|
28
|
+
segments:
|
|
29
|
+
- 0
|
|
30
|
+
- 0
|
|
31
|
+
- 2
|
|
32
|
+
version: 0.0.2
|
|
33
|
+
type: :runtime
|
|
34
|
+
version_requirements: *id001
|
|
35
|
+
- !ruby/object:Gem::Dependency
|
|
36
|
+
name: ms-fasta
|
|
37
|
+
prerelease: false
|
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
39
|
+
none: false
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
segments:
|
|
44
|
+
- 0
|
|
45
|
+
- 2
|
|
46
|
+
- 3
|
|
47
|
+
version: 0.2.3
|
|
48
|
+
type: :runtime
|
|
49
|
+
version_requirements: *id002
|
|
50
|
+
- !ruby/object:Gem::Dependency
|
|
51
|
+
name: spec-more
|
|
52
|
+
prerelease: false
|
|
53
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
54
|
+
none: false
|
|
55
|
+
requirements:
|
|
56
|
+
- - ">="
|
|
57
|
+
- !ruby/object:Gem::Version
|
|
58
|
+
segments:
|
|
59
|
+
- 0
|
|
60
|
+
version: "0"
|
|
61
|
+
type: :development
|
|
62
|
+
version_requirements: *id003
|
|
63
|
+
description: aids for creating and calculating error rates using target-decoy searches and sample validation.
|
|
64
|
+
email: jtprince@gmail.com
|
|
65
|
+
executables:
|
|
66
|
+
- error_rate
|
|
67
|
+
- fasta_to_nuclear.rb
|
|
68
|
+
- fasta_to_peptide_centric_db.rb
|
|
69
|
+
- fasta_to_phobius.rb
|
|
70
|
+
- generate_sbv_input_hashes.rb
|
|
71
|
+
- mascot_pepxml_to_peptide_hit_qvalues.rb
|
|
72
|
+
- phobius_to_nontransmembrane.rb
|
|
73
|
+
- qvalues.rb
|
|
74
|
+
extensions: []
|
|
75
|
+
|
|
76
|
+
extra_rdoc_files:
|
|
77
|
+
- LICENSE
|
|
78
|
+
- README.rdoc
|
|
79
|
+
files:
|
|
80
|
+
- VERSION
|
|
81
|
+
- LICENSE
|
|
82
|
+
- README.rdoc
|
|
83
|
+
- spec/ms/error_rate/qvalue_spec.rb
|
|
84
|
+
- spec/ms/error_rate_spec.rb
|
|
85
|
+
- spec/spec_helper.rb
|
|
86
|
+
- bin/error_rate
|
|
87
|
+
- bin/fasta_to_nuclear.rb
|
|
88
|
+
- bin/fasta_to_peptide_centric_db.rb
|
|
89
|
+
- bin/fasta_to_phobius.rb
|
|
90
|
+
- bin/generate_sbv_input_hashes.rb
|
|
91
|
+
- bin/mascot_pepxml_to_peptide_hit_qvalues.rb
|
|
92
|
+
- bin/phobius_to_nontransmembrane.rb
|
|
93
|
+
- bin/qvalues.rb
|
|
94
|
+
has_rdoc: true
|
|
95
|
+
homepage: http://jtprince.github.com/ms-error_rate
|
|
96
|
+
licenses: []
|
|
97
|
+
|
|
98
|
+
post_install_message:
|
|
99
|
+
rdoc_options: []
|
|
100
|
+
|
|
101
|
+
require_paths:
|
|
102
|
+
- lib
|
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
104
|
+
none: false
|
|
105
|
+
requirements:
|
|
106
|
+
- - ">="
|
|
107
|
+
- !ruby/object:Gem::Version
|
|
108
|
+
segments:
|
|
109
|
+
- 0
|
|
110
|
+
version: "0"
|
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
112
|
+
none: false
|
|
113
|
+
requirements:
|
|
114
|
+
- - ">="
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
segments:
|
|
117
|
+
- 0
|
|
118
|
+
version: "0"
|
|
119
|
+
requirements: []
|
|
120
|
+
|
|
121
|
+
rubyforge_project: mspire
|
|
122
|
+
rubygems_version: 1.3.7
|
|
123
|
+
signing_key:
|
|
124
|
+
specification_version: 3
|
|
125
|
+
summary: An mspire library for calculating error rates in MS/MS identifications (FDRs).
|
|
126
|
+
test_files:
|
|
127
|
+
- spec/ms/error_rate/qvalue_spec.rb
|
|
128
|
+
- spec/ms/error_rate_spec.rb
|
|
129
|
+
- spec/spec_helper.rb
|