ms-error_rate 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +14 -0
- data/.gitmodules +9 -0
- data/History +16 -0
- data/LICENSE +2 -0
- data/Rakefile +52 -0
- data/VERSION +1 -1
- data/lib/ms/error_rate/decoy.rb +27 -0
- data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
- data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
- data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
- data/lib/ms/error_rate/qvalue.rb +93 -0
- data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
- data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
- data/lib/ms/error_rate/sbv.rb +111 -0
- data/lib/ms/error_rate.rb +9 -0
- data/lib/ms/ident.rb +125 -0
- data/lib/support/sort_by_attributes.rb +51 -0
- data/lib/transmembrane/phobius.rb +136 -0
- data/lib/transmembrane/toppred.rb +368 -0
- data/lib/transmembrane.rb +157 -0
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/expert_addition.rb +26 -0
- data/script/expert_list.rb +53 -0
- data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
- data/script/minimal_protein_set.rb +366 -0
- data/script/unique_seq_stats.rb +72 -0
- metadata +66 -14
@@ -0,0 +1,157 @@
|
|
1
|
+
|
2
|
+
# A transmemIndex is a hash that takes a fasta reference as key and returns
|
3
|
+
# a structured hash containing the transmembrane information.
|
4
|
+
module TransmembraneIndex
|
5
|
+
|
6
|
+
# returns :toppred or :phobius
|
7
|
+
def self.filetype(file)
|
8
|
+
tp = nil
|
9
|
+
File.open(file) do |fh|
|
10
|
+
while (line = fh.gets)
|
11
|
+
case line
|
12
|
+
when /SEQENCE/
|
13
|
+
tp = :phobius
|
14
|
+
break
|
15
|
+
when / 0 0 i/
|
16
|
+
tp = :phobius # if they don't have the headers,
|
17
|
+
# this will pick it up if they have a
|
18
|
+
# single prot without tm or signal peptide.
|
19
|
+
break
|
20
|
+
when /Algorithm specific parameters/
|
21
|
+
tp = :toppred # New text
|
22
|
+
break
|
23
|
+
when /<parameters>/
|
24
|
+
tp = :toppred # XML
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
tp
|
30
|
+
end
|
31
|
+
|
32
|
+
def reference_to_key(reference)
|
33
|
+
# needs to be subclassed or written
|
34
|
+
end
|
35
|
+
|
36
|
+
# right now accepts toppred.out files
|
37
|
+
# Phobius objects can use the fasta object to update their hash for methods
|
38
|
+
# like avg_overlap
|
39
|
+
def self.new(file)
|
40
|
+
case x = filetype(file)
|
41
|
+
when :toppred
|
42
|
+
require 'transmembrane/toppred'
|
43
|
+
TopPred::Index.new(file)
|
44
|
+
when :phobius
|
45
|
+
require 'transmembrane/phobius'
|
46
|
+
# warn "WARNING: You have NO fasta object with Phobius based TransmembraneIndex! (which needs one to do proper indexing!)" unless fasta
|
47
|
+
Phobius::Index.new(file)
|
48
|
+
else
|
49
|
+
raise ArgumentError, "#{x} filetype for #{file} not recognized!"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# returns a hash of key -> num certain transmembrane segments
|
54
|
+
def num_certain_index
|
55
|
+
hash = {}
|
56
|
+
self.each do |k,v|
|
57
|
+
hash[k] = v[:num_certain_transmembrane_segments] || 0
|
58
|
+
end
|
59
|
+
hash
|
60
|
+
end
|
61
|
+
|
62
|
+
# tp = :number or :fraction which is the fraction of the sequence size
|
63
|
+
# returns the average number of overlapping amino acids with transmembrane
|
64
|
+
# segments
|
65
|
+
# returns nil if there is no protein by that key
|
66
|
+
def avg_overlap(key, sequence, tp=:number)
|
67
|
+
if self.key? key
|
68
|
+
numbers = num_transmem_aa(self[key], sequence)
|
69
|
+
if numbers.size > 0
|
70
|
+
sum = 0
|
71
|
+
numbers.each {|num| sum += num}
|
72
|
+
avg_num = sum.to_f / numbers.size
|
73
|
+
# the one line way to do it
|
74
|
+
#avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
|
75
|
+
if tp == :fraction
|
76
|
+
avg_num / sequence.size
|
77
|
+
# this is the same as doing this:
|
78
|
+
#numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
|
79
|
+
else
|
80
|
+
avg_num
|
81
|
+
end
|
82
|
+
else
|
83
|
+
0.0
|
84
|
+
end
|
85
|
+
else # what to do if the protein isn't there?? which happens on occasion
|
86
|
+
nil
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# returns an array (usually length of 1) of the number of amino acids
|
91
|
+
# contained inside transmembrane spanning segments.
|
92
|
+
# assumes that tmhash has the key 'transmembrane_segments'
|
93
|
+
# if there are no transmembrane segments, returns empty array.
|
94
|
+
def num_transmem_aa(tmhash, sequence)
|
95
|
+
if tmhash.key? :transmembrane_segments
|
96
|
+
ranges = tmhash[:transmembrane_segments].map do |tmseg|
|
97
|
+
Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
|
98
|
+
end
|
99
|
+
num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
|
100
|
+
else
|
101
|
+
[]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# returns an array of the number of overlapping sequences in substring with
|
106
|
+
# the substrings defined in start_stop_doublets within full_sequence
|
107
|
+
# start_stop_doublets should be 0 indexed!!!
|
108
|
+
# the span includes the 'stop' position i.e., full_sequence[start..stop]
|
109
|
+
def num_overlapping_chars(full_sequence, ranges, substring)
|
110
|
+
#start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
|
111
|
+
if ranges.size == 0
|
112
|
+
[]
|
113
|
+
#full_sequence.enum_for(:scan, substring).map { 0 }
|
114
|
+
else
|
115
|
+
substring_ranges = []
|
116
|
+
pos = 0
|
117
|
+
slen = substring.size
|
118
|
+
while i=full_sequence.index(substring,pos)
|
119
|
+
substring_ranges << Range.new(i, i+slen-1)
|
120
|
+
pos = i + slen
|
121
|
+
end
|
122
|
+
# brute force way
|
123
|
+
last_tm_range = ranges.last.last
|
124
|
+
to_return = substring_ranges.map do |sb|
|
125
|
+
overlap = 0
|
126
|
+
# there's got to be a much simpler way to do this, but this does work...
|
127
|
+
ranges.each do |tm|
|
128
|
+
(frst, lst) =
|
129
|
+
if tm.include?( sb.first )
|
130
|
+
[tm, sb]
|
131
|
+
elsif tm.include?( sb.last )
|
132
|
+
[sb, tm]
|
133
|
+
else
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
if frst
|
137
|
+
if lst.last <= frst.last
|
138
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
|
139
|
+
else
|
140
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
overlap
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
#substring_ranges = full_sequence.enum_for(:scan, substring).map do
|
154
|
+
# (ofirst, olast) = $~.offset(0)
|
155
|
+
# Range.new(ofirst, olast - 1)
|
156
|
+
# end
|
157
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
if ARGV.size == 0
|
7
|
+
puts "usage: prog summary__<setname>__name_to_gene_id.yml"
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
file = ARGV.shift
|
12
|
+
|
13
|
+
hash = YAML.load_file(file)
|
14
|
+
|
15
|
+
previous_hits = Set.new
|
16
|
+
results = []
|
17
|
+
hash.sort.each do |fdr, hits|
|
18
|
+
new_hits = hits - previous_hits.to_a
|
19
|
+
previous_hits.merge(new_hits)
|
20
|
+
results << [fdr, hits.size, *new_hits]
|
21
|
+
end
|
22
|
+
|
23
|
+
results.shift.zip(*results) do |row|
|
24
|
+
puts row.join("\t")
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'orderedhash'
|
4
|
+
require 'yaml'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
if ARGV.size != 2
|
8
|
+
puts "usage: #{File.basename(__FILE__)} <gene_ids>.txt summary.yml"
|
9
|
+
puts "writes a yml file with unique proteins per qvalue cutoff"
|
10
|
+
puts "for each set"
|
11
|
+
puts "summary__<setname>__<gene_ids>.yml"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
(gene_ids, summary) = ARGV
|
16
|
+
|
17
|
+
globs = IO.readlines(gene_ids).reject{|v| v[0,1] == '#'}.map{|v| v.chomp }.select {|v| v =~ /\w/ }
|
18
|
+
|
19
|
+
hash = YAML.load_file(summary)
|
20
|
+
protein_info = hash['protein_info']
|
21
|
+
results = hash['results']
|
22
|
+
output_hashes = OrderedHash.new
|
23
|
+
results.each do |result|
|
24
|
+
|
25
|
+
qvalue_cutoff = result['qvalue_cutoff']
|
26
|
+
result['sets'].each do |setname, sethash|
|
27
|
+
matches = Set.new
|
28
|
+
output_hashes[setname] ||= OrderedHash.new
|
29
|
+
proteins = sethash['proteins']
|
30
|
+
proteins.each do |ipi,info|
|
31
|
+
if info['num_hits_minimal'].first > 0
|
32
|
+
all_proteins = [ipi, *info['indistinguishable']]
|
33
|
+
all_proteins.each do |id|
|
34
|
+
globs.each do |glob|
|
35
|
+
if File.fnmatch?(glob, protein_info[id]['Gene_Symbol'])
|
36
|
+
matches << protein_info[id]['Gene_Symbol']
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
output = matches.to_a.sort
|
43
|
+
output_hashes[setname][qvalue_cutoff] = output
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
output_hashes.each do |setname, output|
|
48
|
+
gene_ids_base = File.basename(gene_ids, '.*')
|
49
|
+
summary_base = summary.chomp(File.extname(summary))
|
50
|
+
output_file = [summary_base, setname, gene_ids_base].join("__") + ".yml"
|
51
|
+
|
52
|
+
File.open(output_file, 'w') {|out| out.print output.to_yaml }
|
53
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
if ARGV.size == 0
|
4
|
+
puts "usage: #{File.basename(__FILE__)} <IPI_based>.fasta ..."
|
5
|
+
puts "moves any leading \"><.*_>\" to the IPI value"
|
6
|
+
puts "for example:"
|
7
|
+
puts ">DCY_IPI:IPI0032311.1|STUFF -> >IPI:DCY_IPI0032311.1|STUFF"
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
ARGV.each do |file|
|
12
|
+
tmp = file + '.tmp'
|
13
|
+
if File.exist?(tmp) ; warn "Skipping #{file} since #{tmp} exists" ; next end
|
14
|
+
File.open(tmp, 'w') do |out|
|
15
|
+
IO.foreach(file) do |line|
|
16
|
+
if line =~ />([^\:\|]+_)/
|
17
|
+
line.sub!("#{$1}IPI:IPI", "IPI:#{$1}IPI")
|
18
|
+
end
|
19
|
+
out.print line
|
20
|
+
end
|
21
|
+
end
|
22
|
+
FileUtils.mv tmp, file
|
23
|
+
end
|
@@ -0,0 +1,366 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'set'
|
5
|
+
require 'optparse'
|
6
|
+
require 'ms/fasta'
|
7
|
+
require 'ms/fasta/ipi'
|
8
|
+
|
9
|
+
SET_RE = /Set\s+(.*)/i
|
10
|
+
QVALUE_EXT = ".qval.yml"
|
11
|
+
|
12
|
+
# returns [sets_to_paths_hash, sets_order]
|
13
|
+
def sets_compare_to_paths(file, ext=QVALUE_EXT)
|
14
|
+
dirname = File.dirname(File.expand_path(file))
|
15
|
+
lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
|
16
|
+
sets = {}
|
17
|
+
current_set = nil
|
18
|
+
sets_order = []
|
19
|
+
lines.each do |line|
|
20
|
+
if line =~ SET_RE
|
21
|
+
current_set = $1.dup
|
22
|
+
sets[current_set] = []
|
23
|
+
sets_order << current_set
|
24
|
+
else
|
25
|
+
full_path = (File.join(dirname,(line + ext)))
|
26
|
+
raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
|
27
|
+
sets[current_set] << full_path
|
28
|
+
end
|
29
|
+
end
|
30
|
+
[sets, sets_order]
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
|
34
|
+
# takes a hash of proteins to aaseqs. Uses a greedy algorithm where
|
35
|
+
# things are sorted first by the number of uniq amino acid sequences and total
|
36
|
+
# aa length. if a block is given, then will yield the prot and the
|
37
|
+
# peptide_array and sort by the returned value. The greedy algorithm acts on
|
38
|
+
# the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
|
39
|
+
# on the proteins in the minimal_protein_array and gives an array of other
|
40
|
+
# proteins.
|
41
|
+
def minimal_protein_set(proteins_to_aaseqs)
|
42
|
+
blk_given = block_given?
|
43
|
+
#STDERR.puts "using block for minimal_protein_set" if blk_given
|
44
|
+
proteins_and_uniq_peps = []
|
45
|
+
|
46
|
+
sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
|
47
|
+
if blk_given
|
48
|
+
yield(k,v)
|
49
|
+
else
|
50
|
+
[ v.size, v.inject(0){|m,s| m+s.size} ]
|
51
|
+
end
|
52
|
+
end.reverse
|
53
|
+
|
54
|
+
found_seq = Set.new
|
55
|
+
|
56
|
+
same_peptide_hits = {}
|
57
|
+
|
58
|
+
last_peps = nil
|
59
|
+
last_uniq_prot = nil
|
60
|
+
sorted_most_to_least.each do |prot, peps|
|
61
|
+
sorted_peps = peps.sort # is it necessary to SORT?????????
|
62
|
+
uniq_peps = peps.select do |pep|
|
63
|
+
if found_seq.include?(pep)
|
64
|
+
false
|
65
|
+
else
|
66
|
+
found_seq.add pep
|
67
|
+
true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
if uniq_peps.size > 0
|
71
|
+
proteins_and_uniq_peps << [prot, uniq_peps]
|
72
|
+
same_peptide_hits[prot] = []
|
73
|
+
last_peps = sorted_peps
|
74
|
+
last_uniq_prot = prot
|
75
|
+
else
|
76
|
+
if sorted_peps == last_peps
|
77
|
+
same_peptide_hits[last_uniq_prot] << prot
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
prot_to_uniq_peps_hash = {}
|
82
|
+
proteins_and_uniq_peps.each do |prot, uniq_peps|
|
83
|
+
prot_to_uniq_peps_hash[prot] = uniq_peps
|
84
|
+
end
|
85
|
+
|
86
|
+
[prot_to_uniq_peps_hash, same_peptide_hits]
|
87
|
+
end
|
88
|
+
|
89
|
+
def cutoffs_to_floats(ar)
|
90
|
+
ar.map do |v|
|
91
|
+
if v == 'nil' || v == '-'
|
92
|
+
nil
|
93
|
+
else
|
94
|
+
answ = v.to_f
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# returns a hash keyed on protein id that yields an array:
|
100
|
+
# [#aaseq, #aaseq_and_charge, #total_hits]
|
101
|
+
def stats_per_prot(prot_to_peps, seq_to_hits)
|
102
|
+
per_protein_hash = {}
|
103
|
+
prot_to_peps.each do |prot, uniq_pep_seqs|
|
104
|
+
all = Set.new
|
105
|
+
aaseqcharges = Set.new
|
106
|
+
aaseqs = Set.new
|
107
|
+
|
108
|
+
uniq_pep_seqs.each do |pep_seq|
|
109
|
+
all_hits = seq_to_hits[pep_seq]
|
110
|
+
all.merge( all_hits )
|
111
|
+
all_hits.each do |hit|
|
112
|
+
aaseq = hit.sequence
|
113
|
+
aaseqs.add( aaseq )
|
114
|
+
aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
|
115
|
+
end
|
116
|
+
per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
per_protein_hash
|
121
|
+
end
|
122
|
+
|
123
|
+
opt = {
|
124
|
+
:cutoffs => [nil],
|
125
|
+
:outfile => "summary.yml",
|
126
|
+
}
|
127
|
+
|
128
|
+
opts = OptionParser.new do |op|
|
129
|
+
op.banner = "USAGE: #{File.basename(__FILE__)} sets_compare.txt"
|
130
|
+
op.separator "OUTPUT: #{opt[:outfile]}"
|
131
|
+
op.separator ""
|
132
|
+
op.separator "INPUT: "
|
133
|
+
op.separator " each <file> referenced in sets_compare.txt should have a"
|
134
|
+
op.separator " <file>.qval.yml file"
|
135
|
+
op.separator ""
|
136
|
+
op.separator "OPTIONS:"
|
137
|
+
op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
|
138
|
+
op.separator ""
|
139
|
+
op.on("--proteins <fasta>,<pep-db>", Array, "path to fasta and peptide centric DB", "peptide_centric_db is in the format: ", "<PEPTIDE>: <ID>-<ID>-<ID>") {|v| opt[:proteins] = v }
|
140
|
+
op.separator "FORMATS:"
|
141
|
+
op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
|
142
|
+
op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
|
143
|
+
end
|
144
|
+
|
145
|
+
# later on we could implement full isoform resolution like IsoformResolver
|
146
|
+
# for now we will generate a report, realizing that some isoforms may not be
|
147
|
+
# reported
|
148
|
+
# it is implemented by using a pre-made map from sequence to protein groups
|
149
|
+
# then, a set of sequences allows one to deduce all the relationships from the
|
150
|
+
# protein groups.
|
151
|
+
|
152
|
+
opts.parse!
|
153
|
+
|
154
|
+
if opt[:output_format]
|
155
|
+
yaml = <<SKEL
|
156
|
+
results:
|
157
|
+
- qvalue_cutoff: <Float>
|
158
|
+
sets:
|
159
|
+
<set_name>:
|
160
|
+
num_uniq_aaseqs: <Integer>
|
161
|
+
num_aaseqs_not_in_pep_db: <Integer>
|
162
|
+
num_uniq_aaseqs_charge: <Integer>
|
163
|
+
proteins:
|
164
|
+
<IPI_ID>:
|
165
|
+
num_hits_all:
|
166
|
+
- <Integer> # total num aaseqs
|
167
|
+
- <Integer> # total num aaseq+charge
|
168
|
+
- <Integer> # total num hits
|
169
|
+
num_hits_minimal:
|
170
|
+
- <Integer> # total num aaseqs
|
171
|
+
- <Integer> # total num aaseq+charge
|
172
|
+
- <Integer> # total num hits
|
173
|
+
indistinguishable:
|
174
|
+
- <IPI_ID>
|
175
|
+
- <IPI_ID>
|
176
|
+
aaseqs:
|
177
|
+
- <String>
|
178
|
+
- <String>
|
179
|
+
sets_order:
|
180
|
+
- <String>
|
181
|
+
- <String>
|
182
|
+
protein_info:
|
183
|
+
<IPI_ID>:
|
184
|
+
Gene_Symbol: <String>
|
185
|
+
IPI: <IPI_ID>
|
186
|
+
Tax_Id: <String>
|
187
|
+
SWISS-PROT: <String>
|
188
|
+
description: <String>
|
189
|
+
ENSEMBL: <String>
|
190
|
+
SKEL
|
191
|
+
print yaml
|
192
|
+
exit
|
193
|
+
end
|
194
|
+
|
195
|
+
if opt[:input_format]
|
196
|
+
string =<<EXPLANATION
|
197
|
+
# the sets_compare.yml format is very simple:
|
198
|
+
|
199
|
+
Set <some_name_for_set1>
|
200
|
+
filename1_no_ext
|
201
|
+
filename2_no_ext
|
202
|
+
Set <some_name_for_set2>
|
203
|
+
filename3_no_ext
|
204
|
+
filename4_no_ext
|
205
|
+
...
|
206
|
+
EXPLANATION
|
207
|
+
puts string
|
208
|
+
exit
|
209
|
+
end
|
210
|
+
|
211
|
+
if ARGV.size != 1
|
212
|
+
puts opts.to_s
|
213
|
+
exit
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
results = {}
|
218
|
+
|
219
|
+
protein_info = {}
|
220
|
+
results['protein_info'] = protein_info
|
221
|
+
results['results'] = []
|
222
|
+
|
223
|
+
(sets_hash, sets_order) = sets_compare_to_paths(ARGV.shift)
|
224
|
+
results['sets_order'] = sets_order
|
225
|
+
|
226
|
+
if opt[:proteins]
|
227
|
+
(fasta, pep_db_file) = opt[:proteins]
|
228
|
+
|
229
|
+
# a hash indexed on ipi containing all info
|
230
|
+
prot_header_hash = {}
|
231
|
+
|
232
|
+
STDERR.print "Loading information from fasta file..."
|
233
|
+
start = Time.now
|
234
|
+
prot_sizes_hash = {}
|
235
|
+
Ms::Fasta.open(fasta, 'rb', :io_index => []) do |obj|
|
236
|
+
obj.each do |entry|
|
237
|
+
hash = Ms::Fasta::Ipi.parse(entry.header)
|
238
|
+
ipi = hash['IPI']
|
239
|
+
prot_header_hash[ipi] = hash
|
240
|
+
prot_sizes_hash[ipi] = entry.sequence.size
|
241
|
+
end
|
242
|
+
end
|
243
|
+
STDERR.puts "#{Time.now - start} seconds."
|
244
|
+
|
245
|
+
STDERR.print "Loading peptide centric DB (this takes about a minute)..."
|
246
|
+
start = Time.now
|
247
|
+
pep_db = YAML.load_file(pep_db_file)
|
248
|
+
STDERR.puts "#{Time.now - start} seconds."
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
opt[:cutoffs].each do |cutoff|
|
253
|
+
|
254
|
+
cutoff_results = {'qvalue_cutoff' => cutoff}
|
255
|
+
results_sets_hash = {}
|
256
|
+
cutoff_results['sets'] = results_sets_hash
|
257
|
+
results['results'] << cutoff_results
|
258
|
+
|
259
|
+
#########################
|
260
|
+
# FOR EACH SET:
|
261
|
+
#########################
|
262
|
+
pep_klass = nil
|
263
|
+
sets_hash.each do |set, files|
|
264
|
+
set_results = {}
|
265
|
+
results_sets_hash[set] = set_results
|
266
|
+
|
267
|
+
# assumes the indices are the same into each data file
|
268
|
+
|
269
|
+
# get the complete set of passing hits
|
270
|
+
all_passing_hits = files.inject([]) do |all_passing_hits, file|
|
271
|
+
hash = YAML.load_file(file)
|
272
|
+
|
273
|
+
header_hash = hash['headers']
|
274
|
+
pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
|
275
|
+
hits = hash['data'].map {|v| pep_klass.new(*v) }
|
276
|
+
|
277
|
+
passing_hits =
|
278
|
+
if cutoff
|
279
|
+
# assumes monotonic qvalues values!
|
280
|
+
(above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
|
281
|
+
above
|
282
|
+
else
|
283
|
+
hits
|
284
|
+
end
|
285
|
+
all_passing_hits.push(*passing_hits)
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
# create an index from aaseq to hits
|
290
|
+
seq_to_hits = Hash.new {|h,k| h[k] = []}
|
291
|
+
uniq_seqcharge = Set.new
|
292
|
+
all_passing_hits.each do |hit|
|
293
|
+
seq_to_hits[hit.sequence] << hit
|
294
|
+
uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
|
295
|
+
end
|
296
|
+
|
297
|
+
|
298
|
+
# determine the number of uniq aaseqs
|
299
|
+
uniq_seqs = seq_to_hits.size
|
300
|
+
|
301
|
+
num_uniq_seqcharges = uniq_seqcharge.size
|
302
|
+
|
303
|
+
set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
|
304
|
+
'num_uniq_aaseqs' => uniq_seqs,
|
305
|
+
'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
|
306
|
+
})
|
307
|
+
|
308
|
+
if opt[:proteins]
|
309
|
+
|
310
|
+
# create an index from proteins to peptides
|
311
|
+
prots_to_peps = Hash.new {|h,k| h[k] = [] }
|
312
|
+
peptides_not_found = []
|
313
|
+
seq_to_hits.keys.each do |seq|
|
314
|
+
if pep_db.key?(seq)
|
315
|
+
pep_db[seq].split('-').each do |prot|
|
316
|
+
prots_to_peps[prot] << seq
|
317
|
+
end
|
318
|
+
else
|
319
|
+
peptides_not_found << seq
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
# Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
|
324
|
+
stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
|
325
|
+
|
326
|
+
# get the minimal protein set
|
327
|
+
(prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
|
328
|
+
# will sort with lowest
|
329
|
+
[ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
|
330
|
+
end
|
331
|
+
|
332
|
+
prot_to_uniq_peps_hash.each do |prot, peps|
|
333
|
+
[prot, *indistinguishable_protein_hash[prot]].each do |prot|
|
334
|
+
protein_info[prot] = prot_header_hash[prot]
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
|
339
|
+
|
340
|
+
# create a hash of data for each protein
|
341
|
+
protein_data_hashes_hash = {}
|
342
|
+
prot_to_uniq_peps_hash.each do |prot, peps|
|
343
|
+
protein_data_hashes_hash[prot] = {
|
344
|
+
'aaseqs' => peps,
|
345
|
+
# this will be a triplet
|
346
|
+
'num_hits_minimal' => stats_per_protein_minimal[prot],
|
347
|
+
'indistinguishable' => indistinguishable_protein_hash[prot],
|
348
|
+
'num_hits_all' => stats_per_protein_before[prot],
|
349
|
+
}
|
350
|
+
end
|
351
|
+
|
352
|
+
set_results['proteins'] = protein_data_hashes_hash
|
353
|
+
set_results['num_proteins'] = prot_to_uniq_peps_hash.size
|
354
|
+
set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
|
355
|
+
if peptides_not_found.size > 0
|
356
|
+
warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
File.open(opt[:outfile], 'w') do |out|
|
363
|
+
out.print results.to_yaml
|
364
|
+
end
|
365
|
+
|
366
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
require 'yaml'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
opt = {}
|
8
|
+
opts = OptionParser.new do |op|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} <precision_file>.yml ..."
|
10
|
+
op.separator "outputs information collected by combining hits from files:"
|
11
|
+
op.separator "---"
|
12
|
+
op.separator "filenames: "
|
13
|
+
op.separator "- <pathgiven>"
|
14
|
+
op.separator "num_unique_aaseqs: <Int>"
|
15
|
+
op.separator "num_unique_aaseqs_charge: <Int>"
|
16
|
+
op.separator "num_peptide_hits: <Int>"
|
17
|
+
op.separator ""
|
18
|
+
op.separator "NOTE: if a precision cutoff is given, all hits that have a better"
|
19
|
+
op.separator "score than the worst score at the cutoff are included, even if "
|
20
|
+
op.separator "the precision for that hit was below the cutoff"
|
21
|
+
op.separator "this prevents early, local aberrations in precision from messing"
|
22
|
+
op.separator "up the analysis"
|
23
|
+
op.separator ""
|
24
|
+
op.on("-p", "--precision <0-1>", Float, "precision cutoff") {|v| opt[:cutoff] = v }
|
25
|
+
op.on("-f", "--fdr <0-1>", Float, "false discovery rate cutoff (1-precision)") {|v| opt[:cutoff] = 1.0 - v }
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.parse!
|
29
|
+
|
30
|
+
if ARGV.size == 0
|
31
|
+
puts opts.to_s
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
unique_sequences = Set.new
|
36
|
+
unique_ions = Set.new
|
37
|
+
all_hits = []
|
38
|
+
|
39
|
+
ARGV.each do |file|
|
40
|
+
hash = YAML.load_file(file)
|
41
|
+
|
42
|
+
prec_index = hash['headers'].index('precision')
|
43
|
+
mowse_index = hash['headers'].index('mowse')
|
44
|
+
aaseq_index = hash['headers'].index('aaseq')
|
45
|
+
charge_index = hash['headers'].index('charge')
|
46
|
+
|
47
|
+
above_cutoff.each do |ar|
|
48
|
+
sequence = ar[aaseq_index]
|
49
|
+
seq_plus_charge = sequence + ar[charge_index]
|
50
|
+
unique_sequences.add sequence
|
51
|
+
unique_ions.add seq_plus_charge
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
prec_k = 'precision cutoff'
|
56
|
+
fn_k = 'filenames'
|
57
|
+
uniq_aaseq_k = 'num unique aaseqs'
|
58
|
+
uniq_ions_k = 'num unique aaseqs+charge'
|
59
|
+
num_hits_k = 'num peptide hits'
|
60
|
+
|
61
|
+
order = [fn_k, prec_k, num_hits_k, uniq_ions_k, uniq_aaseq_k]
|
62
|
+
|
63
|
+
results = {}
|
64
|
+
results[fn_k] = '[' + ARGV.join(", ") + ']'
|
65
|
+
results[prec_k] = opt[:cutoff]
|
66
|
+
results[uniq_aaseq_k] = unique_sequences.size
|
67
|
+
results[uniq_ions_k] = unique_ions.size
|
68
|
+
results[num_hits_k] = all_hits.size
|
69
|
+
|
70
|
+
order.each do |key|
|
71
|
+
puts "#{key}: #{results[key]}"
|
72
|
+
end
|