ms-error_rate 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +14 -0
- data/.gitmodules +9 -0
- data/History +16 -0
- data/LICENSE +2 -0
- data/Rakefile +52 -0
- data/VERSION +1 -1
- data/lib/ms/error_rate/decoy.rb +27 -0
- data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
- data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
- data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
- data/lib/ms/error_rate/qvalue.rb +93 -0
- data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
- data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
- data/lib/ms/error_rate/sbv.rb +111 -0
- data/lib/ms/error_rate.rb +9 -0
- data/lib/ms/ident.rb +125 -0
- data/lib/support/sort_by_attributes.rb +51 -0
- data/lib/transmembrane/phobius.rb +136 -0
- data/lib/transmembrane/toppred.rb +368 -0
- data/lib/transmembrane.rb +157 -0
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/expert_addition.rb +26 -0
- data/script/expert_list.rb +53 -0
- data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
- data/script/minimal_protein_set.rb +366 -0
- data/script/unique_seq_stats.rb +72 -0
- metadata +66 -14
@@ -0,0 +1,157 @@
|
|
1
|
+
|
2
|
+
# A transmemIndex is a hash that takes a fasta reference as key and returns
|
3
|
+
# a structured hash containing the transmembrane information.
|
4
|
+
module TransmembraneIndex
|
5
|
+
|
6
|
+
# returns :toppred or :phobius
|
7
|
+
def self.filetype(file)
|
8
|
+
tp = nil
|
9
|
+
File.open(file) do |fh|
|
10
|
+
while (line = fh.gets)
|
11
|
+
case line
|
12
|
+
when /SEQENCE/
|
13
|
+
tp = :phobius
|
14
|
+
break
|
15
|
+
when / 0 0 i/
|
16
|
+
tp = :phobius # if they don't have the headers,
|
17
|
+
# this will pick it up if they have a
|
18
|
+
# single prot without tm or signal peptide.
|
19
|
+
break
|
20
|
+
when /Algorithm specific parameters/
|
21
|
+
tp = :toppred # New text
|
22
|
+
break
|
23
|
+
when /<parameters>/
|
24
|
+
tp = :toppred # XML
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
tp
|
30
|
+
end
|
31
|
+
|
32
|
+
def reference_to_key(reference)
|
33
|
+
# needs to be subclassed or written
|
34
|
+
end
|
35
|
+
|
36
|
+
# right now accepts toppred.out files
|
37
|
+
# Phobius objects can use the fasta object to update their hash for methods
|
38
|
+
# like avg_overlap
|
39
|
+
def self.new(file)
|
40
|
+
case x = filetype(file)
|
41
|
+
when :toppred
|
42
|
+
require 'transmembrane/toppred'
|
43
|
+
TopPred::Index.new(file)
|
44
|
+
when :phobius
|
45
|
+
require 'transmembrane/phobius'
|
46
|
+
# warn "WARNING: You have NO fasta object with Phobius based TransmembraneIndex! (which needs one to do proper indexing!)" unless fasta
|
47
|
+
Phobius::Index.new(file)
|
48
|
+
else
|
49
|
+
raise ArgumentError, "#{x} filetype for #{file} not recognized!"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# returns a hash of key -> num certain transmembrane segments
|
54
|
+
def num_certain_index
|
55
|
+
hash = {}
|
56
|
+
self.each do |k,v|
|
57
|
+
hash[k] = v[:num_certain_transmembrane_segments] || 0
|
58
|
+
end
|
59
|
+
hash
|
60
|
+
end
|
61
|
+
|
62
|
+
# tp = :number or :fraction which is the fraction of the sequence size
|
63
|
+
# returns the average number of overlapping amino acids with transmembrane
|
64
|
+
# segments
|
65
|
+
# returns nil if there is no protein by that key
|
66
|
+
def avg_overlap(key, sequence, tp=:number)
|
67
|
+
if self.key? key
|
68
|
+
numbers = num_transmem_aa(self[key], sequence)
|
69
|
+
if numbers.size > 0
|
70
|
+
sum = 0
|
71
|
+
numbers.each {|num| sum += num}
|
72
|
+
avg_num = sum.to_f / numbers.size
|
73
|
+
# the one line way to do it
|
74
|
+
#avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
|
75
|
+
if tp == :fraction
|
76
|
+
avg_num / sequence.size
|
77
|
+
# this is the same as doing this:
|
78
|
+
#numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
|
79
|
+
else
|
80
|
+
avg_num
|
81
|
+
end
|
82
|
+
else
|
83
|
+
0.0
|
84
|
+
end
|
85
|
+
else # what to do if the protein isn't there?? which happens on occasion
|
86
|
+
nil
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# returns an array (usually length of 1) of the number of amino acids
|
91
|
+
# contained inside transmembrane spanning segments.
|
92
|
+
# assumes that tmhash has the key 'transmembrane_segments'
|
93
|
+
# if there are no transmembrane segments, returns empty array.
|
94
|
+
def num_transmem_aa(tmhash, sequence)
|
95
|
+
if tmhash.key? :transmembrane_segments
|
96
|
+
ranges = tmhash[:transmembrane_segments].map do |tmseg|
|
97
|
+
Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
|
98
|
+
end
|
99
|
+
num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
|
100
|
+
else
|
101
|
+
[]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# returns an array of the number of overlapping sequences in substring with
|
106
|
+
# the substrings defined in start_stop_doublets within full_sequence
|
107
|
+
# start_stop_doublets should be 0 indexed!!!
|
108
|
+
# the span includes the 'stop' position i.e., full_sequence[start..stop]
|
109
|
+
def num_overlapping_chars(full_sequence, ranges, substring)
|
110
|
+
#start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
|
111
|
+
if ranges.size == 0
|
112
|
+
[]
|
113
|
+
#full_sequence.enum_for(:scan, substring).map { 0 }
|
114
|
+
else
|
115
|
+
substring_ranges = []
|
116
|
+
pos = 0
|
117
|
+
slen = substring.size
|
118
|
+
while i=full_sequence.index(substring,pos)
|
119
|
+
substring_ranges << Range.new(i, i+slen-1)
|
120
|
+
pos = i + slen
|
121
|
+
end
|
122
|
+
# brute force way
|
123
|
+
last_tm_range = ranges.last.last
|
124
|
+
to_return = substring_ranges.map do |sb|
|
125
|
+
overlap = 0
|
126
|
+
# there's got to be a much simpler way to do this, but this does work...
|
127
|
+
ranges.each do |tm|
|
128
|
+
(frst, lst) =
|
129
|
+
if tm.include?( sb.first )
|
130
|
+
[tm, sb]
|
131
|
+
elsif tm.include?( sb.last )
|
132
|
+
[sb, tm]
|
133
|
+
else
|
134
|
+
nil
|
135
|
+
end
|
136
|
+
if frst
|
137
|
+
if lst.last <= frst.last
|
138
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
|
139
|
+
else
|
140
|
+
overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
overlap
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
#substring_ranges = full_sequence.enum_for(:scan, substring).map do
|
154
|
+
# (ofirst, olast) = $~.offset(0)
|
155
|
+
# Range.new(ofirst, olast - 1)
|
156
|
+
# end
|
157
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
if ARGV.size == 0
|
7
|
+
puts "usage: prog summary__<setname>__name_to_gene_id.yml"
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
file = ARGV.shift
|
12
|
+
|
13
|
+
hash = YAML.load_file(file)
|
14
|
+
|
15
|
+
previous_hits = Set.new
|
16
|
+
results = []
|
17
|
+
hash.sort.each do |fdr, hits|
|
18
|
+
new_hits = hits - previous_hits.to_a
|
19
|
+
previous_hits.merge(new_hits)
|
20
|
+
results << [fdr, hits.size, *new_hits]
|
21
|
+
end
|
22
|
+
|
23
|
+
results.shift.zip(*results) do |row|
|
24
|
+
puts row.join("\t")
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'orderedhash'
|
4
|
+
require 'yaml'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
if ARGV.size != 2
|
8
|
+
puts "usage: #{File.basename(__FILE__)} <gene_ids>.txt summary.yml"
|
9
|
+
puts "writes a yml file with unique proteins per qvalue cutoff"
|
10
|
+
puts "for each set"
|
11
|
+
puts "summary__<setname>__<gene_ids>.yml"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
(gene_ids, summary) = ARGV
|
16
|
+
|
17
|
+
globs = IO.readlines(gene_ids).reject{|v| v[0,1] == '#'}.map{|v| v.chomp }.select {|v| v =~ /\w/ }
|
18
|
+
|
19
|
+
hash = YAML.load_file(summary)
|
20
|
+
protein_info = hash['protein_info']
|
21
|
+
results = hash['results']
|
22
|
+
output_hashes = OrderedHash.new
|
23
|
+
results.each do |result|
|
24
|
+
|
25
|
+
qvalue_cutoff = result['qvalue_cutoff']
|
26
|
+
result['sets'].each do |setname, sethash|
|
27
|
+
matches = Set.new
|
28
|
+
output_hashes[setname] ||= OrderedHash.new
|
29
|
+
proteins = sethash['proteins']
|
30
|
+
proteins.each do |ipi,info|
|
31
|
+
if info['num_hits_minimal'].first > 0
|
32
|
+
all_proteins = [ipi, *info['indistinguishable']]
|
33
|
+
all_proteins.each do |id|
|
34
|
+
globs.each do |glob|
|
35
|
+
if File.fnmatch?(glob, protein_info[id]['Gene_Symbol'])
|
36
|
+
matches << protein_info[id]['Gene_Symbol']
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
output = matches.to_a.sort
|
43
|
+
output_hashes[setname][qvalue_cutoff] = output
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
output_hashes.each do |setname, output|
|
48
|
+
gene_ids_base = File.basename(gene_ids, '.*')
|
49
|
+
summary_base = summary.chomp(File.extname(summary))
|
50
|
+
output_file = [summary_base, setname, gene_ids_base].join("__") + ".yml"
|
51
|
+
|
52
|
+
File.open(output_file, 'w') {|out| out.print output.to_yaml }
|
53
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
if ARGV.size == 0
|
4
|
+
puts "usage: #{File.basename(__FILE__)} <IPI_based>.fasta ..."
|
5
|
+
puts "moves any leading \"><.*_>\" to the IPI value"
|
6
|
+
puts "for example:"
|
7
|
+
puts ">DCY_IPI:IPI0032311.1|STUFF -> >IPI:DCY_IPI0032311.1|STUFF"
|
8
|
+
exit
|
9
|
+
end
|
10
|
+
|
11
|
+
ARGV.each do |file|
|
12
|
+
tmp = file + '.tmp'
|
13
|
+
if File.exist?(tmp) ; warn "Skipping #{file} since #{tmp} exists" ; next end
|
14
|
+
File.open(tmp, 'w') do |out|
|
15
|
+
IO.foreach(file) do |line|
|
16
|
+
if line =~ />([^\:\|]+_)/
|
17
|
+
line.sub!("#{$1}IPI:IPI", "IPI:#{$1}IPI")
|
18
|
+
end
|
19
|
+
out.print line
|
20
|
+
end
|
21
|
+
end
|
22
|
+
FileUtils.mv tmp, file
|
23
|
+
end
|
@@ -0,0 +1,366 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'set'
|
5
|
+
require 'optparse'
|
6
|
+
require 'ms/fasta'
|
7
|
+
require 'ms/fasta/ipi'
|
8
|
+
|
9
|
+
SET_RE = /Set\s+(.*)/i
|
10
|
+
QVALUE_EXT = ".qval.yml"
|
11
|
+
|
12
|
+
# returns [sets_to_paths_hash, sets_order]
|
13
|
+
def sets_compare_to_paths(file, ext=QVALUE_EXT)
|
14
|
+
dirname = File.dirname(File.expand_path(file))
|
15
|
+
lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
|
16
|
+
sets = {}
|
17
|
+
current_set = nil
|
18
|
+
sets_order = []
|
19
|
+
lines.each do |line|
|
20
|
+
if line =~ SET_RE
|
21
|
+
current_set = $1.dup
|
22
|
+
sets[current_set] = []
|
23
|
+
sets_order << current_set
|
24
|
+
else
|
25
|
+
full_path = (File.join(dirname,(line + ext)))
|
26
|
+
raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
|
27
|
+
sets[current_set] << full_path
|
28
|
+
end
|
29
|
+
end
|
30
|
+
[sets, sets_order]
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
|
34
|
+
# takes a hash of proteins to aaseqs. Uses a greedy algorithm where
|
35
|
+
# things are sorted first by the number of uniq amino acid sequences and total
|
36
|
+
# aa length. if a block is given, then will yield the prot and the
|
37
|
+
# peptide_array and sort by the returned value. The greedy algorithm acts on
|
38
|
+
# the REVERSE of the sorted proteins. indistinguishable_protein_hash is keyed
|
39
|
+
# on the proteins in the minimal_protein_array and gives an array of other
|
40
|
+
# proteins.
|
41
|
+
def minimal_protein_set(proteins_to_aaseqs)
|
42
|
+
blk_given = block_given?
|
43
|
+
#STDERR.puts "using block for minimal_protein_set" if blk_given
|
44
|
+
proteins_and_uniq_peps = []
|
45
|
+
|
46
|
+
sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
|
47
|
+
if blk_given
|
48
|
+
yield(k,v)
|
49
|
+
else
|
50
|
+
[ v.size, v.inject(0){|m,s| m+s.size} ]
|
51
|
+
end
|
52
|
+
end.reverse
|
53
|
+
|
54
|
+
found_seq = Set.new
|
55
|
+
|
56
|
+
same_peptide_hits = {}
|
57
|
+
|
58
|
+
last_peps = nil
|
59
|
+
last_uniq_prot = nil
|
60
|
+
sorted_most_to_least.each do |prot, peps|
|
61
|
+
sorted_peps = peps.sort # is it necessary to SORT?????????
|
62
|
+
uniq_peps = peps.select do |pep|
|
63
|
+
if found_seq.include?(pep)
|
64
|
+
false
|
65
|
+
else
|
66
|
+
found_seq.add pep
|
67
|
+
true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
if uniq_peps.size > 0
|
71
|
+
proteins_and_uniq_peps << [prot, uniq_peps]
|
72
|
+
same_peptide_hits[prot] = []
|
73
|
+
last_peps = sorted_peps
|
74
|
+
last_uniq_prot = prot
|
75
|
+
else
|
76
|
+
if sorted_peps == last_peps
|
77
|
+
same_peptide_hits[last_uniq_prot] << prot
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
prot_to_uniq_peps_hash = {}
|
82
|
+
proteins_and_uniq_peps.each do |prot, uniq_peps|
|
83
|
+
prot_to_uniq_peps_hash[prot] = uniq_peps
|
84
|
+
end
|
85
|
+
|
86
|
+
[prot_to_uniq_peps_hash, same_peptide_hits]
|
87
|
+
end
|
88
|
+
|
89
|
+
def cutoffs_to_floats(ar)
|
90
|
+
ar.map do |v|
|
91
|
+
if v == 'nil' || v == '-'
|
92
|
+
nil
|
93
|
+
else
|
94
|
+
answ = v.to_f
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# returns a hash keyed on protein id that yields an array:
|
100
|
+
# [#aaseq, #aaseq_and_charge, #total_hits]
|
101
|
+
def stats_per_prot(prot_to_peps, seq_to_hits)
|
102
|
+
per_protein_hash = {}
|
103
|
+
prot_to_peps.each do |prot, uniq_pep_seqs|
|
104
|
+
all = Set.new
|
105
|
+
aaseqcharges = Set.new
|
106
|
+
aaseqs = Set.new
|
107
|
+
|
108
|
+
uniq_pep_seqs.each do |pep_seq|
|
109
|
+
all_hits = seq_to_hits[pep_seq]
|
110
|
+
all.merge( all_hits )
|
111
|
+
all_hits.each do |hit|
|
112
|
+
aaseq = hit.sequence
|
113
|
+
aaseqs.add( aaseq )
|
114
|
+
aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
|
115
|
+
end
|
116
|
+
per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
|
117
|
+
|
118
|
+
end
|
119
|
+
end
|
120
|
+
per_protein_hash
|
121
|
+
end
|
122
|
+
|
123
|
+
opt = {
|
124
|
+
:cutoffs => [nil],
|
125
|
+
:outfile => "summary.yml",
|
126
|
+
}
|
127
|
+
|
128
|
+
opts = OptionParser.new do |op|
|
129
|
+
op.banner = "USAGE: #{File.basename(__FILE__)} sets_compare.txt"
|
130
|
+
op.separator "OUTPUT: #{opt[:outfile]}"
|
131
|
+
op.separator ""
|
132
|
+
op.separator "INPUT: "
|
133
|
+
op.separator " each <file> referenced in sets_compare.txt should have a"
|
134
|
+
op.separator " <file>.qval.yml file"
|
135
|
+
op.separator ""
|
136
|
+
op.separator "OPTIONS:"
|
137
|
+
op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
|
138
|
+
op.separator ""
|
139
|
+
op.on("--proteins <fasta>,<pep-db>", Array, "path to fasta and peptide centric DB", "peptide_centric_db is in the format: ", "<PEPTIDE>: <ID>-<ID>-<ID>") {|v| opt[:proteins] = v }
|
140
|
+
op.separator "FORMATS:"
|
141
|
+
op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
|
142
|
+
op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
|
143
|
+
end
|
144
|
+
|
145
|
+
# later on we could implement full isoform resolution like IsoformResolver
|
146
|
+
# for now we will generate a report, realizing that some isoforms may not be
|
147
|
+
# reported
|
148
|
+
# it is implemented by using a pre-made map from sequence to protein groups
|
149
|
+
# then, a set of sequences allows one to deduce all the relationships from the
|
150
|
+
# protein groups.
|
151
|
+
|
152
|
+
opts.parse!
|
153
|
+
|
154
|
+
if opt[:output_format]
|
155
|
+
yaml = <<SKEL
|
156
|
+
results:
|
157
|
+
- qvalue_cutoff: <Float>
|
158
|
+
sets:
|
159
|
+
<set_name>:
|
160
|
+
num_uniq_aaseqs: <Integer>
|
161
|
+
num_aaseqs_not_in_pep_db: <Integer>
|
162
|
+
num_uniq_aaseqs_charge: <Integer>
|
163
|
+
proteins:
|
164
|
+
<IPI_ID>:
|
165
|
+
num_hits_all:
|
166
|
+
- <Integer> # total num aaseqs
|
167
|
+
- <Integer> # total num aaseq+charge
|
168
|
+
- <Integer> # total num hits
|
169
|
+
num_hits_minimal:
|
170
|
+
- <Integer> # total num aaseqs
|
171
|
+
- <Integer> # total num aaseq+charge
|
172
|
+
- <Integer> # total num hits
|
173
|
+
indistinguishable:
|
174
|
+
- <IPI_ID>
|
175
|
+
- <IPI_ID>
|
176
|
+
aaseqs:
|
177
|
+
- <String>
|
178
|
+
- <String>
|
179
|
+
sets_order:
|
180
|
+
- <String>
|
181
|
+
- <String>
|
182
|
+
protein_info:
|
183
|
+
<IPI_ID>:
|
184
|
+
Gene_Symbol: <String>
|
185
|
+
IPI: <IPI_ID>
|
186
|
+
Tax_Id: <String>
|
187
|
+
SWISS-PROT: <String>
|
188
|
+
description: <String>
|
189
|
+
ENSEMBL: <String>
|
190
|
+
SKEL
|
191
|
+
print yaml
|
192
|
+
exit
|
193
|
+
end
|
194
|
+
|
195
|
+
if opt[:input_format]
|
196
|
+
string =<<EXPLANATION
|
197
|
+
# the sets_compare.yml format is very simple:
|
198
|
+
|
199
|
+
Set <some_name_for_set1>
|
200
|
+
filename1_no_ext
|
201
|
+
filename2_no_ext
|
202
|
+
Set <some_name_for_set2>
|
203
|
+
filename3_no_ext
|
204
|
+
filename4_no_ext
|
205
|
+
...
|
206
|
+
EXPLANATION
|
207
|
+
puts string
|
208
|
+
exit
|
209
|
+
end
|
210
|
+
|
211
|
+
if ARGV.size != 1
|
212
|
+
puts opts.to_s
|
213
|
+
exit
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
results = {}
|
218
|
+
|
219
|
+
protein_info = {}
|
220
|
+
results['protein_info'] = protein_info
|
221
|
+
results['results'] = []
|
222
|
+
|
223
|
+
(sets_hash, sets_order) = sets_compare_to_paths(ARGV.shift)
|
224
|
+
results['sets_order'] = sets_order
|
225
|
+
|
226
|
+
if opt[:proteins]
|
227
|
+
(fasta, pep_db_file) = opt[:proteins]
|
228
|
+
|
229
|
+
# a hash indexed on ipi containing all info
|
230
|
+
prot_header_hash = {}
|
231
|
+
|
232
|
+
STDERR.print "Loading information from fasta file..."
|
233
|
+
start = Time.now
|
234
|
+
prot_sizes_hash = {}
|
235
|
+
Ms::Fasta.open(fasta, 'rb', :io_index => []) do |obj|
|
236
|
+
obj.each do |entry|
|
237
|
+
hash = Ms::Fasta::Ipi.parse(entry.header)
|
238
|
+
ipi = hash['IPI']
|
239
|
+
prot_header_hash[ipi] = hash
|
240
|
+
prot_sizes_hash[ipi] = entry.sequence.size
|
241
|
+
end
|
242
|
+
end
|
243
|
+
STDERR.puts "#{Time.now - start} seconds."
|
244
|
+
|
245
|
+
STDERR.print "Loading peptide centric DB (this takes about a minute)..."
|
246
|
+
start = Time.now
|
247
|
+
pep_db = YAML.load_file(pep_db_file)
|
248
|
+
STDERR.puts "#{Time.now - start} seconds."
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
opt[:cutoffs].each do |cutoff|
|
253
|
+
|
254
|
+
cutoff_results = {'qvalue_cutoff' => cutoff}
|
255
|
+
results_sets_hash = {}
|
256
|
+
cutoff_results['sets'] = results_sets_hash
|
257
|
+
results['results'] << cutoff_results
|
258
|
+
|
259
|
+
#########################
|
260
|
+
# FOR EACH SET:
|
261
|
+
#########################
|
262
|
+
pep_klass = nil
|
263
|
+
sets_hash.each do |set, files|
|
264
|
+
set_results = {}
|
265
|
+
results_sets_hash[set] = set_results
|
266
|
+
|
267
|
+
# assumes the indices are the same into each data file
|
268
|
+
|
269
|
+
# get the complete set of passing hits
|
270
|
+
all_passing_hits = files.inject([]) do |all_passing_hits, file|
|
271
|
+
hash = YAML.load_file(file)
|
272
|
+
|
273
|
+
header_hash = hash['headers']
|
274
|
+
pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
|
275
|
+
hits = hash['data'].map {|v| pep_klass.new(*v) }
|
276
|
+
|
277
|
+
passing_hits =
|
278
|
+
if cutoff
|
279
|
+
# assumes monotonic qvalues values!
|
280
|
+
(above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
|
281
|
+
above
|
282
|
+
else
|
283
|
+
hits
|
284
|
+
end
|
285
|
+
all_passing_hits.push(*passing_hits)
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
# create an index from aaseq to hits
|
290
|
+
seq_to_hits = Hash.new {|h,k| h[k] = []}
|
291
|
+
uniq_seqcharge = Set.new
|
292
|
+
all_passing_hits.each do |hit|
|
293
|
+
seq_to_hits[hit.sequence] << hit
|
294
|
+
uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
|
295
|
+
end
|
296
|
+
|
297
|
+
|
298
|
+
# determine the number of uniq aaseqs
|
299
|
+
uniq_seqs = seq_to_hits.size
|
300
|
+
|
301
|
+
num_uniq_seqcharges = uniq_seqcharge.size
|
302
|
+
|
303
|
+
set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
|
304
|
+
'num_uniq_aaseqs' => uniq_seqs,
|
305
|
+
'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
|
306
|
+
})
|
307
|
+
|
308
|
+
if opt[:proteins]
|
309
|
+
|
310
|
+
# create an index from proteins to peptides
|
311
|
+
prots_to_peps = Hash.new {|h,k| h[k] = [] }
|
312
|
+
peptides_not_found = []
|
313
|
+
seq_to_hits.keys.each do |seq|
|
314
|
+
if pep_db.key?(seq)
|
315
|
+
pep_db[seq].split('-').each do |prot|
|
316
|
+
prots_to_peps[prot] << seq
|
317
|
+
end
|
318
|
+
else
|
319
|
+
peptides_not_found << seq
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
# Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
|
324
|
+
stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
|
325
|
+
|
326
|
+
# get the minimal protein set
|
327
|
+
(prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
|
328
|
+
# will sort with lowest
|
329
|
+
[ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
|
330
|
+
end
|
331
|
+
|
332
|
+
prot_to_uniq_peps_hash.each do |prot, peps|
|
333
|
+
[prot, *indistinguishable_protein_hash[prot]].each do |prot|
|
334
|
+
protein_info[prot] = prot_header_hash[prot]
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
|
339
|
+
|
340
|
+
# create a hash of data for each protein
|
341
|
+
protein_data_hashes_hash = {}
|
342
|
+
prot_to_uniq_peps_hash.each do |prot, peps|
|
343
|
+
protein_data_hashes_hash[prot] = {
|
344
|
+
'aaseqs' => peps,
|
345
|
+
# this will be a triplet
|
346
|
+
'num_hits_minimal' => stats_per_protein_minimal[prot],
|
347
|
+
'indistinguishable' => indistinguishable_protein_hash[prot],
|
348
|
+
'num_hits_all' => stats_per_protein_before[prot],
|
349
|
+
}
|
350
|
+
end
|
351
|
+
|
352
|
+
set_results['proteins'] = protein_data_hashes_hash
|
353
|
+
set_results['num_proteins'] = prot_to_uniq_peps_hash.size
|
354
|
+
set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
|
355
|
+
if peptides_not_found.size > 0
|
356
|
+
warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
File.open(opt[:outfile], 'w') do |out|
|
363
|
+
out.print results.to_yaml
|
364
|
+
end
|
365
|
+
|
366
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
require 'yaml'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
opt = {}
|
8
|
+
opts = OptionParser.new do |op|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} <precision_file>.yml ..."
|
10
|
+
op.separator "outputs information collected by combining hits from files:"
|
11
|
+
op.separator "---"
|
12
|
+
op.separator "filenames: "
|
13
|
+
op.separator "- <pathgiven>"
|
14
|
+
op.separator "num_unique_aaseqs: <Int>"
|
15
|
+
op.separator "num_unique_aaseqs_charge: <Int>"
|
16
|
+
op.separator "num_peptide_hits: <Int>"
|
17
|
+
op.separator ""
|
18
|
+
op.separator "NOTE: if a precision cutoff is given, all hits that have a better"
|
19
|
+
op.separator "score than the worst score at the cutoff are included, even if "
|
20
|
+
op.separator "the precision for that hit was below the cutoff"
|
21
|
+
op.separator "this prevents early, local aberrations in precision from messing"
|
22
|
+
op.separator "up the analysis"
|
23
|
+
op.separator ""
|
24
|
+
op.on("-p", "--precision <0-1>", Float, "precision cutoff") {|v| opt[:cutoff] = v }
|
25
|
+
op.on("-f", "--fdr <0-1>", Float, "false discovery rate cutoff (1-precision)") {|v| opt[:cutoff] = 1.0 - v }
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.parse!
|
29
|
+
|
30
|
+
if ARGV.size == 0
|
31
|
+
puts opts.to_s
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
|
35
|
+
unique_sequences = Set.new
|
36
|
+
unique_ions = Set.new
|
37
|
+
all_hits = []
|
38
|
+
|
39
|
+
ARGV.each do |file|
|
40
|
+
hash = YAML.load_file(file)
|
41
|
+
|
42
|
+
prec_index = hash['headers'].index('precision')
|
43
|
+
mowse_index = hash['headers'].index('mowse')
|
44
|
+
aaseq_index = hash['headers'].index('aaseq')
|
45
|
+
charge_index = hash['headers'].index('charge')
|
46
|
+
|
47
|
+
above_cutoff.each do |ar|
|
48
|
+
sequence = ar[aaseq_index]
|
49
|
+
seq_plus_charge = sequence + ar[charge_index]
|
50
|
+
unique_sequences.add sequence
|
51
|
+
unique_ions.add seq_plus_charge
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
prec_k = 'precision cutoff'
|
56
|
+
fn_k = 'filenames'
|
57
|
+
uniq_aaseq_k = 'num unique aaseqs'
|
58
|
+
uniq_ions_k = 'num unique aaseqs+charge'
|
59
|
+
num_hits_k = 'num peptide hits'
|
60
|
+
|
61
|
+
order = [fn_k, prec_k, num_hits_k, uniq_ions_k, uniq_aaseq_k]
|
62
|
+
|
63
|
+
results = {}
|
64
|
+
results[fn_k] = '[' + ARGV.join(", ") + ']'
|
65
|
+
results[prec_k] = opt[:cutoff]
|
66
|
+
results[uniq_aaseq_k] = unique_sequences.size
|
67
|
+
results[uniq_ions_k] = unique_ions.size
|
68
|
+
results[num_hits_k] = all_hits.size
|
69
|
+
|
70
|
+
order.each do |key|
|
71
|
+
puts "#{key}: #{results[key]}"
|
72
|
+
end
|