mspire 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/bin/protein_summary.rb
CHANGED
@@ -1,455 +1,6 @@
|
|
1
1
|
#!/usr/bin/ruby -w
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'hash_by'
|
5
|
-
require 'optparse'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'spec_id'
|
8
|
-
|
9
|
-
#############################################################
|
10
|
-
# GLOBALS:
|
11
|
-
PRECISION_PROGRAM_BASE = 'precision'
|
12
|
-
DEF_PREFIX = "INV_"
|
13
|
-
DEF_PERCENT_FP = "5.0"
|
14
|
-
#############################################################
|
15
|
-
|
16
|
-
|
17
|
-
# @TODO: add group probability title (showin all group probabilities) for protein prob
|
18
|
-
|
19
|
-
#class String
|
20
|
-
# def margin
|
21
|
-
# self.gsub(/^\s*\|/,'')
|
22
|
-
# end
|
23
|
-
#end
|
24
|
-
|
25
|
-
|
26
|
-
class Runner
|
27
|
-
module HTML
|
28
|
-
def header
|
29
|
-
%Q{<html>
|
30
|
-
<head>
|
31
|
-
#{style}
|
32
|
-
</head>
|
33
|
-
<body>
|
34
|
-
<script type="text/javascript">
|
35
|
-
<!--
|
36
|
-
function toggle_vis(id) {
|
37
|
-
var e = document.getElementById(id);
|
38
|
-
if(e.style.display == 'none')
|
39
|
-
e.style.display = 'block';
|
40
|
-
else
|
41
|
-
e.style.display = 'none';
|
42
|
-
}
|
43
|
-
//-->
|
44
|
-
</script>
|
45
|
-
}
|
46
|
-
end
|
47
|
-
|
48
|
-
def style
|
49
|
-
'
|
50
|
-
<style type="text/css">
|
51
|
-
table {
|
52
|
-
border-width:1px;
|
53
|
-
border-color:#DDDDDD;
|
54
|
-
border-collapse: collapse;
|
55
|
-
}
|
56
|
-
td,th {
|
57
|
-
padding-top: 2px;
|
58
|
-
padding-bottom: 2px;
|
59
|
-
padding-left: 5;
|
60
|
-
padding-right: 5;
|
61
|
-
}
|
62
|
-
td.redline {
|
63
|
-
background-color: #FF0000;
|
64
|
-
color: #FFFFFF
|
65
|
-
}
|
66
|
-
div.file_info, div.software, div.fppr, div.num_proteins{
|
67
|
-
margin-left: 20px;
|
68
|
-
margin-top: 20px;
|
69
|
-
}
|
70
|
-
div.main {
|
71
|
-
margin-left: 10px;
|
72
|
-
margin-right: 10px;
|
73
|
-
margin-top: 50px;
|
74
|
-
margin-bottom: 50px;
|
75
|
-
}
|
76
|
-
div#error {
|
77
|
-
margin: 30px;
|
78
|
-
text-align:center
|
79
|
-
}
|
80
|
-
hr {color: sienna}
|
81
|
-
body { font-size: 8pt; font-family: Arial,Helvetica,Times}
|
82
|
-
</style>
|
83
|
-
'
|
84
|
-
end
|
85
|
-
|
86
|
-
# an anchor and a title
|
87
|
-
def at(display, title)
|
88
|
-
"<a title=\"#{title}\">#{display}</a>"
|
89
|
-
end
|
90
|
-
|
91
|
-
def trailer
|
92
|
-
%q{
|
93
|
-
</body>
|
94
|
-
</html>
|
95
|
-
}
|
96
|
-
end
|
97
|
-
|
98
|
-
def tr
|
99
|
-
"|<tr>
|
100
|
-
| #{yield}
|
101
|
-
|</tr>\n".margin
|
102
|
-
end
|
103
|
-
|
104
|
-
def table
|
105
|
-
"|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
|
106
|
-
| #{yield}
|
107
|
-
|</table></div>\n".margin
|
108
|
-
end
|
109
|
-
|
110
|
-
def tds(arr)
|
111
|
-
arr.map {|v| "<td>#{v}</td>"}.join
|
112
|
-
end
|
113
|
-
|
114
|
-
def ths(arr)
|
115
|
-
str = arr.map {|v| "<th>#{v}</th>"}.join
|
116
|
-
str << "\n"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
class Runner
|
124
|
-
|
125
|
-
include Runner::HTML
|
126
|
-
|
127
|
-
def ref_html(gi, name)
|
128
|
-
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
|
129
|
-
end
|
130
|
-
|
131
|
-
# Takes the -prot.xml filename and grabs the png file (if available)
|
132
|
-
def error_info(prot_file_name)
|
133
|
-
img = prot_file_name.gsub('.xml', '.png')
|
134
|
-
img_bn = File.basename(img)
|
135
|
-
"<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
|
136
|
-
end
|
137
|
-
|
138
|
-
# attempts to get the NCBI gi code
|
139
|
-
def accession(name)
|
140
|
-
if (name.include? '|') && (name[0,3] == 'gi|')
|
141
|
-
name.split('|')[1]
|
142
|
-
else
|
143
|
-
name
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
def prefix_to_regex(prefix)
|
148
|
-
if prefix
|
149
|
-
/^#{Regexp.escape(prefix)}/
|
150
|
-
else
|
151
|
-
nil
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
# given a list of proteins, output a tab delimited textfile with protein
|
156
|
-
# name and the total number of peptides found
|
157
|
-
def output_peptide_counts_file(prots, filename)
|
158
|
-
File.open(filename, "w") do |fh_out|
|
159
|
-
prots.each do |prot|
|
160
|
-
fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# filters on the false positive regex and sorts by prot probability
|
166
|
-
def filter_and_sort(uniq_prots, prefix=nil)
|
167
|
-
prefix_re = prefix_to_regex(prefix)
|
168
|
-
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
169
|
-
## filter on prefix
|
170
|
-
if prefix
|
171
|
-
sorted = sorted.reject {|prot| prot.reference =~ prefix_re }
|
172
|
-
end
|
173
|
-
sorted
|
174
|
-
end
|
175
|
-
|
176
|
-
# assumes that these are sorted on probability
|
177
|
-
# desired_fppr is a float
|
178
|
-
# returns [number_of_prots, actual_fppr]
|
179
|
-
def num_prots_above_fppr(prots, desired_fppr)
|
180
|
-
current_fppr_rate_percent = 0.0
|
181
|
-
previous_fppr_rate_percent = 0.0
|
182
|
-
current_sum_one_minus_prob = 0.0
|
183
|
-
proteins_within_fppr = 0
|
184
|
-
actual_fppr = nil
|
185
|
-
already_found = false
|
186
|
-
prot_cnt = 0
|
187
|
-
prots.each do |prot|
|
188
|
-
prot_cnt += 1
|
189
|
-
# SUM(1-probX)/#prots
|
190
|
-
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
191
|
-
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
192
|
-
|
193
|
-
if current_fppr_rate_percent > desired_fppr && !already_found
|
194
|
-
actual_fppr = previous_fppr_rate_percent
|
195
|
-
proteins_within_fppr = prot_cnt
|
196
|
-
already_found = true
|
197
|
-
end
|
198
|
-
previous_fppr_rate_percent = current_fppr_rate_percent
|
199
|
-
end
|
200
|
-
[proteins_within_fppr, actual_fppr]
|
201
|
-
end
|
202
|
-
|
203
|
-
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
204
|
-
|
205
|
-
# returns a string of the table rows
|
206
|
-
# false_positive_rate (give as a %) is the cutoff mark
|
207
|
-
# returns the number of proteins at the desired_fppr (if given)
|
208
|
-
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
209
|
-
prot_cnt = 0
|
210
|
-
uniq_prots.map do |prot|
|
211
|
-
tr do
|
212
|
-
prot_cnt += 1
|
213
|
-
gi = accession(prot._protein_name)
|
214
|
-
tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
|
215
|
-
end
|
216
|
-
end.join
|
217
|
-
end
|
218
|
-
|
219
|
-
def print_html_pieces(file, *pieces)
|
220
|
-
File.open(file, "w") do |out|
|
221
|
-
pieces.each do |piece|
|
222
|
-
out.print piece
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
def file_info(file)
|
228
|
-
"<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
|
229
|
-
<br/>Last Modified: #{File.mtime(file)}
|
230
|
-
<br/>Size: #{File.size(file)/1000} KB
|
231
|
-
</div>"
|
232
|
-
end
|
233
|
-
|
234
|
-
def bioworks_script_info(obj)
|
235
|
-
version = "3.2??"
|
236
|
-
if obj.version
|
237
|
-
version = obj.version
|
238
|
-
end
|
239
|
-
script_info{"Bioworks version #{version}"}
|
240
|
-
end
|
241
|
-
|
242
|
-
def protproph_script_info
|
243
|
-
begin
|
244
|
-
where = `which xinteract`
|
245
|
-
reply = `#{where}`
|
246
|
-
rescue Exception
|
247
|
-
reply = ""
|
248
|
-
end
|
249
|
-
prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
|
250
|
-
if reply =~ /xinteract.*?\((TPP .*)\)/
|
251
|
-
prophet = $1.dup
|
252
|
-
end
|
253
|
-
script_info { "ProteinProphet from: #{prophet}" }
|
254
|
-
end
|
255
|
-
|
256
|
-
def mspire_version
|
257
|
-
string = "mspire"
|
258
|
-
begin
|
259
|
-
if `gem list --local mspire` =~ /mspire \((.*?)\)/
|
260
|
-
string << (" v" + $1)
|
261
|
-
end
|
262
|
-
rescue Exception
|
263
|
-
end
|
264
|
-
string
|
265
|
-
end
|
266
|
-
|
267
|
-
def script_info
|
268
|
-
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
269
|
-
end
|
270
|
-
|
271
|
-
def proph_output(file, outfn, opt, fppr_output_as_html)
|
272
|
-
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
273
|
-
num_cols = header_anchors.size
|
274
|
-
theaders = ths(header_anchors)
|
275
|
-
|
276
|
-
root = AXML.parse_file(file)
|
277
|
-
prots = []
|
278
|
-
## find the min_prob at a fppr of XX
|
279
|
-
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
280
|
-
|
281
|
-
if opt.c
|
282
|
-
actual_percent_fp = opt.c.to_f
|
283
|
-
elsif opt.cut_at
|
284
|
-
actual_percent_fp = opt.cut_at.to_f
|
285
|
-
else
|
286
|
-
actual_percent_fp = nil
|
287
|
-
end
|
288
|
-
root.protein_group.each do |group|
|
289
|
-
group.protein.each do |prt|
|
290
|
-
prots << prt
|
291
|
-
end
|
292
|
-
end
|
293
|
-
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
294
|
-
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
295
|
-
|
296
|
-
## num proteins above cutoff (if opt.c)
|
297
|
-
num_prots_html = ''
|
298
|
-
if opt.c || opt.cut_at
|
299
|
-
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
300
|
-
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
301
|
-
end
|
302
|
-
if opt.cut_at
|
303
|
-
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
304
|
-
end
|
305
|
-
|
306
|
-
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
307
|
-
|
308
|
-
table_string = table do
|
309
|
-
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
310
|
-
end
|
311
|
-
er_info = opt.precision ? error_info(file) : ""
|
312
|
-
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
313
|
-
print_html_pieces(*html_pieces)
|
314
|
-
end # proph_output
|
315
|
-
|
316
|
-
# given a list of peptide sequences creates javascript to hide/show them
|
317
|
-
def peptide_cell(prot_num, peptide_sequences)
|
318
|
-
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
319
|
-
end
|
320
|
-
|
321
|
-
def bioworks_output(file, outfn, opt, fppr_output_as_html)
|
322
|
-
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
323
|
-
num_cols = header_anchors.size
|
324
|
-
theaders = ths(header_anchors)
|
325
|
-
bio_obj = SpecID.new(file)
|
326
|
-
proteins = bio_obj.prots
|
327
|
-
protein_num = 0
|
328
|
-
rows = ""
|
329
|
-
prefix_re = prefix_to_regex(opt.f)
|
330
|
-
proteins.each do |prot|
|
331
|
-
if opt.f && prot.reference =~ prefix_re
|
332
|
-
next
|
333
|
-
end
|
334
|
-
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
335
|
-
protein_num += 1
|
336
|
-
prot.peps.each do |pep|
|
337
|
-
uniq_peps[pep.sequence.split('.')[1]] = true
|
338
|
-
end
|
339
|
-
pieces = prot.reference.split(' ')
|
340
|
-
long_prot_name = pieces.shift
|
341
|
-
annotation = pieces.join(' ')
|
342
|
-
accession = prot.accession
|
343
|
-
if accession == '0' ; accession = long_prot_name end
|
344
|
-
rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
|
345
|
-
end
|
346
|
-
table_string = table do
|
347
|
-
tr{theaders} + rows
|
348
|
-
end
|
349
|
-
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
|
350
|
-
end # bioworks_output
|
351
|
-
|
352
|
-
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
353
|
-
actual_cutoff = sprintf("%.3f", actual_cutoff)
|
354
|
-
desired_cutoff = sprintf("%.3f", desired_cutoff)
|
355
|
-
"<div class=\"num_proteins\"><h3>False Positive Rate Information</h3>
|
356
|
-
Desired FPR: #{desired_cutoff} %<br/>
|
357
|
-
Actual FPR: #{actual_cutoff} %<br/>
|
358
|
-
Number of Proteins at Actual FPR: #{num_proteins}
|
359
|
-
</div>"
|
360
|
-
end
|
361
|
-
|
362
|
-
# transforms the output string of file_as_decoy into html
|
363
|
-
def file_as_decoy_to_html(string)
|
364
|
-
lines = string.split("\n")
|
365
|
-
#puts lines ?? is this supposed to be commented out?
|
366
|
-
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
367
|
-
lines.map! do |line| "#{line}<br/>" end
|
368
|
-
"<div class=\"fppr\">
|
369
|
-
<h3>Classification Analysis</h3>
|
370
|
-
#{lines.join("\n")}
|
371
|
-
</div>"
|
372
|
-
end
|
373
|
-
|
374
|
-
# transforms the output string of file_as_decoy into html
|
375
|
-
def prefix_as_decoy_to_html(string)
|
376
|
-
"<div class=\"fppr\">
|
377
|
-
<h3>Classification Analysis</h3>
|
378
|
-
</div>" +
|
379
|
-
string
|
380
|
-
end
|
381
|
-
|
382
|
-
def go(argv)
|
383
|
-
@orig_argv = argv.dup
|
384
|
-
dup_argv = argv.dup
|
385
|
-
|
386
|
-
opt = OpenStruct.new
|
387
|
-
opt.f = DEF_PREFIX
|
388
|
-
opts = OptionParser.new do |op|
|
389
|
-
op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
|
390
|
-
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
391
|
-
op.separator " outputs: <file>.summary.html"
|
392
|
-
op.separator ""
|
393
|
-
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
394
|
-
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
395
|
-
op.separator(" if --precision then -f is used to specify a file or prefix")
|
396
|
-
op.separator(" that indicates the false positives.")
|
397
|
-
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
398
|
-
op.separator ""
|
399
|
-
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
400
|
-
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
401
|
-
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
402
|
-
op.separator ""
|
403
|
-
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
404
|
-
op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
|
405
|
-
op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
|
406
|
-
end
|
407
|
-
|
408
|
-
opts.parse!
|
409
|
-
|
410
|
-
if argv.size < 1
|
411
|
-
puts opts
|
412
|
-
exit
|
413
|
-
end
|
414
|
-
|
415
|
-
fppr_output_as_html = ''
|
416
|
-
files = argv.to_a
|
417
|
-
files.each do |file|
|
418
|
-
outfn = file.gsub(/\.xml$/, '.summary.html')
|
419
|
-
## False Positive Rate Calculation:
|
420
|
-
if opt.precision
|
421
|
-
opt.o = outfn # won't actually be written over, but used
|
422
|
-
to_use_argv = create_precision_argv(file, opt)
|
423
|
-
(out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
|
424
|
-
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
425
|
-
end
|
426
|
-
|
427
|
-
case SpecID.file_type(file)
|
428
|
-
when "protproph"
|
429
|
-
proph_output(file, outfn, opt, fppr_output_as_html)
|
430
|
-
when "bioworks"
|
431
|
-
bioworks_output(file, outfn, opt, fppr_output_as_html)
|
432
|
-
else
|
433
|
-
abort "filetype for #{file} not recognized!"
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
end # method go
|
438
|
-
|
439
|
-
def create_precision_argv(file, opt)
|
440
|
-
# include only those options specific
|
441
|
-
new_argv = [file]
|
442
|
-
if opt.f ; new_argv << '-f' << opt.f end
|
443
|
-
if opt.o ; new_argv << '-o' << opt.o end
|
444
|
-
new_argv
|
445
|
-
end
|
446
|
-
|
447
|
-
end # Runner
|
448
|
-
|
449
|
-
##################################################################
|
450
|
-
# MAIN
|
451
|
-
##################################################################
|
452
|
-
|
453
|
-
Runner.new.go(ARGV)
|
3
|
+
require 'spec_id/protein_summary'
|
454
4
|
|
5
|
+
ProteinSummary.new.create_from_command_line_args(ARGV)
|
455
6
|
|
data/bin/raw_to_mzXML.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'spec/mzxml'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
progname = File.basename(__FILE__)
|
8
|
+
|
9
|
+
|
10
|
+
opt = {}
|
11
|
+
opts = OptionParser.new do |op|
|
12
|
+
op.banner = "usage: #{progname} [OPTIONS] <file>.RAW ..."
|
13
|
+
op.separator ""
|
14
|
+
op.on("-p", "--profile", "uses profile output instead of centroid (default)") {|v| opt[:profile] = v}
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.parse!
|
18
|
+
|
19
|
+
if ARGV.size == 0
|
20
|
+
puts opts
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
converter = Spec::MzXML.find_mzxml_converter
|
25
|
+
if converter
|
26
|
+
$stderr.puts "using #{converter} to convert files"
|
27
|
+
else
|
28
|
+
puts "cannot find [#{Spec::MzXML::Potential_mzxml_converters.join(', ')}] in the paths:"
|
29
|
+
puts ENV['PATH'].split(/[:;]/).join(", ")
|
30
|
+
abort
|
31
|
+
end
|
32
|
+
|
33
|
+
files = ARGV.to_a
|
34
|
+
files.each do |file|
|
35
|
+
puts "******************************************"
|
36
|
+
puts "Converting: #{file}"
|
37
|
+
if converter =~ /readw/
|
38
|
+
centroid_or_profile = 'c'
|
39
|
+
if opt[:profile]
|
40
|
+
centroid_or_profile = 'p'
|
41
|
+
end
|
42
|
+
outfile = file.sub(/\.RAW$/i, '.mzXML')
|
43
|
+
cmd = "#{converter} #{file} #{centroid_or_profile} #{outfile}"
|
44
|
+
puts "Performing: '#{cmd}'"
|
45
|
+
puts `#{cmd}`
|
46
|
+
else
|
47
|
+
## t2x only outputs in cwd!
|
48
|
+
Dir.chdir(File.dirname(file)) do |dir|
|
49
|
+
puts "Performing: '#{cmd}' in #{dir}"
|
50
|
+
puts `#{cmd}`
|
51
|
+
system "#{converter} #{File.basename(file)}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
puts "******************************************"
|
55
|
+
end
|
data/bin/srf_group.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'spec_id/srf'
|
6
|
+
|
7
|
+
$OUTFILE = 'bioworks.srg'
|
8
|
+
|
9
|
+
opts = OptionParser.new do |op|
|
10
|
+
op.banner = "usage: #{File.basename(__FILE__)} <file1>.srf <file2>.srf ..."
|
11
|
+
op.separator "outputs: 'bioworks.srg'"
|
12
|
+
op.separator ""
|
13
|
+
op.separator " A '.srg' file is an ascii text file with a list"
|
14
|
+
op.separator " of the srf files (full path names) in that group."
|
15
|
+
op.separator ""
|
16
|
+
op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
|
17
|
+
end
|
18
|
+
|
19
|
+
if ARGV.size == 0
|
20
|
+
puts opts
|
21
|
+
end
|
22
|
+
|
23
|
+
obj = SRFGroup.new
|
24
|
+
obj.filenames = ARGV.to_a
|
25
|
+
obj.to_srg($OUTFILE)
|
26
|
+
|
data/changelog.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
|
2
|
+
## version 0.1.7
|
3
|
+
|
2
4
|
1. A couple of scripts and subroutines were hashing peptides but not on the file
|
3
5
|
basename. This would result in slightly incorrect results (any time there
|
4
6
|
were overlapping scan numbers in multiple datasets, only the top one would be
|
@@ -31,4 +33,9 @@ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
|
|
31
33
|
called the False Positive Predictive Rate (FPPR). I will probably implement
|
32
34
|
this in a future release.
|
33
35
|
|
36
|
+
## version 0.2.0
|
34
37
|
|
38
|
+
** This is a definite code breaker **
|
39
|
+
Revamped the way SpecID works (it is now subclassed). Since I want to return
|
40
|
+
the specific object that the file specifies, I use 'create' now instead of
|
41
|
+
'new' (which forces one to return *that* class.
|
data/lib/align.rb
CHANGED
@@ -24,12 +24,12 @@ class Align
|
|
24
24
|
scanindex_by_basename_noext[runindex.basename_noext] = runindex.scans_by_num
|
25
25
|
end
|
26
26
|
|
27
|
-
dta_filenames =
|
27
|
+
dta_filenames = Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
|
28
28
|
|
29
|
-
parser =
|
29
|
+
parser = Proph::Prot::Parser.new
|
30
30
|
parser.get_prots_and_peps(prot_xml, prot_prob, pep_init_prob, pep_nsp_prob, "regex")
|
31
31
|
peptides = parser.peps
|
32
|
-
peptides =
|
32
|
+
peptides = Proph::Pep.uniq_by_seqcharge(peptides)
|
33
33
|
## we update each peptide with a list of dtafilenames
|
34
34
|
## then we update with a parallel list of scans (one for each dtafn...
|
35
35
|
## unless there are multiple scans associated with each filename
|
data/lib/fasta.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require 'sample_enzyme'
|
2
|
+
require 'each_index'
|
2
3
|
|
4
|
+
|
5
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
3
6
|
class String
|
4
7
|
|
5
8
|
def each_index
|
@@ -21,6 +24,8 @@ class String
|
|
21
24
|
end
|
22
25
|
|
23
26
|
end
|
27
|
+
$VERBOSE = tmp
|
28
|
+
|
24
29
|
|
25
30
|
|
26
31
|
class Fasta
|
@@ -259,9 +264,9 @@ class Fasta
|
|
259
264
|
end
|
260
265
|
|
261
266
|
class Fasta::Prot
|
262
|
-
attr_accessor :header, :aaseq
|
263
267
|
# header given as full line with starting '>' (but no newline chars!).
|
264
268
|
# aaseq also given without any newline chars
|
269
|
+
attr_accessor :header, :aaseq
|
265
270
|
def initialize(header=nil, aaseq=nil)
|
266
271
|
@header = header || ''
|
267
272
|
if aaseq
|
data/lib/gi.rb
CHANGED
@@ -40,19 +40,24 @@ class GI
|
|
40
40
|
BATCH_SIZE = 500
|
41
41
|
# takes an array of gi numbers and returns an array of annotation
|
42
42
|
# This allows use of the batch search mode on NCBI
|
43
|
+
# returns nil if no internet connection
|
43
44
|
def self.gi2annot(list_of_gi_numbers)
|
45
|
+
annots = []
|
44
46
|
loop do
|
45
47
|
batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
|
46
48
|
if batch.size == 0 then break end
|
47
49
|
string = batch.join(",")
|
48
50
|
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
49
51
|
#puts url
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
begin
|
53
|
+
open(url) do |handle|
|
54
|
+
annots.push( *(parse_etool_output(handle)) )
|
55
|
+
end
|
56
|
+
rescue SocketError
|
57
|
+
return nil
|
53
58
|
end
|
54
|
-
annots
|
55
59
|
end
|
60
|
+
annots
|
56
61
|
end
|
57
62
|
|
58
63
|
protected
|
data/lib/roc.rb
CHANGED
@@ -90,6 +90,8 @@ end
|
|
90
90
|
# For calculating precision given lists of hits and decoy hits. The hits are
|
91
91
|
# assumed to have false positives within them that can be estimated from the
|
92
92
|
# number of decoy hits at the same rate
|
93
|
+
# NOTE: this class assumes that lower scores are better. Negate your scores
|
94
|
+
# if this is not the case.
|
93
95
|
class DecoyROC < ROC
|
94
96
|
|
95
97
|
# returns the [num_hits, num_tps, precision] as a function of true
|