mspire 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/bin/protein_summary.rb
CHANGED
@@ -1,455 +1,6 @@
|
|
1
1
|
#!/usr/bin/ruby -w
|
2
2
|
|
3
|
-
require '
|
4
|
-
require 'hash_by'
|
5
|
-
require 'optparse'
|
6
|
-
require 'ostruct'
|
7
|
-
require 'spec_id'
|
8
|
-
|
9
|
-
#############################################################
|
10
|
-
# GLOBALS:
|
11
|
-
PRECISION_PROGRAM_BASE = 'precision'
|
12
|
-
DEF_PREFIX = "INV_"
|
13
|
-
DEF_PERCENT_FP = "5.0"
|
14
|
-
#############################################################
|
15
|
-
|
16
|
-
|
17
|
-
# @TODO: add group probability title (showin all group probabilities) for protein prob
|
18
|
-
|
19
|
-
#class String
|
20
|
-
# def margin
|
21
|
-
# self.gsub(/^\s*\|/,'')
|
22
|
-
# end
|
23
|
-
#end
|
24
|
-
|
25
|
-
|
26
|
-
class Runner
|
27
|
-
module HTML
|
28
|
-
def header
|
29
|
-
%Q{<html>
|
30
|
-
<head>
|
31
|
-
#{style}
|
32
|
-
</head>
|
33
|
-
<body>
|
34
|
-
<script type="text/javascript">
|
35
|
-
<!--
|
36
|
-
function toggle_vis(id) {
|
37
|
-
var e = document.getElementById(id);
|
38
|
-
if(e.style.display == 'none')
|
39
|
-
e.style.display = 'block';
|
40
|
-
else
|
41
|
-
e.style.display = 'none';
|
42
|
-
}
|
43
|
-
//-->
|
44
|
-
</script>
|
45
|
-
}
|
46
|
-
end
|
47
|
-
|
48
|
-
def style
|
49
|
-
'
|
50
|
-
<style type="text/css">
|
51
|
-
table {
|
52
|
-
border-width:1px;
|
53
|
-
border-color:#DDDDDD;
|
54
|
-
border-collapse: collapse;
|
55
|
-
}
|
56
|
-
td,th {
|
57
|
-
padding-top: 2px;
|
58
|
-
padding-bottom: 2px;
|
59
|
-
padding-left: 5;
|
60
|
-
padding-right: 5;
|
61
|
-
}
|
62
|
-
td.redline {
|
63
|
-
background-color: #FF0000;
|
64
|
-
color: #FFFFFF
|
65
|
-
}
|
66
|
-
div.file_info, div.software, div.fppr, div.num_proteins{
|
67
|
-
margin-left: 20px;
|
68
|
-
margin-top: 20px;
|
69
|
-
}
|
70
|
-
div.main {
|
71
|
-
margin-left: 10px;
|
72
|
-
margin-right: 10px;
|
73
|
-
margin-top: 50px;
|
74
|
-
margin-bottom: 50px;
|
75
|
-
}
|
76
|
-
div#error {
|
77
|
-
margin: 30px;
|
78
|
-
text-align:center
|
79
|
-
}
|
80
|
-
hr {color: sienna}
|
81
|
-
body { font-size: 8pt; font-family: Arial,Helvetica,Times}
|
82
|
-
</style>
|
83
|
-
'
|
84
|
-
end
|
85
|
-
|
86
|
-
# an anchor and a title
|
87
|
-
def at(display, title)
|
88
|
-
"<a title=\"#{title}\">#{display}</a>"
|
89
|
-
end
|
90
|
-
|
91
|
-
def trailer
|
92
|
-
%q{
|
93
|
-
</body>
|
94
|
-
</html>
|
95
|
-
}
|
96
|
-
end
|
97
|
-
|
98
|
-
def tr
|
99
|
-
"|<tr>
|
100
|
-
| #{yield}
|
101
|
-
|</tr>\n".margin
|
102
|
-
end
|
103
|
-
|
104
|
-
def table
|
105
|
-
"|<div class=\"main\"><table align=\"center\" border=\"1\" style=\"font-size:100%\" width=\"800px\">
|
106
|
-
| #{yield}
|
107
|
-
|</table></div>\n".margin
|
108
|
-
end
|
109
|
-
|
110
|
-
def tds(arr)
|
111
|
-
arr.map {|v| "<td>#{v}</td>"}.join
|
112
|
-
end
|
113
|
-
|
114
|
-
def ths(arr)
|
115
|
-
str = arr.map {|v| "<th>#{v}</th>"}.join
|
116
|
-
str << "\n"
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
class Runner
|
124
|
-
|
125
|
-
include Runner::HTML
|
126
|
-
|
127
|
-
def ref_html(gi, name)
|
128
|
-
"<a href=\"http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=protein&val=#{gi}\" title=\"#{name}\">#{gi}</a>"
|
129
|
-
end
|
130
|
-
|
131
|
-
# Takes the -prot.xml filename and grabs the png file (if available)
|
132
|
-
def error_info(prot_file_name)
|
133
|
-
img = prot_file_name.gsub('.xml', '.png')
|
134
|
-
img_bn = File.basename(img)
|
135
|
-
"<div id=\"error\"><img src=\"#{img_bn}\" alt=\"[ Optional: To view error/sensitivity image, put #{img_bn} in the same directory as #{File.basename(prot_file_name)} ]\"/>\n</div>"
|
136
|
-
end
|
137
|
-
|
138
|
-
# attempts to get the NCBI gi code
|
139
|
-
def accession(name)
|
140
|
-
if (name.include? '|') && (name[0,3] == 'gi|')
|
141
|
-
name.split('|')[1]
|
142
|
-
else
|
143
|
-
name
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
def prefix_to_regex(prefix)
|
148
|
-
if prefix
|
149
|
-
/^#{Regexp.escape(prefix)}/
|
150
|
-
else
|
151
|
-
nil
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
# given a list of proteins, output a tab delimited textfile with protein
|
156
|
-
# name and the total number of peptides found
|
157
|
-
def output_peptide_counts_file(prots, filename)
|
158
|
-
File.open(filename, "w") do |fh_out|
|
159
|
-
prots.each do |prot|
|
160
|
-
fh_out.puts [prot._protein_name, prot._total_number_peptides].join("\t")
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# filters on the false positive regex and sorts by prot probability
|
166
|
-
def filter_and_sort(uniq_prots, prefix=nil)
|
167
|
-
prefix_re = prefix_to_regex(prefix)
|
168
|
-
sorted = uniq_prots.sort_by {|prt| [prt._probability, prt.parent._probability]}.reverse
|
169
|
-
## filter on prefix
|
170
|
-
if prefix
|
171
|
-
sorted = sorted.reject {|prot| prot.reference =~ prefix_re }
|
172
|
-
end
|
173
|
-
sorted
|
174
|
-
end
|
175
|
-
|
176
|
-
# assumes that these are sorted on probability
|
177
|
-
# desired_fppr is a float
|
178
|
-
# returns [number_of_prots, actual_fppr]
|
179
|
-
def num_prots_above_fppr(prots, desired_fppr)
|
180
|
-
current_fppr_rate_percent = 0.0
|
181
|
-
previous_fppr_rate_percent = 0.0
|
182
|
-
current_sum_one_minus_prob = 0.0
|
183
|
-
proteins_within_fppr = 0
|
184
|
-
actual_fppr = nil
|
185
|
-
already_found = false
|
186
|
-
prot_cnt = 0
|
187
|
-
prots.each do |prot|
|
188
|
-
prot_cnt += 1
|
189
|
-
# SUM(1-probX)/#prots
|
190
|
-
current_sum_one_minus_prob += 1.0 - prot._probability.to_f
|
191
|
-
current_fppr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
|
192
|
-
|
193
|
-
if current_fppr_rate_percent > desired_fppr && !already_found
|
194
|
-
actual_fppr = previous_fppr_rate_percent
|
195
|
-
proteins_within_fppr = prot_cnt
|
196
|
-
already_found = true
|
197
|
-
end
|
198
|
-
previous_fppr_rate_percent = current_fppr_rate_percent
|
199
|
-
end
|
200
|
-
[proteins_within_fppr, actual_fppr]
|
201
|
-
end
|
202
|
-
|
203
|
-
#### #readable_previous_fppr_rate_percent = sprintf("%.2f", previous_fppr_rate_percent)
|
204
|
-
|
205
|
-
# returns a string of the table rows
|
206
|
-
# false_positive_rate (give as a %) is the cutoff mark
|
207
|
-
# returns the number of proteins at the desired_fppr (if given)
|
208
|
-
def table_rows(uniq_prots, prefix, false_positive_rate_percent, num_cols, desired_fppr, actual_percent_fp, peptide_count_filename=nil)
|
209
|
-
prot_cnt = 0
|
210
|
-
uniq_prots.map do |prot|
|
211
|
-
tr do
|
212
|
-
prot_cnt += 1
|
213
|
-
gi = accession(prot._protein_name)
|
214
|
-
tds([prot_cnt, prot._probability, ref_html(gi, prot._protein_name), prot.annotation.first._protein_description, prot._percent_coverage, peptide_cell(prot_cnt, prot._unique_stripped_peptides.split('+')), prot._total_number_peptides, prot._pct_spectrum_ids])
|
215
|
-
end
|
216
|
-
end.join
|
217
|
-
end
|
218
|
-
|
219
|
-
def print_html_pieces(file, *pieces)
|
220
|
-
File.open(file, "w") do |out|
|
221
|
-
pieces.each do |piece|
|
222
|
-
out.print piece
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|
226
|
-
|
227
|
-
def file_info(file)
|
228
|
-
"<div class=\"file_info\"><h3>Source File Information</h3>File: #{File.expand_path(file)}
|
229
|
-
<br/>Last Modified: #{File.mtime(file)}
|
230
|
-
<br/>Size: #{File.size(file)/1000} KB
|
231
|
-
</div>"
|
232
|
-
end
|
233
|
-
|
234
|
-
def bioworks_script_info(obj)
|
235
|
-
version = "3.2??"
|
236
|
-
if obj.version
|
237
|
-
version = obj.version
|
238
|
-
end
|
239
|
-
script_info{"Bioworks version #{version}"}
|
240
|
-
end
|
241
|
-
|
242
|
-
def protproph_script_info
|
243
|
-
begin
|
244
|
-
where = `which xinteract`
|
245
|
-
reply = `#{where}`
|
246
|
-
rescue Exception
|
247
|
-
reply = ""
|
248
|
-
end
|
249
|
-
prophet = "TPP (version unknown)" # put your version here if you can't get it dynamically
|
250
|
-
if reply =~ /xinteract.*?\((TPP .*)\)/
|
251
|
-
prophet = $1.dup
|
252
|
-
end
|
253
|
-
script_info { "ProteinProphet from: #{prophet}" }
|
254
|
-
end
|
255
|
-
|
256
|
-
def mspire_version
|
257
|
-
string = "mspire"
|
258
|
-
begin
|
259
|
-
if `gem list --local mspire` =~ /mspire \((.*?)\)/
|
260
|
-
string << (" v" + $1)
|
261
|
-
end
|
262
|
-
rescue Exception
|
263
|
-
end
|
264
|
-
string
|
265
|
-
end
|
266
|
-
|
267
|
-
def script_info
|
268
|
-
"<div class=\"software\"><h3>Software Information</h3>#{yield}<br/>Ruby package: #{mspire_version}<br/>Command: #{[File.basename(__FILE__), *@orig_argv].join(" ")}</div>"
|
269
|
-
end
|
270
|
-
|
271
|
-
def proph_output(file, outfn, opt, fppr_output_as_html)
|
272
|
-
header_anchors = [at('#', 'number'), at('prob','protein probability (for Prophet, higher is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (includes non-contributing peptides). Click number to show/hide'), at('#peps', 'total number of corresponding peptides that contributed to protein probability'), at('%ids', 'fraction of correct dataset peptide identifications corresponding to protein')]
|
273
|
-
num_cols = header_anchors.size
|
274
|
-
theaders = ths(header_anchors)
|
275
|
-
|
276
|
-
root = AXML.parse_file(file)
|
277
|
-
prots = []
|
278
|
-
## find the min_prob at a fppr of XX
|
279
|
-
min_prob_redline = 1.01 # if no fppr is less than what they give, then all are redlined!
|
280
|
-
|
281
|
-
if opt.c
|
282
|
-
actual_percent_fp = opt.c.to_f
|
283
|
-
elsif opt.cut_at
|
284
|
-
actual_percent_fp = opt.cut_at.to_f
|
285
|
-
else
|
286
|
-
actual_percent_fp = nil
|
287
|
-
end
|
288
|
-
root.protein_group.each do |group|
|
289
|
-
group.protein.each do |prt|
|
290
|
-
prots << prt
|
291
|
-
end
|
292
|
-
end
|
293
|
-
uniq_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
|
294
|
-
filtered_sorted_prots = filter_and_sort(uniq_prots, opt.f)
|
295
|
-
|
296
|
-
## num proteins above cutoff (if opt.c)
|
297
|
-
num_prots_html = ''
|
298
|
-
if opt.c || opt.cut_at
|
299
|
-
(num_prots, actual_fppr) = num_prots_above_fppr(filtered_sorted_prots, actual_percent_fp)
|
300
|
-
num_prots_html = num_prots_to_html(actual_percent_fp, actual_fppr, num_prots)
|
301
|
-
end
|
302
|
-
if opt.cut_at
|
303
|
-
filtered_sorted_prots = filtered_sorted_prots[0,num_prots]
|
304
|
-
end
|
305
|
-
|
306
|
-
output_peptide_counts_file(filtered_sorted_prots, opt.peptide_count) if opt.peptide_count
|
307
|
-
|
308
|
-
table_string = table do
|
309
|
-
tr{theaders} + table_rows(filtered_sorted_prots, opt.f, actual_percent_fp, num_cols, opt.c.to_f, actual_percent_fp, opt.peptide_count)
|
310
|
-
end
|
311
|
-
er_info = opt.precision ? error_info(file) : ""
|
312
|
-
html_pieces = [outfn, header, fppr_output_as_html, er_info, file_info(file), protproph_script_info, num_prots_html, table_string, trailer]
|
313
|
-
print_html_pieces(*html_pieces)
|
314
|
-
end # proph_output
|
315
|
-
|
316
|
-
# given a list of peptide sequences creates javascript to hide/show them
|
317
|
-
def peptide_cell(prot_num, peptide_sequences)
|
318
|
-
"<a href=\"#prot#{prot_num}\" onclick=\"toggle_vis('#{prot_num}');\">#{peptide_sequences.size}</a><div id=\"#{prot_num}\" style=\"display:none;\">#{peptide_sequences.join(', ')}</div>"
|
319
|
-
end
|
320
|
-
|
321
|
-
def bioworks_output(file, outfn, opt, fppr_output_as_html)
|
322
|
-
header_anchors = [at('#', 'number'), at('prob','protein probability (for Bioworks, lower is better)'), at('ref', 'gi number if available (or complete reference)'), at('annotation', 'annotation from the fasta file'), at('%cov', 'percent of protein sequence covered by corresponding peptides'), at('peps', 'unique peptides identified (at any confidence) Click number to show/hide.'), at('#peps', 'total number of peptides seen (not unique)')]
|
323
|
-
num_cols = header_anchors.size
|
324
|
-
theaders = ths(header_anchors)
|
325
|
-
bio_obj = SpecID.new(file)
|
326
|
-
proteins = bio_obj.prots
|
327
|
-
protein_num = 0
|
328
|
-
rows = ""
|
329
|
-
prefix_re = prefix_to_regex(opt.f)
|
330
|
-
proteins.each do |prot|
|
331
|
-
if opt.f && prot.reference =~ prefix_re
|
332
|
-
next
|
333
|
-
end
|
334
|
-
uniq_peps = Hash.new {|h,k| h[k] = true; }
|
335
|
-
protein_num += 1
|
336
|
-
prot.peps.each do |pep|
|
337
|
-
uniq_peps[pep.sequence.split('.')[1]] = true
|
338
|
-
end
|
339
|
-
pieces = prot.reference.split(' ')
|
340
|
-
long_prot_name = pieces.shift
|
341
|
-
annotation = pieces.join(' ')
|
342
|
-
accession = prot.accession
|
343
|
-
if accession == '0' ; accession = long_prot_name end
|
344
|
-
rows << tr{ tds([protein_num, prot.protein_probability, ref_html(accession, long_prot_name), annotation, prot.coverage, peptide_cell(protein_num, uniq_peps.keys), prot.peps.size]) }
|
345
|
-
end
|
346
|
-
table_string = table do
|
347
|
-
tr{theaders} + rows
|
348
|
-
end
|
349
|
-
print_html_pieces(outfn, header, fppr_output_as_html, file_info(file), bioworks_script_info(bio_obj), table_string, trailer)
|
350
|
-
end # bioworks_output
|
351
|
-
|
352
|
-
def num_prots_to_html(desired_cutoff, actual_cutoff, num_proteins)
|
353
|
-
actual_cutoff = sprintf("%.3f", actual_cutoff)
|
354
|
-
desired_cutoff = sprintf("%.3f", desired_cutoff)
|
355
|
-
"<div class=\"num_proteins\"><h3>False Positive Rate Information</h3>
|
356
|
-
Desired FPR: #{desired_cutoff} %<br/>
|
357
|
-
Actual FPR: #{actual_cutoff} %<br/>
|
358
|
-
Number of Proteins at Actual FPR: #{num_proteins}
|
359
|
-
</div>"
|
360
|
-
end
|
361
|
-
|
362
|
-
# transforms the output string of file_as_decoy into html
|
363
|
-
def file_as_decoy_to_html(string)
|
364
|
-
lines = string.split("\n")
|
365
|
-
#puts lines ?? is this supposed to be commented out?
|
366
|
-
lines = lines.reject do |obj| obj =~ /\*{10}/ end
|
367
|
-
lines.map! do |line| "#{line}<br/>" end
|
368
|
-
"<div class=\"fppr\">
|
369
|
-
<h3>Classification Analysis</h3>
|
370
|
-
#{lines.join("\n")}
|
371
|
-
</div>"
|
372
|
-
end
|
373
|
-
|
374
|
-
# transforms the output string of file_as_decoy into html
|
375
|
-
def prefix_as_decoy_to_html(string)
|
376
|
-
"<div class=\"fppr\">
|
377
|
-
<h3>Classification Analysis</h3>
|
378
|
-
</div>" +
|
379
|
-
string
|
380
|
-
end
|
381
|
-
|
382
|
-
def go(argv)
|
383
|
-
@orig_argv = argv.dup
|
384
|
-
dup_argv = argv.dup
|
385
|
-
|
386
|
-
opt = OpenStruct.new
|
387
|
-
opt.f = DEF_PREFIX
|
388
|
-
opts = OptionParser.new do |op|
|
389
|
-
op.banner = "usage: #{File.basename(__FILE__)} [options] <file>.xml ..."
|
390
|
-
op.separator " where file = bioworks -or- <run>-prot (prophet output)"
|
391
|
-
op.separator " outputs: <file>.summary.html"
|
392
|
-
op.separator ""
|
393
|
-
op.on("-f", "--false <prefix>", "ignore proteins with prefix (def: #{DEF_PREFIX})") {|v| opt.f = v }
|
394
|
-
op.on("-p", "--precision", "include the output from precision.rb") {|v| opt.p = v }
|
395
|
-
op.separator(" if --precision then -f is used to specify a file or prefix")
|
396
|
-
op.separator(" that indicates the false positives.")
|
397
|
-
op.on("--peptide_count <filename>", "outputs text file with # peptides per protein") {|v| opt.peptide_count = v}
|
398
|
-
op.separator ""
|
399
|
-
op.separator "Options for #{PRECISION_PROGRAM_BASE}.rb :"
|
400
|
-
op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
|
401
|
-
op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
|
402
|
-
op.separator ""
|
403
|
-
op.separator "Specific to ProteinProphet (with no concatenated DB):"
|
404
|
-
op.on("-c", "--cutoff percent", "includes FPR summary at given cutoff") {|v| opt.c = v }
|
405
|
-
op.on("--cut_at percent", "only reports proteins within FPR percent") {|v| opt.cut_at = v }
|
406
|
-
end
|
407
|
-
|
408
|
-
opts.parse!
|
409
|
-
|
410
|
-
if argv.size < 1
|
411
|
-
puts opts
|
412
|
-
exit
|
413
|
-
end
|
414
|
-
|
415
|
-
fppr_output_as_html = ''
|
416
|
-
files = argv.to_a
|
417
|
-
files.each do |file|
|
418
|
-
outfn = file.gsub(/\.xml$/, '.summary.html')
|
419
|
-
## False Positive Rate Calculation:
|
420
|
-
if opt.precision
|
421
|
-
opt.o = outfn # won't actually be written over, but used
|
422
|
-
to_use_argv = create_precision_argv(file, opt)
|
423
|
-
(out_string, opt) = SpecID::Precision.new.precision(to_use_argv)
|
424
|
-
fppr_output_as_html = prefix_as_decoy_to_html(out_string)
|
425
|
-
end
|
426
|
-
|
427
|
-
case SpecID.file_type(file)
|
428
|
-
when "protproph"
|
429
|
-
proph_output(file, outfn, opt, fppr_output_as_html)
|
430
|
-
when "bioworks"
|
431
|
-
bioworks_output(file, outfn, opt, fppr_output_as_html)
|
432
|
-
else
|
433
|
-
abort "filetype for #{file} not recognized!"
|
434
|
-
end
|
435
|
-
end
|
436
|
-
|
437
|
-
end # method go
|
438
|
-
|
439
|
-
def create_precision_argv(file, opt)
|
440
|
-
# include only those options specific
|
441
|
-
new_argv = [file]
|
442
|
-
if opt.f ; new_argv << '-f' << opt.f end
|
443
|
-
if opt.o ; new_argv << '-o' << opt.o end
|
444
|
-
new_argv
|
445
|
-
end
|
446
|
-
|
447
|
-
end # Runner
|
448
|
-
|
449
|
-
##################################################################
|
450
|
-
# MAIN
|
451
|
-
##################################################################
|
452
|
-
|
453
|
-
Runner.new.go(ARGV)
|
3
|
+
require 'spec_id/protein_summary'
|
454
4
|
|
5
|
+
ProteinSummary.new.create_from_command_line_args(ARGV)
|
455
6
|
|
data/bin/raw_to_mzXML.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'spec/mzxml'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
progname = File.basename(__FILE__)
|
8
|
+
|
9
|
+
|
10
|
+
opt = {}
|
11
|
+
opts = OptionParser.new do |op|
|
12
|
+
op.banner = "usage: #{progname} [OPTIONS] <file>.RAW ..."
|
13
|
+
op.separator ""
|
14
|
+
op.on("-p", "--profile", "uses profile output instead of centroid (default)") {|v| opt[:profile] = v}
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.parse!
|
18
|
+
|
19
|
+
if ARGV.size == 0
|
20
|
+
puts opts
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
converter = Spec::MzXML.find_mzxml_converter
|
25
|
+
if converter
|
26
|
+
$stderr.puts "using #{converter} to convert files"
|
27
|
+
else
|
28
|
+
puts "cannot find [#{Spec::MzXML::Potential_mzxml_converters.join(', ')}] in the paths:"
|
29
|
+
puts ENV['PATH'].split(/[:;]/).join(", ")
|
30
|
+
abort
|
31
|
+
end
|
32
|
+
|
33
|
+
files = ARGV.to_a
|
34
|
+
files.each do |file|
|
35
|
+
puts "******************************************"
|
36
|
+
puts "Converting: #{file}"
|
37
|
+
if converter =~ /readw/
|
38
|
+
centroid_or_profile = 'c'
|
39
|
+
if opt[:profile]
|
40
|
+
centroid_or_profile = 'p'
|
41
|
+
end
|
42
|
+
outfile = file.sub(/\.RAW$/i, '.mzXML')
|
43
|
+
cmd = "#{converter} #{file} #{centroid_or_profile} #{outfile}"
|
44
|
+
puts "Performing: '#{cmd}'"
|
45
|
+
puts `#{cmd}`
|
46
|
+
else
|
47
|
+
## t2x only outputs in cwd!
|
48
|
+
Dir.chdir(File.dirname(file)) do |dir|
|
49
|
+
puts "Performing: '#{cmd}' in #{dir}"
|
50
|
+
puts `#{cmd}`
|
51
|
+
system "#{converter} #{File.basename(file)}"
|
52
|
+
end
|
53
|
+
end
|
54
|
+
puts "******************************************"
|
55
|
+
end
|
data/bin/srf_group.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'spec_id/srf'
|
6
|
+
|
7
|
+
$OUTFILE = 'bioworks.srg'
|
8
|
+
|
9
|
+
opts = OptionParser.new do |op|
|
10
|
+
op.banner = "usage: #{File.basename(__FILE__)} <file1>.srf <file2>.srf ..."
|
11
|
+
op.separator "outputs: 'bioworks.srg'"
|
12
|
+
op.separator ""
|
13
|
+
op.separator " A '.srg' file is an ascii text file with a list"
|
14
|
+
op.separator " of the srf files (full path names) in that group."
|
15
|
+
op.separator ""
|
16
|
+
op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
|
17
|
+
end
|
18
|
+
|
19
|
+
if ARGV.size == 0
|
20
|
+
puts opts
|
21
|
+
end
|
22
|
+
|
23
|
+
obj = SRFGroup.new
|
24
|
+
obj.filenames = ARGV.to_a
|
25
|
+
obj.to_srg($OUTFILE)
|
26
|
+
|
data/changelog.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
|
2
|
+
## version 0.1.7
|
3
|
+
|
2
4
|
1. A couple of scripts and subroutines were hashing peptides but not on the file
|
3
5
|
basename. This would result in slightly incorrect results (any time there
|
4
6
|
were overlapping scan numbers in multiple datasets, only the top one would be
|
@@ -31,4 +33,9 @@ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
|
|
31
33
|
called the False Positive Predictive Rate (FPPR). I will probably implement
|
32
34
|
this in a future release.
|
33
35
|
|
36
|
+
## version 0.2.0
|
34
37
|
|
38
|
+
** This is a definite code breaker **
|
39
|
+
Revamped the way SpecID works (it is now subclassed). Since I want to return
|
40
|
+
the specific object that the file specifies, I use 'create' now instead of
|
41
|
+
'new' (which forces one to return *that* class.
|
data/lib/align.rb
CHANGED
@@ -24,12 +24,12 @@ class Align
|
|
24
24
|
scanindex_by_basename_noext[runindex.basename_noext] = runindex.scans_by_num
|
25
25
|
end
|
26
26
|
|
27
|
-
dta_filenames =
|
27
|
+
dta_filenames = Proph::Pep::Parser.new.dta_filenames_by_seq_charge(pep_proph_xml, "regex")
|
28
28
|
|
29
|
-
parser =
|
29
|
+
parser = Proph::Prot::Parser.new
|
30
30
|
parser.get_prots_and_peps(prot_xml, prot_prob, pep_init_prob, pep_nsp_prob, "regex")
|
31
31
|
peptides = parser.peps
|
32
|
-
peptides =
|
32
|
+
peptides = Proph::Pep.uniq_by_seqcharge(peptides)
|
33
33
|
## we update each peptide with a list of dtafilenames
|
34
34
|
## then we update with a parallel list of scans (one for each dtafn...
|
35
35
|
## unless there are multiple scans associated with each filename
|
data/lib/fasta.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require 'sample_enzyme'
|
2
|
+
require 'each_index'
|
2
3
|
|
4
|
+
|
5
|
+
tmp = $VERBOSE ; $VERBOSE = nil
|
3
6
|
class String
|
4
7
|
|
5
8
|
def each_index
|
@@ -21,6 +24,8 @@ class String
|
|
21
24
|
end
|
22
25
|
|
23
26
|
end
|
27
|
+
$VERBOSE = tmp
|
28
|
+
|
24
29
|
|
25
30
|
|
26
31
|
class Fasta
|
@@ -259,9 +264,9 @@ class Fasta
|
|
259
264
|
end
|
260
265
|
|
261
266
|
class Fasta::Prot
|
262
|
-
attr_accessor :header, :aaseq
|
263
267
|
# header given as full line with starting '>' (but no newline chars!).
|
264
268
|
# aaseq also given without any newline chars
|
269
|
+
attr_accessor :header, :aaseq
|
265
270
|
def initialize(header=nil, aaseq=nil)
|
266
271
|
@header = header || ''
|
267
272
|
if aaseq
|
data/lib/gi.rb
CHANGED
@@ -40,19 +40,24 @@ class GI
|
|
40
40
|
BATCH_SIZE = 500
|
41
41
|
# takes an array of gi numbers and returns an array of annotation
|
42
42
|
# This allows use of the batch search mode on NCBI
|
43
|
+
# returns nil if no internet connection
|
43
44
|
def self.gi2annot(list_of_gi_numbers)
|
45
|
+
annots = []
|
44
46
|
loop do
|
45
47
|
batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
|
46
48
|
if batch.size == 0 then break end
|
47
49
|
string = batch.join(",")
|
48
50
|
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
|
49
51
|
#puts url
|
50
|
-
|
51
|
-
|
52
|
-
|
52
|
+
begin
|
53
|
+
open(url) do |handle|
|
54
|
+
annots.push( *(parse_etool_output(handle)) )
|
55
|
+
end
|
56
|
+
rescue SocketError
|
57
|
+
return nil
|
53
58
|
end
|
54
|
-
annots
|
55
59
|
end
|
60
|
+
annots
|
56
61
|
end
|
57
62
|
|
58
63
|
protected
|
data/lib/roc.rb
CHANGED
@@ -90,6 +90,8 @@ end
|
|
90
90
|
# For calculating precision given lists of hits and decoy hits. The hits are
|
91
91
|
# assumed to have false positives within them that can be estimated from the
|
92
92
|
# number of decoy hits at the same rate
|
93
|
+
# NOTE: this class assumes that lower scores are better. Negate your scores
|
94
|
+
# if this is not the case.
|
93
95
|
class DecoyROC < ROC
|
94
96
|
|
95
97
|
# returns the [num_hits, num_tps, precision] as a function of true
|