mspire 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/lib/spec_id.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
require 'ostruct'
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
require 'set'
|
3
|
+
require 'hash_by'
|
4
|
+
require 'spec_id/precision'
|
5
5
|
require 'roc'
|
6
6
|
require 'sample_enzyme' # for others
|
7
7
|
require 'spec_id/bioworks'
|
8
8
|
require 'spec_id/sequest'
|
9
9
|
require 'spec_id/proph'
|
10
|
-
require '
|
10
|
+
require 'spec_id_xml'
|
11
11
|
|
12
|
+
class SampleEnzyme ; end
|
12
13
|
|
13
14
|
class Mass
|
14
15
|
# http://expasy.org/tools/findmod/findmod_masses.html
|
@@ -70,123 +71,132 @@ class Mass
|
|
70
71
|
}
|
71
72
|
end
|
72
73
|
|
73
|
-
|
74
|
+
module SpecID ; end
|
75
|
+
|
76
|
+
class GenericSpecID ; include SpecID ; end
|
77
|
+
|
78
|
+
module SpecID
|
74
79
|
MONO = Mass::MONO
|
75
80
|
AVG = Mass::AVG
|
76
81
|
|
77
|
-
attr_accessor :
|
78
|
-
attr_writer :peps, :prots
|
82
|
+
attr_accessor :peps, :prots
|
79
83
|
# True if a high protein/peptide score is better than low, false otherwise
|
80
84
|
# This is set automatically for known file types
|
81
85
|
attr_accessor :hi_prob_best
|
82
86
|
|
87
|
+
# A relative pathname of the file the specid object is derived from
|
88
|
+
attr_accessor :filename
|
89
|
+
|
83
90
|
# tp = file_type
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
if file
|
91
|
+
# Will return a SpecID object (really, the object corresponding to the
|
92
|
+
# file type which mixes in SpecID [is_a?(SpecID) == true])
|
93
|
+
# If no file is given, will return a GenericSpecID object.
|
94
|
+
def self.new(file=nil, tp=nil)
|
95
|
+
if file
|
90
96
|
from_file(file, tp)
|
97
|
+
else
|
98
|
+
GenericSpecID.new
|
91
99
|
end
|
92
100
|
end
|
93
101
|
|
94
102
|
# tp = file_type
|
95
|
-
|
103
|
+
# only takes an array if they are srf files!
|
104
|
+
def self.from_file(file, tp=nil)
|
105
|
+
obj = nil
|
96
106
|
unless tp
|
97
|
-
tp =
|
107
|
+
tp = file_type(file)
|
98
108
|
end
|
99
|
-
case tp
|
109
|
+
obj = case tp
|
110
|
+
when 'srg'
|
111
|
+
@hi_prob_best = false
|
112
|
+
SRFGroup.new(file)
|
100
113
|
when 'bioworks'
|
101
|
-
@obj = SpecID::Bioworks.new(file)
|
102
114
|
@hi_prob_best = false
|
115
|
+
Bioworks.new(file)
|
103
116
|
when 'protproph'
|
104
|
-
@obj = SpecID::Proph::ProtSummary.new(file)
|
105
117
|
@hi_prob_best = true
|
118
|
+
Proph::ProtSummary.new(file)
|
106
119
|
else
|
107
120
|
abort "UNRECOGNIZED file type for #{file}"
|
108
121
|
end
|
122
|
+
obj
|
109
123
|
end
|
110
124
|
|
111
125
|
def inspect
|
112
126
|
"<#{self.class} #peps=\"#{peps.size}\">"
|
113
127
|
end
|
114
128
|
|
115
|
-
#
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
def top_peps_prefilter!
|
122
|
-
peps.each do |pep|
|
123
|
-
pep.xcorr = pep.xcorr.to_f
|
124
|
-
pep.deltacn = pep.deltacn.to_f
|
125
|
-
pep.deltamass = pep.deltamass.to_f
|
126
|
-
pep.mass = pep.mass.to_f
|
127
|
-
pep.charge = pep.charge.to_f
|
129
|
+
# takes a comma separated list or array and extends the last to create an
|
130
|
+
# array of desired size
|
131
|
+
def self.extend_args(arg, desired_size)
|
132
|
+
arg_arr = arg
|
133
|
+
if arg.is_a? String
|
134
|
+
arg_arr = arg.split(',')
|
128
135
|
end
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
top_peps << pep
|
138
|
-
else ; break
|
139
|
-
end
|
136
|
+
new_arr = []
|
137
|
+
last_arg = arg_arr[0]
|
138
|
+
desired_size.times do |i|
|
139
|
+
if arg_arr[i]
|
140
|
+
new_arr[i] = arg_arr[i]
|
141
|
+
last_arg = new_arr[i]
|
142
|
+
else
|
143
|
+
new_arr[i] = last_arg
|
140
144
|
end
|
141
145
|
end
|
142
|
-
|
143
|
-
end
|
144
|
-
|
145
|
-
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
146
|
+
new_arr
|
147
|
+
end
|
148
|
+
|
149
|
+
# takes an array of proteins, each having peps
|
150
|
+
# peptide grouping is done
|
151
|
+
# by-
|
152
|
+
# the protein with the most unique peptides ends up taking any
|
153
|
+
# degenerate peptides, tie goes to one with most hits total, then the one
|
154
|
+
# that had the top xcorr(s) (before removing any peptides).All other
|
155
|
+
# proteins with identical peptides will lose those peptides. So, the rich
|
156
|
+
# stay rich, and the poor get poorer.
|
157
|
+
# returns an array of triplets where each is [prot, pep_hits,
|
158
|
+
# uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
|
159
|
+
# peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
|
160
|
+
# update_prots == true will set each protein with the peptides found
|
161
|
+
def self.occams_razor(array_of_prots, update_prots=false)
|
162
|
+
peps_found = Set.new
|
163
|
+
|
164
|
+
to_sort = array_of_prots.map do |prot|
|
165
|
+
pps = prot.peps
|
166
|
+
|
167
|
+
peps_by_uniq_aaseq = pps.hash_by(:aaseq)
|
168
|
+
uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
|
169
|
+
xcorrs = pps.map {|pep| pep.xcorr }
|
170
|
+
|
171
|
+
silly = OpenStruct.new
|
172
|
+
# 0 1 2 3 4 5
|
173
|
+
[uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
|
174
|
+
end
|
175
|
+
prot_triplets = []
|
176
|
+
to_sort.sort.reverse.each do |ar|
|
177
|
+
prot = ar[3]
|
178
|
+
## overlapping set:
|
179
|
+
common = peps_found & ar[4]
|
180
|
+
## find the uniq ones in our little set of peptides:
|
181
|
+
uniq = ar[4] - common
|
182
|
+
pep_hits = []
|
183
|
+
if uniq.size != 0
|
184
|
+
## add to the found list:
|
185
|
+
peps_found.merge(uniq)
|
186
|
+
uniq.each do |seq|
|
187
|
+
pep_hits.push( *(ar[5][seq]) )
|
188
|
+
end
|
189
|
+
prot_triplets << [prot, pep_hits, uniq.to_a]
|
190
|
+
prot.peps = pep_hits if update_prots
|
179
191
|
end
|
180
|
-
[prots_passed, peps_passed]
|
181
|
-
#[prots_passed, peps_passed, deltacnstar_cnt]
|
182
|
-
else
|
183
|
-
abort "#{kind} not implemented"
|
184
192
|
end
|
193
|
+
prot_triplets
|
185
194
|
end
|
186
195
|
|
196
|
+
|
187
197
|
## basically, this is the command line wrapper
|
188
198
|
def self.precision(argv)
|
189
|
-
|
199
|
+
Prec.new.run_cmd_line(argv)
|
190
200
|
end
|
191
201
|
|
192
202
|
|
@@ -197,27 +207,64 @@ class SpecID
|
|
197
207
|
def by_tps(classification_method, tp, fp)
|
198
208
|
ROC.new.by_tps(classification_method, tp, fp)
|
199
209
|
end
|
210
|
+
|
211
|
+
# from the unique set of peptide hits, create a separate peptide hit for
|
212
|
+
# each protein reference where that peptide only references that protein
|
213
|
+
# e.g. pep.prots = [(a single protein)]
|
214
|
+
def pep_prots
|
215
|
+
pps = []
|
216
|
+
peps.each do |pep|
|
217
|
+
pep.prots.map do |prt|
|
218
|
+
pep.dup
|
219
|
+
pep.prots = [prt]
|
220
|
+
pps << pep
|
221
|
+
end
|
222
|
+
end
|
223
|
+
pps
|
224
|
+
end
|
200
225
|
|
201
226
|
# returns [tp, fp] based on the protein prefix for items where items =
|
202
227
|
# (:prot|:peps)
|
228
|
+
# this may result in a duplication of some peptides if they match both
|
229
|
+
# normal and decoy proteins. In this case, the protein arrays are split,
|
230
|
+
# too, so that each points only to its breed of protein.
|
203
231
|
def classify_by_prefix(items, prefix, fp_on_match=true)
|
204
232
|
regex = /^#{Regexp.escape(prefix)}/
|
205
|
-
|
233
|
+
case items
|
206
234
|
when :prots
|
207
|
-
proc { |prt|
|
235
|
+
myproc = proc { |prt|
|
208
236
|
if prt.reference =~ regex ; !fp_on_match
|
209
237
|
else ; fp_on_match end
|
210
238
|
}
|
239
|
+
return classify(items, myproc)
|
211
240
|
when :peps
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
241
|
+
match = [] ; nomatch = []
|
242
|
+
peps.each do |pep|
|
243
|
+
match_prots = [] ; nomatch_prots = []
|
244
|
+
(hit, nohit) = pep.prots.partition do |prot|
|
245
|
+
prot.reference =~ regex
|
246
|
+
end
|
247
|
+
if hit.size == 0
|
248
|
+
nomatch << pep
|
249
|
+
elsif nohit.size == 0
|
250
|
+
match << pep
|
251
|
+
else ## both have hits
|
252
|
+
pep.prots = match_prots
|
253
|
+
nomatch_pep = pep.dup
|
254
|
+
nomatch_pep.prots = nomatch_prots
|
255
|
+
match << pep
|
256
|
+
nomatch << pep
|
257
|
+
end
|
258
|
+
end
|
259
|
+
if fp_on_match
|
260
|
+
return [nomatch , match]
|
261
|
+
else
|
262
|
+
return [match, nomatch]
|
263
|
+
end
|
216
264
|
else
|
217
|
-
abort "
|
265
|
+
abort "don't recognize "
|
218
266
|
end
|
219
|
-
|
220
|
-
end
|
267
|
+
end
|
221
268
|
|
222
269
|
###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
|
223
270
|
# # Returns tp, fp where each is an array of proteins where fp is determined
|
@@ -244,18 +291,6 @@ class SpecID
|
|
244
291
|
[t,f]
|
245
292
|
end
|
246
293
|
|
247
|
-
def peps
|
248
|
-
if @peps ; @peps
|
249
|
-
else @obj.peps
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
def prots
|
254
|
-
if @prots ; @prots
|
255
|
-
else @obj.prots
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
294
|
# returns two arrays, true positives and false positives (determined by proc
|
260
295
|
# classify_item_by) sorted by proc rank_item_by. Items will be ranked from
|
261
296
|
# lowest to highest based on the return value of rank_item_by. items is a
|
@@ -276,7 +311,7 @@ class SpecID
|
|
276
311
|
# returns a proc for getting all probabilities so that an ascending sort
|
277
312
|
# will put the best scores first
|
278
313
|
def probability_proc
|
279
|
-
if
|
314
|
+
if hi_prob_best
|
280
315
|
get_prob_proc = proc {|prt| prt.probability * -1 }
|
281
316
|
else
|
282
317
|
get_prob_proc = proc {|prt| prt.probability }
|
@@ -328,17 +363,13 @@ class SpecID
|
|
328
363
|
if prt.reference =~ regex ; false
|
329
364
|
else ; true end
|
330
365
|
}
|
366
|
+
|
331
367
|
real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
|
332
368
|
|
333
369
|
(num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
|
334
370
|
[num_hits, precision]
|
335
371
|
end
|
336
372
|
|
337
|
-
def method_missing(symbol, *args)
|
338
|
-
@obj.send(symbol, *args)
|
339
|
-
end
|
340
|
-
|
341
|
-
|
342
373
|
# # takes the existing spec_id object and marshals it into "file.msh"
|
343
374
|
# # a new file will always look for a file.msh to load
|
344
375
|
# def marshal(force=false)
|
@@ -348,7 +379,14 @@ class SpecID
|
|
348
379
|
# end
|
349
380
|
|
350
381
|
# Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
|
382
|
+
# 'srf' if SRF file, 'srg' if search results group file.
|
351
383
|
def self.file_type(file)
|
384
|
+
if file =~ /\.srg$/
|
385
|
+
return 'srg'
|
386
|
+
end
|
387
|
+
if IO.read(file, 7,438) == 'Enzyme:'
|
388
|
+
return 'srf'
|
389
|
+
end
|
352
390
|
File.open(file) do |fh|
|
353
391
|
lines = ""
|
354
392
|
4.times { lines << fh.readline }
|
@@ -397,7 +435,7 @@ class SpecID
|
|
397
435
|
#peptides.each do |pep| print pep.class.to_s + " " end
|
398
436
|
#puts peptides.first.is_a? Array
|
399
437
|
#abort "DFHDFD"
|
400
|
-
peptides.collect{|pep| pep.
|
438
|
+
peptides.collect{|pep| pep.probability }.sort
|
401
439
|
end
|
402
440
|
|
403
441
|
# returns a sorted lists of probabilities based on all pepprots (a peptide
|
@@ -477,138 +515,149 @@ class SpecID
|
|
477
515
|
end
|
478
516
|
sorted_probabilities(min_peptides)
|
479
517
|
end
|
480
|
-
|
481
|
-
|
482
|
-
# A Generic spectraID protein
|
483
|
-
class Prot
|
484
|
-
# probability is always a float!
|
485
|
-
attr_accessor :probability, :reference
|
486
|
-
end
|
487
|
-
|
488
|
-
class Pep
|
489
|
-
attr_accessor :probability
|
490
|
-
# full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
|
491
|
-
# or last.
|
492
|
-
attr_accessor :sequence
|
493
|
-
attr_accessor :charge
|
494
|
-
|
495
|
-
# units can be :mmu, :amu, :ppm
|
496
|
-
def mass_accuracy(pep, unit=:ppm, mono=true)
|
497
|
-
# 10^6 * deltam accuracy/ m[measured]
|
498
|
-
# i.e., theoretical mass 1000, measured 999.9: 100ppm
|
499
|
-
# http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
|
500
|
-
# pep.mass is the theoretical M+H of the peptide
|
501
|
-
# this assumes that the deltacn value we're being told is correct, but I
|
502
|
-
# have my suspicions (since the <mass> value is not accurate...)
|
503
|
-
|
504
|
-
######## TO COMPLETE (and add to spec_id..?)
|
505
|
-
case unit
|
506
|
-
when :ppm
|
507
|
-
when :amu
|
508
|
-
when :mmu
|
509
|
-
end
|
510
|
-
end
|
511
|
-
end
|
512
|
-
|
513
518
|
end
|
514
519
|
|
515
|
-
# I would prefer to call this SpecID::XML, but I keep getting an error:
|
516
|
-
# /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
|
517
|
-
# constant XML referenced by SpecID::XML' This works around that for now.
|
518
|
-
# Any major xml elements should return a newline at the end for simple
|
519
|
-
# concatenation into a file
|
520
|
-
module SpecIDXML
|
521
|
-
|
522
|
-
Special_chrs_hash = {
|
523
|
-
'"' => '"',
|
524
|
-
'&' => '&',
|
525
|
-
"'" => ''',
|
526
|
-
'<' => '<',
|
527
|
-
'>' => '>',
|
528
|
-
}
|
529
|
-
|
530
|
-
# substitutes special xml chars
|
531
|
-
def escape_special_chars(string)
|
532
|
-
string.split('').map do |char|
|
533
|
-
if Special_chrs_hash.key? char ; Special_chrs_hash[char]
|
534
|
-
# if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
|
535
|
-
else ; char end
|
536
|
-
end.join
|
537
|
-
end
|
538
|
-
|
539
|
-
$DEPTH = 0
|
540
|
-
|
541
|
-
def tabs
|
542
|
-
# this is ugly
|
543
|
-
string = ""
|
544
|
-
$DEPTH.times { string << "\t" }
|
545
|
-
string
|
546
|
-
end
|
547
520
|
|
521
|
+
# A Generic spectraID protein
|
522
|
+
module SpecID::Prot
|
523
|
+
# probability is always a float!
|
524
|
+
attr_accessor :probability, :reference, :peps
|
548
525
|
|
549
|
-
def
|
550
|
-
|
526
|
+
def <=> (other)
|
527
|
+
self.reference <=> other.reference
|
551
528
|
end
|
552
529
|
|
553
|
-
|
554
|
-
symbol_list.collect { |sy|
|
555
|
-
param_xml(sy)
|
556
|
-
}.join("\n") + "\n"
|
557
|
-
end
|
530
|
+
end
|
558
531
|
|
559
|
-
|
560
|
-
|
532
|
+
module SpecID::Pep
|
533
|
+
|
534
|
+
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
535
|
+
|
536
|
+
attr_accessor :prots
|
537
|
+
attr_accessor :probability
|
538
|
+
# full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
|
539
|
+
# or last.
|
540
|
+
attr_accessor :sequence
|
541
|
+
|
542
|
+
# the basic amino acid sequence (no leading or trailing '.' or amino acids)
|
543
|
+
# should not contain any special symbols, etc.
|
544
|
+
attr_accessor :aaseq
|
545
|
+
attr_accessor :charge
|
546
|
+
|
547
|
+
# removes nonstandard chars with Non_standard_amino_acid_char_re
|
548
|
+
# preserves A-Z and '.' and '-'
|
549
|
+
def self.remove_non_amino_acids(sequence)
|
550
|
+
sequence.gsub(Non_standard_amino_acid_char_re, '')
|
551
|
+
end
|
552
|
+
|
553
|
+
# remove_non_amino_acids && split_sequence
|
554
|
+
def self.prepare_sequence(val)
|
555
|
+
nv = remove_non_amino_acids(val)
|
556
|
+
split_sequence(nv)
|
557
|
+
end
|
558
|
+
|
559
|
+
def <=>(other)
|
560
|
+
aaseq <=> other.aaseq
|
561
|
+
end
|
562
|
+
|
563
|
+
# Returns prev, peptide, next from sequence. Parse errors return
|
564
|
+
# nil,nil,nil
|
565
|
+
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
566
|
+
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
567
|
+
# PEPTIDE.A # -> -, PEPTIDE, A
|
568
|
+
# A.PEPTIDE # -> A, PEPTIDE, -
|
569
|
+
# PEPTIDE # -> nil,nil,nil
|
570
|
+
def self.split_sequence(val)
|
571
|
+
peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
|
572
|
+
pieces = val.split('.')
|
573
|
+
case pieces.size
|
574
|
+
when 3
|
575
|
+
peptide_prev_aa, peptide, peptide_next_aa = *pieces
|
576
|
+
when 2
|
577
|
+
if pieces[0].size > 1 ## N termini
|
578
|
+
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
|
579
|
+
else ## C termini
|
580
|
+
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
|
581
|
+
end
|
582
|
+
when 1 ## this must be a parse error!
|
583
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
584
|
+
when 0
|
585
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
586
|
+
end
|
587
|
+
return peptide_prev_aa, peptide, peptide_next_aa
|
588
|
+
end
|
589
|
+
|
590
|
+
##
|
591
|
+
def self.sequence_to_aaseq(sequence)
|
592
|
+
after_removed = remove_non_amino_acids(sequence)
|
593
|
+
pieces = after_removed.split('.')
|
594
|
+
case pieces.size
|
595
|
+
when 3
|
596
|
+
pieces[1]
|
597
|
+
when 2
|
598
|
+
if pieces[0].size > 1 ## N termini
|
599
|
+
pieces[0]
|
600
|
+
else ## C termini
|
601
|
+
pieces[1]
|
602
|
+
end
|
603
|
+
when 1 ## this must be a parse error!
|
604
|
+
pieces[0] ## which is the peptide itself
|
605
|
+
else
|
606
|
+
abort "bad peptide sequence: #{sequence}"
|
607
|
+
end
|
561
608
|
end
|
562
609
|
|
563
|
-
|
564
|
-
|
565
|
-
|
610
|
+
# This will rapidly determine the list of proteins for which given
|
611
|
+
# peptides belong. It is meant to be low level and fast (eventually),
|
612
|
+
# so it asks for the data in a format amenable to this.
|
613
|
+
# returns a mirror array where each entry is an array of Fasta::Prot
|
614
|
+
# objects where each protein contains the sequence
|
615
|
+
def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
|
616
|
+
prots = fasta_obj.prots
|
617
|
+
prot_seqs = prots.map do |prot|
|
618
|
+
prot.aaseq
|
619
|
+
end
|
566
620
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
621
|
+
groups = peptide_strings_list.map do |pep_seq|
|
622
|
+
prot_index = 0
|
623
|
+
protein_group = []
|
624
|
+
prot_seqs.each do |prot_seq|
|
625
|
+
if prot_seq.include? pep_seq
|
626
|
+
protein_group << prots[prot_index]
|
627
|
+
end
|
628
|
+
prot_index += 1
|
629
|
+
end
|
630
|
+
protein_group
|
631
|
+
end
|
573
632
|
|
574
|
-
|
575
|
-
def element_xml_no_atts(element)
|
576
|
-
start = "#{tabs}<#{element}>\n"
|
577
|
-
$DEPTH += 1
|
578
|
-
if block_given? ; middle = yield else ; middle = '' end
|
579
|
-
$DEPTH -= 1
|
580
|
-
start + middle + "#{tabs}</#{element}>\n"
|
633
|
+
groups
|
581
634
|
end
|
582
635
|
|
583
|
-
#
|
584
|
-
def
|
636
|
+
# units can be :mmu, :amu, :ppm
|
637
|
+
def mass_accuracy(pep, unit=:ppm, mono=true)
|
638
|
+
# 10^6 * deltam accuracy/ m[measured]
|
639
|
+
# i.e., theoretical mass 1000, measured 999.9: 100ppm
|
640
|
+
# http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
|
641
|
+
# pep.mass is the theoretical M+H of the peptide
|
642
|
+
# this assumes that the deltacn value we're being told is correct, but I
|
643
|
+
# have my suspicions (since the <mass> value is not accurate...)
|
585
644
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
645
|
+
######## TO COMPLETE (and add to spec_id..?)
|
646
|
+
case unit
|
647
|
+
when :ppm
|
648
|
+
when :amu
|
649
|
+
when :mmu
|
650
|
+
end
|
591
651
|
end
|
652
|
+
end
|
592
653
|
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
start = "#{tabs}<#{element} #{att_string}>\n"
|
597
|
-
$DEPTH += 1
|
598
|
-
if block_given? ; middle = yield else ; middle = '' end
|
599
|
-
$DEPTH -= 1
|
600
|
-
start + middle + "#{tabs}</#{element}>\n"
|
601
|
-
end
|
654
|
+
class SpecID::GenericProt
|
655
|
+
include SpecID::Prot
|
656
|
+
end
|
602
657
|
|
603
|
-
|
604
|
-
|
605
|
-
|
658
|
+
class SpecID::GenericPep
|
659
|
+
include SpecID::Pep
|
660
|
+
end
|
606
661
|
|
607
|
-
def attrs_xml(list_of_symbols)
|
608
|
-
list_of_symbols.collect {|sy|
|
609
|
-
attr_xml(sy)
|
610
|
-
}.join(" ")
|
611
|
-
end
|
612
662
|
|
613
|
-
end
|
614
663
|
|