mspire 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/lib/spec_id.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
require 'ostruct'
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
require 'set'
|
3
|
+
require 'hash_by'
|
4
|
+
require 'spec_id/precision'
|
5
5
|
require 'roc'
|
6
6
|
require 'sample_enzyme' # for others
|
7
7
|
require 'spec_id/bioworks'
|
8
8
|
require 'spec_id/sequest'
|
9
9
|
require 'spec_id/proph'
|
10
|
-
require '
|
10
|
+
require 'spec_id_xml'
|
11
11
|
|
12
|
+
class SampleEnzyme ; end
|
12
13
|
|
13
14
|
class Mass
|
14
15
|
# http://expasy.org/tools/findmod/findmod_masses.html
|
@@ -70,123 +71,132 @@ class Mass
|
|
70
71
|
}
|
71
72
|
end
|
72
73
|
|
73
|
-
|
74
|
+
module SpecID ; end
|
75
|
+
|
76
|
+
class GenericSpecID ; include SpecID ; end
|
77
|
+
|
78
|
+
module SpecID
|
74
79
|
MONO = Mass::MONO
|
75
80
|
AVG = Mass::AVG
|
76
81
|
|
77
|
-
attr_accessor :
|
78
|
-
attr_writer :peps, :prots
|
82
|
+
attr_accessor :peps, :prots
|
79
83
|
# True if a high protein/peptide score is better than low, false otherwise
|
80
84
|
# This is set automatically for known file types
|
81
85
|
attr_accessor :hi_prob_best
|
82
86
|
|
87
|
+
# A relative pathname of the file the specid object is derived from
|
88
|
+
attr_accessor :filename
|
89
|
+
|
83
90
|
# tp = file_type
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
if file
|
91
|
+
# Will return a SpecID object (really, the object corresponding to the
|
92
|
+
# file type which mixes in SpecID [is_a?(SpecID) == true])
|
93
|
+
# If no file is given, will return a GenericSpecID object.
|
94
|
+
def self.new(file=nil, tp=nil)
|
95
|
+
if file
|
90
96
|
from_file(file, tp)
|
97
|
+
else
|
98
|
+
GenericSpecID.new
|
91
99
|
end
|
92
100
|
end
|
93
101
|
|
94
102
|
# tp = file_type
|
95
|
-
|
103
|
+
# only takes an array if they are srf files!
|
104
|
+
def self.from_file(file, tp=nil)
|
105
|
+
obj = nil
|
96
106
|
unless tp
|
97
|
-
tp =
|
107
|
+
tp = file_type(file)
|
98
108
|
end
|
99
|
-
case tp
|
109
|
+
obj = case tp
|
110
|
+
when 'srg'
|
111
|
+
@hi_prob_best = false
|
112
|
+
SRFGroup.new(file)
|
100
113
|
when 'bioworks'
|
101
|
-
@obj = SpecID::Bioworks.new(file)
|
102
114
|
@hi_prob_best = false
|
115
|
+
Bioworks.new(file)
|
103
116
|
when 'protproph'
|
104
|
-
@obj = SpecID::Proph::ProtSummary.new(file)
|
105
117
|
@hi_prob_best = true
|
118
|
+
Proph::ProtSummary.new(file)
|
106
119
|
else
|
107
120
|
abort "UNRECOGNIZED file type for #{file}"
|
108
121
|
end
|
122
|
+
obj
|
109
123
|
end
|
110
124
|
|
111
125
|
def inspect
|
112
126
|
"<#{self.class} #peps=\"#{peps.size}\">"
|
113
127
|
end
|
114
128
|
|
115
|
-
#
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
def top_peps_prefilter!
|
122
|
-
peps.each do |pep|
|
123
|
-
pep.xcorr = pep.xcorr.to_f
|
124
|
-
pep.deltacn = pep.deltacn.to_f
|
125
|
-
pep.deltamass = pep.deltamass.to_f
|
126
|
-
pep.mass = pep.mass.to_f
|
127
|
-
pep.charge = pep.charge.to_f
|
129
|
+
# takes a comma separated list or array and extends the last to create an
|
130
|
+
# array of desired size
|
131
|
+
def self.extend_args(arg, desired_size)
|
132
|
+
arg_arr = arg
|
133
|
+
if arg.is_a? String
|
134
|
+
arg_arr = arg.split(',')
|
128
135
|
end
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
top_peps << pep
|
138
|
-
else ; break
|
139
|
-
end
|
136
|
+
new_arr = []
|
137
|
+
last_arg = arg_arr[0]
|
138
|
+
desired_size.times do |i|
|
139
|
+
if arg_arr[i]
|
140
|
+
new_arr[i] = arg_arr[i]
|
141
|
+
last_arg = new_arr[i]
|
142
|
+
else
|
143
|
+
new_arr[i] = last_arg
|
140
144
|
end
|
141
145
|
end
|
142
|
-
|
143
|
-
end
|
144
|
-
|
145
|
-
|
146
|
-
#
|
147
|
-
#
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
146
|
+
new_arr
|
147
|
+
end
|
148
|
+
|
149
|
+
# takes an array of proteins, each having peps
|
150
|
+
# peptide grouping is done
|
151
|
+
# by-
|
152
|
+
# the protein with the most unique peptides ends up taking any
|
153
|
+
# degenerate peptides, tie goes to one with most hits total, then the one
|
154
|
+
# that had the top xcorr(s) (before removing any peptides).All other
|
155
|
+
# proteins with identical peptides will lose those peptides. So, the rich
|
156
|
+
# stay rich, and the poor get poorer.
|
157
|
+
# returns an array of triplets where each is [prot, pep_hits,
|
158
|
+
# uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
|
159
|
+
# peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
|
160
|
+
# update_prots == true will set each protein with the peptides found
|
161
|
+
def self.occams_razor(array_of_prots, update_prots=false)
|
162
|
+
peps_found = Set.new
|
163
|
+
|
164
|
+
to_sort = array_of_prots.map do |prot|
|
165
|
+
pps = prot.peps
|
166
|
+
|
167
|
+
peps_by_uniq_aaseq = pps.hash_by(:aaseq)
|
168
|
+
uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
|
169
|
+
xcorrs = pps.map {|pep| pep.xcorr }
|
170
|
+
|
171
|
+
silly = OpenStruct.new
|
172
|
+
# 0 1 2 3 4 5
|
173
|
+
[uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
|
174
|
+
end
|
175
|
+
prot_triplets = []
|
176
|
+
to_sort.sort.reverse.each do |ar|
|
177
|
+
prot = ar[3]
|
178
|
+
## overlapping set:
|
179
|
+
common = peps_found & ar[4]
|
180
|
+
## find the uniq ones in our little set of peptides:
|
181
|
+
uniq = ar[4] - common
|
182
|
+
pep_hits = []
|
183
|
+
if uniq.size != 0
|
184
|
+
## add to the found list:
|
185
|
+
peps_found.merge(uniq)
|
186
|
+
uniq.each do |seq|
|
187
|
+
pep_hits.push( *(ar[5][seq]) )
|
188
|
+
end
|
189
|
+
prot_triplets << [prot, pep_hits, uniq.to_a]
|
190
|
+
prot.peps = pep_hits if update_prots
|
179
191
|
end
|
180
|
-
[prots_passed, peps_passed]
|
181
|
-
#[prots_passed, peps_passed, deltacnstar_cnt]
|
182
|
-
else
|
183
|
-
abort "#{kind} not implemented"
|
184
192
|
end
|
193
|
+
prot_triplets
|
185
194
|
end
|
186
195
|
|
196
|
+
|
187
197
|
## basically, this is the command line wrapper
|
188
198
|
def self.precision(argv)
|
189
|
-
|
199
|
+
Prec.new.run_cmd_line(argv)
|
190
200
|
end
|
191
201
|
|
192
202
|
|
@@ -197,27 +207,64 @@ class SpecID
|
|
197
207
|
def by_tps(classification_method, tp, fp)
|
198
208
|
ROC.new.by_tps(classification_method, tp, fp)
|
199
209
|
end
|
210
|
+
|
211
|
+
# from the unique set of peptide hits, create a separate peptide hit for
|
212
|
+
# each protein reference where that peptide only references that protein
|
213
|
+
# e.g. pep.prots = [(a single protein)]
|
214
|
+
def pep_prots
|
215
|
+
pps = []
|
216
|
+
peps.each do |pep|
|
217
|
+
pep.prots.map do |prt|
|
218
|
+
pep.dup
|
219
|
+
pep.prots = [prt]
|
220
|
+
pps << pep
|
221
|
+
end
|
222
|
+
end
|
223
|
+
pps
|
224
|
+
end
|
200
225
|
|
201
226
|
# returns [tp, fp] based on the protein prefix for items where items =
|
202
227
|
# (:prot|:peps)
|
228
|
+
# this may result in a duplication of some peptides if they match both
|
229
|
+
# normal and decoy proteins. In this case, the protein arrays are split,
|
230
|
+
# too, so that each points only to its breed of protein.
|
203
231
|
def classify_by_prefix(items, prefix, fp_on_match=true)
|
204
232
|
regex = /^#{Regexp.escape(prefix)}/
|
205
|
-
|
233
|
+
case items
|
206
234
|
when :prots
|
207
|
-
proc { |prt|
|
235
|
+
myproc = proc { |prt|
|
208
236
|
if prt.reference =~ regex ; !fp_on_match
|
209
237
|
else ; fp_on_match end
|
210
238
|
}
|
239
|
+
return classify(items, myproc)
|
211
240
|
when :peps
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
241
|
+
match = [] ; nomatch = []
|
242
|
+
peps.each do |pep|
|
243
|
+
match_prots = [] ; nomatch_prots = []
|
244
|
+
(hit, nohit) = pep.prots.partition do |prot|
|
245
|
+
prot.reference =~ regex
|
246
|
+
end
|
247
|
+
if hit.size == 0
|
248
|
+
nomatch << pep
|
249
|
+
elsif nohit.size == 0
|
250
|
+
match << pep
|
251
|
+
else ## both have hits
|
252
|
+
pep.prots = match_prots
|
253
|
+
nomatch_pep = pep.dup
|
254
|
+
nomatch_pep.prots = nomatch_prots
|
255
|
+
match << pep
|
256
|
+
nomatch << pep
|
257
|
+
end
|
258
|
+
end
|
259
|
+
if fp_on_match
|
260
|
+
return [nomatch , match]
|
261
|
+
else
|
262
|
+
return [match, nomatch]
|
263
|
+
end
|
216
264
|
else
|
217
|
-
abort "
|
265
|
+
abort "don't recognize "
|
218
266
|
end
|
219
|
-
|
220
|
-
end
|
267
|
+
end
|
221
268
|
|
222
269
|
###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
|
223
270
|
# # Returns tp, fp where each is an array of proteins where fp is determined
|
@@ -244,18 +291,6 @@ class SpecID
|
|
244
291
|
[t,f]
|
245
292
|
end
|
246
293
|
|
247
|
-
def peps
|
248
|
-
if @peps ; @peps
|
249
|
-
else @obj.peps
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
def prots
|
254
|
-
if @prots ; @prots
|
255
|
-
else @obj.prots
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
294
|
# returns two arrays, true positives and false positives (determined by proc
|
260
295
|
# classify_item_by) sorted by proc rank_item_by. Items will be ranked from
|
261
296
|
# lowest to highest based on the return value of rank_item_by. items is a
|
@@ -276,7 +311,7 @@ class SpecID
|
|
276
311
|
# returns a proc for getting all probabilities so that an ascending sort
|
277
312
|
# will put the best scores first
|
278
313
|
def probability_proc
|
279
|
-
if
|
314
|
+
if hi_prob_best
|
280
315
|
get_prob_proc = proc {|prt| prt.probability * -1 }
|
281
316
|
else
|
282
317
|
get_prob_proc = proc {|prt| prt.probability }
|
@@ -328,17 +363,13 @@ class SpecID
|
|
328
363
|
if prt.reference =~ regex ; false
|
329
364
|
else ; true end
|
330
365
|
}
|
366
|
+
|
331
367
|
real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
|
332
368
|
|
333
369
|
(num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
|
334
370
|
[num_hits, precision]
|
335
371
|
end
|
336
372
|
|
337
|
-
def method_missing(symbol, *args)
|
338
|
-
@obj.send(symbol, *args)
|
339
|
-
end
|
340
|
-
|
341
|
-
|
342
373
|
# # takes the existing spec_id object and marshals it into "file.msh"
|
343
374
|
# # a new file will always look for a file.msh to load
|
344
375
|
# def marshal(force=false)
|
@@ -348,7 +379,14 @@ class SpecID
|
|
348
379
|
# end
|
349
380
|
|
350
381
|
# Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
|
382
|
+
# 'srf' if SRF file, 'srg' if search results group file.
|
351
383
|
def self.file_type(file)
|
384
|
+
if file =~ /\.srg$/
|
385
|
+
return 'srg'
|
386
|
+
end
|
387
|
+
if IO.read(file, 7,438) == 'Enzyme:'
|
388
|
+
return 'srf'
|
389
|
+
end
|
352
390
|
File.open(file) do |fh|
|
353
391
|
lines = ""
|
354
392
|
4.times { lines << fh.readline }
|
@@ -397,7 +435,7 @@ class SpecID
|
|
397
435
|
#peptides.each do |pep| print pep.class.to_s + " " end
|
398
436
|
#puts peptides.first.is_a? Array
|
399
437
|
#abort "DFHDFD"
|
400
|
-
peptides.collect{|pep| pep.
|
438
|
+
peptides.collect{|pep| pep.probability }.sort
|
401
439
|
end
|
402
440
|
|
403
441
|
# returns a sorted lists of probabilities based on all pepprots (a peptide
|
@@ -477,138 +515,149 @@ class SpecID
|
|
477
515
|
end
|
478
516
|
sorted_probabilities(min_peptides)
|
479
517
|
end
|
480
|
-
|
481
|
-
|
482
|
-
# A Generic spectraID protein
|
483
|
-
class Prot
|
484
|
-
# probability is always a float!
|
485
|
-
attr_accessor :probability, :reference
|
486
|
-
end
|
487
|
-
|
488
|
-
class Pep
|
489
|
-
attr_accessor :probability
|
490
|
-
# full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
|
491
|
-
# or last.
|
492
|
-
attr_accessor :sequence
|
493
|
-
attr_accessor :charge
|
494
|
-
|
495
|
-
# units can be :mmu, :amu, :ppm
|
496
|
-
def mass_accuracy(pep, unit=:ppm, mono=true)
|
497
|
-
# 10^6 * deltam accuracy/ m[measured]
|
498
|
-
# i.e., theoretical mass 1000, measured 999.9: 100ppm
|
499
|
-
# http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
|
500
|
-
# pep.mass is the theoretical M+H of the peptide
|
501
|
-
# this assumes that the deltacn value we're being told is correct, but I
|
502
|
-
# have my suspicions (since the <mass> value is not accurate...)
|
503
|
-
|
504
|
-
######## TO COMPLETE (and add to spec_id..?)
|
505
|
-
case unit
|
506
|
-
when :ppm
|
507
|
-
when :amu
|
508
|
-
when :mmu
|
509
|
-
end
|
510
|
-
end
|
511
|
-
end
|
512
|
-
|
513
518
|
end
|
514
519
|
|
515
|
-
# I would prefer to call this SpecID::XML, but I keep getting an error:
|
516
|
-
# /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
|
517
|
-
# constant XML referenced by SpecID::XML' This works around that for now.
|
518
|
-
# Any major xml elements should return a newline at the end for simple
|
519
|
-
# concatenation into a file
|
520
|
-
module SpecIDXML
|
521
|
-
|
522
|
-
Special_chrs_hash = {
|
523
|
-
'"' => '"',
|
524
|
-
'&' => '&',
|
525
|
-
"'" => ''',
|
526
|
-
'<' => '<',
|
527
|
-
'>' => '>',
|
528
|
-
}
|
529
|
-
|
530
|
-
# substitutes special xml chars
|
531
|
-
def escape_special_chars(string)
|
532
|
-
string.split('').map do |char|
|
533
|
-
if Special_chrs_hash.key? char ; Special_chrs_hash[char]
|
534
|
-
# if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
|
535
|
-
else ; char end
|
536
|
-
end.join
|
537
|
-
end
|
538
|
-
|
539
|
-
$DEPTH = 0
|
540
|
-
|
541
|
-
def tabs
|
542
|
-
# this is ugly
|
543
|
-
string = ""
|
544
|
-
$DEPTH.times { string << "\t" }
|
545
|
-
string
|
546
|
-
end
|
547
520
|
|
521
|
+
# A Generic spectraID protein
|
522
|
+
module SpecID::Prot
|
523
|
+
# probability is always a float!
|
524
|
+
attr_accessor :probability, :reference, :peps
|
548
525
|
|
549
|
-
def
|
550
|
-
|
526
|
+
def <=> (other)
|
527
|
+
self.reference <=> other.reference
|
551
528
|
end
|
552
529
|
|
553
|
-
|
554
|
-
symbol_list.collect { |sy|
|
555
|
-
param_xml(sy)
|
556
|
-
}.join("\n") + "\n"
|
557
|
-
end
|
530
|
+
end
|
558
531
|
|
559
|
-
|
560
|
-
|
532
|
+
module SpecID::Pep
|
533
|
+
|
534
|
+
Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
|
535
|
+
|
536
|
+
attr_accessor :prots
|
537
|
+
attr_accessor :probability
|
538
|
+
# full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
|
539
|
+
# or last.
|
540
|
+
attr_accessor :sequence
|
541
|
+
|
542
|
+
# the basic amino acid sequence (no leading or trailing '.' or amino acids)
|
543
|
+
# should not contain any special symbols, etc.
|
544
|
+
attr_accessor :aaseq
|
545
|
+
attr_accessor :charge
|
546
|
+
|
547
|
+
# removes nonstandard chars with Non_standard_amino_acid_char_re
|
548
|
+
# preserves A-Z and '.' and '-'
|
549
|
+
def self.remove_non_amino_acids(sequence)
|
550
|
+
sequence.gsub(Non_standard_amino_acid_char_re, '')
|
551
|
+
end
|
552
|
+
|
553
|
+
# remove_non_amino_acids && split_sequence
|
554
|
+
def self.prepare_sequence(val)
|
555
|
+
nv = remove_non_amino_acids(val)
|
556
|
+
split_sequence(nv)
|
557
|
+
end
|
558
|
+
|
559
|
+
def <=>(other)
|
560
|
+
aaseq <=> other.aaseq
|
561
|
+
end
|
562
|
+
|
563
|
+
# Returns prev, peptide, next from sequence. Parse errors return
|
564
|
+
# nil,nil,nil
|
565
|
+
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
566
|
+
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
567
|
+
# PEPTIDE.A # -> -, PEPTIDE, A
|
568
|
+
# A.PEPTIDE # -> A, PEPTIDE, -
|
569
|
+
# PEPTIDE # -> nil,nil,nil
|
570
|
+
def self.split_sequence(val)
|
571
|
+
peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
|
572
|
+
pieces = val.split('.')
|
573
|
+
case pieces.size
|
574
|
+
when 3
|
575
|
+
peptide_prev_aa, peptide, peptide_next_aa = *pieces
|
576
|
+
when 2
|
577
|
+
if pieces[0].size > 1 ## N termini
|
578
|
+
peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
|
579
|
+
else ## C termini
|
580
|
+
peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
|
581
|
+
end
|
582
|
+
when 1 ## this must be a parse error!
|
583
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
584
|
+
when 0
|
585
|
+
peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
|
586
|
+
end
|
587
|
+
return peptide_prev_aa, peptide, peptide_next_aa
|
588
|
+
end
|
589
|
+
|
590
|
+
##
|
591
|
+
def self.sequence_to_aaseq(sequence)
|
592
|
+
after_removed = remove_non_amino_acids(sequence)
|
593
|
+
pieces = after_removed.split('.')
|
594
|
+
case pieces.size
|
595
|
+
when 3
|
596
|
+
pieces[1]
|
597
|
+
when 2
|
598
|
+
if pieces[0].size > 1 ## N termini
|
599
|
+
pieces[0]
|
600
|
+
else ## C termini
|
601
|
+
pieces[1]
|
602
|
+
end
|
603
|
+
when 1 ## this must be a parse error!
|
604
|
+
pieces[0] ## which is the peptide itself
|
605
|
+
else
|
606
|
+
abort "bad peptide sequence: #{sequence}"
|
607
|
+
end
|
561
608
|
end
|
562
609
|
|
563
|
-
|
564
|
-
|
565
|
-
|
610
|
+
# This will rapidly determine the list of proteins for which given
|
611
|
+
# peptides belong. It is meant to be low level and fast (eventually),
|
612
|
+
# so it asks for the data in a format amenable to this.
|
613
|
+
# returns a mirror array where each entry is an array of Fasta::Prot
|
614
|
+
# objects where each protein contains the sequence
|
615
|
+
def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
|
616
|
+
prots = fasta_obj.prots
|
617
|
+
prot_seqs = prots.map do |prot|
|
618
|
+
prot.aaseq
|
619
|
+
end
|
566
620
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
621
|
+
groups = peptide_strings_list.map do |pep_seq|
|
622
|
+
prot_index = 0
|
623
|
+
protein_group = []
|
624
|
+
prot_seqs.each do |prot_seq|
|
625
|
+
if prot_seq.include? pep_seq
|
626
|
+
protein_group << prots[prot_index]
|
627
|
+
end
|
628
|
+
prot_index += 1
|
629
|
+
end
|
630
|
+
protein_group
|
631
|
+
end
|
573
632
|
|
574
|
-
|
575
|
-
def element_xml_no_atts(element)
|
576
|
-
start = "#{tabs}<#{element}>\n"
|
577
|
-
$DEPTH += 1
|
578
|
-
if block_given? ; middle = yield else ; middle = '' end
|
579
|
-
$DEPTH -= 1
|
580
|
-
start + middle + "#{tabs}</#{element}>\n"
|
633
|
+
groups
|
581
634
|
end
|
582
635
|
|
583
|
-
#
|
584
|
-
def
|
636
|
+
# units can be :mmu, :amu, :ppm
|
637
|
+
def mass_accuracy(pep, unit=:ppm, mono=true)
|
638
|
+
# 10^6 * deltam accuracy/ m[measured]
|
639
|
+
# i.e., theoretical mass 1000, measured 999.9: 100ppm
|
640
|
+
# http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
|
641
|
+
# pep.mass is the theoretical M+H of the peptide
|
642
|
+
# this assumes that the deltacn value we're being told is correct, but I
|
643
|
+
# have my suspicions (since the <mass> value is not accurate...)
|
585
644
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
645
|
+
######## TO COMPLETE (and add to spec_id..?)
|
646
|
+
case unit
|
647
|
+
when :ppm
|
648
|
+
when :amu
|
649
|
+
when :mmu
|
650
|
+
end
|
591
651
|
end
|
652
|
+
end
|
592
653
|
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
start = "#{tabs}<#{element} #{att_string}>\n"
|
597
|
-
$DEPTH += 1
|
598
|
-
if block_given? ; middle = yield else ; middle = '' end
|
599
|
-
$DEPTH -= 1
|
600
|
-
start + middle + "#{tabs}</#{element}>\n"
|
601
|
-
end
|
654
|
+
class SpecID::GenericProt
|
655
|
+
include SpecID::Prot
|
656
|
+
end
|
602
657
|
|
603
|
-
|
604
|
-
|
605
|
-
|
658
|
+
class SpecID::GenericPep
|
659
|
+
include SpecID::Pep
|
660
|
+
end
|
606
661
|
|
607
|
-
def attrs_xml(list_of_symbols)
|
608
|
-
list_of_symbols.collect {|sy|
|
609
|
-
attr_xml(sy)
|
610
|
-
}.join(" ")
|
611
|
-
end
|
612
662
|
|
613
|
-
end
|
614
663
|
|