mspire 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
data/lib/spec_id.rb CHANGED
@@ -1,14 +1,15 @@
1
1
  require 'ostruct'
2
-
3
- class SampleEnzyme ; end
4
-
2
+ require 'set'
3
+ require 'hash_by'
4
+ require 'spec_id/precision'
5
5
  require 'roc'
6
6
  require 'sample_enzyme' # for others
7
7
  require 'spec_id/bioworks'
8
8
  require 'spec_id/sequest'
9
9
  require 'spec_id/proph'
10
- require 'spec_id/precision'
10
+ require 'spec_id_xml'
11
11
 
12
+ class SampleEnzyme ; end
12
13
 
13
14
  class Mass
14
15
  # http://expasy.org/tools/findmod/findmod_masses.html
@@ -70,123 +71,132 @@ class Mass
70
71
  }
71
72
  end
72
73
 
73
- class SpecID
74
+ module SpecID ; end
75
+
76
+ class GenericSpecID ; include SpecID ; end
77
+
78
+ module SpecID
74
79
  MONO = Mass::MONO
75
80
  AVG = Mass::AVG
76
81
 
77
- attr_accessor :obj
78
- attr_writer :peps, :prots
82
+ attr_accessor :peps, :prots
79
83
  # True if a high protein/peptide score is better than low, false otherwise
80
84
  # This is set automatically for known file types
81
85
  attr_accessor :hi_prob_best
82
86
 
87
+ # A relative pathname of the file the specid object is derived from
88
+ attr_accessor :filename
89
+
83
90
  # tp = file_type
84
- def initialize(file=nil, tp=nil)
85
- @obj = nil
86
- @peps = nil
87
- @prots = nil
88
- @hi_prob_best = nil
89
- if file
91
+ # Will return a SpecID object (really, the object corresponding to the
92
+ # file type which mixes in SpecID [is_a?(SpecID) == true])
93
+ # If no file is given, will return a GenericSpecID object.
94
+ def self.new(file=nil, tp=nil)
95
+ if file
90
96
  from_file(file, tp)
97
+ else
98
+ GenericSpecID.new
91
99
  end
92
100
  end
93
101
 
94
102
  # tp = file_type
95
- def from_file(file, tp=nil)
103
+ # only takes an array if they are srf files!
104
+ def self.from_file(file, tp=nil)
105
+ obj = nil
96
106
  unless tp
97
- tp = self.class.file_type(file)
107
+ tp = file_type(file)
98
108
  end
99
- case tp
109
+ obj = case tp
110
+ when 'srg'
111
+ @hi_prob_best = false
112
+ SRFGroup.new(file)
100
113
  when 'bioworks'
101
- @obj = SpecID::Bioworks.new(file)
102
114
  @hi_prob_best = false
115
+ Bioworks.new(file)
103
116
  when 'protproph'
104
- @obj = SpecID::Proph::ProtSummary.new(file)
105
117
  @hi_prob_best = true
118
+ Proph::ProtSummary.new(file)
106
119
  else
107
120
  abort "UNRECOGNIZED file type for #{file}"
108
121
  end
122
+ obj
109
123
  end
110
124
 
111
125
  def inspect
112
126
  "<#{self.class} #peps=\"#{peps.size}\">"
113
127
  end
114
128
 
115
- # returns the top peptide hits per file dta (first_scan + charge)
116
- # all hits with same score as top score are returned
117
- # assumes that all fields are strings...
118
- # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
119
- # deletes the protein array (but not relevant proteins)
120
- # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
121
- def top_peps_prefilter!
122
- peps.each do |pep|
123
- pep.xcorr = pep.xcorr.to_f
124
- pep.deltacn = pep.deltacn.to_f
125
- pep.deltamass = pep.deltamass.to_f
126
- pep.mass = pep.mass.to_f
127
- pep.charge = pep.charge.to_f
129
+ # takes a comma separated list or array and extends the last to create an
130
+ # array of desired size
131
+ def self.extend_args(arg, desired_size)
132
+ arg_arr = arg
133
+ if arg.is_a? String
134
+ arg_arr = arg.split(',')
128
135
  end
129
- # get the top peptide by firstscan/charge (equivalent to .out files)
130
- top_peps = []
131
- #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
132
- self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
133
- best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
134
- top_score = best_to_worst.first.xcorr
135
- best_to_worst.each do |pep|
136
- if pep.xcorr == top_score
137
- top_peps << pep
138
- else ; break
139
- end
136
+ new_arr = []
137
+ last_arg = arg_arr[0]
138
+ desired_size.times do |i|
139
+ if arg_arr[i]
140
+ new_arr[i] = arg_arr[i]
141
+ last_arg = new_arr[i]
142
+ else
143
+ new_arr[i] = last_arg
140
144
  end
141
145
  end
142
- @peps = top_peps
143
- end
144
-
145
-
146
- # when kind == :common ; xcorr1, xcorr2, xcorr3, deltacn, rough_ppm
147
- # interface very unstable. For now, keeping it very loose...
148
- # assumed that peptide xcorr, deltacn, deltamass, mass are Floats
149
- # assumed that peptide charge is Integer
150
- # returns prots
151
- # must respond to 'peps'
152
- def filter(kind, *args)
153
- case kind
154
- when :common
155
- (x1, x2, x3, deltacn, rough_ppm) = args
156
- # returns num proteins
157
- peps_passed = self.peps.select do |pep|
158
- # have to add the upper limit to deltacn because the lowest score is often
159
- # assigned a 1.10 in bioworks!
160
- pep_deltacn = pep.deltacn
161
- pep_charge = pep.charge
162
- (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
163
- #truth = (pep_deltacn >= deltacn) and
164
- (
165
- (pep_charge == 1 && pep.xcorr >= x1) or
166
- (pep_charge == 2 && pep.xcorr >= x2) or
167
- (pep_charge == 3 && pep.xcorr >= x3)
168
- ) and
169
- ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
170
- end
171
-
172
- #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
173
-
174
- hash = peps_passed.hash_by(:prot)
175
-
176
- prots_passed = hash.map do |prot,pep_arr|
177
- prot.peps = pep_arr
178
- prot
146
+ new_arr
147
+ end
148
+
149
+ # takes an array of proteins, each having peps
150
+ # peptide grouping is done
151
+ # by-
152
+ # the protein with the most unique peptides ends up taking any
153
+ # degenerate peptides, tie goes to one with most hits total, then the one
154
+ # that had the top xcorr(s) (before removing any peptides).All other
155
+ # proteins with identical peptides will lose those peptides. So, the rich
156
+ # stay rich, and the poor get poorer.
157
+ # returns an array of triplets where each is [prot, pep_hits,
158
+ # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
159
+ # peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
160
+ # update_prots == true will set each protein with the peptides found
161
+ def self.occams_razor(array_of_prots, update_prots=false)
162
+ peps_found = Set.new
163
+
164
+ to_sort = array_of_prots.map do |prot|
165
+ pps = prot.peps
166
+
167
+ peps_by_uniq_aaseq = pps.hash_by(:aaseq)
168
+ uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
169
+ xcorrs = pps.map {|pep| pep.xcorr }
170
+
171
+ silly = OpenStruct.new
172
+ # 0 1 2 3 4 5
173
+ [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
174
+ end
175
+ prot_triplets = []
176
+ to_sort.sort.reverse.each do |ar|
177
+ prot = ar[3]
178
+ ## overlapping set:
179
+ common = peps_found & ar[4]
180
+ ## find the uniq ones in our little set of peptides:
181
+ uniq = ar[4] - common
182
+ pep_hits = []
183
+ if uniq.size != 0
184
+ ## add to the found list:
185
+ peps_found.merge(uniq)
186
+ uniq.each do |seq|
187
+ pep_hits.push( *(ar[5][seq]) )
188
+ end
189
+ prot_triplets << [prot, pep_hits, uniq.to_a]
190
+ prot.peps = pep_hits if update_prots
179
191
  end
180
- [prots_passed, peps_passed]
181
- #[prots_passed, peps_passed, deltacnstar_cnt]
182
- else
183
- abort "#{kind} not implemented"
184
192
  end
193
+ prot_triplets
185
194
  end
186
195
 
196
+
187
197
  ## basically, this is the command line wrapper
188
198
  def self.precision(argv)
189
- SpecID::Precision.new.run_cmd_line(argv)
199
+ Prec.new.run_cmd_line(argv)
190
200
  end
191
201
 
192
202
 
@@ -197,27 +207,64 @@ class SpecID
197
207
  def by_tps(classification_method, tp, fp)
198
208
  ROC.new.by_tps(classification_method, tp, fp)
199
209
  end
210
+
211
+ # from the unique set of peptide hits, create a separate peptide hit for
212
+ # each protein reference where that peptide only references that protein
213
+ # e.g. pep.prots = [(a single protein)]
214
+ def pep_prots
215
+ pps = []
216
+ peps.each do |pep|
217
+ pep.prots.map do |prt|
218
+ pep.dup
219
+ pep.prots = [prt]
220
+ pps << pep
221
+ end
222
+ end
223
+ pps
224
+ end
200
225
 
201
226
  # returns [tp, fp] based on the protein prefix for items where items =
202
227
  # (:prot|:peps)
228
+ # this may result in a duplication of some peptides if they match both
229
+ # normal and decoy proteins. In this case, the protein arrays are split,
230
+ # too, so that each points only to its breed of protein.
203
231
  def classify_by_prefix(items, prefix, fp_on_match=true)
204
232
  regex = /^#{Regexp.escape(prefix)}/
205
- myproc = case items
233
+ case items
206
234
  when :prots
207
- proc { |prt|
235
+ myproc = proc { |prt|
208
236
  if prt.reference =~ regex ; !fp_on_match
209
237
  else ; fp_on_match end
210
238
  }
239
+ return classify(items, myproc)
211
240
  when :peps
212
- proc { |pep|
213
- if pep.prot.reference =~ regex ; !fp_on_match
214
- else ; fp_on_match end
215
- }
241
+ match = [] ; nomatch = []
242
+ peps.each do |pep|
243
+ match_prots = [] ; nomatch_prots = []
244
+ (hit, nohit) = pep.prots.partition do |prot|
245
+ prot.reference =~ regex
246
+ end
247
+ if hit.size == 0
248
+ nomatch << pep
249
+ elsif nohit.size == 0
250
+ match << pep
251
+ else ## both have hits
252
+ pep.prots = match_prots
253
+ nomatch_pep = pep.dup
254
+ nomatch_pep.prots = nomatch_prots
255
+ match << pep
256
+ nomatch << pep
257
+ end
258
+ end
259
+ if fp_on_match
260
+ return [nomatch , match]
261
+ else
262
+ return [match, nomatch]
263
+ end
216
264
  else
217
- abort "no go"
265
+ abort "don't recognize "
218
266
  end
219
- classify(items, myproc)
220
- end
267
+ end
221
268
 
222
269
  ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
223
270
  # # Returns tp, fp where each is an array of proteins where fp is determined
@@ -244,18 +291,6 @@ class SpecID
244
291
  [t,f]
245
292
  end
246
293
 
247
- def peps
248
- if @peps ; @peps
249
- else @obj.peps
250
- end
251
- end
252
-
253
- def prots
254
- if @prots ; @prots
255
- else @obj.prots
256
- end
257
- end
258
-
259
294
  # returns two arrays, true positives and false positives (determined by proc
260
295
  # classify_item_by) sorted by proc rank_item_by. Items will be ranked from
261
296
  # lowest to highest based on the return value of rank_item_by. items is a
@@ -276,7 +311,7 @@ class SpecID
276
311
  # returns a proc for getting all probabilities so that an ascending sort
277
312
  # will put the best scores first
278
313
  def probability_proc
279
- if @hi_prob_best
314
+ if hi_prob_best
280
315
  get_prob_proc = proc {|prt| prt.probability * -1 }
281
316
  else
282
317
  get_prob_proc = proc {|prt| prt.probability }
@@ -328,17 +363,13 @@ class SpecID
328
363
  if prt.reference =~ regex ; false
329
364
  else ; true end
330
365
  }
366
+
331
367
  real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
332
368
 
333
369
  (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
334
370
  [num_hits, precision]
335
371
  end
336
372
 
337
- def method_missing(symbol, *args)
338
- @obj.send(symbol, *args)
339
- end
340
-
341
-
342
373
  # # takes the existing spec_id object and marshals it into "file.msh"
343
374
  # # a new file will always look for a file.msh to load
344
375
  # def marshal(force=false)
@@ -348,7 +379,14 @@ class SpecID
348
379
  # end
349
380
 
350
381
  # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
382
+ # 'srf' if SRF file, 'srg' if search results group file.
351
383
  def self.file_type(file)
384
+ if file =~ /\.srg$/
385
+ return 'srg'
386
+ end
387
+ if IO.read(file, 7,438) == 'Enzyme:'
388
+ return 'srf'
389
+ end
352
390
  File.open(file) do |fh|
353
391
  lines = ""
354
392
  4.times { lines << fh.readline }
@@ -397,7 +435,7 @@ class SpecID
397
435
  #peptides.each do |pep| print pep.class.to_s + " " end
398
436
  #puts peptides.first.is_a? Array
399
437
  #abort "DFHDFD"
400
- peptides.collect{|pep| pep.peptide_probability }.sort
438
+ peptides.collect{|pep| pep.probability }.sort
401
439
  end
402
440
 
403
441
  # returns a sorted lists of probabilities based on all pepprots (a peptide
@@ -477,138 +515,149 @@ class SpecID
477
515
  end
478
516
  sorted_probabilities(min_peptides)
479
517
  end
480
-
481
-
482
- # A Generic spectraID protein
483
- class Prot
484
- # probability is always a float!
485
- attr_accessor :probability, :reference
486
- end
487
-
488
- class Pep
489
- attr_accessor :probability
490
- # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
491
- # or last.
492
- attr_accessor :sequence
493
- attr_accessor :charge
494
-
495
- # units can be :mmu, :amu, :ppm
496
- def mass_accuracy(pep, unit=:ppm, mono=true)
497
- # 10^6 * deltam accuracy/ m[measured]
498
- # i.e., theoretical mass 1000, measured 999.9: 100ppm
499
- # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
500
- # pep.mass is the theoretical M+H of the peptide
501
- # this assumes that the deltacn value we're being told is correct, but I
502
- # have my suspicions (since the <mass> value is not accurate...)
503
-
504
- ######## TO COMPLETE (and add to spec_id..?)
505
- case unit
506
- when :ppm
507
- when :amu
508
- when :mmu
509
- end
510
- end
511
- end
512
-
513
518
  end
514
519
 
515
- # I would prefer to call this SpecID::XML, but I keep getting an error:
516
- # /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
517
- # constant XML referenced by SpecID::XML' This works around that for now.
518
- # Any major xml elements should return a newline at the end for simple
519
- # concatenation into a file
520
- module SpecIDXML
521
-
522
- Special_chrs_hash = {
523
- '"' => '&quot;',
524
- '&' => '&amp;',
525
- "'" => '&apos;',
526
- '<' => '&lt;',
527
- '>' => '&gt;',
528
- }
529
-
530
- # substitutes special xml chars
531
- def escape_special_chars(string)
532
- string.split('').map do |char|
533
- if Special_chrs_hash.key? char ; Special_chrs_hash[char]
534
- # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
535
- else ; char end
536
- end.join
537
- end
538
-
539
- $DEPTH = 0
540
-
541
- def tabs
542
- # this is ugly
543
- string = ""
544
- $DEPTH.times { string << "\t" }
545
- string
546
- end
547
520
 
521
+ # A Generic spectraID protein
522
+ module SpecID::Prot
523
+ # probability is always a float!
524
+ attr_accessor :probability, :reference, :peps
548
525
 
549
- def param_xml(symbol)
550
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
526
+ def <=> (other)
527
+ self.reference <=> other.reference
551
528
  end
552
529
 
553
- def params_xml(*symbol_list)
554
- symbol_list.collect { |sy|
555
- param_xml(sy)
556
- }.join("\n") + "\n"
557
- end
530
+ end
558
531
 
559
- def short_element_xml(element, att_list)
560
- "#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
532
+ module SpecID::Pep
533
+
534
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
535
+
536
+ attr_accessor :prots
537
+ attr_accessor :probability
538
+ # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
539
+ # or last.
540
+ attr_accessor :sequence
541
+
542
+ # the basic amino acid sequence (no leading or trailing '.' or amino acids)
543
+ # should not contain any special symbols, etc.
544
+ attr_accessor :aaseq
545
+ attr_accessor :charge
546
+
547
+ # removes nonstandard chars with Non_standard_amino_acid_char_re
548
+ # preserves A-Z and '.' and '-'
549
+ def self.remove_non_amino_acids(sequence)
550
+ sequence.gsub(Non_standard_amino_acid_char_re, '')
551
+ end
552
+
553
+ # remove_non_amino_acids && split_sequence
554
+ def self.prepare_sequence(val)
555
+ nv = remove_non_amino_acids(val)
556
+ split_sequence(nv)
557
+ end
558
+
559
+ def <=>(other)
560
+ aaseq <=> other.aaseq
561
+ end
562
+
563
+ # Returns prev, peptide, next from sequence. Parse errors return
564
+ # nil,nil,nil
565
+ # R.PEPTIDE.A # -> R, PEPTIDE, A
566
+ # R.PEPTIDE.- # -> R, PEPTIDE, -
567
+ # PEPTIDE.A # -> -, PEPTIDE, A
568
+ # A.PEPTIDE # -> A, PEPTIDE, -
569
+ # PEPTIDE # -> nil,nil,nil
570
+ def self.split_sequence(val)
571
+ peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
572
+ pieces = val.split('.')
573
+ case pieces.size
574
+ when 3
575
+ peptide_prev_aa, peptide, peptide_next_aa = *pieces
576
+ when 2
577
+ if pieces[0].size > 1 ## N termini
578
+ peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
579
+ else ## C termini
580
+ peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
581
+ end
582
+ when 1 ## this must be a parse error!
583
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
584
+ when 0
585
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
586
+ end
587
+ return peptide_prev_aa, peptide, peptide_next_aa
588
+ end
589
+
590
+ ##
591
+ def self.sequence_to_aaseq(sequence)
592
+ after_removed = remove_non_amino_acids(sequence)
593
+ pieces = after_removed.split('.')
594
+ case pieces.size
595
+ when 3
596
+ pieces[1]
597
+ when 2
598
+ if pieces[0].size > 1 ## N termini
599
+ pieces[0]
600
+ else ## C termini
601
+ pieces[1]
602
+ end
603
+ when 1 ## this must be a parse error!
604
+ pieces[0] ## which is the peptide itself
605
+ else
606
+ abort "bad peptide sequence: #{sequence}"
607
+ end
561
608
  end
562
609
 
563
- def short_element_xml_and_att_string(element, att_string)
564
- "#{tabs}<#{element} #{att_string}/>\n"
565
- end
610
+ # This will rapidly determine the list of proteins for which given
611
+ # peptides belong. It is meant to be low level and fast (eventually),
612
+ # so it asks for the data in a format amenable to this.
613
+ # returns a mirror array where each entry is an array of Fasta::Prot
614
+ # objects where each protein contains the sequence
615
+ def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
616
+ prots = fasta_obj.prots
617
+ prot_seqs = prots.map do |prot|
618
+ prot.aaseq
619
+ end
566
620
 
567
- # requires that obj have attribute '@xml_element_name'
568
- # displays all *instance_variables* (does not call methods!)
569
- def short_element_xml_from_instance_vars(element_name)
570
- string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
571
- "#{tabs}<#{element_name} #{string}/>\n"
572
- end
621
+ groups = peptide_strings_list.map do |pep_seq|
622
+ prot_index = 0
623
+ protein_group = []
624
+ prot_seqs.each do |prot_seq|
625
+ if prot_seq.include? pep_seq
626
+ protein_group << prots[prot_index]
627
+ end
628
+ prot_index += 1
629
+ end
630
+ protein_group
631
+ end
573
632
 
574
- # takes an element as a symbol and returns the
575
- def element_xml_no_atts(element)
576
- start = "#{tabs}<#{element}>\n"
577
- $DEPTH += 1
578
- if block_given? ; middle = yield else ; middle = '' end
579
- $DEPTH -= 1
580
- start + middle + "#{tabs}</#{element}>\n"
633
+ groups
581
634
  end
582
635
 
583
- # takes an element as a symbol and returns the
584
- def element_xml(element, att_list)
636
+ # units can be :mmu, :amu, :ppm
637
+ def mass_accuracy(pep, unit=:ppm, mono=true)
638
+ # 10^6 * deltam accuracy/ m[measured]
639
+ # i.e., theoretical mass 1000, measured 999.9: 100ppm
640
+ # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
641
+ # pep.mass is the theoretical M+H of the peptide
642
+ # this assumes that the deltacn value we're being told is correct, but I
643
+ # have my suspicions (since the <mass> value is not accurate...)
585
644
 
586
- start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
587
- $DEPTH += 1
588
- if block_given? ; middle = yield else ; middle = '' end
589
- $DEPTH -= 1
590
- start + middle + "#{tabs}</#{element}>\n"
645
+ ######## TO COMPLETE (and add to spec_id..?)
646
+ case unit
647
+ when :ppm
648
+ when :amu
649
+ when :mmu
650
+ end
591
651
  end
652
+ end
592
653
 
593
- # element as symbol and att_string as attributes
594
- # takes a block of whatever
595
- def element_xml_and_att_string(element, att_string)
596
- start = "#{tabs}<#{element} #{att_string}>\n"
597
- $DEPTH += 1
598
- if block_given? ; middle = yield else ; middle = '' end
599
- $DEPTH -= 1
600
- start + middle + "#{tabs}</#{element}>\n"
601
- end
654
+ class SpecID::GenericProt
655
+ include SpecID::Prot
656
+ end
602
657
 
603
- def attr_xml(symbol)
604
- "#{symbol}=\"#{send(symbol)}\""
605
- end
658
+ class SpecID::GenericPep
659
+ include SpecID::Pep
660
+ end
606
661
 
607
- def attrs_xml(list_of_symbols)
608
- list_of_symbols.collect {|sy|
609
- attr_xml(sy)
610
- }.join(" ")
611
- end
612
662
 
613
- end
614
663