mspire 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
data/lib/spec_id.rb CHANGED
@@ -1,14 +1,15 @@
1
1
  require 'ostruct'
2
-
3
- class SampleEnzyme ; end
4
-
2
+ require 'set'
3
+ require 'hash_by'
4
+ require 'spec_id/precision'
5
5
  require 'roc'
6
6
  require 'sample_enzyme' # for others
7
7
  require 'spec_id/bioworks'
8
8
  require 'spec_id/sequest'
9
9
  require 'spec_id/proph'
10
- require 'spec_id/precision'
10
+ require 'spec_id_xml'
11
11
 
12
+ class SampleEnzyme ; end
12
13
 
13
14
  class Mass
14
15
  # http://expasy.org/tools/findmod/findmod_masses.html
@@ -70,123 +71,132 @@ class Mass
70
71
  }
71
72
  end
72
73
 
73
- class SpecID
74
+ module SpecID ; end
75
+
76
+ class GenericSpecID ; include SpecID ; end
77
+
78
+ module SpecID
74
79
  MONO = Mass::MONO
75
80
  AVG = Mass::AVG
76
81
 
77
- attr_accessor :obj
78
- attr_writer :peps, :prots
82
+ attr_accessor :peps, :prots
79
83
  # True if a high protein/peptide score is better than low, false otherwise
80
84
  # This is set automatically for known file types
81
85
  attr_accessor :hi_prob_best
82
86
 
87
+ # A relative pathname of the file the specid object is derived from
88
+ attr_accessor :filename
89
+
83
90
  # tp = file_type
84
- def initialize(file=nil, tp=nil)
85
- @obj = nil
86
- @peps = nil
87
- @prots = nil
88
- @hi_prob_best = nil
89
- if file
91
+ # Will return a SpecID object (really, the object corresponding to the
92
+ # file type which mixes in SpecID [is_a?(SpecID) == true])
93
+ # If no file is given, will return a GenericSpecID object.
94
+ def self.new(file=nil, tp=nil)
95
+ if file
90
96
  from_file(file, tp)
97
+ else
98
+ GenericSpecID.new
91
99
  end
92
100
  end
93
101
 
94
102
  # tp = file_type
95
- def from_file(file, tp=nil)
103
+ # only takes an array if they are srf files!
104
+ def self.from_file(file, tp=nil)
105
+ obj = nil
96
106
  unless tp
97
- tp = self.class.file_type(file)
107
+ tp = file_type(file)
98
108
  end
99
- case tp
109
+ obj = case tp
110
+ when 'srg'
111
+ @hi_prob_best = false
112
+ SRFGroup.new(file)
100
113
  when 'bioworks'
101
- @obj = SpecID::Bioworks.new(file)
102
114
  @hi_prob_best = false
115
+ Bioworks.new(file)
103
116
  when 'protproph'
104
- @obj = SpecID::Proph::ProtSummary.new(file)
105
117
  @hi_prob_best = true
118
+ Proph::ProtSummary.new(file)
106
119
  else
107
120
  abort "UNRECOGNIZED file type for #{file}"
108
121
  end
122
+ obj
109
123
  end
110
124
 
111
125
  def inspect
112
126
  "<#{self.class} #peps=\"#{peps.size}\">"
113
127
  end
114
128
 
115
- # returns the top peptide hits per file dta (first_scan + charge)
116
- # all hits with same score as top score are returned
117
- # assumes that all fields are strings...
118
- # converts xcorr, deltacn, deltamass, mass, and charge into numerical types
119
- # deletes the protein array (but not relevant proteins)
120
- # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
121
- def top_peps_prefilter!
122
- peps.each do |pep|
123
- pep.xcorr = pep.xcorr.to_f
124
- pep.deltacn = pep.deltacn.to_f
125
- pep.deltamass = pep.deltamass.to_f
126
- pep.mass = pep.mass.to_f
127
- pep.charge = pep.charge.to_f
129
+ # takes a comma separated list or array and extends the last to create an
130
+ # array of desired size
131
+ def self.extend_args(arg, desired_size)
132
+ arg_arr = arg
133
+ if arg.is_a? String
134
+ arg_arr = arg.split(',')
128
135
  end
129
- # get the top peptide by firstscan/charge (equivalent to .out files)
130
- top_peps = []
131
- #self.peps.hash_by {|pep| [pep.base_name, pep.first_scan.to_i, pep.charge.to_i]}.values.map do |v|
132
- self.peps.hash_by {|pep| [SpecID::Sequest::PepXML::SearchHit.split_sequence(pep.sequence)[1], pep.charge.to_i]}.values.map do |v|
133
- best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
134
- top_score = best_to_worst.first.xcorr
135
- best_to_worst.each do |pep|
136
- if pep.xcorr == top_score
137
- top_peps << pep
138
- else ; break
139
- end
136
+ new_arr = []
137
+ last_arg = arg_arr[0]
138
+ desired_size.times do |i|
139
+ if arg_arr[i]
140
+ new_arr[i] = arg_arr[i]
141
+ last_arg = new_arr[i]
142
+ else
143
+ new_arr[i] = last_arg
140
144
  end
141
145
  end
142
- @peps = top_peps
143
- end
144
-
145
-
146
- # when kind == :common ; xcorr1, xcorr2, xcorr3, deltacn, rough_ppm
147
- # interface very unstable. For now, keeping it very loose...
148
- # assumed that peptide xcorr, deltacn, deltamass, mass are Floats
149
- # assumed that peptide charge is Integer
150
- # returns prots
151
- # must respond to 'peps'
152
- def filter(kind, *args)
153
- case kind
154
- when :common
155
- (x1, x2, x3, deltacn, rough_ppm) = args
156
- # returns num proteins
157
- peps_passed = self.peps.select do |pep|
158
- # have to add the upper limit to deltacn because the lowest score is often
159
- # assigned a 1.10 in bioworks!
160
- pep_deltacn = pep.deltacn
161
- pep_charge = pep.charge
162
- (pep_deltacn >= deltacn && pep_deltacn <= 1.0) and
163
- #truth = (pep_deltacn >= deltacn) and
164
- (
165
- (pep_charge == 1 && pep.xcorr >= x1) or
166
- (pep_charge == 2 && pep.xcorr >= x2) or
167
- (pep_charge == 3 && pep.xcorr >= x3)
168
- ) and
169
- ((1.0e6 * (pep.deltamass.abs/pep.mass)) <= rough_ppm)
170
- end
171
-
172
- #deltacnstar_cnt = peps_passed.select{|v| v.deltacn > 1.0}.size
173
-
174
- hash = peps_passed.hash_by(:prot)
175
-
176
- prots_passed = hash.map do |prot,pep_arr|
177
- prot.peps = pep_arr
178
- prot
146
+ new_arr
147
+ end
148
+
149
+ # takes an array of proteins, each having peps
150
+ # peptide grouping is done
151
+ # by-
152
+ # the protein with the most unique peptides ends up taking any
153
+ # degenerate peptides, tie goes to one with most hits total, then the one
154
+ # that had the top xcorr(s) (before removing any peptides).All other
155
+ # proteins with identical peptides will lose those peptides. So, the rich
156
+ # stay rich, and the poor get poorer.
157
+ # returns an array of triplets where each is [prot, pep_hits,
158
+ # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
159
+ # peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
160
+ # update_prots == true will set each protein with the peptides found
161
+ def self.occams_razor(array_of_prots, update_prots=false)
162
+ peps_found = Set.new
163
+
164
+ to_sort = array_of_prots.map do |prot|
165
+ pps = prot.peps
166
+
167
+ peps_by_uniq_aaseq = pps.hash_by(:aaseq)
168
+ uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
169
+ xcorrs = pps.map {|pep| pep.xcorr }
170
+
171
+ silly = OpenStruct.new
172
+ # 0 1 2 3 4 5
173
+ [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
174
+ end
175
+ prot_triplets = []
176
+ to_sort.sort.reverse.each do |ar|
177
+ prot = ar[3]
178
+ ## overlapping set:
179
+ common = peps_found & ar[4]
180
+ ## find the uniq ones in our little set of peptides:
181
+ uniq = ar[4] - common
182
+ pep_hits = []
183
+ if uniq.size != 0
184
+ ## add to the found list:
185
+ peps_found.merge(uniq)
186
+ uniq.each do |seq|
187
+ pep_hits.push( *(ar[5][seq]) )
188
+ end
189
+ prot_triplets << [prot, pep_hits, uniq.to_a]
190
+ prot.peps = pep_hits if update_prots
179
191
  end
180
- [prots_passed, peps_passed]
181
- #[prots_passed, peps_passed, deltacnstar_cnt]
182
- else
183
- abort "#{kind} not implemented"
184
192
  end
193
+ prot_triplets
185
194
  end
186
195
 
196
+
187
197
  ## basically, this is the command line wrapper
188
198
  def self.precision(argv)
189
- SpecID::Precision.new.run_cmd_line(argv)
199
+ Prec.new.run_cmd_line(argv)
190
200
  end
191
201
 
192
202
 
@@ -197,27 +207,64 @@ class SpecID
197
207
  def by_tps(classification_method, tp, fp)
198
208
  ROC.new.by_tps(classification_method, tp, fp)
199
209
  end
210
+
211
+ # from the unique set of peptide hits, create a separate peptide hit for
212
+ # each protein reference where that peptide only references that protein
213
+ # e.g. pep.prots = [(a single protein)]
214
+ def pep_prots
215
+ pps = []
216
+ peps.each do |pep|
217
+ pep.prots.map do |prt|
218
+ pep.dup
219
+ pep.prots = [prt]
220
+ pps << pep
221
+ end
222
+ end
223
+ pps
224
+ end
200
225
 
201
226
  # returns [tp, fp] based on the protein prefix for items where items =
202
227
  # (:prot|:peps)
228
+ # this may result in a duplication of some peptides if they match both
229
+ # normal and decoy proteins. In this case, the protein arrays are split,
230
+ # too, so that each points only to its breed of protein.
203
231
  def classify_by_prefix(items, prefix, fp_on_match=true)
204
232
  regex = /^#{Regexp.escape(prefix)}/
205
- myproc = case items
233
+ case items
206
234
  when :prots
207
- proc { |prt|
235
+ myproc = proc { |prt|
208
236
  if prt.reference =~ regex ; !fp_on_match
209
237
  else ; fp_on_match end
210
238
  }
239
+ return classify(items, myproc)
211
240
  when :peps
212
- proc { |pep|
213
- if pep.prot.reference =~ regex ; !fp_on_match
214
- else ; fp_on_match end
215
- }
241
+ match = [] ; nomatch = []
242
+ peps.each do |pep|
243
+ match_prots = [] ; nomatch_prots = []
244
+ (hit, nohit) = pep.prots.partition do |prot|
245
+ prot.reference =~ regex
246
+ end
247
+ if hit.size == 0
248
+ nomatch << pep
249
+ elsif nohit.size == 0
250
+ match << pep
251
+ else ## both have hits
252
+ pep.prots = match_prots
253
+ nomatch_pep = pep.dup
254
+ nomatch_pep.prots = nomatch_prots
255
+ match << pep
256
+ nomatch << pep
257
+ end
258
+ end
259
+ if fp_on_match
260
+ return [nomatch , match]
261
+ else
262
+ return [match, nomatch]
263
+ end
216
264
  else
217
- abort "no go"
265
+ abort "don't recognize "
218
266
  end
219
- classify(items, myproc)
220
- end
267
+ end
221
268
 
222
269
  ###### ThIS GUY IS BAD (and unnecessary) AND SHOULD PROBABLY BE DELETEED...
223
270
  # # Returns tp, fp where each is an array of proteins where fp is determined
@@ -244,18 +291,6 @@ class SpecID
244
291
  [t,f]
245
292
  end
246
293
 
247
- def peps
248
- if @peps ; @peps
249
- else @obj.peps
250
- end
251
- end
252
-
253
- def prots
254
- if @prots ; @prots
255
- else @obj.prots
256
- end
257
- end
258
-
259
294
  # returns two arrays, true positives and false positives (determined by proc
260
295
  # classify_item_by) sorted by proc rank_item_by. Items will be ranked from
261
296
  # lowest to highest based on the return value of rank_item_by. items is a
@@ -276,7 +311,7 @@ class SpecID
276
311
  # returns a proc for getting all probabilities so that an ascending sort
277
312
  # will put the best scores first
278
313
  def probability_proc
279
- if @hi_prob_best
314
+ if hi_prob_best
280
315
  get_prob_proc = proc {|prt| prt.probability * -1 }
281
316
  else
282
317
  get_prob_proc = proc {|prt| prt.probability }
@@ -328,17 +363,13 @@ class SpecID
328
363
  if prt.reference =~ regex ; false
329
364
  else ; true end
330
365
  }
366
+
331
367
  real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
332
368
 
333
369
  (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
334
370
  [num_hits, precision]
335
371
  end
336
372
 
337
- def method_missing(symbol, *args)
338
- @obj.send(symbol, *args)
339
- end
340
-
341
-
342
373
  # # takes the existing spec_id object and marshals it into "file.msh"
343
374
  # # a new file will always look for a file.msh to load
344
375
  # def marshal(force=false)
@@ -348,7 +379,14 @@ class SpecID
348
379
  # end
349
380
 
350
381
  # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
382
+ # 'srf' if SRF file, 'srg' if search results group file.
351
383
  def self.file_type(file)
384
+ if file =~ /\.srg$/
385
+ return 'srg'
386
+ end
387
+ if IO.read(file, 7,438) == 'Enzyme:'
388
+ return 'srf'
389
+ end
352
390
  File.open(file) do |fh|
353
391
  lines = ""
354
392
  4.times { lines << fh.readline }
@@ -397,7 +435,7 @@ class SpecID
397
435
  #peptides.each do |pep| print pep.class.to_s + " " end
398
436
  #puts peptides.first.is_a? Array
399
437
  #abort "DFHDFD"
400
- peptides.collect{|pep| pep.peptide_probability }.sort
438
+ peptides.collect{|pep| pep.probability }.sort
401
439
  end
402
440
 
403
441
  # returns a sorted lists of probabilities based on all pepprots (a peptide
@@ -477,138 +515,149 @@ class SpecID
477
515
  end
478
516
  sorted_probabilities(min_peptides)
479
517
  end
480
-
481
-
482
- # A Generic spectraID protein
483
- class Prot
484
- # probability is always a float!
485
- attr_accessor :probability, :reference
486
- end
487
-
488
- class Pep
489
- attr_accessor :probability
490
- # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
491
- # or last.
492
- attr_accessor :sequence
493
- attr_accessor :charge
494
-
495
- # units can be :mmu, :amu, :ppm
496
- def mass_accuracy(pep, unit=:ppm, mono=true)
497
- # 10^6 * deltam accuracy/ m[measured]
498
- # i.e., theoretical mass 1000, measured 999.9: 100ppm
499
- # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
500
- # pep.mass is the theoretical M+H of the peptide
501
- # this assumes that the deltacn value we're being told is correct, but I
502
- # have my suspicions (since the <mass> value is not accurate...)
503
-
504
- ######## TO COMPLETE (and add to spec_id..?)
505
- case unit
506
- when :ppm
507
- when :amu
508
- when :mmu
509
- end
510
- end
511
- end
512
-
513
518
  end
514
519
 
515
- # I would prefer to call this SpecID::XML, but I keep getting an error:
516
- # /home/john/Proteomics/msprot/lib/spec_id/bioworks.rb:412: warning: toplevel
517
- # constant XML referenced by SpecID::XML' This works around that for now.
518
- # Any major xml elements should return a newline at the end for simple
519
- # concatenation into a file
520
- module SpecIDXML
521
-
522
- Special_chrs_hash = {
523
- '"' => '&quot;',
524
- '&' => '&amp;',
525
- "'" => '&apos;',
526
- '<' => '&lt;',
527
- '>' => '&gt;',
528
- }
529
-
530
- # substitutes special xml chars
531
- def escape_special_chars(string)
532
- string.split('').map do |char|
533
- if Special_chrs_hash.key? char ; Special_chrs_hash[char]
534
- # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
535
- else ; char end
536
- end.join
537
- end
538
-
539
- $DEPTH = 0
540
-
541
- def tabs
542
- # this is ugly
543
- string = ""
544
- $DEPTH.times { string << "\t" }
545
- string
546
- end
547
520
 
521
+ # A Generic spectraID protein
522
+ module SpecID::Prot
523
+ # probability is always a float!
524
+ attr_accessor :probability, :reference, :peps
548
525
 
549
- def param_xml(symbol)
550
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
526
+ def <=> (other)
527
+ self.reference <=> other.reference
551
528
  end
552
529
 
553
- def params_xml(*symbol_list)
554
- symbol_list.collect { |sy|
555
- param_xml(sy)
556
- }.join("\n") + "\n"
557
- end
530
+ end
558
531
 
559
- def short_element_xml(element, att_list)
560
- "#{tabs}<#{element} #{attrs_xml(att_list)}/>\n"
532
+ module SpecID::Pep
533
+
534
+ Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
535
+
536
+ attr_accessor :prots
537
+ attr_accessor :probability
538
+ # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
539
+ # or last.
540
+ attr_accessor :sequence
541
+
542
+ # the basic amino acid sequence (no leading or trailing '.' or amino acids)
543
+ # should not contain any special symbols, etc.
544
+ attr_accessor :aaseq
545
+ attr_accessor :charge
546
+
547
+ # removes nonstandard chars with Non_standard_amino_acid_char_re
548
+ # preserves A-Z and '.' and '-'
549
+ def self.remove_non_amino_acids(sequence)
550
+ sequence.gsub(Non_standard_amino_acid_char_re, '')
551
+ end
552
+
553
+ # remove_non_amino_acids && split_sequence
554
+ def self.prepare_sequence(val)
555
+ nv = remove_non_amino_acids(val)
556
+ split_sequence(nv)
557
+ end
558
+
559
+ def <=>(other)
560
+ aaseq <=> other.aaseq
561
+ end
562
+
563
+ # Returns prev, peptide, next from sequence. Parse errors return
564
+ # nil,nil,nil
565
+ # R.PEPTIDE.A # -> R, PEPTIDE, A
566
+ # R.PEPTIDE.- # -> R, PEPTIDE, -
567
+ # PEPTIDE.A # -> -, PEPTIDE, A
568
+ # A.PEPTIDE # -> A, PEPTIDE, -
569
+ # PEPTIDE # -> nil,nil,nil
570
+ def self.split_sequence(val)
571
+ peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
572
+ pieces = val.split('.')
573
+ case pieces.size
574
+ when 3
575
+ peptide_prev_aa, peptide, peptide_next_aa = *pieces
576
+ when 2
577
+ if pieces[0].size > 1 ## N termini
578
+ peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
579
+ else ## C termini
580
+ peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
581
+ end
582
+ when 1 ## this must be a parse error!
583
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
584
+ when 0
585
+ peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
586
+ end
587
+ return peptide_prev_aa, peptide, peptide_next_aa
588
+ end
589
+
590
+ ##
591
+ def self.sequence_to_aaseq(sequence)
592
+ after_removed = remove_non_amino_acids(sequence)
593
+ pieces = after_removed.split('.')
594
+ case pieces.size
595
+ when 3
596
+ pieces[1]
597
+ when 2
598
+ if pieces[0].size > 1 ## N termini
599
+ pieces[0]
600
+ else ## C termini
601
+ pieces[1]
602
+ end
603
+ when 1 ## this must be a parse error!
604
+ pieces[0] ## which is the peptide itself
605
+ else
606
+ abort "bad peptide sequence: #{sequence}"
607
+ end
561
608
  end
562
609
 
563
- def short_element_xml_and_att_string(element, att_string)
564
- "#{tabs}<#{element} #{att_string}/>\n"
565
- end
610
+ # This will rapidly determine the list of proteins for which given
611
+ # peptides belong. It is meant to be low level and fast (eventually),
612
+ # so it asks for the data in a format amenable to this.
613
+ # returns a mirror array where each entry is an array of Fasta::Prot
614
+ # objects where each protein contains the sequence
615
+ def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
616
+ prots = fasta_obj.prots
617
+ prot_seqs = prots.map do |prot|
618
+ prot.aaseq
619
+ end
566
620
 
567
- # requires that obj have attribute '@xml_element_name'
568
- # displays all *instance_variables* (does not call methods!)
569
- def short_element_xml_from_instance_vars(element_name)
570
- string = instance_variables.map{|v| "#{v[1..-1]}=\"#{instance_variable_get(v)}\"" }.join(' ')
571
- "#{tabs}<#{element_name} #{string}/>\n"
572
- end
621
+ groups = peptide_strings_list.map do |pep_seq|
622
+ prot_index = 0
623
+ protein_group = []
624
+ prot_seqs.each do |prot_seq|
625
+ if prot_seq.include? pep_seq
626
+ protein_group << prots[prot_index]
627
+ end
628
+ prot_index += 1
629
+ end
630
+ protein_group
631
+ end
573
632
 
574
- # takes an element as a symbol and returns the
575
- def element_xml_no_atts(element)
576
- start = "#{tabs}<#{element}>\n"
577
- $DEPTH += 1
578
- if block_given? ; middle = yield else ; middle = '' end
579
- $DEPTH -= 1
580
- start + middle + "#{tabs}</#{element}>\n"
633
+ groups
581
634
  end
582
635
 
583
- # takes an element as a symbol and returns the
584
- def element_xml(element, att_list)
636
+ # units can be :mmu, :amu, :ppm
637
+ def mass_accuracy(pep, unit=:ppm, mono=true)
638
+ # 10^6 * deltam accuracy/ m[measured]
639
+ # i.e., theoretical mass 1000, measured 999.9: 100ppm
640
+ # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
641
+ # pep.mass is the theoretical M+H of the peptide
642
+ # this assumes that the deltacn value we're being told is correct, but I
643
+ # have my suspicions (since the <mass> value is not accurate...)
585
644
 
586
- start = "#{tabs}<#{element} #{attrs_xml(att_list)}>\n"
587
- $DEPTH += 1
588
- if block_given? ; middle = yield else ; middle = '' end
589
- $DEPTH -= 1
590
- start + middle + "#{tabs}</#{element}>\n"
645
+ ######## TO COMPLETE (and add to spec_id..?)
646
+ case unit
647
+ when :ppm
648
+ when :amu
649
+ when :mmu
650
+ end
591
651
  end
652
+ end
592
653
 
593
- # element as symbol and att_string as attributes
594
- # takes a block of whatever
595
- def element_xml_and_att_string(element, att_string)
596
- start = "#{tabs}<#{element} #{att_string}>\n"
597
- $DEPTH += 1
598
- if block_given? ; middle = yield else ; middle = '' end
599
- $DEPTH -= 1
600
- start + middle + "#{tabs}</#{element}>\n"
601
- end
654
+ class SpecID::GenericProt
655
+ include SpecID::Prot
656
+ end
602
657
 
603
- def attr_xml(symbol)
604
- "#{symbol}=\"#{send(symbol)}\""
605
- end
658
+ class SpecID::GenericPep
659
+ include SpecID::Pep
660
+ end
606
661
 
607
- def attrs_xml(list_of_symbols)
608
- list_of_symbols.collect {|sy|
609
- attr_xml(sy)
610
- }.join(" ")
611
- end
612
662
 
613
- end
614
663