mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -5,10 +5,11 @@ require 'xmlparser'
5
5
  require 'spec_id'
6
6
  require 'zlib'
7
7
  require 'hash_by'
8
- require 'set_from_hash'
9
8
  require 'array_class'
9
+ require 'fasta'
10
10
 
11
11
  ## have to pre-declare some guys
12
+ module ProteinReferenceable; end
12
13
  module SpecID; end
13
14
  module SpecID::Prot; end
14
15
  module SpecID::Pep; end
@@ -274,7 +275,7 @@ class Bioworks::XMLParser < XMLParser
274
275
  def endElement(name)
275
276
  case name
276
277
  when "peptide"
277
- @current_obj.set_from_hash(@current_hash)
278
+ @current_obj.set_from_hash_given_text(@current_hash)
278
279
  when "protein"
279
280
  else
280
281
  @current_hash[name] = @current_data
@@ -293,6 +294,7 @@ module Bioworks::XML
293
294
  end
294
295
 
295
296
  class Bioworks::Prot
297
+ include ProteinReferenceable
296
298
  include SpecID::Prot
297
299
  include Bioworks::XML
298
300
 
@@ -357,20 +359,20 @@ class Bioworks::Prot
357
359
  hash.delete("bioworksinfo")
358
360
  hash["sf"] = hash.delete("Sf")
359
361
  hash["pi"] = hash.delete("pI")
360
- set_from_hash(hash)
362
+ set_from_xml_hash(hash)
361
363
  end
362
364
 
363
365
  # changes the sf to Sf and pI to pi
364
366
  def set_from_xml_hash(hash)
365
367
  @reference = hash["reference"]
366
- @protein_probability = hash["protein_probability"]
367
- @probability = @protein_probability.to_f
368
- @consensus_score = hash["consensus_score"]
369
- @sf = hash["Sf"]
370
- @unified_score = hash["unified_score"]
371
- @coverage = hash["coverage"]
372
- @pi = hash["pI"]
373
- @weight = hash["weight"]
368
+ @protein_probability = hash["protein_probability"].to_f
369
+ #@probability = @protein_probability.to_f
370
+ @consensus_score = hash["consensus_score"].to_f
371
+ @sf = hash["Sf"].to_f
372
+ @unified_score = hash["unified_score"].to_f
373
+ @coverage = hash["coverage"].to_f
374
+ @pi = hash["pI"].to_f
375
+ @weight = hash["weight"].to_f
374
376
  @accession = hash["accession"]
375
377
  end
376
378
  end
@@ -392,6 +394,8 @@ class Bioworks::Pep
392
394
  ## NOTE! the mass is really the theoretical MH+!!!!
393
395
  ## NOTE! ALL values stored as strings, except peptide_probability!
394
396
 
397
+ #ions is a string 'x/y'
398
+
395
399
  ## other accessors:
396
400
  def probability ; self[15] end
397
401
  def mh ; self[1] end
@@ -449,14 +453,16 @@ class Bioworks::Pep
449
453
  end
450
454
  $VERBOSE = tmp_verb
451
455
 
456
+ undef_method :inspect
452
457
  def inspect
453
458
  "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
454
459
 
455
460
 
456
461
  end
457
462
 
458
- def set_from_hash(hash)
459
- self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
463
+ # if cast == true, then all the data will be cast
464
+ def set_from_hash_given_text(hash)
465
+ self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
460
466
  self.file = hash["file"]
461
467
  self[15] = hash["peptide_probability"].to_f
462
468
  self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
@@ -470,7 +476,7 @@ class Bioworks::Pep
470
476
  hash[$1] = $2
471
477
  #puts "IN PEP: " + $1 + ": " + $2
472
478
  elsif line =~ @@end_pep_re
473
- set_from_hash(hash)
479
+ set_from_hash_given_text(hash)
474
480
  #puts "SELF[12]: #{self[12]}"
475
481
  #puts "SELF[12]: #{self[12]}"
476
482
  break
@@ -0,0 +1,139 @@
1
+
2
+ require 'spec_id/sequest/pepxml'
3
+ require 'spec_id/mass'
4
+
5
+ # A digestor must be able to respond to these methods:
6
+ class Digestor
7
+
8
+ # min_mh_mass = min molecular mass of peptide (M+H)+
9
+ attr_accessor :min_mh_mass
10
+ # max_mh_mass = max molecular mass of peptide (M+H)+
11
+ attr_accessor :max_mh_mass
12
+ # the number of allowable missed cleavages
13
+ attr_accessor :missed_cleavages
14
+ # sample_enzyme = SampleEnzyme object
15
+ attr_accessor :sample_enzyme
16
+ # hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
17
+ # In addition, the following keys (as symbols) are recognized.
18
+ # add_C_term_protein
19
+ # add_C_term_peptide
20
+ # add_N_term_protein
21
+ # add_N_term_peptide
22
+ attr_accessor :mass_hash
23
+
24
+ # returns a list of peptide objects created from a digestion of the fasta
25
+ # proteins using the sequest params (variable mods not supported yet)
26
+ def self.digest(fasta_obj, params_obj)
27
+ dig = self.new
28
+ dig.set_from_params(params_obj)
29
+ dig.create_peptide_hash(fasta_obj).values
30
+ end
31
+
32
+ def initialize
33
+ end
34
+
35
+ # takes a parameters object and fills in the necessary values
36
+ def set_from_params(params_obj, include_variable_mods=false)
37
+ raise NotImplementedError, "no variable mods yet" if include_variable_mods
38
+ if params_obj.is_a? Sequest::Params
39
+ @sample_enzyme = params_obj.sample_enzyme
40
+ @missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
41
+ (@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
42
+ (static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
43
+ monoisotopic_parents = case params_obj.mass_type_parent
44
+ when '0' ; false
45
+ when '1' ; true
46
+ end
47
+
48
+ @mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
49
+ else
50
+ raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
51
+ end
52
+ end
53
+
54
+ # aka 'digestion'
55
+ # will return a hash of SpecID::GenericPep objects (with 'aaseq' and
56
+ # 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
57
+ def create_peptide_hash(fasta_obj)
58
+ pep_to_prots_hash = {}
59
+ pep_objs = nil
60
+ pep_aaseqs_ar = fasta_obj.map do |prot|
61
+ @sample_enzyme.digest(prot.aaseq, @missed_cleavages)
62
+ end
63
+ prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
64
+ passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
65
+ #pep_aaseqs_ar.each_with_index do |before_peps,i|
66
+ # after_peps = passing_pep_seqs_ar[i]
67
+ # puts "before: #{before_peps.size} after: #{after_peps.size}"
68
+ # puts "Losing: #{(before_peps - after_peps).inspect}"
69
+ # puts "Keeping: #{after_peps.inspect}"
70
+ #end
71
+ fasta_obj.each_with_index do |prot, i|
72
+ pep_seqs = passing_pep_seqs_ar[i]
73
+ pep_seqs.each do |pep_seq|
74
+ pep_obj =
75
+ if pep_to_prots_hash.key?(pep_seq)
76
+ pep_to_prots_hash[pep_seq]
77
+ else
78
+ pep_ob = SpecID::GenericPep.new
79
+ pep_ob.prots = []
80
+ pep_ob.aaseq = pep_seq
81
+ pep_to_prots_hash[pep_seq] = pep_ob
82
+ end
83
+ pep_obj.prots << prot
84
+ end
85
+ end
86
+ #pep_to_prots_hash.each do |k,v|
87
+ # p v.aaseq
88
+ # puts v.prots.size
89
+ #end
90
+ pep_to_prots_hash
91
+ end
92
+
93
+ # min max are both in terms of the M+H(+)
94
+ #
95
+ # h_plus:
96
+ # On this website:
97
+ # http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
98
+ # They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
99
+ #
100
+ # The prot_aaseq is used if the mass_hash contains the keys
101
+ # :add_C_term_protein or :add_N_term_protein
102
+ #
103
+ # prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
104
+ # peptides matching a protein aaseq
105
+ # returns another parallel array of passing proteins
106
+ def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
107
+ if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
108
+ raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
109
+ else
110
+ # figure out how much must be added to each peptide
111
+ # include the h2o, the h, and N and C terminal static mods
112
+ h_key = h_plus ? :h_plus : :h
113
+ final_add = mass_hash[:h2o] + mass_hash[h_key]
114
+ [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
115
+ if mass_hash.key?(sym)
116
+ final_add += mass_hash[sym]
117
+ end
118
+ end
119
+ hash_by_aa_string = {}
120
+ mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
121
+
122
+ pep_aaseqs_ar.map do |pep_aaseqs|
123
+ pep_aaseqs.select do |aaseq|
124
+ sum = 0.0
125
+ aaseq.split('').each do |let|
126
+ if !hash_by_aa_string.key? let
127
+ puts 'NOT FOUND'
128
+ p let
129
+ end
130
+ sum += hash_by_aa_string[let]
131
+ end
132
+ mh_plus = sum + final_add
133
+ ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ end
@@ -0,0 +1,116 @@
1
+
2
+ class Mass
3
+ # http://expasy.org/tools/findmod/findmod_masses.html
4
+ # still need to add the modifications
5
+ MONO = {
6
+ :A => 71.03711,
7
+ :R => 156.10111,
8
+ :N => 114.04293,
9
+ :D => 115.02694,
10
+ :C => 103.00919,
11
+ :E => 129.04259,
12
+ :Q => 128.05858,
13
+ :G => 57.02146,
14
+ :H => 137.05891,
15
+ :I => 113.08406,
16
+ :L => 113.08406,
17
+ :K => 128.09496,
18
+ :M => 131.04049,
19
+ :F => 147.06841,
20
+ :P => 97.05276,
21
+ :S => 87.03203,
22
+ :T => 101.04768,
23
+ :W => 186.07931,
24
+ :Y => 163.06333,
25
+ :V => 99.06841,
26
+
27
+ # uncommon
28
+ :B => 172.048405, # average of aspartic acid and asparagine
29
+ :U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
30
+ :X => 118.805716, # the average of the mono masses of the 20 amino acids
31
+ :* => 118.805716, # same as X
32
+
33
+ # elements etc.
34
+ :h => 1.00783,
35
+ :h_plus => 1.00728,
36
+ :o => 15.9949146,
37
+ :h2o => 18.01056,
38
+
39
+ }
40
+ AVG = {
41
+ :A => 71.0788,
42
+ :R => 156.1875,
43
+ :N => 114.1038,
44
+ :D => 115.0886,
45
+ :C => 103.1388,
46
+ :E => 129.1155,
47
+ :Q => 128.1307,
48
+ :G => 57.0519,
49
+ :H => 137.1411,
50
+ :I => 113.1594,
51
+ :L => 113.1594,
52
+ :K => 128.1741,
53
+ :M => 131.1926,
54
+ :F => 147.1766,
55
+ :P => 97.1167,
56
+ :S => 87.0782,
57
+ :T => 101.1051,
58
+ :W => 186.2132,
59
+ :Y => 163.1760,
60
+ :V => 99.1326,
61
+
62
+ # uncommon
63
+ :B => 172.1405, # average of aspartic acid and asparagine
64
+ :U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
65
+ :X => 118.88603, # the average of the masses of the 20 amino acids
66
+ :* => 118.88603, # same as X
67
+
68
+ # elements etc.
69
+ :h => 1.00794,
70
+ :h_plus => 1.00739,
71
+ :o => 15.9994,
72
+ :h2o => 18.01524,
73
+ }
74
+
75
+ # returns a fresh hash where it has been added to each amino acid the amount
76
+ # specified in the array of a PepXML::Modifications object
77
+ # if static_terminal_mods given than will create the following keys as
78
+ # symbols as necessary:
79
+ # add_C_term_protein
80
+ # add_C_term_peptide
81
+ # add_N_term_protein
82
+ # add_N_term_peptide
83
+ def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
84
+ hash_to_use =
85
+ if monoisotopic
86
+ Mass::MONO
87
+ else
88
+ Mass::AVG
89
+ end
90
+ copy_hash = hash_to_use.dup
91
+ static_mods.each do |mod|
92
+ copy_hash[mod.aminoacid.to_sym] += mod.massdiff
93
+ end
94
+ static_terminal_mods.each do |mod|
95
+ if x = mod.protein_terminus
96
+ # its a protein terminus modification
97
+ case x
98
+ when 'n'
99
+ copy_hash[:add_N_term_protein] = mod.massdiff
100
+ when 'c'
101
+ copy_hash[:add_C_term_protein] = mod.massdiff
102
+ end
103
+ else
104
+ # its a peptide terminus modification
105
+ case mod.terminus
106
+ when 'n'
107
+ copy_hash[:add_N_term_peptide] = mod.massdiff
108
+ when 'c'
109
+ copy_hash[:add_C_term_peptide] = mod.massdiff
110
+ end
111
+ end
112
+ end
113
+ copy_hash
114
+ end
115
+ end
116
+
@@ -0,0 +1,236 @@
1
+ require 'xml_style_parser'
2
+ require 'spec_id/sequest/pepxml'
3
+
4
+
5
+ module SpecID ; end
6
+ module SpecID::Parser ; end
7
+
8
+
9
+ class SpecID::Parser::PepProph
10
+ include XMLStyleParser
11
+ def initialize(parse_type=:spec_id, version='3.0')
12
+ @method = parse_type
13
+ @version = version
14
+ implemented = %w(AXML LibXML)
15
+ klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
16
+ case klass_s
17
+ when 'AXML'
18
+ @get_root_node_from_file = Proc.new do |file|
19
+ AXML.parse_file(file)
20
+ end
21
+ when 'LibXML' # LibXML is buggy on some machines...
22
+ @get_root_node_from_file = Proc.new do |file|
23
+ doc = XML::Document.file(file)
24
+ doc.root
25
+ end
26
+ else
27
+ raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
28
+ end
29
+ end
30
+
31
+ # returns the spec_id object
32
+ def spec_id(file, opts={})
33
+ raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
34
+ spec_id_obj =
35
+ if x = opts[:spec_id]
36
+ x
37
+ else
38
+ Proph::PepSummary.new
39
+ end
40
+ msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
41
+ spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
42
+
43
+ msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
44
+ spec_id_obj.from_pepxml_node(msms_run_summary_n)
45
+ end
46
+
47
+ end
48
+
49
+ class SpecID::Parser::ProtProph
50
+ include XMLStyleParser
51
+ Split_unique_stripped_peptides_re = /\+/
52
+
53
+ def initialize(parse_type=:spec_id, version='4')
54
+ @method = parse_type
55
+ @version = version
56
+
57
+ implemented = %w(AXML LibXML)
58
+ klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
59
+ case klass_s
60
+ when 'AXML'
61
+ #puts "parsing with AXML (XMLParser based)" if $VERBOSE
62
+ @get_root_node_from_file = Proc.new do |file|
63
+ AXML.parse_file(file)
64
+ end
65
+ when 'LibXML' # LibXML is buggy on some machines...
66
+ #puts "parsing with LibXML" if $VERBOSE
67
+ @get_root_node_from_file = Proc.new do |file|
68
+ doc = XML::Document.file(file)
69
+ doc.root
70
+ end
71
+ else
72
+ raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
73
+ end
74
+ end
75
+
76
+ # returns the spec_id object
77
+ def spec_id(file, opts={})
78
+ raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
79
+ spec_id_obj =
80
+ if x = opts[:spec_id]
81
+ x
82
+ else
83
+ Proph::ProtSummary.new
84
+ end
85
+ protein_summary_n = @get_root_node_from_file.call(file)
86
+
87
+ #protein_summary_n = scan_for_first(doc, 'protein_summary')
88
+
89
+ # protein_summary_header_n = protein_summary_n.child
90
+ # could grab some of this info if we wanted...
91
+
92
+ pep_hash = {}
93
+ prot_hash = {}
94
+ protein_groups = []
95
+
96
+ # get all the proteins from inside protein groups
97
+ protein_group_name = 'protein_group'
98
+ get_protein_summary_header = true
99
+ protein_summary_n.each do |protein_group_n|
100
+ if get_protein_summary_header
101
+ protein_summary_header_n = protein_group_n
102
+ get_protein_summary_header = false
103
+ elsif protein_group_n.name == protein_group_name
104
+ protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
105
+ end
106
+ end
107
+
108
+ # need to finalize hash stuff
109
+ pep_hash.each do |k,pep|
110
+ new_prots = []
111
+ pep.prots.each do |prot_or_string|
112
+ if prot_or_string.is_a?(Proph::Prot)
113
+ new_prots << prot_or_string
114
+ else
115
+ prt = prot_hash[prot_or_string]
116
+ if prt.nil?
117
+ # this is an indistinguishable protein!
118
+ else
119
+ new_prots << prt
120
+ end
121
+ end
122
+ end
123
+ pep.prots = new_prots
124
+ end
125
+
126
+ spec_id_obj.peps = pep_hash.values
127
+ spec_id_obj.prots = prot_hash.values
128
+ spec_id_obj.prot_groups = protein_groups
129
+ spec_id_obj
130
+ end
131
+
132
+ # takes a Y or N and gives true/false
133
+ def booleanize(string)
134
+ case string
135
+ when 'Y'
136
+ true
137
+ when 'N'
138
+ false
139
+ else
140
+ nil
141
+ end
142
+ end
143
+
144
+ # assumes that all the rest of the nodes are protein_groups
145
+ # pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
146
+ # modified) + charge
147
+ # (as far as I can tell, all protein entries are unique!)
148
+ # returns a ProtGroup object
149
+ def get_proteins(protein_group_node, pep_hash, prot_hash)
150
+
151
+ protein_group_proteins = []
152
+
153
+ protein_group_node.each do |protein_n|
154
+ raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
155
+ # probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
156
+
157
+ # get the description
158
+ # INITIALIZE the protein and set key
159
+ n = protein_n
160
+ protein_name = n['protein_name']
161
+ peps = []
162
+ protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
163
+ n['n_indistinguishable_proteins'].to_i,
164
+ n['percent_coverage'].to_f,
165
+ n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
166
+ n['group_sibling_id'], n['total_number_peptides'].to_i,
167
+ n['pct_spectrum_ids'].to_f, nil,
168
+ peps ])
169
+ protein_group_proteins << protein
170
+ prot_hash[protein_name] = protein
171
+
172
+ # traverse through the peptides (and annotation)
173
+ protein_n.each do |protein_sub_n|
174
+ # create a proteins array for each peptide
175
+ proteins = [protein]
176
+
177
+ if protein_sub_n.name == 'annotation'
178
+ protein.description = protein_sub_n['protein_description']
179
+ end
180
+ if protein_sub_n.name == 'peptide'
181
+ peptide_n = protein_sub_n
182
+ # peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
183
+ # get modifications, if any
184
+
185
+ n = peptide_n
186
+ peptide_sequence = n['peptide_sequence']
187
+ charge = n['charge'].to_i
188
+
189
+ # GET list of all proteins and modifications
190
+
191
+ mod_info = nil
192
+ peptide_hash_string = peptide_sequence
193
+ if peptide_n.child?
194
+ peptide_n.each do |pep_sub_n|
195
+ case pep_sub_n.name
196
+ when 'peptide_parent_protein'
197
+ # NOTE! the proteins list will have strings until the assoc.
198
+ # prot is found!
199
+ proteins << pep_sub_n['protein_name']
200
+ when 'modification_info'
201
+ masses = pep_sub_n.map do |mod_aa_mass_n|
202
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
203
+ end
204
+ peptide_hash_string = pep_sub_n['modified_peptide']
205
+ mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
206
+ end
207
+ end
208
+ end
209
+
210
+ key = [peptide_hash_string, charge]
211
+ peptide =
212
+ if pep_hash.key? key
213
+ pep_hash[key]
214
+ else
215
+ pep = Proph::Prot::Pep.new([peptide_sequence, charge,
216
+ n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
217
+ n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
218
+ n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
219
+ n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
220
+ booleanize(n['is_contributing_evidence']),
221
+ n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
222
+ pep_hash[key] = pep
223
+ pep
224
+ end
225
+ peps << peptide
226
+ end
227
+ end # end protein children
228
+ end
229
+ Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
230
+ end
231
+
232
+ def parse(file, opts)
233
+ send(@method, file, opts)
234
+ end
235
+
236
+ end