mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -5,10 +5,11 @@ require 'xmlparser'
5
5
  require 'spec_id'
6
6
  require 'zlib'
7
7
  require 'hash_by'
8
- require 'set_from_hash'
9
8
  require 'array_class'
9
+ require 'fasta'
10
10
 
11
11
  ## have to pre-declare some guys
12
+ module ProteinReferenceable; end
12
13
  module SpecID; end
13
14
  module SpecID::Prot; end
14
15
  module SpecID::Pep; end
@@ -274,7 +275,7 @@ class Bioworks::XMLParser < XMLParser
274
275
  def endElement(name)
275
276
  case name
276
277
  when "peptide"
277
- @current_obj.set_from_hash(@current_hash)
278
+ @current_obj.set_from_hash_given_text(@current_hash)
278
279
  when "protein"
279
280
  else
280
281
  @current_hash[name] = @current_data
@@ -293,6 +294,7 @@ module Bioworks::XML
293
294
  end
294
295
 
295
296
  class Bioworks::Prot
297
+ include ProteinReferenceable
296
298
  include SpecID::Prot
297
299
  include Bioworks::XML
298
300
 
@@ -357,20 +359,20 @@ class Bioworks::Prot
357
359
  hash.delete("bioworksinfo")
358
360
  hash["sf"] = hash.delete("Sf")
359
361
  hash["pi"] = hash.delete("pI")
360
- set_from_hash(hash)
362
+ set_from_xml_hash(hash)
361
363
  end
362
364
 
363
365
  # changes the sf to Sf and pI to pi
364
366
  def set_from_xml_hash(hash)
365
367
  @reference = hash["reference"]
366
- @protein_probability = hash["protein_probability"]
367
- @probability = @protein_probability.to_f
368
- @consensus_score = hash["consensus_score"]
369
- @sf = hash["Sf"]
370
- @unified_score = hash["unified_score"]
371
- @coverage = hash["coverage"]
372
- @pi = hash["pI"]
373
- @weight = hash["weight"]
368
+ @protein_probability = hash["protein_probability"].to_f
369
+ #@probability = @protein_probability.to_f
370
+ @consensus_score = hash["consensus_score"].to_f
371
+ @sf = hash["Sf"].to_f
372
+ @unified_score = hash["unified_score"].to_f
373
+ @coverage = hash["coverage"].to_f
374
+ @pi = hash["pI"].to_f
375
+ @weight = hash["weight"].to_f
374
376
  @accession = hash["accession"]
375
377
  end
376
378
  end
@@ -392,6 +394,8 @@ class Bioworks::Pep
392
394
  ## NOTE! the mass is really the theoretical MH+!!!!
393
395
  ## NOTE! ALL values stored as strings, except peptide_probability!
394
396
 
397
+ #ions is a string 'x/y'
398
+
395
399
  ## other accessors:
396
400
  def probability ; self[15] end
397
401
  def mh ; self[1] end
@@ -449,14 +453,16 @@ class Bioworks::Pep
449
453
  end
450
454
  $VERBOSE = tmp_verb
451
455
 
456
+ undef_method :inspect
452
457
  def inspect
453
458
  "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
454
459
 
455
460
 
456
461
  end
457
462
 
458
- def set_from_hash(hash)
459
- self[0,11] = [hash["sequence"], hash["mass"], hash["deltamass"], hash["charge"], hash["xcorr"], hash["deltacn"], hash["sp"], hash["rsp"], hash["ions"], hash["count"], hash["tic"]]
463
+ # if cast == true, then all the data will be cast
464
+ def set_from_hash_given_text(hash)
465
+ self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
460
466
  self.file = hash["file"]
461
467
  self[15] = hash["peptide_probability"].to_f
462
468
  self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
@@ -470,7 +476,7 @@ class Bioworks::Pep
470
476
  hash[$1] = $2
471
477
  #puts "IN PEP: " + $1 + ": " + $2
472
478
  elsif line =~ @@end_pep_re
473
- set_from_hash(hash)
479
+ set_from_hash_given_text(hash)
474
480
  #puts "SELF[12]: #{self[12]}"
475
481
  #puts "SELF[12]: #{self[12]}"
476
482
  break
@@ -0,0 +1,139 @@
1
+
2
+ require 'spec_id/sequest/pepxml'
3
+ require 'spec_id/mass'
4
+
5
+ # A digestor must be able to respond to these methods:
6
+ class Digestor
7
+
8
+ # min_mh_mass = min molecular mass of peptide (M+H)+
9
+ attr_accessor :min_mh_mass
10
+ # max_mh_mass = max molecular mass of peptide (M+H)+
11
+ attr_accessor :max_mh_mass
12
+ # the number of allowable missed cleavages
13
+ attr_accessor :missed_cleavages
14
+ # sample_enzyme = SampleEnzyme object
15
+ attr_accessor :sample_enzyme
16
+ # hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
17
+ # In addition, the following keys (as symbols) are recognized.
18
+ # add_C_term_protein
19
+ # add_C_term_peptide
20
+ # add_N_term_protein
21
+ # add_N_term_peptide
22
+ attr_accessor :mass_hash
23
+
24
+ # returns a list of peptide objects created from a digestion of the fasta
25
+ # proteins using the sequest params (variable mods not supported yet)
26
+ def self.digest(fasta_obj, params_obj)
27
+ dig = self.new
28
+ dig.set_from_params(params_obj)
29
+ dig.create_peptide_hash(fasta_obj).values
30
+ end
31
+
32
+ def initialize
33
+ end
34
+
35
+ # takes a parameters object and fills in the necessary values
36
+ def set_from_params(params_obj, include_variable_mods=false)
37
+ raise NotImplementedError, "no variable mods yet" if include_variable_mods
38
+ if params_obj.is_a? Sequest::Params
39
+ @sample_enzyme = params_obj.sample_enzyme
40
+ @missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
41
+ (@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
42
+ (static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
43
+ monoisotopic_parents = case params_obj.mass_type_parent
44
+ when '0' ; false
45
+ when '1' ; true
46
+ end
47
+
48
+ @mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
49
+ else
50
+ raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
51
+ end
52
+ end
53
+
54
+ # aka 'digestion'
55
+ # will return a hash of SpecID::GenericPep objects (with 'aaseq' and
56
+ # 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
57
+ def create_peptide_hash(fasta_obj)
58
+ pep_to_prots_hash = {}
59
+ pep_objs = nil
60
+ pep_aaseqs_ar = fasta_obj.map do |prot|
61
+ @sample_enzyme.digest(prot.aaseq, @missed_cleavages)
62
+ end
63
+ prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
64
+ passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
65
+ #pep_aaseqs_ar.each_with_index do |before_peps,i|
66
+ # after_peps = passing_pep_seqs_ar[i]
67
+ # puts "before: #{before_peps.size} after: #{after_peps.size}"
68
+ # puts "Losing: #{(before_peps - after_peps).inspect}"
69
+ # puts "Keeping: #{after_peps.inspect}"
70
+ #end
71
+ fasta_obj.each_with_index do |prot, i|
72
+ pep_seqs = passing_pep_seqs_ar[i]
73
+ pep_seqs.each do |pep_seq|
74
+ pep_obj =
75
+ if pep_to_prots_hash.key?(pep_seq)
76
+ pep_to_prots_hash[pep_seq]
77
+ else
78
+ pep_ob = SpecID::GenericPep.new
79
+ pep_ob.prots = []
80
+ pep_ob.aaseq = pep_seq
81
+ pep_to_prots_hash[pep_seq] = pep_ob
82
+ end
83
+ pep_obj.prots << prot
84
+ end
85
+ end
86
+ #pep_to_prots_hash.each do |k,v|
87
+ # p v.aaseq
88
+ # puts v.prots.size
89
+ #end
90
+ pep_to_prots_hash
91
+ end
92
+
93
+ # min max are both in terms of the M+H(+)
94
+ #
95
+ # h_plus:
96
+ # On this website:
97
+ # http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
98
+ # They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
99
+ #
100
+ # The prot_aaseq is used if the mass_hash contains the keys
101
+ # :add_C_term_protein or :add_N_term_protein
102
+ #
103
+ # prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
104
+ # peptides matching a protein aaseq
105
+ # returns another parallel array of passing proteins
106
+ def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
107
+ if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
108
+ raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
109
+ else
110
+ # figure out how much must be added to each peptide
111
+ # include the h2o, the h, and N and C terminal static mods
112
+ h_key = h_plus ? :h_plus : :h
113
+ final_add = mass_hash[:h2o] + mass_hash[h_key]
114
+ [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
115
+ if mass_hash.key?(sym)
116
+ final_add += mass_hash[sym]
117
+ end
118
+ end
119
+ hash_by_aa_string = {}
120
+ mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
121
+
122
+ pep_aaseqs_ar.map do |pep_aaseqs|
123
+ pep_aaseqs.select do |aaseq|
124
+ sum = 0.0
125
+ aaseq.split('').each do |let|
126
+ if !hash_by_aa_string.key? let
127
+ puts 'NOT FOUND'
128
+ p let
129
+ end
130
+ sum += hash_by_aa_string[let]
131
+ end
132
+ mh_plus = sum + final_add
133
+ ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ end
@@ -0,0 +1,116 @@
1
+
2
+ class Mass
3
+ # http://expasy.org/tools/findmod/findmod_masses.html
4
+ # still need to add the modifications
5
+ MONO = {
6
+ :A => 71.03711,
7
+ :R => 156.10111,
8
+ :N => 114.04293,
9
+ :D => 115.02694,
10
+ :C => 103.00919,
11
+ :E => 129.04259,
12
+ :Q => 128.05858,
13
+ :G => 57.02146,
14
+ :H => 137.05891,
15
+ :I => 113.08406,
16
+ :L => 113.08406,
17
+ :K => 128.09496,
18
+ :M => 131.04049,
19
+ :F => 147.06841,
20
+ :P => 97.05276,
21
+ :S => 87.03203,
22
+ :T => 101.04768,
23
+ :W => 186.07931,
24
+ :Y => 163.06333,
25
+ :V => 99.06841,
26
+
27
+ # uncommon
28
+ :B => 172.048405, # average of aspartic acid and asparagine
29
+ :U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
30
+ :X => 118.805716, # the average of the mono masses of the 20 amino acids
31
+ :* => 118.805716, # same as X
32
+
33
+ # elements etc.
34
+ :h => 1.00783,
35
+ :h_plus => 1.00728,
36
+ :o => 15.9949146,
37
+ :h2o => 18.01056,
38
+
39
+ }
40
+ AVG = {
41
+ :A => 71.0788,
42
+ :R => 156.1875,
43
+ :N => 114.1038,
44
+ :D => 115.0886,
45
+ :C => 103.1388,
46
+ :E => 129.1155,
47
+ :Q => 128.1307,
48
+ :G => 57.0519,
49
+ :H => 137.1411,
50
+ :I => 113.1594,
51
+ :L => 113.1594,
52
+ :K => 128.1741,
53
+ :M => 131.1926,
54
+ :F => 147.1766,
55
+ :P => 97.1167,
56
+ :S => 87.0782,
57
+ :T => 101.1051,
58
+ :W => 186.2132,
59
+ :Y => 163.1760,
60
+ :V => 99.1326,
61
+
62
+ # uncommon
63
+ :B => 172.1405, # average of aspartic acid and asparagine
64
+ :U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
65
+ :X => 118.88603, # the average of the masses of the 20 amino acids
66
+ :* => 118.88603, # same as X
67
+
68
+ # elements etc.
69
+ :h => 1.00794,
70
+ :h_plus => 1.00739,
71
+ :o => 15.9994,
72
+ :h2o => 18.01524,
73
+ }
74
+
75
+ # returns a fresh hash where it has been added to each amino acid the amount
76
+ # specified in the array of a PepXML::Modifications object
77
+ # if static_terminal_mods given than will create the following keys as
78
+ # symbols as necessary:
79
+ # add_C_term_protein
80
+ # add_C_term_peptide
81
+ # add_N_term_protein
82
+ # add_N_term_peptide
83
+ def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
84
+ hash_to_use =
85
+ if monoisotopic
86
+ Mass::MONO
87
+ else
88
+ Mass::AVG
89
+ end
90
+ copy_hash = hash_to_use.dup
91
+ static_mods.each do |mod|
92
+ copy_hash[mod.aminoacid.to_sym] += mod.massdiff
93
+ end
94
+ static_terminal_mods.each do |mod|
95
+ if x = mod.protein_terminus
96
+ # its a protein terminus modification
97
+ case x
98
+ when 'n'
99
+ copy_hash[:add_N_term_protein] = mod.massdiff
100
+ when 'c'
101
+ copy_hash[:add_C_term_protein] = mod.massdiff
102
+ end
103
+ else
104
+ # its a peptide terminus modification
105
+ case mod.terminus
106
+ when 'n'
107
+ copy_hash[:add_N_term_peptide] = mod.massdiff
108
+ when 'c'
109
+ copy_hash[:add_C_term_peptide] = mod.massdiff
110
+ end
111
+ end
112
+ end
113
+ copy_hash
114
+ end
115
+ end
116
+
@@ -0,0 +1,236 @@
1
+ require 'xml_style_parser'
2
+ require 'spec_id/sequest/pepxml'
3
+
4
+
5
+ module SpecID ; end
6
+ module SpecID::Parser ; end
7
+
8
+
9
+ class SpecID::Parser::PepProph
10
+ include XMLStyleParser
11
+ def initialize(parse_type=:spec_id, version='3.0')
12
+ @method = parse_type
13
+ @version = version
14
+ implemented = %w(AXML LibXML)
15
+ klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
16
+ case klass_s
17
+ when 'AXML'
18
+ @get_root_node_from_file = Proc.new do |file|
19
+ AXML.parse_file(file)
20
+ end
21
+ when 'LibXML' # LibXML is buggy on some machines...
22
+ @get_root_node_from_file = Proc.new do |file|
23
+ doc = XML::Document.file(file)
24
+ doc.root
25
+ end
26
+ else
27
+ raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
28
+ end
29
+ end
30
+
31
+ # returns the spec_id object
32
+ def spec_id(file, opts={})
33
+ raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
34
+ spec_id_obj =
35
+ if x = opts[:spec_id]
36
+ x
37
+ else
38
+ Proph::PepSummary.new
39
+ end
40
+ msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
41
+ spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
42
+
43
+ msms_run_summary_n = msms_pipeline_analysis_n.find_first('child::msms_run_summary')
44
+ spec_id_obj.from_pepxml_node(msms_run_summary_n)
45
+ end
46
+
47
+ end
48
+
49
+ class SpecID::Parser::ProtProph
50
+ include XMLStyleParser
51
+ Split_unique_stripped_peptides_re = /\+/
52
+
53
+ def initialize(parse_type=:spec_id, version='4')
54
+ @method = parse_type
55
+ @version = version
56
+
57
+ implemented = %w(AXML LibXML)
58
+ klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
59
+ case klass_s
60
+ when 'AXML'
61
+ #puts "parsing with AXML (XMLParser based)" if $VERBOSE
62
+ @get_root_node_from_file = Proc.new do |file|
63
+ AXML.parse_file(file)
64
+ end
65
+ when 'LibXML' # LibXML is buggy on some machines...
66
+ #puts "parsing with LibXML" if $VERBOSE
67
+ @get_root_node_from_file = Proc.new do |file|
68
+ doc = XML::Document.file(file)
69
+ doc.root
70
+ end
71
+ else
72
+ raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
73
+ end
74
+ end
75
+
76
+ # returns the spec_id object
77
+ def spec_id(file, opts={})
78
+ raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
79
+ spec_id_obj =
80
+ if x = opts[:spec_id]
81
+ x
82
+ else
83
+ Proph::ProtSummary.new
84
+ end
85
+ protein_summary_n = @get_root_node_from_file.call(file)
86
+
87
+ #protein_summary_n = scan_for_first(doc, 'protein_summary')
88
+
89
+ # protein_summary_header_n = protein_summary_n.child
90
+ # could grab some of this info if we wanted...
91
+
92
+ pep_hash = {}
93
+ prot_hash = {}
94
+ protein_groups = []
95
+
96
+ # get all the proteins from inside protein groups
97
+ protein_group_name = 'protein_group'
98
+ get_protein_summary_header = true
99
+ protein_summary_n.each do |protein_group_n|
100
+ if get_protein_summary_header
101
+ protein_summary_header_n = protein_group_n
102
+ get_protein_summary_header = false
103
+ elsif protein_group_n.name == protein_group_name
104
+ protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
105
+ end
106
+ end
107
+
108
+ # need to finalize hash stuff
109
+ pep_hash.each do |k,pep|
110
+ new_prots = []
111
+ pep.prots.each do |prot_or_string|
112
+ if prot_or_string.is_a?(Proph::Prot)
113
+ new_prots << prot_or_string
114
+ else
115
+ prt = prot_hash[prot_or_string]
116
+ if prt.nil?
117
+ # this is an indistinguishable protein!
118
+ else
119
+ new_prots << prt
120
+ end
121
+ end
122
+ end
123
+ pep.prots = new_prots
124
+ end
125
+
126
+ spec_id_obj.peps = pep_hash.values
127
+ spec_id_obj.prots = prot_hash.values
128
+ spec_id_obj.prot_groups = protein_groups
129
+ spec_id_obj
130
+ end
131
+
132
+ # takes a Y or N and gives true/false
133
+ def booleanize(string)
134
+ case string
135
+ when 'Y'
136
+ true
137
+ when 'N'
138
+ false
139
+ else
140
+ nil
141
+ end
142
+ end
143
+
144
+ # assumes that all the rest of the nodes are protein_groups
145
+ # pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
146
+ # modified) + charge
147
+ # (as far as I can tell, all protein entries are unique!)
148
+ # returns a ProtGroup object
149
+ def get_proteins(protein_group_node, pep_hash, prot_hash)
150
+
151
+ protein_group_proteins = []
152
+
153
+ protein_group_node.each do |protein_n|
154
+ raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
155
+ # probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
156
+
157
+ # get the description
158
+ # INITIALIZE the protein and set key
159
+ n = protein_n
160
+ protein_name = n['protein_name']
161
+ peps = []
162
+ protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
163
+ n['n_indistinguishable_proteins'].to_i,
164
+ n['percent_coverage'].to_f,
165
+ n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
166
+ n['group_sibling_id'], n['total_number_peptides'].to_i,
167
+ n['pct_spectrum_ids'].to_f, nil,
168
+ peps ])
169
+ protein_group_proteins << protein
170
+ prot_hash[protein_name] = protein
171
+
172
+ # traverse through the peptides (and annotation)
173
+ protein_n.each do |protein_sub_n|
174
+ # create a proteins array for each peptide
175
+ proteins = [protein]
176
+
177
+ if protein_sub_n.name == 'annotation'
178
+ protein.description = protein_sub_n['protein_description']
179
+ end
180
+ if protein_sub_n.name == 'peptide'
181
+ peptide_n = protein_sub_n
182
+ # peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
183
+ # get modifications, if any
184
+
185
+ n = peptide_n
186
+ peptide_sequence = n['peptide_sequence']
187
+ charge = n['charge'].to_i
188
+
189
+ # GET list of all proteins and modifications
190
+
191
+ mod_info = nil
192
+ peptide_hash_string = peptide_sequence
193
+ if peptide_n.child?
194
+ peptide_n.each do |pep_sub_n|
195
+ case pep_sub_n.name
196
+ when 'peptide_parent_protein'
197
+ # NOTE! the proteins list will have strings until the assoc.
198
+ # prot is found!
199
+ proteins << pep_sub_n['protein_name']
200
+ when 'modification_info'
201
+ masses = pep_sub_n.map do |mod_aa_mass_n|
202
+ Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
203
+ end
204
+ peptide_hash_string = pep_sub_n['modified_peptide']
205
+ mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
206
+ end
207
+ end
208
+ end
209
+
210
+ key = [peptide_hash_string, charge]
211
+ peptide =
212
+ if pep_hash.key? key
213
+ pep_hash[key]
214
+ else
215
+ pep = Proph::Prot::Pep.new([peptide_sequence, charge,
216
+ n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
217
+ n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
218
+ n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
219
+ n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
220
+ booleanize(n['is_contributing_evidence']),
221
+ n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
222
+ pep_hash[key] = pep
223
+ pep
224
+ end
225
+ peps << peptide
226
+ end
227
+ end # end protein children
228
+ end
229
+ Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
230
+ end
231
+
232
+ def parse(file, opts)
233
+ send(@method, file, opts)
234
+ end
235
+
236
+ end