mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/spec_id.rb CHANGED
@@ -1,75 +1,19 @@
1
1
  require 'ostruct'
2
2
  require 'set'
3
3
  require 'hash_by'
4
- require 'spec_id/precision'
5
4
  require 'roc'
6
5
  require 'sample_enzyme' # for others
7
6
  require 'spec_id/bioworks'
8
7
  require 'spec_id/sequest'
9
- require 'spec_id/proph'
8
+ require 'spec_id/proph/prot_summary'
10
9
  require 'spec_id_xml'
10
+ require 'spec_id/mass'
11
+ require 'fasta'
12
+
13
+ module ProteinReferenceable ; end
11
14
 
12
15
  class SampleEnzyme ; end
13
16
 
14
- class Mass
15
- # http://expasy.org/tools/findmod/findmod_masses.html
16
- # still need to add the modifications
17
- MONO = {
18
- :A => 71.03711,
19
- :R => 156.10111,
20
- :N => 114.04293,
21
- :D => 115.02694,
22
- :C => 103.00919,
23
- :E => 129.04259,
24
- :Q => 128.05858,
25
- :G => 57.02146,
26
- :H => 137.05891,
27
- :I => 113.08406,
28
- :L => 113.08406,
29
- :K => 128.09496,
30
- :M => 131.04049,
31
- :F => 147.06841,
32
- :P => 97.05276,
33
- :S => 87.03203,
34
- :T => 101.04768,
35
- :W => 186.07931,
36
- :Y => 163.06333,
37
- :V => 99.06841,
38
-
39
- :h => 1.00783,
40
- :h_plus => 1.00728,
41
- :o => 15.9949146,
42
- :h2o => 18.01056,
43
-
44
- }
45
- AVG = {
46
- :A => 71.0788,
47
- :R => 156.1875,
48
- :N => 114.1038,
49
- :D => 115.0886,
50
- :C => 103.1388,
51
- :E => 129.1155,
52
- :Q => 128.1307,
53
- :G => 57.0519,
54
- :H => 137.1411,
55
- :I => 113.1594,
56
- :L => 113.1594,
57
- :K => 128.1741,
58
- :M => 131.1926,
59
- :F => 147.1766,
60
- :P => 97.1167,
61
- :S => 87.0782,
62
- :T => 101.1051,
63
- :W => 186.2132,
64
- :Y => 163.1760,
65
- :V => 99.1326,
66
-
67
- :h => 1.00794,
68
- :h_plus => 1.00739,
69
- :o => 15.9994,
70
- :h2o => 18.01524,
71
- }
72
- end
73
17
 
74
18
  module SpecID ; end
75
19
 
@@ -91,8 +35,14 @@ module SpecID
91
35
  # Will return a SpecID object (really, the object corresponding to the
92
36
  # file type which mixes in SpecID [is_a?(SpecID) == true])
93
37
  # If no file is given, will return a GenericSpecID object.
38
+ # If file is an array, this is assumed to be a group of srf files which is
39
+ # converted into an SRFGroup Ojbect and run.
94
40
  def self.new(file=nil, tp=nil)
95
- if file
41
+ # this will need to be specialized for other groups later
42
+ if file.is_a?(Array)
43
+ # takes an array of srf filenames
44
+ SRFGroup.new(file)
45
+ elsif file
96
46
  from_file(file, tp)
97
47
  else
98
48
  GenericSpecID.new
@@ -100,22 +50,27 @@ module SpecID
100
50
  end
101
51
 
102
52
  # tp = file_type
103
- # only takes an array if they are srf files!
53
+ # a single srf file will be packaged into an SRFGroup object
104
54
  def self.from_file(file, tp=nil)
105
55
  obj = nil
106
56
  unless tp
107
57
  tp = file_type(file)
108
58
  end
109
59
  obj = case tp
60
+ when 'srf'
61
+ #@hi_prob_best = false
62
+ SRFGroup.new([file])
110
63
  when 'srg'
111
- @hi_prob_best = false
64
+ #@hi_prob_best = false
112
65
  SRFGroup.new(file)
113
66
  when 'bioworks'
114
- @hi_prob_best = false
67
+ #@hi_prob_best = false
115
68
  Bioworks.new(file)
116
69
  when 'protproph'
117
- @hi_prob_best = true
70
+ #@hi_prob_best = true
118
71
  Proph::ProtSummary.new(file)
72
+ when 'pepproph'
73
+ Proph::PepSummary.new(file)
119
74
  else
120
75
  abort "UNRECOGNIZED file type for #{file}"
121
76
  end
@@ -123,9 +78,76 @@ module SpecID
123
78
  end
124
79
 
125
80
  def inspect
126
- "<#{self.class} #peps=\"#{peps.size}\">"
81
+ peps_string =
82
+ if peps
83
+ "peps(#)=#{peps.size}"
84
+ else
85
+ "peps=(nil)"
86
+ end
87
+ "<#{self.class} #{peps_string}>"
88
+ end
89
+
90
+ # given some list of SpecID::Pep based objects, returns the list of proteins
91
+ # associated with those peptides
92
+ # kind must be a symbol:
93
+ # :no_update (current proteins are returned, but their peps attribute
94
+ # is not updated)
95
+ # :update (current proteins returned with peps attribute updated)
96
+ # :new (new proteins are created complete with peps attribute)
97
+ def self.protein_list(pephits, kind=:no_update)
98
+
99
+ orig_pephits_prts = []
100
+ if kind == :new
101
+ new_prots = {}
102
+ pephits.each_with_index do |pep,i|
103
+ orig_pephits_prts[i] = pep.prots
104
+ peps_new_prts = pep.prots.map do |prt|
105
+ if new_prots.key? prt.reference
106
+ already_exists = new_prots[prt.reference]
107
+ else
108
+ np = prt.dup
109
+ np.peps = []
110
+ new_prots[np.reference] = np
111
+ np
112
+ end
113
+ end
114
+ pep.prots = peps_new_prts
115
+ end
116
+ end
117
+
118
+ if kind == :update
119
+ pephits.each do |pep|
120
+ pep.prots.each do |prt|
121
+ prt.peps = []
122
+ end
123
+ end
124
+ end
125
+
126
+ prot_set = {}
127
+ pephits.each do |pep|
128
+ prts = pep.prots
129
+ prts.each do |prt|
130
+ prot_set[ prt.reference ] = prt
131
+ end
132
+ if (kind == :update || kind == :new)
133
+ prts.each do |prt|
134
+ prt.peps << pep
135
+ end
136
+ end
137
+ end
138
+
139
+ ## Reset the original protein hits
140
+ if kind == :new
141
+ pephits.each_with_index do |pep,i|
142
+ pep.prots = orig_pephits_prts[i]
143
+ end
144
+ end
145
+
146
+ prot_set.values
127
147
  end
128
148
 
149
+
150
+
129
151
  # takes a comma separated list or array and extends the last to create an
130
152
  # array of desired size
131
153
  def self.extend_args(arg, desired_size)
@@ -193,13 +215,6 @@ module SpecID
193
215
  prot_triplets
194
216
  end
195
217
 
196
-
197
- ## basically, this is the command line wrapper
198
- def self.precision(argv)
199
- Prec.new.run_cmd_line(argv)
200
- end
201
-
202
-
203
218
  # returns number of true positives (array) and the specified output (as
204
219
  # parallel array). Requires the classification method and a sorted array of
205
220
  # tp values and an array fp values.
@@ -223,55 +238,100 @@ module SpecID
223
238
  pps
224
239
  end
225
240
 
226
- def classify_by_regex(items, regex, fp_on_match=true)
227
- case items
228
- when :prots
241
+ def self.prots?(ar)
242
+ ar.first.is_a? SpecID::Prot
243
+ end
244
+
245
+ def self.peps?(ar)
246
+ ar.first.is_a? SpecID::Pep
247
+ end
248
+
249
+ # for older stuff
250
+ def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
251
+ objects =
252
+ case items
253
+ when :prots
254
+ prots
255
+ when :peps
256
+ peps
257
+ end
258
+ SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
259
+ end
260
+
261
+ # includes the peptide hit in both
262
+ # returns (target, decoy)
263
+ # (for peps) ties can be :both, true (target wins), false (decoy wins)
264
+ # regardless of ties behavior, will partition out the proteins to be
265
+ # appropriate for the peptide
266
+ def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
267
+ if items.size == 0
268
+ return [[],[]]
269
+ elsif prots?(items)
229
270
  myproc = proc { |prt|
230
- if prt.reference =~ regex ; !fp_on_match
231
- else ; fp_on_match end
271
+ if prt.reference =~ regex ; !decoy_on_match
272
+ else ; decoy_on_match end
232
273
  }
233
274
  return classify(items, myproc)
234
- when :peps
275
+ elsif peps?(items)
235
276
  match = [] ; nomatch = []
236
- peps.each do |pep|
237
- match_prots = [] ; nomatch_prots = []
238
- (hit, nohit) = pep.prots.partition do |prot|
277
+ items.each do |pep|
278
+ (match_prots, nomatch_prots) = pep.prots.partition do |prot|
239
279
  prot.reference =~ regex
240
280
  end
241
- if hit.size == 0
281
+ if match_prots.size == 0
242
282
  nomatch << pep
243
- elsif nohit.size == 0
283
+ elsif nomatch_prots.size == 0
244
284
  match << pep
245
285
  else ## both have hits
246
286
  pep.prots = match_prots
247
287
  nomatch_pep = pep.dup
248
288
  nomatch_pep.prots = nomatch_prots
249
- match << pep
250
- nomatch << pep
289
+
290
+ # resolve ties
291
+ case ties
292
+ when true
293
+ if decoy_on_match
294
+ nomatch << pep
295
+ else
296
+ match << pep
297
+ end
298
+ when false
299
+ if decoy_on_match
300
+ match << pep
301
+ else
302
+ nomatch << pep
303
+ end
304
+ when :both
305
+ match << pep
306
+ nomatch << pep
307
+ else ; raise ArgumentError
308
+ end
251
309
  end
252
310
  end
253
- if fp_on_match
311
+ if decoy_on_match
254
312
  return [nomatch , match]
255
313
  else
256
314
  return [match, nomatch]
257
315
  end
258
316
  else
259
- abort "don't recognize "
317
+ raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
260
318
  end
261
319
  end
262
320
 
321
+
322
+
263
323
  # returns [tp, fp] based on the protein prefix for items where items =
264
324
  # (:prot|:peps)
265
325
  # this may result in a duplication of some peptides if they match both
266
326
  # normal and decoy proteins. In this case, the protein arrays are split,
267
327
  # too, so that each points only to its breed of protein.
268
- def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
328
+ def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
269
329
  if prefix
270
330
  regex = /^#{Regexp.escape(flag)}/
271
331
  else
272
332
  regex = /#{Regexp.escape(flag)}/
273
333
  end
274
- classify_by_regex(items, regex, fp_on_match)
334
+ classify_by_regex(items, regex, decoy_on_match)
275
335
  end
276
336
 
277
337
  # Returns (match, nomatch)
@@ -303,7 +363,7 @@ module SpecID
303
363
  classify_item_by.call(item) ]
304
364
  end
305
365
  roc = ROC.new
306
- tp, fp = roc.prep_list(doublets)
366
+ tp, fp = roc.doublets_to_separate(doublets)
307
367
  return tp, fp
308
368
  end
309
369
 
@@ -393,11 +453,13 @@ module SpecID
393
453
  end
394
454
  File.open(file) do |fh|
395
455
  lines = ""
396
- 4.times { lines << fh.readline }
456
+ 8.times { lines << fh.readline }
397
457
  if lines =~ /<bioworksinfo>/
398
458
  return 'bioworks'
399
- elsif lines =~ /<protein_summary/ && lines =~ /xmlns="http:\/\/regis-web.systemsbiology.net\/protXML"/
459
+ elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
400
460
  return 'protproph'
461
+ elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
462
+ return 'pepproph'
401
463
  end
402
464
  end
403
465
  end
@@ -521,9 +583,10 @@ module SpecID
521
583
  end
522
584
  end
523
585
 
524
-
525
586
  # A Generic spectraID protein
526
587
  module SpecID::Prot
588
+ include ProteinReferenceable
589
+
527
590
  # probability is always a float!
528
591
  attr_accessor :probability, :reference, :peps
529
592
 
@@ -531,6 +594,14 @@ module SpecID::Prot
531
594
  self.reference <=> other.reference
532
595
  end
533
596
 
597
+ def inspect
598
+ pep_string =
599
+ if peps
600
+ ", @peps(#)=#{peps.size}"
601
+ end
602
+ "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
603
+ end
604
+
534
605
  end
535
606
 
536
607
  module SpecID::Pep
@@ -653,6 +724,23 @@ module SpecID::Pep
653
724
  when :mmu
654
725
  end
655
726
  end
727
+
728
+ # calls the method associated with each key and returns the value
729
+ def values_at(*args)
730
+ args.map do |arg|
731
+ send(arg)
732
+ end
733
+ end
734
+
735
+ def inspect
736
+
737
+ prot_string =
738
+ if prots
739
+ ", @prots(#)=#{prots.size}"
740
+ end
741
+ "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
742
+ end
743
+
656
744
  end
657
745
 
658
746
  class SpecID::GenericProt
data/lib/spec_id_xml.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # concatenation into a file
7
7
  module SpecIDXML
8
8
 
9
- Special_chrs_hash = {
9
+ MSial_chrs_hash = {
10
10
  '"' => '&quot;',
11
11
  '&' => '&amp;',
12
12
  "'" => '&apos;',
@@ -17,8 +17,8 @@ module SpecIDXML
17
17
  # substitutes special xml chars
18
18
  def escape_special_chars(string)
19
19
  string.split('').map do |char|
20
- if Special_chrs_hash.key? char ; Special_chrs_hash[char]
21
- # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
20
+ if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
21
+ # if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
22
22
  else ; char end
23
23
  end.join
24
24
  end
@@ -33,13 +33,13 @@ module SpecIDXML
33
33
  end
34
34
 
35
35
 
36
- def param_xml(symbol)
37
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
36
+ def param_xml(obj, symbol)
37
+ tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
38
38
  end
39
39
 
40
- def params_xml(*symbol_list)
40
+ def params_xml(obj, *symbol_list)
41
41
  symbol_list.collect { |sy|
42
- param_xml(sy)
42
+ param_xml(obj, sy)
43
43
  }.join("\n") + "\n"
44
44
  end
45
45
 
@@ -92,9 +92,7 @@ module SpecIDXML
92
92
  end
93
93
 
94
94
  def attrs_xml(list_of_symbols)
95
- list_of_symbols.collect {|sy|
96
- attr_xml(sy)
97
- }.join(" ")
95
+ list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
98
96
  end
99
97
 
100
98
  end
@@ -0,0 +1,147 @@
1
+ require 'transmem'
2
+
3
+ class Phobius ; end
4
+
5
+ # This class will probably change its interface some in the future
6
+ # That's the web portal
7
+ # http://phobius.cgb.ki.se/
8
+ # How to run:
9
+ # Select output format as 'Short'
10
+ # then hit 'Submit Query'
11
+
12
+ # note: to implement some of the TransmemIndex features, the update_aaseq
13
+ # method must be called!
14
+ class Phobius::Index < Hash
15
+ include TransmemIndex
16
+
17
+ # will update_aaseq if given a fasta_obj
18
+ def initialize(file, fasta_obj = nil )
19
+ Phobius.default_index(file, self)
20
+ if fasta_obj
21
+ update_aaseq(fasta_obj)
22
+ end
23
+ end
24
+
25
+ # we need to match whatever function toppred uses to generate identifiers if
26
+ # we want derivative processes to be fast and accurate
27
+ def reference_to_key(reference)
28
+ if reference
29
+ if reference.size > 0
30
+ index = reference.index(' ')
31
+ string =
32
+ if index
33
+ reference[0...index]
34
+ else
35
+ reference
36
+ end
37
+ string.gsub('"','')
38
+ else
39
+ ''
40
+ end
41
+ else
42
+ nil
43
+ end
44
+ end
45
+
46
+ # adds an :aaseq key to each hash (necessary for avg_overlap method)
47
+ # these are shallow references to the aaseq in the fasta obj
48
+ def update_aaseq(fasta)
49
+ fasta.each do |prot|
50
+ self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
51
+ end
52
+ end
53
+
54
+ end
55
+
56
+ class Phobius
57
+ include TransmemIndex
58
+
59
+ # returns the default index
60
+ def self.default_index(file, index={})
61
+ parser = Phobius::Parser.new(:short)
62
+ parser.file_to_index(file, index)
63
+ end
64
+
65
+ end
66
+
67
+ module Phobius::Parser
68
+
69
+ def self.new(parser_type=:short)
70
+ klass =
71
+ case parser_type
72
+ when :short
73
+ Phobius::ParserShort
74
+ else
75
+ raise ArgumentError, "don't recognize parser type: #{parser_type}"
76
+ end
77
+ klass.new
78
+ end
79
+
80
+ def file_to_index(file, index={})
81
+ File.open(file) {|fh| to_index(fh, index) }
82
+ end
83
+
84
+ end
85
+
86
+
87
+ class Phobius::ParserShort
88
+ include Phobius::Parser
89
+
90
+ # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
91
+ # and returns an array of hashes with the keys :start and :stop
92
+ def prediction_to_array(string)
93
+ segments = []
94
+ string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
95
+ segments << { :start => m1.to_i, :stop => m2.to_i }
96
+ end
97
+ segments
98
+ end
99
+
100
+ # returns a hash structure in this form: { identifier => {
101
+ # :num_certain_transmembrane_segments => Int,
102
+ # :transmembrane_segments => [:start => Int, :stop
103
+ # => Int] }
104
+ # can parse io even if there is no header to key in on.
105
+ def to_index(io, index={})
106
+ init_pos = io.pos
107
+ cnt = 0
108
+ found_header = false
109
+ loop do
110
+ if io.gets =~ /SEQENCE/
111
+ found_header = true
112
+ break
113
+ end
114
+ cnt += 1
115
+ break if cnt > 10
116
+ end
117
+ if !found_header
118
+ io.pos = init_pos
119
+ end
120
+ current_record = nil
121
+ io.each do |line|
122
+ line.chomp!
123
+ # grab values
124
+ ar = line.split(/\s+/)
125
+ next if ar.size != 4
126
+ (key, num_tms, signal_peptide, prediction) = ar
127
+ # cast the values
128
+ num_tms = num_tms.to_i
129
+ signal_peptide =
130
+ case signal_peptide
131
+ when 'Y'
132
+ true
133
+ when '0'
134
+ false
135
+ end
136
+ index[key] = {
137
+ :num_certain_transmembrane_segments => num_tms,
138
+ :signal_peptide => signal_peptide,
139
+ }
140
+ if num_tms > 0
141
+ index[key][:transmembrane_segments] = prediction_to_array(prediction)
142
+ end
143
+ end
144
+ index
145
+ end
146
+
147
+ end