mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/spec_id.rb CHANGED
@@ -1,75 +1,19 @@
1
1
  require 'ostruct'
2
2
  require 'set'
3
3
  require 'hash_by'
4
- require 'spec_id/precision'
5
4
  require 'roc'
6
5
  require 'sample_enzyme' # for others
7
6
  require 'spec_id/bioworks'
8
7
  require 'spec_id/sequest'
9
- require 'spec_id/proph'
8
+ require 'spec_id/proph/prot_summary'
10
9
  require 'spec_id_xml'
10
+ require 'spec_id/mass'
11
+ require 'fasta'
12
+
13
+ module ProteinReferenceable ; end
11
14
 
12
15
  class SampleEnzyme ; end
13
16
 
14
- class Mass
15
- # http://expasy.org/tools/findmod/findmod_masses.html
16
- # still need to add the modifications
17
- MONO = {
18
- :A => 71.03711,
19
- :R => 156.10111,
20
- :N => 114.04293,
21
- :D => 115.02694,
22
- :C => 103.00919,
23
- :E => 129.04259,
24
- :Q => 128.05858,
25
- :G => 57.02146,
26
- :H => 137.05891,
27
- :I => 113.08406,
28
- :L => 113.08406,
29
- :K => 128.09496,
30
- :M => 131.04049,
31
- :F => 147.06841,
32
- :P => 97.05276,
33
- :S => 87.03203,
34
- :T => 101.04768,
35
- :W => 186.07931,
36
- :Y => 163.06333,
37
- :V => 99.06841,
38
-
39
- :h => 1.00783,
40
- :h_plus => 1.00728,
41
- :o => 15.9949146,
42
- :h2o => 18.01056,
43
-
44
- }
45
- AVG = {
46
- :A => 71.0788,
47
- :R => 156.1875,
48
- :N => 114.1038,
49
- :D => 115.0886,
50
- :C => 103.1388,
51
- :E => 129.1155,
52
- :Q => 128.1307,
53
- :G => 57.0519,
54
- :H => 137.1411,
55
- :I => 113.1594,
56
- :L => 113.1594,
57
- :K => 128.1741,
58
- :M => 131.1926,
59
- :F => 147.1766,
60
- :P => 97.1167,
61
- :S => 87.0782,
62
- :T => 101.1051,
63
- :W => 186.2132,
64
- :Y => 163.1760,
65
- :V => 99.1326,
66
-
67
- :h => 1.00794,
68
- :h_plus => 1.00739,
69
- :o => 15.9994,
70
- :h2o => 18.01524,
71
- }
72
- end
73
17
 
74
18
  module SpecID ; end
75
19
 
@@ -91,8 +35,14 @@ module SpecID
91
35
  # Will return a SpecID object (really, the object corresponding to the
92
36
  # file type which mixes in SpecID [is_a?(SpecID) == true])
93
37
  # If no file is given, will return a GenericSpecID object.
38
+ # If file is an array, this is assumed to be a group of srf files which is
39
+ # converted into an SRFGroup Ojbect and run.
94
40
  def self.new(file=nil, tp=nil)
95
- if file
41
+ # this will need to be specialized for other groups later
42
+ if file.is_a?(Array)
43
+ # takes an array of srf filenames
44
+ SRFGroup.new(file)
45
+ elsif file
96
46
  from_file(file, tp)
97
47
  else
98
48
  GenericSpecID.new
@@ -100,22 +50,27 @@ module SpecID
100
50
  end
101
51
 
102
52
  # tp = file_type
103
- # only takes an array if they are srf files!
53
+ # a single srf file will be packaged into an SRFGroup object
104
54
  def self.from_file(file, tp=nil)
105
55
  obj = nil
106
56
  unless tp
107
57
  tp = file_type(file)
108
58
  end
109
59
  obj = case tp
60
+ when 'srf'
61
+ #@hi_prob_best = false
62
+ SRFGroup.new([file])
110
63
  when 'srg'
111
- @hi_prob_best = false
64
+ #@hi_prob_best = false
112
65
  SRFGroup.new(file)
113
66
  when 'bioworks'
114
- @hi_prob_best = false
67
+ #@hi_prob_best = false
115
68
  Bioworks.new(file)
116
69
  when 'protproph'
117
- @hi_prob_best = true
70
+ #@hi_prob_best = true
118
71
  Proph::ProtSummary.new(file)
72
+ when 'pepproph'
73
+ Proph::PepSummary.new(file)
119
74
  else
120
75
  abort "UNRECOGNIZED file type for #{file}"
121
76
  end
@@ -123,9 +78,76 @@ module SpecID
123
78
  end
124
79
 
125
80
  def inspect
126
- "<#{self.class} #peps=\"#{peps.size}\">"
81
+ peps_string =
82
+ if peps
83
+ "peps(#)=#{peps.size}"
84
+ else
85
+ "peps=(nil)"
86
+ end
87
+ "<#{self.class} #{peps_string}>"
88
+ end
89
+
90
+ # given some list of SpecID::Pep based objects, returns the list of proteins
91
+ # associated with those peptides
92
+ # kind must be a symbol:
93
+ # :no_update (current proteins are returned, but their peps attribute
94
+ # is not updated)
95
+ # :update (current proteins returned with peps attribute updated)
96
+ # :new (new proteins are created complete with peps attribute)
97
+ def self.protein_list(pephits, kind=:no_update)
98
+
99
+ orig_pephits_prts = []
100
+ if kind == :new
101
+ new_prots = {}
102
+ pephits.each_with_index do |pep,i|
103
+ orig_pephits_prts[i] = pep.prots
104
+ peps_new_prts = pep.prots.map do |prt|
105
+ if new_prots.key? prt.reference
106
+ already_exists = new_prots[prt.reference]
107
+ else
108
+ np = prt.dup
109
+ np.peps = []
110
+ new_prots[np.reference] = np
111
+ np
112
+ end
113
+ end
114
+ pep.prots = peps_new_prts
115
+ end
116
+ end
117
+
118
+ if kind == :update
119
+ pephits.each do |pep|
120
+ pep.prots.each do |prt|
121
+ prt.peps = []
122
+ end
123
+ end
124
+ end
125
+
126
+ prot_set = {}
127
+ pephits.each do |pep|
128
+ prts = pep.prots
129
+ prts.each do |prt|
130
+ prot_set[ prt.reference ] = prt
131
+ end
132
+ if (kind == :update || kind == :new)
133
+ prts.each do |prt|
134
+ prt.peps << pep
135
+ end
136
+ end
137
+ end
138
+
139
+ ## Reset the original protein hits
140
+ if kind == :new
141
+ pephits.each_with_index do |pep,i|
142
+ pep.prots = orig_pephits_prts[i]
143
+ end
144
+ end
145
+
146
+ prot_set.values
127
147
  end
128
148
 
149
+
150
+
129
151
  # takes a comma separated list or array and extends the last to create an
130
152
  # array of desired size
131
153
  def self.extend_args(arg, desired_size)
@@ -193,13 +215,6 @@ module SpecID
193
215
  prot_triplets
194
216
  end
195
217
 
196
-
197
- ## basically, this is the command line wrapper
198
- def self.precision(argv)
199
- Prec.new.run_cmd_line(argv)
200
- end
201
-
202
-
203
218
  # returns number of true positives (array) and the specified output (as
204
219
  # parallel array). Requires the classification method and a sorted array of
205
220
  # tp values and an array fp values.
@@ -223,55 +238,100 @@ module SpecID
223
238
  pps
224
239
  end
225
240
 
226
- def classify_by_regex(items, regex, fp_on_match=true)
227
- case items
228
- when :prots
241
+ def self.prots?(ar)
242
+ ar.first.is_a? SpecID::Prot
243
+ end
244
+
245
+ def self.peps?(ar)
246
+ ar.first.is_a? SpecID::Pep
247
+ end
248
+
249
+ # for older stuff
250
+ def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
251
+ objects =
252
+ case items
253
+ when :prots
254
+ prots
255
+ when :peps
256
+ peps
257
+ end
258
+ SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
259
+ end
260
+
261
+ # includes the peptide hit in both
262
+ # returns (target, decoy)
263
+ # (for peps) ties can be :both, true (target wins), false (decoy wins)
264
+ # regardless of ties behavior, will partition out the proteins to be
265
+ # appropriate for the peptide
266
+ def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
267
+ if items.size == 0
268
+ return [[],[]]
269
+ elsif prots?(items)
229
270
  myproc = proc { |prt|
230
- if prt.reference =~ regex ; !fp_on_match
231
- else ; fp_on_match end
271
+ if prt.reference =~ regex ; !decoy_on_match
272
+ else ; decoy_on_match end
232
273
  }
233
274
  return classify(items, myproc)
234
- when :peps
275
+ elsif peps?(items)
235
276
  match = [] ; nomatch = []
236
- peps.each do |pep|
237
- match_prots = [] ; nomatch_prots = []
238
- (hit, nohit) = pep.prots.partition do |prot|
277
+ items.each do |pep|
278
+ (match_prots, nomatch_prots) = pep.prots.partition do |prot|
239
279
  prot.reference =~ regex
240
280
  end
241
- if hit.size == 0
281
+ if match_prots.size == 0
242
282
  nomatch << pep
243
- elsif nohit.size == 0
283
+ elsif nomatch_prots.size == 0
244
284
  match << pep
245
285
  else ## both have hits
246
286
  pep.prots = match_prots
247
287
  nomatch_pep = pep.dup
248
288
  nomatch_pep.prots = nomatch_prots
249
- match << pep
250
- nomatch << pep
289
+
290
+ # resolve ties
291
+ case ties
292
+ when true
293
+ if decoy_on_match
294
+ nomatch << pep
295
+ else
296
+ match << pep
297
+ end
298
+ when false
299
+ if decoy_on_match
300
+ match << pep
301
+ else
302
+ nomatch << pep
303
+ end
304
+ when :both
305
+ match << pep
306
+ nomatch << pep
307
+ else ; raise ArgumentError
308
+ end
251
309
  end
252
310
  end
253
- if fp_on_match
311
+ if decoy_on_match
254
312
  return [nomatch , match]
255
313
  else
256
314
  return [match, nomatch]
257
315
  end
258
316
  else
259
- abort "don't recognize "
317
+ raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
260
318
  end
261
319
  end
262
320
 
321
+
322
+
263
323
  # returns [tp, fp] based on the protein prefix for items where items =
264
324
  # (:prot|:peps)
265
325
  # this may result in a duplication of some peptides if they match both
266
326
  # normal and decoy proteins. In this case, the protein arrays are split,
267
327
  # too, so that each points only to its breed of protein.
268
- def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false)
328
+ def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
269
329
  if prefix
270
330
  regex = /^#{Regexp.escape(flag)}/
271
331
  else
272
332
  regex = /#{Regexp.escape(flag)}/
273
333
  end
274
- classify_by_regex(items, regex, fp_on_match)
334
+ classify_by_regex(items, regex, decoy_on_match)
275
335
  end
276
336
 
277
337
  # Returns (match, nomatch)
@@ -303,7 +363,7 @@ module SpecID
303
363
  classify_item_by.call(item) ]
304
364
  end
305
365
  roc = ROC.new
306
- tp, fp = roc.prep_list(doublets)
366
+ tp, fp = roc.doublets_to_separate(doublets)
307
367
  return tp, fp
308
368
  end
309
369
 
@@ -393,11 +453,13 @@ module SpecID
393
453
  end
394
454
  File.open(file) do |fh|
395
455
  lines = ""
396
- 4.times { lines << fh.readline }
456
+ 8.times { lines << fh.readline }
397
457
  if lines =~ /<bioworksinfo>/
398
458
  return 'bioworks'
399
- elsif lines =~ /<protein_summary/ && lines =~ /xmlns="http:\/\/regis-web.systemsbiology.net\/protXML"/
459
+ elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
400
460
  return 'protproph'
461
+ elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
462
+ return 'pepproph'
401
463
  end
402
464
  end
403
465
  end
@@ -521,9 +583,10 @@ module SpecID
521
583
  end
522
584
  end
523
585
 
524
-
525
586
  # A Generic spectraID protein
526
587
  module SpecID::Prot
588
+ include ProteinReferenceable
589
+
527
590
  # probability is always a float!
528
591
  attr_accessor :probability, :reference, :peps
529
592
 
@@ -531,6 +594,14 @@ module SpecID::Prot
531
594
  self.reference <=> other.reference
532
595
  end
533
596
 
597
+ def inspect
598
+ pep_string =
599
+ if peps
600
+ ", @peps(#)=#{peps.size}"
601
+ end
602
+ "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
603
+ end
604
+
534
605
  end
535
606
 
536
607
  module SpecID::Pep
@@ -653,6 +724,23 @@ module SpecID::Pep
653
724
  when :mmu
654
725
  end
655
726
  end
727
+
728
+ # calls the method associated with each key and returns the value
729
+ def values_at(*args)
730
+ args.map do |arg|
731
+ send(arg)
732
+ end
733
+ end
734
+
735
+ def inspect
736
+
737
+ prot_string =
738
+ if prots
739
+ ", @prots(#)=#{prots.size}"
740
+ end
741
+ "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
742
+ end
743
+
656
744
  end
657
745
 
658
746
  class SpecID::GenericProt
data/lib/spec_id_xml.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # concatenation into a file
7
7
  module SpecIDXML
8
8
 
9
- Special_chrs_hash = {
9
+ MSial_chrs_hash = {
10
10
  '"' => '&quot;',
11
11
  '&' => '&amp;',
12
12
  "'" => '&apos;',
@@ -17,8 +17,8 @@ module SpecIDXML
17
17
  # substitutes special xml chars
18
18
  def escape_special_chars(string)
19
19
  string.split('').map do |char|
20
- if Special_chrs_hash.key? char ; Special_chrs_hash[char]
21
- # if x = Special_chrs_hash[char] ; x # <-- that's slightly slower
20
+ if MSial_chrs_hash.key? char ; MSial_chrs_hash[char]
21
+ # if x = MSial_chrs_hash[char] ; x # <-- that's slightly slower
22
22
  else ; char end
23
23
  end.join
24
24
  end
@@ -33,13 +33,13 @@ module SpecIDXML
33
33
  end
34
34
 
35
35
 
36
- def param_xml(symbol)
37
- tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{send(symbol)}" + '"/>'
36
+ def param_xml(obj, symbol)
37
+ tabs + '<parameter name="' + "#{symbol}" + '" value="' + "#{obj.send(symbol)}" + '"/>'
38
38
  end
39
39
 
40
- def params_xml(*symbol_list)
40
+ def params_xml(obj, *symbol_list)
41
41
  symbol_list.collect { |sy|
42
- param_xml(sy)
42
+ param_xml(obj, sy)
43
43
  }.join("\n") + "\n"
44
44
  end
45
45
 
@@ -92,9 +92,7 @@ module SpecIDXML
92
92
  end
93
93
 
94
94
  def attrs_xml(list_of_symbols)
95
- list_of_symbols.collect {|sy|
96
- attr_xml(sy)
97
- }.join(" ")
95
+ list_of_symbols.collect {|sy| attr_xml(sy) }.join(" ")
98
96
  end
99
97
 
100
98
  end
@@ -0,0 +1,147 @@
1
+ require 'transmem'
2
+
3
+ class Phobius ; end
4
+
5
+ # This class will probably change its interface some in the future
6
+ # That's the web portal
7
+ # http://phobius.cgb.ki.se/
8
+ # How to run:
9
+ # Select output format as 'Short'
10
+ # then hit 'Submit Query'
11
+
12
+ # note: to implement some of the TransmemIndex features, the update_aaseq
13
+ # method must be called!
14
+ class Phobius::Index < Hash
15
+ include TransmemIndex
16
+
17
+ # will update_aaseq if given a fasta_obj
18
+ def initialize(file, fasta_obj = nil )
19
+ Phobius.default_index(file, self)
20
+ if fasta_obj
21
+ update_aaseq(fasta_obj)
22
+ end
23
+ end
24
+
25
+ # we need to match whatever function toppred uses to generate identifiers if
26
+ # we want derivative processes to be fast and accurate
27
+ def reference_to_key(reference)
28
+ if reference
29
+ if reference.size > 0
30
+ index = reference.index(' ')
31
+ string =
32
+ if index
33
+ reference[0...index]
34
+ else
35
+ reference
36
+ end
37
+ string.gsub('"','')
38
+ else
39
+ ''
40
+ end
41
+ else
42
+ nil
43
+ end
44
+ end
45
+
46
+ # adds an :aaseq key to each hash (necessary for avg_overlap method)
47
+ # these are shallow references to the aaseq in the fasta obj
48
+ def update_aaseq(fasta)
49
+ fasta.each do |prot|
50
+ self[reference_to_key(prot.reference)][:aaseq] = prot.aaseq
51
+ end
52
+ end
53
+
54
+ end
55
+
56
+ class Phobius
57
+ include TransmemIndex
58
+
59
+ # returns the default index
60
+ def self.default_index(file, index={})
61
+ parser = Phobius::Parser.new(:short)
62
+ parser.file_to_index(file, index)
63
+ end
64
+
65
+ end
66
+
67
+ module Phobius::Parser
68
+
69
+ def self.new(parser_type=:short)
70
+ klass =
71
+ case parser_type
72
+ when :short
73
+ Phobius::ParserShort
74
+ else
75
+ raise ArgumentError, "don't recognize parser type: #{parser_type}"
76
+ end
77
+ klass.new
78
+ end
79
+
80
+ def file_to_index(file, index={})
81
+ File.open(file) {|fh| to_index(fh, index) }
82
+ end
83
+
84
+ end
85
+
86
+
87
+ class Phobius::ParserShort
88
+ include Phobius::Parser
89
+
90
+ # takes a phobius prediction string (e.g., i12-31o37-56i63-84o96-116i123-143o149-169i)
91
+ # and returns an array of hashes with the keys :start and :stop
92
+ def prediction_to_array(string)
93
+ segments = []
94
+ string.scan(/[io](\d+)-(\d+)/) do |m1, m2|
95
+ segments << { :start => m1.to_i, :stop => m2.to_i }
96
+ end
97
+ segments
98
+ end
99
+
100
+ # returns a hash structure in this form: { identifier => {
101
+ # :num_certain_transmembrane_segments => Int,
102
+ # :transmembrane_segments => [:start => Int, :stop
103
+ # => Int] }
104
+ # can parse io even if there is no header to key in on.
105
+ def to_index(io, index={})
106
+ init_pos = io.pos
107
+ cnt = 0
108
+ found_header = false
109
+ loop do
110
+ if io.gets =~ /SEQENCE/
111
+ found_header = true
112
+ break
113
+ end
114
+ cnt += 1
115
+ break if cnt > 10
116
+ end
117
+ if !found_header
118
+ io.pos = init_pos
119
+ end
120
+ current_record = nil
121
+ io.each do |line|
122
+ line.chomp!
123
+ # grab values
124
+ ar = line.split(/\s+/)
125
+ next if ar.size != 4
126
+ (key, num_tms, signal_peptide, prediction) = ar
127
+ # cast the values
128
+ num_tms = num_tms.to_i
129
+ signal_peptide =
130
+ case signal_peptide
131
+ when 'Y'
132
+ true
133
+ when '0'
134
+ false
135
+ end
136
+ index[key] = {
137
+ :num_certain_transmembrane_segments => num_tms,
138
+ :signal_peptide => signal_peptide,
139
+ }
140
+ if num_tms > 0
141
+ index[key][:transmembrane_segments] = prediction_to_array(prediction)
142
+ end
143
+ end
144
+ index
145
+ end
146
+
147
+ end