mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/spec_id/proph.rb CHANGED
@@ -1,468 +1,4 @@
1
1
 
2
- require 'rexml/document'
3
- require 'hash_by'
4
- require 'instance_var_set_from_hash'
5
- require 'axml'
6
- require 'spec_id'
2
+ #require 'spec_id/proph/prot_summary'
3
+ #require 'spec_id/proph/pep_summary'
7
4
 
8
-
9
- module SpecID ; end
10
- module SpecID::Prot ; end
11
- module SpecID::Pep ; end
12
-
13
- class Proph
14
-
15
-
16
- ################ --BEGIN
17
-
18
-
19
- class Parser
20
- def root_el(file)
21
- AXML.parse_file(file)
22
- end
23
- end
24
-
25
-
26
- class ProtSummary
27
- include SpecID
28
-
29
- attr_writer :prots
30
- attr_accessor :prot_groups
31
-
32
- def hi_prob_best ; true end
33
-
34
- def initialize(file=nil)
35
- @prots = nil
36
- if file
37
- @prot_groups = ProtSummary::Parser.new.parse_file(file)
38
- end
39
- end
40
-
41
- def prots
42
- if @prots ; @prots
43
- else
44
- @prots = unique_prots(@prot_groups)
45
- @prots
46
- end
47
- end
48
-
49
- # returns a set of unique proteins
50
- def unique_prots(prot_groups)
51
- all_prots = []
52
- prot_groups.each do |pg|
53
- pg.prots.each do |prt|
54
- all_prots << prt
55
- end
56
- end
57
- all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
58
- end
59
-
60
- end
61
-
62
- class ProtSummary::Parser < Parser
63
- attr_accessor :prot_groups
64
- def initialize(file=nil, with_peps=false, tp='axml')
65
- if file
66
- @prot_groups = parse_file(file, with_peps, tp)
67
- end
68
- end
69
-
70
- # returns an array of protein_groups
71
- def parse_file(file, with_peps=false, tp='axml')
72
- File.open(file) do |fh|
73
- @prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
74
- end
75
- @prot_groups
76
- end
77
-
78
- # returns an array of ProtGroup objects
79
- def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
80
- prtgrps = []
81
- case tp
82
- when 'axml'
83
- root = AXML.parse(stream)
84
- root.protein_group.each do |protein_group|
85
- pg = ProtGroup.new(protein_group.attrs) do
86
- protein_group.map do |protein|
87
- Prot.new(protein.attrs)
88
- end
89
- end
90
- prtgrps << pg
91
- end
92
- end
93
- prtgrps
94
- end
95
- end # ProtSummary::Parser
96
-
97
-
98
- class ProtGroup
99
- attr_accessor :group_number, :probability, :prots
100
- def initialize(args=nil)
101
- @prots = []
102
- if args
103
- instance_var_set_from_hash(args)
104
- end
105
- if block_given?
106
- @prots = yield
107
- end
108
- end
109
- end
110
-
111
- class Prot
112
- include SpecID::Prot
113
-
114
- ## probability and reference accessors are inherited
115
- attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description
116
-
117
- # returns protein_name
118
- def name ; @protein_name end
119
- def reference ; @protein_name end
120
-
121
- def initialize(args)
122
- self.instance_var_set_from_hash(args)
123
- if @probability ; @probability = @probability.to_f end
124
- end
125
-
126
- # def self.uniq_prots_with_prob_and_reference(file)
127
- # root = Parser.root_el(file)
128
- # prots = []
129
- # root.protein_group.each do |group|
130
- # group.protein.each do |prt|
131
- # #prots << prt
132
- # prots <<
133
- # end
134
- # end
135
- #
136
- # un_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
137
- #
138
- # end
139
-
140
- def to_s
141
- '<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
142
- end
143
-
144
- end # class Prot
145
-
146
- class Pep
147
- include SpecID::Pep
148
-
149
- attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
150
- attr_writer :arithmetic_avg_scan_by_parent_time
151
-
152
- def initialize(args=nil)
153
- if args
154
- @sequence = args[:sequence]
155
- @probability = args[:probability] ## nsp prob
156
- @filenames = args[:filenames]
157
- @charge = args[:charge]
158
- @nsp_cutoff = args[:nsp_cutoff]
159
- if args.key?(:scans)
160
- @scans = args[:scans]
161
- else
162
- @scans = [] ## this is set later if needed
163
- end
164
- else
165
- @scans = []
166
- end
167
- end
168
-
169
- # filter peptides based on the number of scans
170
- # if a peptide has more than max_dups scans, the peptide is tossed
171
- # note that multiple scans that were used as a single dtafile scan
172
- # will be counted as a single scan for these purposes!
173
- # (easy, since they are stored as a single item in the array of scans)
174
- def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
175
- if max_dups
176
- new_peps = []
177
- peps.each do |pep|
178
- unless pep.scans.size > max_dups
179
- new_peps << pep
180
- end
181
- end
182
- new_peps
183
- else
184
- peps.dup
185
- end
186
- end
187
-
188
- ## from the list of scans, creates a scan object whose time is the
189
- ## arithmetic mean of the parent scans (based on prec_inten) and whose
190
- ## prec_mz is the avg of all prec_mz's. num is nil, charge is the first
191
- def arithmetic_avg_scan_by_parent_time
192
- unless @arithmetic_avg_scan_by_parent_time
193
- flat_scans = @scans.flatten
194
-
195
- # new_prec_mz
196
- prec_mz_sum = 0.0
197
- prec_inten_sum = 0.0
198
- times = []
199
- intens = []
200
- tot_inten = 0.0
201
- flat_scans.each do |c|
202
- prec_inten = c.prec_inten
203
- prec_inten_sum += prec_inten
204
- prec_mz_sum += c.prec_mz
205
- tot_inten += prec_inten
206
- times << c.parent.time
207
- intens << prec_inten
208
- end
209
- new_prec_mz = prec_mz_sum / flat_scans.size
210
- new_prec_inten = prec_inten_sum / flat_scans.size
211
-
212
- fraction_inten = []
213
- intens.each do |inten|
214
- fraction_inten.push( inten/tot_inten )
215
- end
216
-
217
- new_time = 0.0
218
- (0...times.size).each do |i|
219
- new_time += times[i] * fraction_inten[i]
220
- end
221
-
222
- @arithmetic_avg_scan_by_parent_time = Spec::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
223
-
224
- end
225
- @arithmetic_avg_scan_by_parent_time
226
- end
227
-
228
- def to_s
229
- '<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
230
- end
231
-
232
- def has_dta?(dta_filename)
233
- if @filenames
234
- @filenames.each do |fn|
235
- if dta_filename == fn
236
- return true
237
- end
238
- end
239
- end
240
- return false
241
- end
242
-
243
- # Given a list of peptides, returns only those unique based on
244
- # sequence/charge
245
- def self.uniq_by_seqcharge(peptides)
246
- # @TODO: this could be done with one fewer traversals, but it is beautiful
247
- peptides.hash_by(:sequence, :charge).collect do |k,v|
248
- v.first
249
- end
250
- end
251
-
252
- end # class Pep
253
-
254
-
255
- # Class for parsing the peptide prophet output files in various ways
256
- class Pep::Parser < Parser
257
-
258
- # parse_type = "rexml" | "regex"
259
- # regex's are about 50 times faster but are not guaranteed to work
260
- # seq charge hash is keyed on an array -> [sequence,charge]
261
- # @TODO: implement parsing on this with xmlparser
262
- def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
263
- seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
264
- case parse_type
265
- when "rexml"
266
- #puts "READING: " + pep_xml_file + " ..."
267
- doc = REXML::Document.new File.new(pep_xml_file)
268
-
269
- ## Create a hash of peptides based on sequence_charge (takes an array)
270
- doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
271
- pep_charge = result.attributes['assumed_charge']
272
- filename = result.attributes['spectrum']
273
- result.elements.to_a('search_hit').each do |hit|
274
- pep_seq = hit.attributes['peptide']
275
- seq_charge = [pep_seq, pep_charge]
276
- seq_charge_hash[seq_charge] << filename
277
- end
278
- end
279
- seq_charge_hash
280
- when "regex"
281
- #puts "READING: " + pep_xml_file + " ..."
282
- ## Create a hash of peptides based on sequence_charge (takes an array)
283
-
284
- ## file from peptideAtlas:
285
- search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
286
- search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
287
- search_hit_regex = /<search_hit .*peptide="(\w+)" /o
288
-
289
- peptide_h = {}
290
- filename = nil
291
- pep_charge = nil
292
- File.open(pep_xml_file).each do |line|
293
- if line =~ search_result_regex1
294
- filename = $1.dup
295
- pep_charge = $2.dup
296
- elsif line =~ search_result_regex2
297
- filename = $1.dup
298
- pep_charge = $2.dup
299
- end
300
- if line =~ search_hit_regex
301
- pep_seq = $1.dup
302
- seq_charge = [pep_seq, pep_charge]
303
- seq_charge_hash[seq_charge] << filename
304
- end
305
- end
306
- end
307
- seq_charge_hash
308
- end
309
-
310
- # drops all search_hits that have peptideprophet probability < min_val
311
- # and drops any search_results that end up with 0 search_hits
312
- def filter_by_min_pep_prob(file, outfile, min_val)
313
- root = root_el(file)
314
-
315
- d_search_hit = nil
316
- d_search_result = nil
317
- root.children.each do |child1|
318
- if child1.name == 'msms_run_summary'
319
- d_search_result = []
320
- child1.children.each do |child2|
321
- if child2.name == 'search_result'
322
- #puts "size before: " + child2.size.to_s
323
- d_search_hit = []
324
- child2.children.each do |child3|
325
- if child3.name == 'search_hit'
326
- child3.children.each do |child4|
327
- if child4.name == 'peptideprophet_result'
328
- if child4.attrs["probability"].to_f < min_val
329
- #puts "dropping probability: #{child4.attrs["probability"]}"
330
- d_search_hit << child3
331
- else
332
- #puts "keeping probability: #{child4.attrs["probability"]}"
333
- end
334
- end
335
- end
336
- end
337
- end
338
- d_search_hit.each do |to_drop|
339
- to_drop.drop
340
- end
341
- #puts "size after: " + child2.size.to_s
342
- if child2.size == 0
343
- d_search_result << child2
344
- end
345
- end
346
- end
347
- d_search_result.each do |to_drop|
348
- to_drop.drop
349
- end
350
- end
351
- end
352
-
353
- File.open(outfile, "w") do |fh|
354
- fh.print root.to_s
355
- end
356
- end
357
- end # Pep::Parser
358
-
359
-
360
- # Class for parsing the '*-prot.xml' files in different ways
361
- class Prot::Parser < Parser
362
-
363
- attr_accessor :prots
364
- attr_writer :peps
365
-
366
- def initialize
367
- @prots = []
368
- end
369
-
370
- # returns all the peptides from prots
371
- def peps
372
- unless @peps
373
- @peps = []
374
- @prots.each do |prot|
375
- @peps.push(*(prot.peps))
376
- end
377
- end
378
- @peps
379
- end
380
-
381
-
382
- # sets and returns an array of Prot objects
383
- # parse_type = "rexml" | "regex"
384
- def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
385
- ## ensure these are all floats
386
- (prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
387
- cutoff.to_f
388
- end
389
-
390
- case parse_type
391
- when "rexml"
392
- doc = REXML::Document.new File.new(protxmlfile)
393
- doc.elements.each("protein_summary/protein_group/protein") do |elem|
394
- if elem.attributes['probability'].to_f >= prot_prob_cutoff
395
- prob = elem.attributes['probability'].to_f
396
- name= elem.attributes['protein_name']
397
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
398
- peptides = []
399
- elem.elements.to_a('peptide').each do |pep|
400
- if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
401
- nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
402
- sequence = pep.attributes['peptide_sequence']
403
- charge = pep.attributes['charge']
404
- pnm = pep.attributes['precursor_neutral_mass']
405
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
406
- end
407
- ## Only take proteins with peptides!
408
- if peptides.size > 0
409
- curr_prot.peps = peptides
410
- @prots << curr_prot
411
- end
412
- end
413
- end
414
- end
415
- when "regex"
416
- prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
417
- prot_prob_regex = /probability="([\d\.]+)"/o
418
- pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
419
- pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
420
-
421
- curr_prot = nil
422
- peptides = []
423
- File.open(protxmlfile).each do |line|
424
- if line =~ prot_regex
425
- prob = nil
426
- name = $1.dup
427
- rest = $2
428
- if rest =~ prot_prob_regex
429
- prob = $1.dup
430
- end
431
- if curr_prot
432
- if curr_prot.probability.to_f >= prot_prob_cutoff
433
- if peptides.size > 0
434
- curr_prot.peps = peptides
435
- @prots.push(curr_prot)
436
- end
437
- end
438
- end
439
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
440
- peptides = []
441
- end
442
- if line =~ pep_regex
443
- sequence = $1.dup
444
- rest = $2
445
- if rest =~ pep_else_regex
446
- charge = $1
447
- init_prob = $2
448
- nsp_prob = $3
449
- if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
450
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
451
- end
452
- end
453
- end
454
- # get the last one:
455
- if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
456
- curr_prot.peps = peptides
457
- @prots.push(curr_prot)
458
- end
459
- end
460
- end
461
- @prots
462
- end
463
-
464
- end # Prot::Parser
465
-
466
- ################ --END
467
-
468
- end # Proph
@@ -5,7 +5,7 @@ require 'hash_by'
5
5
  require 'optparse'
6
6
  require 'ostruct'
7
7
  require 'spec_id'
8
- require 'spec_id/precision'
8
+ #require 'spec_id/precision' # gone now
9
9
  require 'gi'
10
10
 
11
11
  #############################################################
@@ -428,7 +428,7 @@ class ProteinSummary
428
428
  op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
429
429
  op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
430
430
  op.separator ""
431
- op.separator "Specific to ProteinProphet (with no concatenated DB):"
431
+ op.separator "MSific to ProteinProphet (with no concatenated DB):"
432
432
  op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
433
433
  op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
434
434
  op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v}