mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/spec_id/proph.rb CHANGED
@@ -1,468 +1,4 @@
1
1
 
2
- require 'rexml/document'
3
- require 'hash_by'
4
- require 'instance_var_set_from_hash'
5
- require 'axml'
6
- require 'spec_id'
2
+ #require 'spec_id/proph/prot_summary'
3
+ #require 'spec_id/proph/pep_summary'
7
4
 
8
-
9
- module SpecID ; end
10
- module SpecID::Prot ; end
11
- module SpecID::Pep ; end
12
-
13
- class Proph
14
-
15
-
16
- ################ --BEGIN
17
-
18
-
19
- class Parser
20
- def root_el(file)
21
- AXML.parse_file(file)
22
- end
23
- end
24
-
25
-
26
- class ProtSummary
27
- include SpecID
28
-
29
- attr_writer :prots
30
- attr_accessor :prot_groups
31
-
32
- def hi_prob_best ; true end
33
-
34
- def initialize(file=nil)
35
- @prots = nil
36
- if file
37
- @prot_groups = ProtSummary::Parser.new.parse_file(file)
38
- end
39
- end
40
-
41
- def prots
42
- if @prots ; @prots
43
- else
44
- @prots = unique_prots(@prot_groups)
45
- @prots
46
- end
47
- end
48
-
49
- # returns a set of unique proteins
50
- def unique_prots(prot_groups)
51
- all_prots = []
52
- prot_groups.each do |pg|
53
- pg.prots.each do |prt|
54
- all_prots << prt
55
- end
56
- end
57
- all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
58
- end
59
-
60
- end
61
-
62
- class ProtSummary::Parser < Parser
63
- attr_accessor :prot_groups
64
- def initialize(file=nil, with_peps=false, tp='axml')
65
- if file
66
- @prot_groups = parse_file(file, with_peps, tp)
67
- end
68
- end
69
-
70
- # returns an array of protein_groups
71
- def parse_file(file, with_peps=false, tp='axml')
72
- File.open(file) do |fh|
73
- @prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
74
- end
75
- @prot_groups
76
- end
77
-
78
- # returns an array of ProtGroup objects
79
- def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
80
- prtgrps = []
81
- case tp
82
- when 'axml'
83
- root = AXML.parse(stream)
84
- root.protein_group.each do |protein_group|
85
- pg = ProtGroup.new(protein_group.attrs) do
86
- protein_group.map do |protein|
87
- Prot.new(protein.attrs)
88
- end
89
- end
90
- prtgrps << pg
91
- end
92
- end
93
- prtgrps
94
- end
95
- end # ProtSummary::Parser
96
-
97
-
98
- class ProtGroup
99
- attr_accessor :group_number, :probability, :prots
100
- def initialize(args=nil)
101
- @prots = []
102
- if args
103
- instance_var_set_from_hash(args)
104
- end
105
- if block_given?
106
- @prots = yield
107
- end
108
- end
109
- end
110
-
111
- class Prot
112
- include SpecID::Prot
113
-
114
- ## probability and reference accessors are inherited
115
- attr_accessor :peps, :protein_name, :cutoff, :group_sibling_id, :n_indistinguishable_proteins, :percent_coverage, :unique_stripped_peptides, :total_number_peptides, :pct_spectrum_ids, :description
116
-
117
- # returns protein_name
118
- def name ; @protein_name end
119
- def reference ; @protein_name end
120
-
121
- def initialize(args)
122
- self.instance_var_set_from_hash(args)
123
- if @probability ; @probability = @probability.to_f end
124
- end
125
-
126
- # def self.uniq_prots_with_prob_and_reference(file)
127
- # root = Parser.root_el(file)
128
- # prots = []
129
- # root.protein_group.each do |group|
130
- # group.protein.each do |prt|
131
- # #prots << prt
132
- # prots <<
133
- # end
134
- # end
135
- #
136
- # un_prots = prots.hash_by(:_protein_name).map{|name,prot_arr| prot_arr.first }
137
- #
138
- # end
139
-
140
- def to_s
141
- '<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
142
- end
143
-
144
- end # class Prot
145
-
146
- class Pep
147
- include SpecID::Pep
148
-
149
- attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
150
- attr_writer :arithmetic_avg_scan_by_parent_time
151
-
152
- def initialize(args=nil)
153
- if args
154
- @sequence = args[:sequence]
155
- @probability = args[:probability] ## nsp prob
156
- @filenames = args[:filenames]
157
- @charge = args[:charge]
158
- @nsp_cutoff = args[:nsp_cutoff]
159
- if args.key?(:scans)
160
- @scans = args[:scans]
161
- else
162
- @scans = [] ## this is set later if needed
163
- end
164
- else
165
- @scans = []
166
- end
167
- end
168
-
169
- # filter peptides based on the number of scans
170
- # if a peptide has more than max_dups scans, the peptide is tossed
171
- # note that multiple scans that were used as a single dtafile scan
172
- # will be counted as a single scan for these purposes!
173
- # (easy, since they are stored as a single item in the array of scans)
174
- def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
175
- if max_dups
176
- new_peps = []
177
- peps.each do |pep|
178
- unless pep.scans.size > max_dups
179
- new_peps << pep
180
- end
181
- end
182
- new_peps
183
- else
184
- peps.dup
185
- end
186
- end
187
-
188
- ## from the list of scans, creates a scan object whose time is the
189
- ## arithmetic mean of the parent scans (based on prec_inten) and whose
190
- ## prec_mz is the avg of all prec_mz's. num is nil, charge is the first
191
- def arithmetic_avg_scan_by_parent_time
192
- unless @arithmetic_avg_scan_by_parent_time
193
- flat_scans = @scans.flatten
194
-
195
- # new_prec_mz
196
- prec_mz_sum = 0.0
197
- prec_inten_sum = 0.0
198
- times = []
199
- intens = []
200
- tot_inten = 0.0
201
- flat_scans.each do |c|
202
- prec_inten = c.prec_inten
203
- prec_inten_sum += prec_inten
204
- prec_mz_sum += c.prec_mz
205
- tot_inten += prec_inten
206
- times << c.parent.time
207
- intens << prec_inten
208
- end
209
- new_prec_mz = prec_mz_sum / flat_scans.size
210
- new_prec_inten = prec_inten_sum / flat_scans.size
211
-
212
- fraction_inten = []
213
- intens.each do |inten|
214
- fraction_inten.push( inten/tot_inten )
215
- end
216
-
217
- new_time = 0.0
218
- (0...times.size).each do |i|
219
- new_time += times[i] * fraction_inten[i]
220
- end
221
-
222
- @arithmetic_avg_scan_by_parent_time = Spec::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
223
-
224
- end
225
- @arithmetic_avg_scan_by_parent_time
226
- end
227
-
228
- def to_s
229
- '<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
230
- end
231
-
232
- def has_dta?(dta_filename)
233
- if @filenames
234
- @filenames.each do |fn|
235
- if dta_filename == fn
236
- return true
237
- end
238
- end
239
- end
240
- return false
241
- end
242
-
243
- # Given a list of peptides, returns only those unique based on
244
- # sequence/charge
245
- def self.uniq_by_seqcharge(peptides)
246
- # @TODO: this could be done with one fewer traversals, but it is beautiful
247
- peptides.hash_by(:sequence, :charge).collect do |k,v|
248
- v.first
249
- end
250
- end
251
-
252
- end # class Pep
253
-
254
-
255
- # Class for parsing the peptide prophet output files in various ways
256
- class Pep::Parser < Parser
257
-
258
- # parse_type = "rexml" | "regex"
259
- # regex's are about 50 times faster but are not guaranteed to work
260
- # seq charge hash is keyed on an array -> [sequence,charge]
261
- # @TODO: implement parsing on this with xmlparser
262
- def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
263
- seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
264
- case parse_type
265
- when "rexml"
266
- #puts "READING: " + pep_xml_file + " ..."
267
- doc = REXML::Document.new File.new(pep_xml_file)
268
-
269
- ## Create a hash of peptides based on sequence_charge (takes an array)
270
- doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
271
- pep_charge = result.attributes['assumed_charge']
272
- filename = result.attributes['spectrum']
273
- result.elements.to_a('search_hit').each do |hit|
274
- pep_seq = hit.attributes['peptide']
275
- seq_charge = [pep_seq, pep_charge]
276
- seq_charge_hash[seq_charge] << filename
277
- end
278
- end
279
- seq_charge_hash
280
- when "regex"
281
- #puts "READING: " + pep_xml_file + " ..."
282
- ## Create a hash of peptides based on sequence_charge (takes an array)
283
-
284
- ## file from peptideAtlas:
285
- search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
286
- search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
287
- search_hit_regex = /<search_hit .*peptide="(\w+)" /o
288
-
289
- peptide_h = {}
290
- filename = nil
291
- pep_charge = nil
292
- File.open(pep_xml_file).each do |line|
293
- if line =~ search_result_regex1
294
- filename = $1.dup
295
- pep_charge = $2.dup
296
- elsif line =~ search_result_regex2
297
- filename = $1.dup
298
- pep_charge = $2.dup
299
- end
300
- if line =~ search_hit_regex
301
- pep_seq = $1.dup
302
- seq_charge = [pep_seq, pep_charge]
303
- seq_charge_hash[seq_charge] << filename
304
- end
305
- end
306
- end
307
- seq_charge_hash
308
- end
309
-
310
- # drops all search_hits that have peptideprophet probability < min_val
311
- # and drops any search_results that end up with 0 search_hits
312
- def filter_by_min_pep_prob(file, outfile, min_val)
313
- root = root_el(file)
314
-
315
- d_search_hit = nil
316
- d_search_result = nil
317
- root.children.each do |child1|
318
- if child1.name == 'msms_run_summary'
319
- d_search_result = []
320
- child1.children.each do |child2|
321
- if child2.name == 'search_result'
322
- #puts "size before: " + child2.size.to_s
323
- d_search_hit = []
324
- child2.children.each do |child3|
325
- if child3.name == 'search_hit'
326
- child3.children.each do |child4|
327
- if child4.name == 'peptideprophet_result'
328
- if child4.attrs["probability"].to_f < min_val
329
- #puts "dropping probability: #{child4.attrs["probability"]}"
330
- d_search_hit << child3
331
- else
332
- #puts "keeping probability: #{child4.attrs["probability"]}"
333
- end
334
- end
335
- end
336
- end
337
- end
338
- d_search_hit.each do |to_drop|
339
- to_drop.drop
340
- end
341
- #puts "size after: " + child2.size.to_s
342
- if child2.size == 0
343
- d_search_result << child2
344
- end
345
- end
346
- end
347
- d_search_result.each do |to_drop|
348
- to_drop.drop
349
- end
350
- end
351
- end
352
-
353
- File.open(outfile, "w") do |fh|
354
- fh.print root.to_s
355
- end
356
- end
357
- end # Pep::Parser
358
-
359
-
360
- # Class for parsing the '*-prot.xml' files in different ways
361
- class Prot::Parser < Parser
362
-
363
- attr_accessor :prots
364
- attr_writer :peps
365
-
366
- def initialize
367
- @prots = []
368
- end
369
-
370
- # returns all the peptides from prots
371
- def peps
372
- unless @peps
373
- @peps = []
374
- @prots.each do |prot|
375
- @peps.push(*(prot.peps))
376
- end
377
- end
378
- @peps
379
- end
380
-
381
-
382
- # sets and returns an array of Prot objects
383
- # parse_type = "rexml" | "regex"
384
- def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
385
- ## ensure these are all floats
386
- (prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
387
- cutoff.to_f
388
- end
389
-
390
- case parse_type
391
- when "rexml"
392
- doc = REXML::Document.new File.new(protxmlfile)
393
- doc.elements.each("protein_summary/protein_group/protein") do |elem|
394
- if elem.attributes['probability'].to_f >= prot_prob_cutoff
395
- prob = elem.attributes['probability'].to_f
396
- name= elem.attributes['protein_name']
397
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
398
- peptides = []
399
- elem.elements.to_a('peptide').each do |pep|
400
- if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
401
- nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
402
- sequence = pep.attributes['peptide_sequence']
403
- charge = pep.attributes['charge']
404
- pnm = pep.attributes['precursor_neutral_mass']
405
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
406
- end
407
- ## Only take proteins with peptides!
408
- if peptides.size > 0
409
- curr_prot.peps = peptides
410
- @prots << curr_prot
411
- end
412
- end
413
- end
414
- end
415
- when "regex"
416
- prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
417
- prot_prob_regex = /probability="([\d\.]+)"/o
418
- pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
419
- pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
420
-
421
- curr_prot = nil
422
- peptides = []
423
- File.open(protxmlfile).each do |line|
424
- if line =~ prot_regex
425
- prob = nil
426
- name = $1.dup
427
- rest = $2
428
- if rest =~ prot_prob_regex
429
- prob = $1.dup
430
- end
431
- if curr_prot
432
- if curr_prot.probability.to_f >= prot_prob_cutoff
433
- if peptides.size > 0
434
- curr_prot.peps = peptides
435
- @prots.push(curr_prot)
436
- end
437
- end
438
- end
439
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
440
- peptides = []
441
- end
442
- if line =~ pep_regex
443
- sequence = $1.dup
444
- rest = $2
445
- if rest =~ pep_else_regex
446
- charge = $1
447
- init_prob = $2
448
- nsp_prob = $3
449
- if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
450
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
451
- end
452
- end
453
- end
454
- # get the last one:
455
- if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
456
- curr_prot.peps = peptides
457
- @prots.push(curr_prot)
458
- end
459
- end
460
- end
461
- @prots
462
- end
463
-
464
- end # Prot::Parser
465
-
466
- ################ --END
467
-
468
- end # Proph
@@ -5,7 +5,7 @@ require 'hash_by'
5
5
  require 'optparse'
6
6
  require 'ostruct'
7
7
  require 'spec_id'
8
- require 'spec_id/precision'
8
+ #require 'spec_id/precision' # gone now
9
9
  require 'gi'
10
10
 
11
11
  #############################################################
@@ -428,7 +428,7 @@ class ProteinSummary
428
428
  op.on("--#{PRECISION_PROGRAM_BASE}", "include output of #{PRECISION_PROGRAM_BASE}.rb,") {|v| opt.precision = v}
429
429
  op.separator(" type '#{PRECISION_PROGRAM_BASE}.rb' for details")
430
430
  op.separator ""
431
- op.separator "Specific to ProteinProphet (with no concatenated DB):"
431
+ op.separator "MSific to ProteinProphet (with no concatenated DB):"
432
432
  op.on("-c", "--cutoff percent", "false positive predictive rate (FPPR)% for given cutoff") {|v| opt.c = v }
433
433
  op.on("--cut_at percent", "only reports proteins within FPPR %") {|v| opt.cut_at = v }
434
434
  op.on("--get_annotation", "retrieves annotation by gi code") {|v| opt.get_annotation = v}