mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,484 +0,0 @@
1
- require 'rexml/document'
2
- require 'hash_by'
3
- require 'instance_var_set_from_hash'
4
- require 'axml'
5
- require 'spec_id'
6
- require 'arrayclass'
7
-
8
- require 'spec_id/parser/proph'
9
-
10
-
11
- module SpecID ; end
12
- module SpecID::Prot ; end
13
- module SpecID::Pep ; end
14
-
15
- module Proph
16
-
17
- class ProtSummary
18
- include SpecID
19
-
20
- # if you get this match it's a protein prophet file and the version is the
21
- # first match!
22
- Filetype_and_version_re_old = /ProteinProphet_v([\.\d]+)\.dtd/ # gives 1.9 or what else?
23
- Filetype_and_version_re_new = /protXML_v([\.\d]+)\.xsd/ # gives 4 right now
24
- # inherits prots and peps
25
-
26
- # the protein groups
27
- attr_accessor :prot_groups
28
- attr_accessor :version
29
-
30
- def hi_prob_best ; true end
31
-
32
- def get_version(file)
33
- answer = nil
34
- File.open(file) do |fh|
35
- 5.times do
36
- line = fh.gets
37
- answer =
38
- if line =~ Filetype_and_version_re_new
39
- $1.dup
40
- elsif line =~ Filetype_and_version_re_old
41
- $1.dup
42
- end
43
- break if answer
44
- end
45
- end
46
- raise(ArgumentError, "couldn't detect version in #{file}") unless answer
47
- answer
48
- end
49
-
50
- def initialize(file=nil)
51
- @prots = nil
52
- if file
53
- @version = get_version(file)
54
- #@prot_groups = ProtSummary::Parser.new.parse_file(file)
55
- SpecID::Parser::ProtProph.new(:spec_id).parse(file, :spec_id => self)
56
- end
57
- end
58
-
59
- # returns a set of unique proteins
60
- def unique_prots(prot_groups)
61
- all_prots = []
62
- prot_groups.each do |pg|
63
- pg.prots.each do |prt|
64
- all_prots << prt
65
- end
66
- end
67
- all_prots.hash_by(:protein_name).map{|name,prot_arr| prot_arr.first }
68
- end
69
-
70
- end
71
-
72
- class ProtSummary::Parser
73
- attr_accessor :prot_groups
74
- def initialize(file=nil, with_peps=false, tp='axml')
75
- if file
76
- @prot_groups = parse_file(file, with_peps, tp)
77
- end
78
- end
79
-
80
- # returns an array of protein_groups
81
- def parse_file(file, with_peps=false, tp='axml')
82
- File.open(file) do |fh|
83
- @prot_groups = _parse_for_prot_groups(fh, with_peps, tp)
84
- end
85
- @prot_groups
86
- end
87
-
88
- # returns an array of ProtGroup objects
89
- def _parse_for_prot_groups(stream, with_peps=false, tp='axml')
90
- prtgrps = []
91
- case tp
92
- when 'axml'
93
- root = AXML.parse(stream)
94
- root.protein_group.each do |protein_group|
95
- pg = ProtGroup.new(protein_group.attrs) do
96
- protein_group.map do |protein|
97
- Prot.new(protein.attrs)
98
- end
99
- end
100
- prtgrps << pg
101
- end
102
- end
103
- prtgrps
104
- end
105
- end # ProtSummary::Parser
106
-
107
-
108
- class ProtGroup
109
- attr_accessor :group_number, :probability, :prots
110
- def initialize(args=nil)
111
- @prots = []
112
- if args
113
- instance_var_set_from_hash(args)
114
- end
115
- if block_given?
116
- @prots = yield
117
- end
118
- end
119
- end
120
-
121
- end # Proph
122
-
123
-
124
-
125
- Proph::Prot = Arrayclass.new(%w(protein_name probability n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description peps q_value))
126
-
127
- # note that 'description' is found in the element 'annotation', attribute 'protein_description'
128
- # NOTE!: unique_stripped peptides is an array rather than + joined string
129
- class Proph::Prot
130
- include SpecID::Prot
131
-
132
- # returns protein_name
133
- def name ; self[0] end
134
- def reference ; self[0] end
135
- def first_entry ; self[0] end # the name is also the first_entry
136
-
137
- end
138
-
139
- #def to_s
140
- # '<Prot: protein_name=' + @protein_name + ' ' + 'probability=' + @probability.to_s + '>'
141
- #end
142
-
143
- # this is a pep from a -prot.xml file
144
-
145
- Proph::Prot::Pep = Arrayclass.new(%w(peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots q_value))
146
-
147
- class Proph::Prot::Pep
148
- include SpecID::Pep
149
-
150
- alias_method :mod_info, :modification_info
151
- alias_method :mod_info=, :modification_info=
152
-
153
- def aaseq ; self[0] end
154
- def probability ; self[3] end
155
-
156
- end # class Pep
157
-
158
- =begin
159
- #attr_accessor :sequence, :probability, :filenames, :charge, :precursor_neutral_mass, :nsp_cutoff, :scans
160
- #attr_writer :arithmetic_avg_scan_by_parent_time
161
-
162
- #def initialize(args=nil)
163
- # if args
164
- # @sequence = args[:sequence]
165
- # @probability = args[:probability] ## nsp prob
166
- # @filenames = args[:filenames]
167
- # @charge = args[:charge]
168
- # @nsp_cutoff = args[:nsp_cutoff]
169
- # if args.key?(:scans)
170
- # @scans = args[:scans]
171
- # else
172
- # @scans = [] ## this is set later if needed
173
- # end
174
- # else
175
- # @scans = []
176
- # end
177
- #end
178
-
179
- # filter peptides based on the number of scans
180
- # if a peptide has more than max_dups scans, the peptide is tossed
181
- # note that multiple scans that were used as a single dtafile scan
182
- # will be counted as a single scan for these purposes!
183
- # (easy, since they are stored as a single item in the array of scans)
184
- def self.filter_by_max_dup_scans(max_dups=nil, peps=nil)
185
- if max_dups
186
- new_peps = []
187
- peps.each do |pep|
188
- unless pep.scans.size > max_dups
189
- new_peps << pep
190
- end
191
- end
192
- new_peps
193
- else
194
- peps.dup
195
- end
196
- end
197
-
198
-
199
- ## from the list of scans, creates a scan object whose time is the
200
- ## arithmetic mean of the parent scans (based on prec_inten) and whose
201
- ## prec_mz is the avg of all prec_mz's. num is nil, charge is the first
202
- def arithmetic_avg_scan_by_parent_time
203
- unless @arithmetic_avg_scan_by_parent_time
204
- flat_scans = @scans.flatten
205
-
206
- # new_prec_mz
207
- prec_mz_sum = 0.0
208
- prec_inten_sum = 0.0
209
- times = []
210
- intens = []
211
- tot_inten = 0.0
212
- flat_scans.each do |c|
213
- prec_inten = c.prec_inten
214
- prec_inten_sum += prec_inten
215
- prec_mz_sum += c.prec_mz
216
- tot_inten += prec_inten
217
- times << c.parent.time
218
- intens << prec_inten
219
- end
220
- new_prec_mz = prec_mz_sum / flat_scans.size
221
- new_prec_inten = prec_inten_sum / flat_scans.size
222
-
223
- fraction_inten = []
224
- intens.each do |inten|
225
- fraction_inten.push( inten/tot_inten )
226
- end
227
-
228
- new_time = 0.0
229
- (0...times.size).each do |i|
230
- new_time += times[i] * fraction_inten[i]
231
- end
232
-
233
- @arithmetic_avg_scan_by_parent_time = MS::Scan.new( nil, @scans.first.ms_level, new_time, new_prec_mz, new_prec_inten )
234
-
235
- end
236
- @arithmetic_avg_scan_by_parent_time
237
- end
238
-
239
- def to_s
240
- '<Pep seq=' + @sequence + ' ' + 'prob=' + @probability.to_s + ' charge=' + @charge + '>'
241
- end
242
-
243
- def has_dta?(dta_filename)
244
- if @filenames
245
- @filenames.each do |fn|
246
- if dta_filename == fn
247
- return true
248
- end
249
- end
250
- end
251
- return false
252
- end
253
-
254
-
255
- # Given a list of peptides, returns only those unique based on
256
- # sequence/charge
257
- def self.uniq_by_seqcharge(peptides)
258
- # @TODO: this could be done with one fewer traversals, but it is beautiful
259
- peptides.hash_by(:sequence, :charge).collect do |k,v|
260
- v.first
261
- end
262
- end
263
- =end
264
-
265
-
266
-
267
-
268
-
269
- =begin
270
-
271
- # Class for parsing the peptide prophet output files in various ways
272
- class Proph::Pep::Parser < Parser
273
-
274
- # parse_type = "rexml" | "regex"
275
- # regex's are about 50 times faster but are not guaranteed to work
276
- # seq charge hash is keyed on an array -> [sequence,charge]
277
- # @TODO: implement parsing on this with xmlparser
278
- def dta_filenames_by_seq_charge(pep_xml_file, parse_type="rexml")
279
- seq_charge_hash = Hash.new {|hash,key| hash[key] = [] }
280
- case parse_type
281
- when "rexml"
282
- #puts "READING: " + pep_xml_file + " ..."
283
- doc = REXML::Document.new File.new(pep_xml_file)
284
-
285
- ## Create a hash of peptides based on sequence_charge (takes an array)
286
- doc.elements.each("msms_pipeline_analysis/msms_run_summary/search_result") do |result|
287
- pep_charge = result.attributes['assumed_charge']
288
- filename = result.attributes['spectrum']
289
- result.elements.to_a('search_hit').each do |hit|
290
- pep_seq = hit.attributes['peptide']
291
- seq_charge = [pep_seq, pep_charge]
292
- seq_charge_hash[seq_charge] << filename
293
- end
294
- end
295
- seq_charge_hash
296
- when "regex"
297
- #puts "READING: " + pep_xml_file + " ..."
298
- ## Create a hash of peptides based on sequence_charge (takes an array)
299
-
300
- ## file from peptideAtlas:
301
- search_result_regex1 = /<spectrum_query spectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
302
- search_result_regex2 = /<search_result sxpectrum="(.*\.\d+\.\d+\.\d)".* assumed_charge="(\d)"/o
303
- search_hit_regex = /<search_hit .*peptide="(\w+)" /o
304
-
305
- peptide_h = {}
306
- filename = nil
307
- pep_charge = nil
308
- File.open(pep_xml_file).each do |line|
309
- if line =~ search_result_regex1
310
- filename = $1.dup
311
- pep_charge = $2.dup
312
- elsif line =~ search_result_regex2
313
- filename = $1.dup
314
- pep_charge = $2.dup
315
- end
316
- if line =~ search_hit_regex
317
- pep_seq = $1.dup
318
- seq_charge = [pep_seq, pep_charge]
319
- seq_charge_hash[seq_charge] << filename
320
- end
321
- end
322
- end
323
- seq_charge_hash
324
- end
325
-
326
- # drops all search_hits that have peptideprophet probability < min_val
327
- # and drops any search_results that end up with 0 search_hits
328
- def filter_by_min_pep_prob(file, outfile, min_val)
329
- root = root_el(file)
330
-
331
- d_search_hit = nil
332
- d_search_result = nil
333
- root.children.each do |child1|
334
- if child1.name == 'msms_run_summary'
335
- d_search_result = []
336
- child1.children.each do |child2|
337
- if child2.name == 'search_result'
338
- #puts "size before: " + child2.size.to_s
339
- d_search_hit = []
340
- child2.children.each do |child3|
341
- if child3.name == 'search_hit'
342
- child3.children.each do |child4|
343
- if child4.name == 'peptideprophet_result'
344
- if child4.attrs["probability"].to_f < min_val
345
- #puts "dropping probability: #{child4.attrs["probability"]}"
346
- d_search_hit << child3
347
- else
348
- #puts "keeping probability: #{child4.attrs["probability"]}"
349
- end
350
- end
351
- end
352
- end
353
- end
354
- d_search_hit.each do |to_drop|
355
- to_drop.drop
356
- end
357
- #puts "size after: " + child2.size.to_s
358
- if child2.size == 0
359
- d_search_result << child2
360
- end
361
- end
362
- end
363
- d_search_result.each do |to_drop|
364
- to_drop.drop
365
- end
366
- end
367
- end
368
-
369
- File.open(outfile, "w") do |fh|
370
- fh.print root.to_s
371
- end
372
- end
373
- end # Pep::Parser
374
-
375
-
376
- # Class for parsing the '*-prot.xml' files in different ways
377
- class Proph::Prot::Parser < Parser
378
-
379
- attr_accessor :prots
380
- attr_writer :peps
381
-
382
- def initialize
383
- @prots = []
384
- end
385
-
386
- # returns all the peptides from prots
387
- def peps
388
- unless @peps
389
- @peps = []
390
- @prots.each do |prot|
391
- @peps.push(*(prot.peps))
392
- end
393
- end
394
- @peps
395
- end
396
-
397
-
398
- # sets and returns an array of Prot objects
399
- # parse_type = "rexml" | "regex"
400
- def get_prots_and_peps(protxmlfile, prot_prob_cutoff=1.0, pep_init_prob_cutoff=1.0, pep_nsp_prob_cutoff=1.0, parse_type="rexml")
401
- ## ensure these are all floats
402
- (prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff) = [prot_prob_cutoff, pep_init_prob_cutoff, pep_nsp_prob_cutoff].collect do |cutoff|
403
- cutoff.to_f
404
- end
405
-
406
- case parse_type
407
- when "rexml"
408
- doc = REXML::Document.new File.new(protxmlfile)
409
- doc.elements.each("protein_summary/protein_group/protein") do |elem|
410
- if elem.attributes['probability'].to_f >= prot_prob_cutoff
411
- prob = elem.attributes['probability'].to_f
412
- name= elem.attributes['protein_name']
413
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
414
- peptides = []
415
- elem.elements.to_a('peptide').each do |pep|
416
- if pep.attributes['nsp_adjusted_probability'].to_f >= pep_nsp_prob_cutoff && pep.attributes['initial_probability'].to_f >= pep_init_prob_cutoff
417
- nsp_prob = pep.attributes['nsp_adjusted_probability'].to_f
418
- sequence = pep.attributes['peptide_sequence']
419
- charge = pep.attributes['charge']
420
- pnm = pep.attributes['precursor_neutral_mass']
421
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :precursor_neutral_mass => pnm, :nsp_cutoff => pep_nsp_prob_cutoff))
422
- end
423
- ## Only take proteins with peptides!
424
- if peptides.size > 0
425
- curr_prot.peps = peptides
426
- @prots << curr_prot
427
- end
428
- end
429
- end
430
- end
431
- when "regex"
432
- prot_regex = /<protein protein_name="(.*)?" n_indistinguishable_proteins(.*)/o
433
- prot_prob_regex = /probability="([\d\.]+)"/o
434
- pep_regex = /<peptide peptide_sequence="(\w+)?"(.*)/o
435
- pep_else_regex = /charge="(\d)" initial_probability="([\d\.]+)" nsp_adjusted_probability="([\d\.]+)"/o
436
-
437
- curr_prot = nil
438
- peptides = []
439
- File.open(protxmlfile).each do |line|
440
- if line =~ prot_regex
441
- prob = nil
442
- name = $1.dup
443
- rest = $2
444
- if rest =~ prot_prob_regex
445
- prob = $1.dup
446
- end
447
- if curr_prot
448
- if curr_prot.probability.to_f >= prot_prob_cutoff
449
- if peptides.size > 0
450
- curr_prot.peps = peptides
451
- @prots.push(curr_prot)
452
- end
453
- end
454
- end
455
- curr_prot = Prot.new({:probability => prob, :protein_name => name, :cutoff => prot_prob_cutoff})
456
- peptides = []
457
- end
458
- if line =~ pep_regex
459
- sequence = $1.dup
460
- rest = $2
461
- if rest =~ pep_else_regex
462
- charge = $1
463
- init_prob = $2
464
- nsp_prob = $3
465
- if nsp_prob.to_f >= pep_nsp_prob_cutoff && init_prob.to_f >= pep_init_prob_cutoff
466
- peptides.push(Pep.new(:probability => nsp_prob, :sequence => sequence, :charge => charge, :nsp_cutoff => pep_nsp_prob_cutoff))
467
- end
468
- end
469
- end
470
- # get the last one:
471
- if curr_prot && curr_prot.probability.to_f > prot_prob_cutoff && peptides.size > 0
472
- curr_prot.peps = peptides
473
- @prots.push(curr_prot)
474
- end
475
- end
476
- end
477
- @prots
478
- end
479
-
480
- end # Prot::Parser
481
-
482
- ################ --END
483
-
484
- =end
data/lib/spec_id/proph.rb DELETED
@@ -1,4 +0,0 @@
1
-
2
- #require 'spec_id/proph/prot_summary'
3
- #require 'spec_id/proph/pep_summary'
4
-