mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,497 +0,0 @@
1
-
2
-
3
- require 'sample_enzyme'
4
- require 'xmlparser'
5
- require 'spec_id'
6
- require 'zlib'
7
- require 'hash_by'
8
- require 'arrayclass'
9
- require 'fasta'
10
-
11
- ## have to pre-declare some guys
12
- module ProteinReferenceable; end
13
- module SpecID; end
14
- module SpecID::Prot; end
15
- module SpecID::Pep; end
16
- module SpecIDXML; end
17
-
18
- class Bioworks
19
- include SpecID
20
-
21
- # Regular expressions
22
- @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
23
- @@modifications_re = /<modifications>(.*)<\/modifications>/o
24
- @@protein_re = /<protein>/o
25
- @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
26
- @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
27
-
28
-
29
- attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
30
- # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
31
- attr_accessor :modifications
32
-
33
- def hi_prob_best ; false end
34
-
35
- # -> prints to file filename1.sqt, filename2.sqt
36
- # @TODO: sqt file output
37
- def to_sqt(params_file)
38
- ## hash peps by filename
39
- ## hash prots by peptide
40
- end
41
-
42
- # returns the number of prots. Raises an Exception if open and closing xml
43
- # tags don't agree
44
- def num_prots(file)
45
- re = /(<protein>)|(<\/protein>)/mo
46
- begin_tags = 0
47
- end_tags = 0
48
- IO.read(file).scan(re) do |match|
49
- if match.first
50
- begin_tags += 1
51
- else
52
- end_tags += 1
53
- end
54
- end
55
- if begin_tags != end_tags
56
- puts "WARNING: #{file} doesn't have matching closing tags"
57
- puts "for the <protein> tag. Returning # of beginning tags."
58
- end
59
- begin_tags
60
- end
61
-
62
-
63
-
64
- # Outputs the bioworks browser excel format (tab delimited) to file.
65
- # Useful if you have more than ~65,000 lines (can export bioworks.xml
66
- # and then convert to excel format).
67
- # Currently, the only things not precisely identical are:
68
- # 1. The peptide hit counts (although the first number [total # peptides] is accurate)
69
- # 2. The precise ordering of peptides within each protein. When dealing with output from multiple runs, peptides with runs with exactly the same scan numbers are not guaranteed to be in the same order.
70
- def to_excel(file)
71
- update_peptide_hit_counts
72
- arr = []
73
- arr << ['', 'Reference', '', '', '', 'Score', 'Coverage', 'MW', 'Accession', 'Peptide (Hits)', '', ' ']
74
- arr << ['', '"File, Scan(s)"', 'Peptide', 'MH+', 'z', 'XC', 'DeltaCn', 'Sp', 'RSp', 'Ions', 'Count', ' ']
75
- @prots.each_with_index do |prot,index|
76
- line_arr = prot.get(:consensus_score, :coverage, :weight, :accession)
77
- if line_arr[1] == "0.0" then line_arr[1] = "" end
78
- line_arr.unshift('', '', '')
79
- line_arr.unshift('"' + prot.reference.split('|')[-1] + '"')
80
- line_arr.unshift(index+1)
81
- pep_hit_counts = prot.peptide_hit_counts
82
- pep_hit_counts_string = pep_hit_counts[0].to_s + ' (' + pep_hit_counts[1..-1].join(" ") + ')'
83
- line_arr.push( pep_hit_counts_string )
84
- line_arr.push("")
85
- line_arr.push(" ")
86
- arr.push( line_arr )
87
- prot.peps.sort_by{|obj| [obj.first_scan.to_i, obj.last_scan.to_i] }.each do |pep|
88
-
89
- pep_arr = pep.get(:sequence, :mass, :charge, :xcorr, :deltacn, :sp, :rsp, :ions)
90
- count = pep.count
91
- if count == '0' then count = "" end
92
- pep_arr.push(count)
93
- pep_arr.push(' ')
94
- pep_arr.unshift('"' + pep.file + '"')
95
- pep_arr.unshift( '' )
96
- arr.push( pep_arr )
97
- end
98
- end
99
- File.open(file, "w") do |out|
100
- arr.each do |line|
101
- out.print(line.join("\t"), "\n")
102
- end
103
- end
104
-
105
- end
106
-
107
- # for output to excel format or other things, updates each protein
108
- # with a peptide hit count array based on ranking of xcorr per dta file
109
- # where each array is the total number of peptide hits, then rank 1,2,3,4,5
110
- # @TODO: Can't get this to check out yet. Perhaps they use normalized
111
- # Xcorr?
112
- def update_peptide_hit_counts
113
- @prots.each do |prot|
114
- prot.peptide_hit_counts[0] = prot.peps.size
115
- end
116
- hash = peps.hash_by(:file)
117
- hash.sort.each do |k,v|
118
- sorted = v.sort_by {|obj| obj.xcorr.to_f }
119
- peps, prot_groups = _uniq_peps_by_sequence_charge(sorted) ## but not on prot!!!!!uniq_peps_by_sequence_charge!
120
-
121
- prot_groups.each_with_index do |prot_group, i|
122
- prot_group.each do |prot|
123
- prot.peptide_hit_counts[i+1] += 1 if prot.peptide_hit_counts[i+1]
124
- end
125
- end
126
- end
127
- end
128
-
129
- # returns (peptides, proteins) where peptides is the unique list of peps
130
- # and proteins is a parallel array of arrays of represented proteins
131
- # note that each pep will contain its original prot it belongs to, even
132
- # though the parallel protein actually represents the proteins it belongs
133
- # to.
134
- # assumes that each peptide points to all its proteins in pep.prots
135
- def _uniq_peps_by_sequence_charge(peps)
136
- new_arr = []
137
- prot_arr = []
138
- index_accounted_for = []
139
- (0...peps.size).each do |i|
140
- next if index_accounted_for.include?(i)
141
- new_arr << peps[i]
142
- prot_arr.push( peps[i].prots )
143
- ((i+1)...peps.size).each do |j|
144
- pep1, pep2 = peps[i], peps[j]
145
- if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
146
- prot_arr.last.push( *(pep2.prots) )
147
- index_accounted_for << j
148
- end
149
- end
150
- end
151
- return new_arr, prot_arr
152
- end
153
-
154
- def initialize(file=nil)
155
- @peps = nil
156
- if file
157
- @filename = file
158
- parse_xml(file)
159
- #parse_xml_by_xmlparser(file)
160
- end
161
- end
162
-
163
- def parse_xml_by_xmlparser(file)
164
- parser = Bioworks::XMLParser.new
165
- File.open(file) do |fh|
166
- #3.times do fh.gets end ## TEMPFIX
167
- parser.parse(fh)
168
- end
169
- #puts "ETETWSST"
170
- #p parser.prots
171
- @prots = parser.prots
172
- end
173
-
174
- # This is highly specific to Bioworks 3.2 xml export. In other words,
175
- # unless the newlines, etc. are duplicated, this parser will fail! Not
176
- # robust, but it is faster than xmlparser (which is based on the speedy
177
- # expat)
178
- def parse_xml(file)
179
- fh = nil
180
- if file =~ /\.gz$/
181
- fh = Zlib::GzipReader.open(file)
182
- else
183
- fh = File.open(file)
184
- end
185
- @origfilename = get_regex_val(fh, @@origfilename_re)
186
- @origfilepath = get_regex_val(fh, @@origfilepath_re)
187
- if @origfilename
188
- @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
189
- end
190
- @version = get_regex_val(fh, @@bioworksinfo_re)
191
- @modifications = get_regex_val(fh, @@modifications_re)
192
- @prots, @peps = get_prots_from_xml_stream(fh)
193
- fh.close
194
- end
195
-
196
- ## returns proteins and peptides
197
- def get_prots_from_xml_stream(fh)
198
- uniq_pephit_hash = {}
199
- prots = []
200
- while line = fh.gets
201
- if line =~ @@protein_re
202
- prot = Bioworks::Prot.new
203
- prot.bioworks = self
204
- prot.set_from_xml_stream(fh, uniq_pephit_hash)
205
- prots << prot
206
- end
207
- end
208
- [prots, uniq_pephit_hash.values]
209
- end
210
-
211
- # gets the regex and stops (and rewinds if it hits a protein)
212
- # if no regex is found, returns nil and rewinds the filehandle
213
- def get_regex_val(fh, regex)
214
- ver = nil
215
- last_pos = fh.pos
216
- while line = fh.gets
217
- if line =~ regex
218
- ver = $1.dup
219
- break
220
- elsif line =~ @@protein_re
221
- fh.seek last_pos
222
- break
223
- end
224
- last_pos = fh.pos
225
- end
226
- unless ver then fh.rewind end
227
- ver
228
- end
229
-
230
- # Outputs sequest xml files (pepxml) for the trans-proteomics pipeline
231
- def to_pepxml
232
- string = xml_version
233
- string
234
- end
235
-
236
- end
237
-
238
- # Implements fast parsing via XMLParser (wrapper around Expat)
239
- # It is actually slower (about %25 slower) than regular expression parsing
240
- class Bioworks::XMLParser < XMLParser
241
- @@at = '@'
242
- attr_accessor :prots
243
-
244
- def initialize
245
- @current_obj = nil
246
- @current_hash = {}
247
- @current_name = nil
248
- @current_data = nil
249
- @prots = []
250
- end
251
-
252
- def startElement(name, attrs)
253
- case name
254
- when "peptide"
255
- curr_prot = @current_obj
256
- if @current_obj.class == Bioworks::Prot
257
- @current_obj.set_from_xml_hash_xmlparser(@current_hash)
258
- else
259
- curr_prot = @current_obj.prot ## unless previous was a peptide
260
- end
261
- peptide = Bioworks::Pep.new
262
- peptide.prot = curr_prot
263
- curr_prot.peps << peptide
264
- @current_obj = peptide
265
- @current_hash = {}
266
- when "protein"
267
- @current_obj = Bioworks::Prot.new
268
- @current_hash = {}
269
- @prots << @current_obj
270
- else
271
- @current_name = name
272
- end
273
- end
274
-
275
- def endElement(name)
276
- case name
277
- when "peptide"
278
- @current_obj.set_from_hash_given_text(@current_hash)
279
- when "protein"
280
- else
281
- @current_hash[name] = @current_data
282
- end
283
- end
284
-
285
- def character(data)
286
- @current_data = data
287
- end
288
-
289
- end
290
-
291
- module Bioworks::XML
292
- # The regular expression to grab attributes from the bioworks xml format
293
- @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
294
- end
295
-
296
- class Bioworks::Prot
297
- include ProteinReferenceable
298
- include SpecID::Prot
299
- include Bioworks::XML
300
-
301
- @@end_prot_re = /<\/protein>/o
302
- @@pep_re = /<peptide>/o
303
- @@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps)
304
- attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts
305
-
306
- def initialize
307
- @peps = []
308
- @peptide_hit_counts = [0,0,0,0,0,0]
309
- end
310
-
311
-
312
- # returns array of values of the attributes given (as symbols)
313
- def get(*args)
314
- args.collect do |arg|
315
- send(arg)
316
- end
317
- end
318
-
319
- def set_from_xml_stream(fh, uniq_pephit_hash)
320
- hash = {}
321
- @peps = []
322
- while line = fh.gets
323
- if line =~ @@att_re
324
- hash[$1] = $2
325
- elsif line =~ @@pep_re
326
- ## Could do a look ahead to grab the file and sequence to check
327
- ## uniqueness to increase speed here.
328
- pep = Bioworks::Pep.new.set_from_xml_stream(fh)
329
- # normal search results files have a global filename
330
- # while multi-consensus do not
331
- pep[12] ||= bioworks.global_filename
332
-
333
- ## figure out uniqueness
334
- ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
335
- if uniq_pephit_hash.key? ky
336
- pep = uniq_pephit_hash[ky]
337
- else
338
- ## insert the new protein
339
- pep.prots = []
340
- uniq_pephit_hash[ky] = pep
341
- end
342
- pep.prots << self
343
- @peps << pep
344
-
345
- elsif line =~ @@end_prot_re
346
- set_from_xml_hash(hash)
347
- break
348
- else
349
- puts "Bad parsing on: #{line}"
350
- puts "EXITING!"
351
- exit
352
- end
353
- end
354
- self
355
- end
356
-
357
- def set_from_xml_hash_xmlparser(hash)
358
- hash.delete("sequestresults")
359
- hash.delete("bioworksinfo")
360
- hash["sf"] = hash.delete("Sf")
361
- hash["pi"] = hash.delete("pI")
362
- set_from_xml_hash(hash)
363
- end
364
-
365
- # changes the sf to Sf and pI to pi
366
- def set_from_xml_hash(hash)
367
- @reference = hash["reference"]
368
- @protein_probability = hash["protein_probability"].to_f
369
- #@probability = @protein_probability.to_f
370
- @consensus_score = hash["consensus_score"].to_f
371
- @sf = hash["Sf"].to_f
372
- @unified_score = hash["unified_score"].to_f
373
- @coverage = hash["coverage"].to_f
374
- @pi = hash["pI"].to_f
375
- @weight = hash["weight"].to_f
376
- @accession = hash["accession"]
377
- end
378
- end
379
-
380
- Bioworks::Pep = Arrayclass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
381
- # 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
382
-
383
- class Bioworks::Pep
384
- include SpecID::Pep
385
- include Bioworks::XML
386
- include SpecIDXML
387
-
388
- @@file_split_first_re = /, /o
389
- @@file_split_second_re = / - /o
390
- #@@att_re = /<(.*)>(.*)<\/(.*)>/
391
- @@end_pep_re = /<\/peptide>/o
392
- @@file_one_scan_re = /(.*), (\d+)/o
393
- @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
394
- ## NOTE! the mass is really the theoretical MH+!!!!
395
- ## NOTE! ALL values stored as strings, except peptide_probability!
396
-
397
- #ions is a string 'x/y'
398
-
399
- ## other accessors:
400
- def probability ; self[15] end
401
- def mh ; self[1] end
402
-
403
- # This is not a true ppm since it should be divided by the actual mh instead
404
- # of the theoretical (but it is as close as we can get for this object)
405
- def ppm
406
- 1.0e6 * (self[2].abs/self[1])
407
- #1.0e6 * (self.deltamass.abs/self.mh)
408
- end
409
-
410
- # returns array of values of the attributes given (as symbols)
411
- def get(*args)
412
- args.collect do |arg|
413
- send(arg)
414
- end
415
- end
416
-
417
-
418
-
419
-
420
- #def peptide_probability=(prob)
421
- # @peptide_probability = prob.to_f
422
- #end
423
-
424
- # takes arguments in one of two forms:
425
- # 1. file, first_scan[ - last_scan]
426
- # 2. scan[ - last_scan]
427
- # returns base_name, first_scan, last_scan
428
- # base_name will be set for #1, nil for #2
429
- def self.extract_file_info(arg)
430
- last_scan = nil
431
- (base_name, first_scan) = arg.split(@@file_split_first_re)
432
- unless first_scan
433
- first_scan = base_name
434
- base_name = nil
435
- end
436
- first_scan = first_scan.split(@@file_split_second_re)
437
- if first_scan.size > 1
438
- (first_scan, last_scan) = first_scan
439
- else
440
- first_scan = first_scan[0]
441
- last_scan = first_scan
442
- end
443
- [base_name, first_scan, last_scan]
444
- end
445
-
446
- tmp_verb = $VERBOSE
447
- $VERBOSE = nil
448
- def file=(arg)
449
- ## Set these vals by index:
450
- #puts "AERRG: #{arg}"
451
- self[16] = arg
452
- self[12,3] = self.class.extract_file_info(arg)
453
- end
454
- $VERBOSE = tmp_verb
455
-
456
- undef_method :inspect
457
- def inspect
458
- "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
459
-
460
-
461
- end
462
-
463
- # if cast == true, then all the data will be cast
464
- def set_from_hash_given_text(hash)
465
- self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
466
- self.file = hash["file"]
467
- self[15] = hash["peptide_probability"].to_f
468
- self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
469
- end
470
-
471
- def set_from_xml_stream(fh)
472
- hash = {}
473
- while line = fh.gets
474
- if line =~ @@att_re
475
- #hash[$1] = $2.dup
476
- hash[$1] = $2
477
- #puts "IN PEP: " + $1 + ": " + $2
478
- elsif line =~ @@end_pep_re
479
- set_from_hash_given_text(hash)
480
- #puts "SELF[12]: #{self[12]}"
481
- #puts "SELF[12]: #{self[12]}"
482
- break
483
- else
484
- puts "Bad parsing on: #{line}"
485
- puts "EXITING!"
486
- exit
487
- end
488
- end
489
- self
490
- end
491
-
492
- end
493
-
494
-
495
-
496
-
497
-
@@ -1,138 +0,0 @@
1
-
2
- require 'spec_id/sequest/pepxml'
3
- require 'spec_id/mass'
4
-
5
- # A digestor must be able to respond to these methods:
6
- class Digestor
7
-
8
- # min_mh_mass = min molecular mass of peptide (M+H)+
9
- attr_accessor :min_mh_mass
10
- # max_mh_mass = max molecular mass of peptide (M+H)+
11
- attr_accessor :max_mh_mass
12
- # the number of allowable missed cleavages
13
- attr_accessor :missed_cleavages
14
- # sample_enzyme = SampleEnzyme object
15
- attr_accessor :sample_enzyme
16
- # hash of masses to use (matching keys of Mass::AVG or Mass::MONO)
17
- # In addition, the following keys (as symbols) are recognized.
18
- # add_C_term_protein
19
- # add_C_term_peptide
20
- # add_N_term_protein
21
- # add_N_term_peptide
22
- attr_accessor :mass_hash
23
-
24
- # returns a list of peptide objects created from a digestion of the fasta
25
- # proteins using the sequest params (variable mods not supported yet)
26
- def self.digest(fasta_obj, params_obj)
27
- dig = self.new
28
- dig.set_from_params(params_obj)
29
- dig.create_peptide_hash(fasta_obj).values
30
- end
31
-
32
- def initialize
33
- end
34
-
35
- # takes a parameters object and fills in the necessary values
36
- def set_from_params(params_obj, include_variable_mods=false)
37
- raise NotImplementedError, "no variable mods yet" if include_variable_mods
38
- if params_obj.is_a? Sequest::Params
39
- @sample_enzyme = params_obj.sample_enzyme
40
- @missed_cleavages = params_obj.max_num_internal_cleavage_sites.to_i
41
- (@min_mh_mass, @max_mh_mass) = params_obj.digest_mass_range.split(' ').map {|v| v.to_f }
42
- (static_mods, static_terminal_mods) = Sequest::PepXML::Modifications.new.create_static_mods(params_obj)
43
- monoisotopic_parents = case params_obj.mass_type_parent
44
- when '0' ; false
45
- when '1' ; true
46
- end
47
-
48
- @mass_hash = Mass.add_static_masses(monoisotopic_parents, static_mods, static_terminal_mods)
49
- else
50
- raise ArgumentError, "Don't recognize params object of type: #{params_obj.class}"
51
- end
52
- end
53
-
54
- # aka 'digestion'
55
- # will return a hash of SpecID::GenericPep objects (with 'aaseq' and
56
- # 'prots') hashed by aminoacid sequence. The prot will be the fasta object.
57
- def create_peptide_hash(fasta_obj)
58
- pep_to_prots_hash = {}
59
- pep_objs = nil
60
- pep_aaseqs_ar = fasta_obj.map do |prot|
61
- @sample_enzyme.digest(prot.aaseq, @missed_cleavages)
62
- end
63
- prot_aaseqs = fasta_obj.map {|prot| prot.aaseq }
64
- passing_pep_seqs_ar = limit_sizes(prot_aaseqs, pep_aaseqs_ar, @min_mh_mass, @max_mh_mass, @mass_hash)
65
- #pep_aaseqs_ar.each_with_index do |before_peps,i|
66
- # after_peps = passing_pep_seqs_ar[i]
67
- # puts "before: #{before_peps.size} after: #{after_peps.size}"
68
- # puts "Losing: #{(before_peps - after_peps).inspect}"
69
- # puts "Keeping: #{after_peps.inspect}"
70
- #end
71
- fasta_obj.each_with_index do |prot, i|
72
- pep_seqs = passing_pep_seqs_ar[i]
73
- pep_seqs.each do |pep_seq|
74
- pep_obj =
75
- if pep_to_prots_hash.key?(pep_seq)
76
- pep_to_prots_hash[pep_seq]
77
- else
78
- pep_ob = SpecID::GenericPep.new
79
- pep_ob.prots = []
80
- pep_ob.aaseq = pep_seq
81
- pep_to_prots_hash[pep_seq] = pep_ob
82
- end
83
- pep_obj.prots << prot
84
- end
85
- end
86
- #pep_to_prots_hash.each do |k,v|
87
- # p v.aaseq
88
- # puts v.prots.size
89
- #end
90
- pep_to_prots_hash
91
- end
92
-
93
- # min max are both in terms of the M+H(+)
94
- #
95
- # h_plus:
96
- # On this website:
97
- # http://db.systemsbiology.net:8080/proteomicsToolkit/FragIonServlet.html
98
- # They use the mass of 'H' not 'H+' to find the (M+H)+ weight.
99
- #
100
- # The prot_aaseq is used if the mass_hash contains the keys
101
- # :add_C_term_protein or :add_N_term_protein
102
- #
103
- # mass_hash requires the key :h_plus or :h depending on h_plus option.
104
- # prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
105
- # peptides matching a protein aaseq
106
- # returns another parallel array of passing peptides per protein
107
- def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
108
- if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
109
- raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
110
- else
111
- # figure out how much must be added to each peptide
112
- # include the h2o, the h, and N and C terminal static mods
113
- h_plus_key = h_plus ? :h_plus : :h
114
- extra_add = mass_hash[h_plus_key]
115
- [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
116
- if mass_hash.key?(sym)
117
- extra_add += mass_hash[sym]
118
- end
119
- end
120
- mc = Mass::Calculator.new(mass_hash, extra_add)
121
-
122
- masses_per_group = pep_aaseqs_ar.map do |pep_aaseqs|
123
- mc.masses(pep_aaseqs)
124
- end
125
-
126
- masses_per_group.zip(pep_aaseqs_ar).map do |masses, aaseqs|
127
- passing = []
128
- aaseqs.zip(masses) do |aaseq, mh_plus|
129
- if ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
130
- passing << aaseq
131
- end
132
- end
133
- passing
134
- end
135
- end
136
- end
137
-
138
- end