mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/ms/parser.rb ADDED
@@ -0,0 +1,108 @@
1
+ require 'xml_style_parser'
2
+
3
+ module MS; end
4
+
5
+ module MS::Parser
6
+ # inherits attr_accessor :method, :default_parser, and parse (which should
7
+ # be overridden)
8
+ include XMLStyleParser
9
+
10
+ Mzxml_regexp = /http:\/\/sashimi.sourceforge.net\/schema(_revision)?\/([\w\d_\.]+)/o
11
+ # 'http://sashimi.sourceforge.net/schema/MsXML.xsd' # version 1
12
+ # 'http://sashimi.sourceforge.net/schema_revision/mzXML_X.X' # others
13
+ Mzdata_regexp = /<mzData.*version="([\d\.]+)"/m
14
+
15
+ attr_accessor :version
16
+
17
+ ############################################
18
+ # POINTERS (to create META MAGIC)
19
+ ############################################
20
+
21
+ @@filetypes_to_upcase = {
22
+ :mzxml => 'MzXML',
23
+ :mzdata => 'MzData',
24
+ :mzml => 'MzML',
25
+ :raw => 'Raw',
26
+ }
27
+
28
+ @@filetypes_to_require = {}
29
+ @@filetypes_to_constant = {}
30
+
31
+ abbrevs = Dir.chdir(File.dirname(__FILE__) + "/parser") do
32
+ Dir["*.rb"].map {|f| f.sub(/\.rb$/,'') }
33
+ end
34
+ abbrevs.each do |abbr|
35
+ abb = abbr.to_sym
36
+ req = ['ms', 'parser', abbr].join("/")
37
+ @@filetypes_to_require[abb] = req
38
+ @@filetypes_to_constant[abb] = ['MS', 'Parser', @@filetypes_to_upcase[abb]].join("::")
39
+ end
40
+
41
+ ############################################
42
+ # END POINTERS
43
+ ############################################
44
+
45
+ # finds the filetype of a file (expects to be at the beginning) and rewinds
46
+ # the filehandle to the beginning returns [filetype, version]. nil if
47
+ # filetype and version could not be determined
48
+ def self.filetype_and_version(fh_or_filename)
49
+ if fh_or_filename.is_a? IO
50
+ fh = fh_or_filename
51
+ found = nil
52
+ # Test for RAW file:
53
+ header = fh.read(18).unpack('@2axaxaxaxaxaxaxa').join
54
+ if header == 'Finnigan'
55
+ return [:raw, nil]
56
+ end
57
+ fh.rewind
58
+ while (line = fh.gets)
59
+ found =
60
+ case line
61
+ when Mzxml_regexp
62
+ mtch = $2.dup
63
+ case mtch
64
+ when /mzXML_([\d\.]+)/
65
+ [:mzxml, $1.dup]
66
+ when /MsXML/
67
+ [:mzxml, '1.0']
68
+ else
69
+ abort "Cannot determine mzXML version!"
70
+ end
71
+ when Mzdata_regexp
72
+ [:mzdata, $1.dup]
73
+ end
74
+ if found
75
+ break
76
+ end
77
+ end
78
+ fh.rewind
79
+ found
80
+ else
81
+ File.open(fh_or_filename) do |fh|
82
+ filetype_and_version(fh)
83
+ end
84
+ end
85
+ end
86
+
87
+ # filetype_version is an example file to parse, or it is an array: [type, version].
88
+ # parse_type is the information to be gleaned (as symbol).
89
+ def self.new(filetype_version, parse_type)
90
+ unless filetype_version.is_a? Array
91
+ filetype_version = filetype_and_version(filetype_version)
92
+ end
93
+ require_and_create_parser(filetype_version, parse_type)
94
+ end
95
+
96
+ private
97
+
98
+ # returns a working parser.
99
+ def self.require_and_create_parser(filetype_version, parse_type)
100
+ (filetype, version) = filetype_version
101
+ #puts "FT: #{filetype} VERSION: #{version}"
102
+ reply = require @@filetypes_to_require[filetype]
103
+ @@filetypes_to_require[filetype]
104
+ parser_class = MS::Parser.const_get(@@filetypes_to_upcase[filetype])
105
+ parser_class.new(parse_type, version)
106
+ end
107
+
108
+ end
@@ -0,0 +1,10 @@
1
+ require 'array_class'
2
+
3
+ module MS; end
4
+
5
+ # parent == spectrumRef references a scan
6
+ # 0 1 2 3 4
7
+ MS::Precursor = ArrayClass.new(%w(mz intensity parent ms_level charge_states))
8
+
9
+ class MS::Precursor
10
+ end
data/lib/ms/scan.rb ADDED
@@ -0,0 +1,81 @@
1
+ require 'array_class'
2
+ require 'ms/precursor'
3
+
4
+ module MS ; end
5
+
6
+ # 0 1 2 3 4 5 6
7
+ MS::Scan = ArrayClass.new( %w(num ms_level time start_mz end_mz precursors spectrum) )
8
+
9
+ # time in seconds
10
+ # everything else in float/int or as array (precursors)
11
+
12
+ class MS::Scan
13
+ #@@order = %w(num ms_level time start_mz end_mz prec_mz prec_inten parent spectrum)
14
+ #attr_accessor :num, :ms_level, :time, :start_mz, :end_mz, :prec_mz, :prec_inten, :parent, :spectrum
15
+
16
+ #def initialize(ar=nil)
17
+ # @@order.zip(ar) do |x,v|
18
+ # send((x+'=').to_sym, v)
19
+ # end
20
+ #end
21
+
22
+ def to_s
23
+ "<Scan num=#{num} ms_level=#{ms_level} time=#{time}>"
24
+ end
25
+
26
+ undef_method :inspect
27
+ def inspect
28
+ atts = %w(num ms_level time start_mz end_mz)
29
+ display = atts.map do |att|
30
+ if val = send(att.to_sym)
31
+ "@#{att}=#{val}"
32
+ else
33
+ nil
34
+ end
35
+ end
36
+ display.compact!
37
+ spec_display =
38
+ if spectrum
39
+ spectrum.mz.size
40
+ else
41
+ nil
42
+ end
43
+ "<MS::Scan:#{__id__} " + display.join(", ") + "@precursors=#{precursors.inspect}" + "@spectrum=size:#{spec_display}" + ">"
44
+ end
45
+
46
+ # returns the string (space delimited): "ms_level num time [prec_mz prec_inten]"
47
+ def to_index_file_string
48
+ arr = [ms_level, num, time]
49
+ if precursors then arr << precursors.first.mz end
50
+ if x = precursors.first.inten then arr << x end
51
+ arr.join(" ")
52
+ end
53
+
54
+ # adds the attribute parent to each scan with a parent
55
+ # (level 1 = no parent; level 2 = prev level 1, etc.
56
+ def self.add_parent_scan(scans)
57
+ prev_scan = nil
58
+ parent_stack = [nil]
59
+ ## we want to set the level to be the first mslevel we come to
60
+ prev_level = 1
61
+ scans.each do |scan|
62
+ if scan then prev_level = scan.ms_level; break; end
63
+ end
64
+ scans.each do |scan|
65
+ next unless scan ## the first one is nil, (others?)
66
+ level = scan.ms_level
67
+ if prev_level < level
68
+ parent_stack.unshift prev_scan
69
+ end
70
+ if prev_level > level
71
+ (prev_level - level).times do parent_stack.shift end
72
+ end
73
+ scan.parent = parent_stack.first
74
+ prev_level = level
75
+ prev_scan = scan
76
+ end
77
+ end
78
+
79
+ end
80
+
81
+
@@ -0,0 +1,193 @@
1
+ require 'base64'
2
+ require 'bsearch'
3
+ require 'ms'
4
+
5
+ class MS::Spectrum
6
+
7
+ Unpack_network_float = 'g*'
8
+ Unpack_network_double = 'G*'
9
+ Unpack_little_endian_float = 'e*'
10
+ Unpack_little_endian_double = 'E*'
11
+
12
+ # m/z's
13
+ attr_accessor :mz
14
+ # intensities
15
+ attr_accessor :intensity
16
+
17
+ def initialize(mz=[], intensity=[])
18
+ @mz = mz
19
+ @intensity = intensity
20
+ end
21
+
22
+
23
+ def has_mz_data?
24
+ (@mz.size > 0) && (@mz.first.is_a?(Numeric))
25
+ end
26
+
27
+ def has_intensity_data?
28
+ (@intensity.size > 0) && (@intensity.first.is_a?(Numeric))
29
+ end
30
+
31
+ # takes a base64 string and returns an array
32
+ def self.base64_to_array(string, precision=32, network_order=true)
33
+ b64d = Base64.decode64(string)
34
+ unpack_code =
35
+ if network_order
36
+ if precision == 32
37
+ Unpack_network_float
38
+ elsif precision == 64
39
+ Unpack_network_double
40
+ end
41
+ else ## little endian
42
+ if precision == 32
43
+ Unpack_little_endian_float
44
+ elsif precision == 64
45
+ Unpack_little_endian_double
46
+ end
47
+ end
48
+ b64d.unpack(unpack_code)
49
+ end
50
+
51
+ def self.from_base64_pair(mz_string, mz_precision, mz_network_order, inten_string, inten_precision, inten_network_order)
52
+ mz = base64_to_array(mz_string, mz_precision, mz_network_order)
53
+ inten = base64_to_array(inten_string, inten_precision, inten_network_order)
54
+ self.new(mz, inten)
55
+ end
56
+
57
+ # takes a base64 peaks string and sets spectrum
58
+ # returns self for chaining
59
+ def self.from_base64_peaks(string, precision=32, network_order=true)
60
+ data = base64_to_array(string, precision, network_order)
61
+ sz = data.size/2
62
+ mz = Array.new(sz)
63
+ intensity = Array.new(sz)
64
+ ndata = []
65
+ my_ind = 0
66
+ data.each_with_index do |dat,ind|
67
+ if (ind % 2) == 0 # even
68
+ mz[my_ind] = dat
69
+ else
70
+ intensity[my_ind] = dat
71
+ my_ind += 1
72
+ end
73
+ end
74
+ self.new(mz, intensity)
75
+ end
76
+
77
+
78
+
79
+
80
+ ######
81
+ # NOT REALLY USING RIGHT NOW:
82
+ ######
83
+
84
+ # takes a base64 peaks string and returns an array of [m/z,intens] doublets
85
+ # mzXML as network ordered
86
+ def base64_peaks_to_pairs(string, precision=32)
87
+ data = base64_peaks_to_array(string, precision)
88
+ ndata = []
89
+ data.each_with_index do |dat,ind|
90
+ if (ind % 2) == 0 # even
91
+ arr = Array.new(2)
92
+ arr[0] = dat
93
+ ndata.push( arr )
94
+ else
95
+ ndata.last[1] = dat
96
+ end
97
+ end
98
+ ndata
99
+ end
100
+
101
+ # returns the index of the first value matching that m/z. the argument m/z
102
+ # may be less precise than the actual m/z (rounding to the same precision
103
+ # given) but must be at least integer precision (after rounding)
104
+ # implemented as binary search (bsearch from the web)
105
+ def index(mz)
106
+ return_val = nil
107
+ ind = @mz.bsearch_lower_boundary{|x| x <=> mz }
108
+ if @mz[ind] == mz
109
+ return_val = ind
110
+ else
111
+ # do a rounding game to see which one is it, or nil
112
+ # find all the values rounding to the same integer in the locale
113
+ # test each one fully in turn
114
+ mz = mz.to_f
115
+ mz_size = @mz.size
116
+ if ((ind < mz_size) and equal_after_rounding?(@mz[ind], mz))
117
+ return_val = ind
118
+ else # run the loop
119
+ up = ind
120
+ loop do
121
+ up += 1
122
+ if up >= mz_size
123
+ break
124
+ end
125
+ mz_up = @mz[up]
126
+ if (mz_up.ceil - mz.ceil >= 2)
127
+ break
128
+ else
129
+ if equal_after_rounding?(mz_up, mz)
130
+ return_val = up
131
+ return return_val
132
+ end
133
+ end
134
+ end
135
+ dn= ind
136
+ loop do
137
+ dn -= 1
138
+ if dn < 0
139
+ break
140
+ end
141
+ mz_dn = @mz[dn]
142
+ if (mz.floor - mz_dn.floor >= 2)
143
+ break
144
+ else
145
+ if equal_after_rounding?(mz_dn, mz)
146
+ return_val = dn
147
+ return return_val
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+ return_val
154
+ end
155
+
156
+ # uses index function and returns the intensity at that value
157
+ def intensity_at_mz(mz)
158
+ if x = index(mz)
159
+ @intensity[x]
160
+ else
161
+ nil
162
+ end
163
+ end
164
+
165
+ # less_precise should be a float
166
+ # precise should be a float
167
+ def equal_after_rounding?(precise, less_precise)
168
+ # determine the precision of less_precise
169
+ exp10 = precision_as_neg_int(less_precise)
170
+ #puts "EXP10: #{exp10}"
171
+ answ = ((precise*exp10).round == (less_precise*exp10).round)
172
+ #puts "TESTING FOR EQUAL: #{precise} #{less_precise}"
173
+ #puts answ
174
+ (precise*exp10).round == (less_precise*exp10).round
175
+ end
176
+
177
+ # returns 1 for ones place, 10 for tenths, 100 for hundredths
178
+ # to a precision exceeding 1e-6
179
+ def precision_as_neg_int(float)
180
+ neg_exp10 = 1
181
+ loop do
182
+ over = float * neg_exp10
183
+ rounded = over.round
184
+ if (over - rounded).abs <= 1e-6
185
+ break
186
+ end
187
+ neg_exp10 *= 10
188
+ end
189
+ neg_exp10
190
+ end
191
+
192
+ end
193
+
data/lib/ms.rb ADDED
@@ -0,0 +1,10 @@
1
+
2
+
3
+ module MS
4
+ attr_accessor :spectra
5
+
6
+ # should
7
+ def new(file=nil)
8
+ end
9
+
10
+ end
data/lib/mspire.rb ADDED
@@ -0,0 +1,4 @@
1
+
2
+ module Mspire
3
+ Version = '0.3.0'
4
+ end
data/lib/roc.rb CHANGED
@@ -42,10 +42,19 @@ class ROC
42
42
  area
43
43
  end
44
44
 
45
+ # takes two lists of values and makes doublets [[val, boolean],...]
46
+ def separate_to_doublets(tps, fps)
47
+ true_doublets = tps.map {|v| [v, 0] }
48
+ false_doublets = fps.map {|v| [v, 1] }
49
+ all_doublets = true_doublets + false_doublets
50
+ all_doublets.sort!
51
+ all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
52
+ end
53
+
45
54
  # given an array of doublets where each doublet is a value and a boolean,
46
55
  # sorts the list and divides it into two arrays (tps, fps) of the values.
47
56
  # The output can then be fed into many of the other routines.
48
- def prep_list(list)
57
+ def doublets_to_separate(list)
49
58
  tp = []; fp = []
50
59
  list.each do |dbl|
51
60
  if dbl[1]
@@ -85,6 +94,27 @@ class ROC
85
94
  end
86
95
  return x, y
87
96
  end
97
+
98
+ # takes previously sorted doublets [value, boolean]
99
+ def numhits_and_ppv(doublets)
100
+ x = []
101
+ y = []
102
+ tps = 0
103
+ fps = 0
104
+ doublets.each_with_index do |d,i|
105
+ if d[1] ; tps += 1
106
+ else ; fps += 1 end
107
+
108
+ if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
109
+ num_hits = tps + fps
110
+ x << num_hits
111
+ y << tps.to_f/num_hits
112
+ end
113
+ end
114
+ [x, y]
115
+ end
116
+
117
+
88
118
  end
89
119
 
90
120
  # For calculating precision given lists of hits and decoy hits. The hits are
@@ -124,4 +154,34 @@ class DecoyROC < ROC
124
154
  [num_hits_ar, num_tps_ar, ppv_ar]
125
155
  end
126
156
 
157
+ # returns [num_hits, precision] as a function of num hits. decoy hits are
158
+ # seen merely as indicators of the number of false hits in the dataset.
159
+ # This is the same algorithm as pred_and_tps_and_ppv, just eliminates
160
+ # uneeded calcs
161
+ def pred_and_ppv(hits, decoy_hits)
162
+ hits_i = 0
163
+ decoy_i = 0
164
+
165
+ num_hits_ar = []
166
+ ppv_ar = []
167
+
168
+ while hits_i < hits.size
169
+ while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
170
+ decoy_i += 1
171
+ end
172
+ unless hits[hits_i] == hits[hits_i+1]
173
+ ## determine the number of false positives
174
+ tot_num_hits = hits_i+1
175
+ num_tps = tot_num_hits - decoy_i
176
+
177
+ num_hits_ar << tot_num_hits
178
+ ppv_ar << ( num_tps.to_f/tot_num_hits )
179
+
180
+ end
181
+ hits_i += 1
182
+ end
183
+ [num_hits_ar, ppv_ar]
184
+
185
+ end
186
+
127
187
  end
data/lib/sample_enzyme.rb CHANGED
@@ -1,9 +1,12 @@
1
1
 
2
2
  module SpecIDXML; end
3
3
 
4
- require 'spec_id_xml'
5
4
  require 'strscan'
6
5
 
6
+ require 'spec_id_xml'
7
+ require 'spec_id'
8
+
9
+
7
10
  class SampleEnzyme
8
11
  include SpecIDXML
9
12
 
@@ -18,6 +21,7 @@ class SampleEnzyme
18
21
  # Currently, recognize:
19
22
  # trypsin
20
23
  # For other enzymes, you must set :cut, :no_cut, :name, and :sense
24
+ # will yield the object if you want to set the values that way
21
25
  def initialize(name=nil)
22
26
  @sense = nil
23
27
  @cut = nil
@@ -25,11 +29,14 @@ class SampleEnzyme
25
29
  @name = name
26
30
  if @name
27
31
  # set the values if we recognize this name
28
- send(@name.to_sym)
32
+ send("set_#{@name}".to_sym)
33
+ end
34
+ if block_given?
35
+ yield(self)
29
36
  end
30
37
  end
31
38
 
32
- def trypsin
39
+ def set_trypsin
33
40
  @sense = 'C'
34
41
  @cut = 'KR'
35
42
  @no_cut = 'P'
@@ -41,10 +48,26 @@ class SampleEnzyme
41
48
  end
42
49
  end
43
50
 
51
+ # returns self
52
+ def from_pepxml_node(node)
53
+ self.name = node['name']
54
+ ch = node.child
55
+ self.cut = ch['cut']
56
+ self.no_cut= ch['no_cut']
57
+ self.sense = ch['sense']
58
+ self
59
+ end
60
+
61
+ def self.from_pepxml_node(node)
62
+ self.new.from_pepxml_node(node)
63
+ end
64
+
44
65
  # returns all peptides of missed cleavages <= 'missed_cleavages'
45
66
  # so 2 missed cleavages will return all no missed cleavage peptides
46
67
  # all 1 missed cleavages and all 2 missed cleavages.
47
- def digest(string, missed_cleavages=0)
68
+ # options:
69
+ def digest(string, missed_cleavages=0, options={})
70
+ raise NotImplementedError if @sense == 'N'
48
71
  s = StringScanner.new(string)
49
72
  no_cut_regex = Regexp.new("[#{@no_cut}]")
50
73
  regex = Regexp.new("[#{@cut}]")
@@ -75,7 +98,7 @@ class SampleEnzyme
75
98
  end
76
99
  ## LOOP through and grab each set of missed cleavages from num down to 0
77
100
  all_sets_of_peps = []
78
- (0..missed_cleavages).to_a.reverse.map do |num_mc|
101
+ (0..missed_cleavages).to_a.reverse.each do |num_mc|
79
102
  all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
80
103
  end
81
104
  all_sets_of_peps
@@ -85,9 +108,9 @@ class SampleEnzyme
85
108
  # cleavages
86
109
  # DOES NOT contain peptides that contain < num of missed cleavages
87
110
  # (i.e., will not return missed cleaveages of 1 or 2 if num == 3
88
- def get_missed_cleavages(tryptic_peps, num)
89
- (0...(tryptic_peps.size - num)).to_a.map do |i|
90
- tryptic_peps[i,num+1].join
111
+ def get_missed_cleavages(ar_of_peptide_seqs, num)
112
+ (0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
113
+ ar_of_peptide_seqs[i,num+1].join
91
114
  end
92
115
  end
93
116
 
data/lib/scan_i.rb ADDED
@@ -0,0 +1,21 @@
1
+
2
+ # http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
3
+ class String
4
+ def scan_i seq
5
+ pos=0
6
+ ndx=[]
7
+ slen = seq.length
8
+ while i=index(seq,pos)
9
+ ndx << i
10
+ pos = i + slen
11
+ end
12
+ ndx
13
+ end
14
+
15
+ #def scan_enum seq
16
+ # self.enum_for(:scan, seq).map do
17
+ # $~.offset(0)[0]
18
+ # end
19
+ #end
20
+ end
21
+
@@ -9,9 +9,10 @@ class SpecID::AAFreqs
9
9
  # seeing that amino acid. Frequencies should add to 1.
10
10
  attr_accessor :aafreqs
11
11
 
12
- def initialize(fasta_file=nil)
13
- if fasta_file
14
- @fasta = Fasta.new.read_file(fasta_file)
12
+ # fasta is fasta object!
13
+ def initialize(fasta=nil)
14
+ @fasta = fasta
15
+ if @fasta
15
16
  @aafreqs = calculate_frequencies(@fasta)
16
17
  end
17
18
  end
@@ -64,6 +65,9 @@ class SpecID::AAFreqs
64
65
  # returns two numbers in array [actual, expected]
65
66
  # expected is a Float!!!
66
67
  def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
68
+ if at_least > 1
69
+ raise NotImplementedError, "can only do at_least=1 right now!"
70
+ end
67
71
  one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
68
72
  amino_acid_as_st = amino_acid.to_s
69
73
  probs = []