mspire 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,112 @@
1
+ require 'nokogiri'
2
+ require 'ms/ident'
3
+ require 'ms/ident/pepxml/msms_pipeline_analysis'
4
+
5
+ require 'ostruct'
6
+
7
+ module MS ; module Ident ; end ; end
8
+
9
+ class Numeric
10
+ # returns a string with a + or - on the front
11
+ def to_plus_minus_string
12
+ if self >= 0
13
+ '+' << self.to_s
14
+ else
15
+ self.to_s
16
+ end
17
+ end
18
+ end
19
+
20
+ class MS::Ident::Pepxml
21
+ XML_STYLESHEET_LOCATION = '/tools/bin/TPP/tpp/schema/pepXML_std.xsl'
22
+ DEFAULT_PEPXML_VERSION = MsmsPipelineAnalysis::PEPXML_VERSION
23
+ XML_ENCODING = 'UTF-8'
24
+
25
+ attr_accessor :msms_pipeline_analysis
26
+
27
+ # returns an array of MS::Ident::Pepxml::SearchHit::Simple structs
28
+ def self.simple_search_hits(file)
29
+ hit_values = File.open(file) do |io|
30
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
31
+ # we can work with namespaces, or just remove them ...
32
+ doc.remove_namespaces!
33
+ root = doc.root
34
+ search_hits = root.xpath('//search_hit')
35
+ search_hits.each_with_index.map do |search_hit,i|
36
+ aaseq = search_hit['peptide']
37
+ charge = search_hit.parent.parent['assumed_charge'].to_i
38
+ search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
39
+ search_scores = {}
40
+ search_score_nodes.each do |node|
41
+ search_scores[node['name'].to_sym] = node['value'].to_f
42
+ end
43
+ MS::Ident::Pepxml::SearchHit::Simple.new("hit_#{i}", MS::Ident::Search.new(file.chomp(File.extname(file))), aaseq, charge, search_scores)
44
+ end
45
+ end
46
+ end
47
+
48
+ def pepxml_version
49
+ msms_pipeline_analysis.pepxml_version
50
+ end
51
+
52
+ # returns an array of spectrum queries
53
+ def spectrum_queries
54
+ msms_pipeline_analysis.msms_run_summary.spectrum_queries
55
+ end
56
+
57
+ # yields a new Msms_Pipeline_Analysis object if given a block
58
+ def initialize(&block)
59
+ block.call(@msms_pipeline_analysis=MsmsPipelineAnalysis.new) if block
60
+ end
61
+
62
+ # takes an xml document object and sets it with the xml stylesheet
63
+ def add_stylesheet(doc, location)
64
+ xml_stylesheet = Nokogiri::XML::ProcessingInstruction.new(doc, "xml-stylesheet", %Q{type="text/xsl" href="#{location}"})
65
+ doc.root.add_previous_sibling xml_stylesheet
66
+ doc
67
+ end
68
+
69
+ # if no options are given, an xml string is returned. If either :outdir or
70
+ # :outfile is given, the xml is written to file and the output filename is returned.
71
+ # A single string argument will be interpreted as :outfile if it ends in
72
+ # '.xml' and the :outdir otherwise. In this case, update_summary_xml is still true
73
+ #
74
+ # options:
75
+ #
76
+ # arg default
77
+ # :outdir => nil write to disk using this outdir with summary_xml basename
78
+ # :outfile => nil write to this filename (overrides outdir)
79
+ # :update_summary_xml => true update summary_xml attribute to point to the output file true/false
80
+ #
81
+ # set outdir to
82
+ # File.dirname(pepxml_obj.msms_pipeline_analysis.msms_run_summary.base_name)
83
+ # to write to the same directory as the input search file.
84
+ def to_xml(opts={})
85
+ opts ||= {}
86
+ if opts.is_a?(String)
87
+ opts = ( opts.match(/\.xml$/) ? {:outfile => opts} : {:outdir => opts } )
88
+ end
89
+ opt = {:update_summary_xml => true, :outdir => nil, :outfile => nil}.merge(opts)
90
+
91
+ if opt[:outfile]
92
+ outfile = opt[:outfile]
93
+ elsif opt[:outdir]
94
+ outfile = File.join(opt[:outdir], msms_pipeline_analysis.summary_xml.split(/[\/\\]/).last)
95
+ end
96
+ self.msms_pipeline_analysis.summary_xml = File.expand_path(outfile) if (opt[:update_summary_xml] && outfile)
97
+
98
+ builder = Nokogiri::XML::Builder.new(:encoding => XML_ENCODING)
99
+ msms_pipeline_analysis.to_xml(builder)
100
+ add_stylesheet(builder.doc, MS::Ident::Pepxml::XML_STYLESHEET_LOCATION)
101
+ string = builder.doc.to_xml
102
+
103
+ if outfile
104
+ File.open(outfile,'w') {|out| out.print(string) }
105
+ outfile
106
+ else
107
+ string
108
+ end
109
+ end
110
+ end
111
+
112
+
@@ -0,0 +1,33 @@
1
+ require 'andand'
2
+
3
+ module MS ; end
4
+ module MS::Ident
5
+ module ProteinLike
6
+ # an id for the protein
7
+ attr_accessor :id
8
+
9
+ # the protein sequence
10
+ attr_accessor :sequence
11
+ alias_method :seq, :sequence
12
+ alias_method :seq=, :sequence=
13
+
14
+ # a description of the protein
15
+ attr_accessor :description
16
+
17
+ # if the GN=([^\s]+) regexp is found in the description, returns the first
18
+ # match, or nil if not found
19
+ def gene_id
20
+ description.andand.match(/ GN=(\w+) ?/)[1]
21
+ end
22
+ end
23
+
24
+ # a generic protein class that is ProteinLike
25
+ class Protein
26
+ include ProteinLike
27
+
28
+ def initialize(id=nil, sequence=nil)
29
+ (@id, @sequence) = id, sequence
30
+ end
31
+ end
32
+ end
33
+
@@ -0,0 +1,80 @@
1
+ require 'set'
2
+
3
+ module MS
4
+ module Ident
5
+ # represents a group of proteins, typically indistinguishable in the
6
+ # experiment.
7
+ class ProteinGroup < Array
8
+ attr_accessor :peptide_hits
9
+
10
+ PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
11
+ peptide_hits = protein_group_and_peptide_hits.last
12
+ num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
13
+ num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
14
+ [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
15
+ end
16
+
17
+ # greedy algorithm to map a set of peptide_hits to protein groups. each
18
+ # peptide hit should respond to :aaseq, :charge, :proteins if a block is
19
+ # given, yields a single argument: a doublet of protein_group and peptide
20
+ # set. It expects a metric or array to sort by for creating greedy protein
21
+ # groups (the greediest proteins should sort to the back of the array). if
22
+ # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
23
+ # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
24
+ # peptide_hits and the objects returned by peptide_hit#proteins are used as
25
+ # hash keys. As long as each peptide hit has a unique signature (like an
26
+ # id) then any object will work. If they are Struct objects, you might
27
+ # consider redefining the #hash method to be object_id for performance and
28
+ # accuracy.
29
+ #
30
+ # returns an array of ProteinGroup objects, each set with :peptide_hits
31
+ #
32
+ # If update_peptide_hits is true, then each peptide_hit is linked to the array
33
+ # of protein_groups it is associated with using :protein_groups. A
34
+ # symbol can also be passed in, and that method will be called instead.
35
+ def self.peptide_hits_to_protein_groups(peptide_hits, update_peptide_hits=false, &sort_by)
36
+ update_peptide_hits = 'protein_groups='.to_sym if (update_peptide_hits==true)
37
+ sort_by ||= PRIORITIZE_PROTEINS
38
+ # note to self: I wrote this in 2011, so I think I know what I'm doing now
39
+ protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
40
+ peptide_hits.each do |peptide_hit|
41
+ peptide_hit.proteins.each do |protein|
42
+ protein_to_peptides[protein] << peptide_hit
43
+ end
44
+ end
45
+ peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
46
+ protein_to_peptides.each do |protein, peptide_set|
47
+ peptides_to_protein_group[peptide_set] << protein
48
+ end
49
+ peptides_to_protein_group.each do |pephits,ar_of_prots|
50
+ pg = MS::Ident::ProteinGroup.new(ar_of_prots)
51
+ pg.peptide_hits = pephits
52
+ peptides_to_protein_group[pephits] = pg
53
+ end
54
+
55
+ protein_group_to_peptides = peptides_to_protein_group.invert
56
+ greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
57
+
58
+ accounted_for = Set.new
59
+ # we are discarding the subsumed sets, but we could get them with
60
+ # partition
61
+ greedy_first.select! do |group, peptide_set|
62
+ has_an_unaccounted_peptide = false
63
+ peptide_set.each do |peptide_hit|
64
+ unless accounted_for.include?(peptide_hit)
65
+ has_an_unaccounted_peptide = true
66
+ accounted_for.add(peptide_hit)
67
+ end
68
+ end
69
+ group.peptide_hits = peptide_set if has_an_unaccounted_peptide
70
+ has_an_unaccounted_peptide
71
+ end
72
+ if update_peptide_hits
73
+ greedy_first.each {|pg, pephits| pephits.each {|hit| hit.send(update_peptide_hits, pg) } }
74
+ end
75
+ greedy_first.map(&:first)
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,114 @@
1
+
2
+ module MS
3
+ module Ident
4
+
5
+ module SearchLike
6
+ attr_accessor :id
7
+ attr_accessor :peptide_hits
8
+ alias_method :hits, :peptide_hits
9
+ alias_method :hits=, :peptide_hits=
10
+
11
+ # returns an array of peptide_hits and protein_hits that are linked to
12
+ # one another. NOTE: this will update peptide and protein
13
+ # hits :proteins and :peptide_hits attributes respectively). Assumes that each search
14
+ # responds to :peptide_hits, each peptide responds to :proteins and each protein to
15
+ # :peptide_hits. Can be done on a single file to restore protein/peptide
16
+ # linkages to their original single-file state.
17
+ # Assumes the protein is initialized with (reference, peptide_ar)
18
+ #
19
+ # yields the protein that will become the template for a new protein
20
+ # and expects a new protein hit
21
+ #def merge!(ar_of_peptide_hit_arrays)
22
+ # all_peptide_hits = []
23
+ # reference_hash = {}
24
+ # ar_of_peptide_hit_arrays.each do |peptide_hits|
25
+ # all_peptide_hits.push(*peptide_hits)
26
+ # peptide_hits.each do |peptide|
27
+ # peptide.proteins.each do |protein|
28
+ # id = protein.id
29
+ # if reference_hash.key?(id)
30
+ # reference_hash[id].peptide_hits << peptide
31
+ # reference_hash[id]
32
+ # else
33
+ # reference_hash[id] = yield(protein, [peptide])
34
+ # end
35
+ # end
36
+ # end
37
+ # end
38
+ # [all_peptide_hits, reference_hash.values]
39
+ #end
40
+ end
41
+
42
+ class Search
43
+ include SearchLike
44
+
45
+ def initialize(id=nil, peptide_hits=[])
46
+ @id = id
47
+ @peptide_hits = peptide_hits
48
+ end
49
+ end
50
+
51
+
52
+ module SearchGroup
53
+
54
+ # an array of search objects
55
+ attr_accessor :searches
56
+
57
+ # the group's file extension (with no leading period)
58
+ def extension
59
+ 'grp'
60
+ end
61
+
62
+ def search_class
63
+ Search
64
+ end
65
+
66
+ # a simple formatted file with paths to the search files
67
+ def to_paths(file)
68
+ IO.readlines(file).grep(/\w/).reject {|v| v =~ /^#/}.map {|v| v.chomp }
69
+ end
70
+
71
+ def from_file(file)
72
+ from_filenames(to_paths(file))
73
+ end
74
+
75
+
76
+ def from_filenames(filenames)
77
+ filenames.each do |file|
78
+ if !File.exist? file
79
+ message = "File: #{file} does not exist!\n"
80
+ message << "perhaps you need to modify the file with file paths"
81
+ abort message
82
+ end
83
+ @searches << search_class.new(file)
84
+ end
85
+ end
86
+
87
+
88
+ # takes an array of filenames or a single search filename (with
89
+ # extension defined by 'extendsion') or an array of objects passes any
90
+ # arguments to the initializer for each search
91
+ # the optional block yields the object for further processing
92
+ def initialize(arg=nil, opts={})
93
+ @peptide_hits = []
94
+ @reference_hash = {}
95
+ @searches = []
96
+
97
+ if arg
98
+ if arg.is_a?(String) && arg =~ /\.#{Regexp.escap(extension)}$/
99
+ from_file(arg)
100
+ elsif arg.is_a?(Array) && arg.first.is_a?(String)
101
+ from_filenames(arg)
102
+ elsif arg.is_a?(Array)
103
+ @searches = array
104
+ else
105
+ raise ArgumentError, "must be file, array of filenames, or array of objs"
106
+ end
107
+ @searches << search_class.new(file, opts)
108
+ end
109
+ yield(self) if block_given?
110
+ end
111
+
112
+ end
113
+ end
114
+ end
data/lib/ms/ident.rb ADDED
@@ -0,0 +1,37 @@
1
+
2
+ require 'ms/ident/protein_group'
3
+ require 'ms/ident/protein'
4
+ require 'ms/ident/peptide_hit'
5
+
6
+ module MS
7
+
8
+ # An MS::Ident::ProteinGroup is an array of proteins that responds to
9
+ # :peptide_hits. All protein level identifications should be stored in a
10
+ # proteingroup object.
11
+ #
12
+ # An MS::Ident::Protein is an object representing a protein (:id,
13
+ # :sequence, :description). Note, it is not a protein hit (use a
14
+ # ProteinGroup)
15
+ #
16
+ # An MS::Ident::PeptideHit is an object representing a match between an
17
+ # amino acid sequence and a spectrum.
18
+ #
19
+ # Typical usage:
20
+ #
21
+ # require 'ms/ident'
22
+ #
23
+ # hit1 = PeptideHit.new(:id => 1, :aaseq => 'PEPTIDE', :search =>
24
+ # MS::Ident::Search.new, etc...)
25
+ # peptide_hits = [hit1, hit2, ...]
26
+ #
27
+ # protein_groups = MS::Ident::ProteinGroup.peptide_hits_to_protein_groups(peptide_hits)
28
+ # protein_groups.first.peptide_hits # => the peptide hits in that group
29
+ module Ident
30
+ # returns the filetype (if possible)
31
+ def self.filetype(file)
32
+ if file =~ /\.srf$/i
33
+ :srf
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,59 @@
1
+ module MS
2
+ module Isotope
3
+ module AA
4
+ ATOM_COUNTS_STR = {
5
+ 'A' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
6
+ 'R' => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
7
+ 'N' => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
8
+ 'D' => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
9
+ 'C' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
10
+ 'E' => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
11
+ 'Q' => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
12
+ 'G' => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
13
+ 'H' => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
14
+ 'I' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
15
+ 'L' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
16
+ 'K' => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
17
+ 'M' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
18
+ 'F' => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
19
+ 'P' => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
20
+ 'S' => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
21
+ 'T' => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
22
+ 'W' => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
23
+ 'Y' => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
24
+ 'V' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
25
+ 'U' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
26
+ 'O' => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
27
+ }
28
+ ATOM_COUNTS_SYM = {
29
+ :A => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
30
+ :R => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
31
+ :N => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
32
+ :D => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
33
+ :C => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
34
+ :E => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
35
+ :Q => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
36
+ :G => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
37
+ :H => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
38
+ :I => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
39
+ :L => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
40
+ :K => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
41
+ :M => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
42
+ :F => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
43
+ :P => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
44
+ :S => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
45
+ :T => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
46
+ :W => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
47
+ :Y => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
48
+ :V => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
49
+ :U => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
50
+ :O => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
51
+ }
52
+ ATOM_COUNTS_STR.each {|aa,val| ATOM_COUNTS[aa.to_sym] = val }
53
+
54
+ # string and symbol access of amino acid (atoms are all lower case
55
+ # symbols)
56
+ ATOM_COUNTS = ATOM_COUNTS_SYM.merge ATOM_COUNTS_STR
57
+ end
58
+ end
59
+ end
data/lib/ms/mascot.rb ADDED
@@ -0,0 +1,6 @@
1
+
2
+ module MS
3
+ module Mascot
4
+ H_PLUS = 1.007276
5
+ end
6
+ end
data/lib/ms/mass/aa.rb ADDED
@@ -0,0 +1,79 @@
1
+ require 'ms/mass'
2
+
3
+ module MS
4
+ module Mass
5
+ module AA
6
+ # amino_acids keys as strings, average masses
7
+ AVG_STRING = {
8
+ "*"=>118.88603,
9
+ "A"=>71.0779,
10
+ "B"=>172.1405,
11
+ "C"=>103.1429,
12
+ "D"=>115.0874,
13
+ "E"=>129.11398,
14
+ "F"=>147.17386,
15
+ "G"=>57.05132,
16
+ "H"=>137.13928,
17
+ "I"=>113.15764,
18
+ "K"=>128.17228,
19
+ "L"=>113.15764,
20
+ "M"=>131.19606,
21
+ "N"=>114.10264,
22
+ "O"=>211.28076,
23
+ "P"=>97.11518,
24
+ "Q"=>128.12922,
25
+ "R"=>156.18568,
26
+ "S"=>87.0773,
27
+ "T"=>101.10388,
28
+ "U"=>150.0379,
29
+ "V"=>99.13106,
30
+ "W"=>186.2099,
31
+ "X"=>118.88603,
32
+ "Y"=>163.17326,
33
+ "Z"=>128.6231
34
+ }
35
+ # amino_acids keys as strings, monoisotopic masses
36
+ MONO_STRING = {
37
+ "*"=>118.805716,
38
+ "A"=>71.0371137878,
39
+ "B"=>172.048405,
40
+ "C"=>103.0091844778,
41
+ "D"=>115.026943032,
42
+ "E"=>129.0425930962,
43
+ "F"=>147.0684139162,
44
+ "G"=>57.0214637236,
45
+ "H"=>137.0589118624,
46
+ "I"=>113.0840639804,
47
+ "K"=>128.0949630177,
48
+ "L"=>113.0840639804,
49
+ "M"=>131.0404846062,
50
+ "N"=>114.0429274472,
51
+ "O"=>211.1446528645,
52
+ "P"=>97.052763852,
53
+ "Q"=>128.0585775114,
54
+ "R"=>156.1011110281,
55
+ "S"=>87.0320284099,
56
+ "T"=>101.0476784741,
57
+ "U"=>150.9536355878,
58
+ "V"=>99.0684139162,
59
+ "W"=>186.0793129535,
60
+ "X"=>118.805716,
61
+ "Y"=>163.0633285383,
62
+ "Z"=>128.550585
63
+ }
64
+
65
+ # amino_acids keys as symbols, monoisotopic masses
66
+ MONO_SYM = Hash[MONO_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
67
+
68
+ # amino_acids keys as symbols, average masses
69
+ AVG_SYM = Hash[AVG_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
70
+
71
+ # Monoisotopic amino acid masses keyed as symbols and also strings (all
72
+ # upper case). Also includes MS::Mass::MONO for things like protons ('h+')
73
+ MONO = MONO_SYM.merge(MONO_STRING).merge(MS::Mass::MONO)
74
+ # Average amino acid masses keyed as symbols and also strings (all
75
+ # uppder case). Also includes MS::Mass::AVG for things like protons ('h+')
76
+ AVG = AVG_SYM.merge(AVG_STRING).merge(MS::Mass::AVG)
77
+ end
78
+ end
79
+ end
data/lib/ms/mass.rb ADDED
@@ -0,0 +1,55 @@
1
+
2
+ module MS
3
+ module Mass
4
+
5
+ # takes a chemical formula in this format: C2BrH12O
6
+ def self.formula_to_exact_mass(formula)
7
+ # TODO: add other input methods
8
+ pairs = formula.scan(/([A-Z][a-z]?)(\d*)/).map do |match|
9
+ if match.last == ''
10
+ match[-1] = 1
11
+ end
12
+ [match[0], match[1].to_i]
13
+ end
14
+ pairs.map do |pair|
15
+ MONO[pair.first.downcase] * pair.last
16
+ end.reduce(:+)
17
+ end
18
+
19
+ H_PLUS = 1.00727646677
20
+ # + http://www.unimod.org/masses.html
21
+ MONO_STR = {
22
+ 'c' => 12.0, # +
23
+ 'br' => 78.9183361, # +
24
+ 'd' => 2.014101779, # +
25
+ 'f' => 18.99840322, # +
26
+ 'n' => 14.003074, # +
27
+ 'o' => 15.99491463, # +
28
+ 'na' => 22.9897677, # +
29
+ 'p' => 30.973762, # +
30
+ 's' => 31.9720707, # +
31
+ 'li' => 7.016003, # +
32
+ 'cl' => 34.96885272, # +
33
+ 'k' => 38.9637074, # +
34
+ 'si' => 27.9769265325, # http://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl?ele=Si&ascii=html&isotype=some
35
+ 'i' => 126.904473, # +
36
+ 'h+' => 1.00727646677,
37
+ 'h' => 1.007825035, # +
38
+ 'h2o' => 18.0105647,
39
+ 'oh' => 17.002739665,
40
+ }
41
+ AVG_STR = {
42
+ 'h+' => 1.007276, # using Mascot_H_plus mass (is this right for AVG??)
43
+ 'h' => 1.00794,
44
+ 'h2o' => 18.01528,
45
+ 'oh' => 17.00734,
46
+ }
47
+ # sets MONO_STR, MONO, AVG_STR, and AVG
48
+ %w(MONO AVG).each do |type|
49
+ const_set "#{type}_SYM", Hash[ const_get("#{type}_STR").map {|k,v| [k.to_sym, v] } ]
50
+ const_set type, const_get("#{type}_STR").merge( const_get("#{type}_SYM") )
51
+ end
52
+ end
53
+ end
54
+
55
+
@@ -0,0 +1,98 @@
1
+ module MS
2
+ class Mzml
3
+ # A simple array of indices but #[] has been overloaded to find an index
4
+ # by name
5
+ #
6
+ # index_list[0] # the first index
7
+ # index_list.map(&:names) # -> [:spectrum, :chromatogram]
8
+ # index_list[:spectrum] # the spectrum index
9
+ # index_list[:chromatogram] # the chromatogram index
10
+ class IndexList < Array
11
+ alias_method :old_bracket_slice, :'[]'
12
+
13
+ # @param [Object] an Integer (index number) or a Symbol (:spectrum or
14
+ # :chromatogram)
15
+ # @return [MS::Mzml::Index] an index object
16
+ def [](int_or_symbol)
17
+ if int_or_symbol.is_a?(Integer)
18
+ old_bracket_slice(int_or_symbol)
19
+ else
20
+ self.find {|index| index.name == int_or_symbol }
21
+ end
22
+ end
23
+ end
24
+
25
+ # the array holds start bytes
26
+ class Index < Array
27
+
28
+ class << self
29
+ # returns an Integer or nil if not found
30
+ # does a single jump backwards from the tail of the file looking for
31
+ # an xml element based on tag. If it is not found, returns nil
32
+ def index_offset(io, tag='indexListOffset', bytes_backwards=200)
33
+ tag_re = %r{<#{tag}>([\-\d]+)</#{tag}>}
34
+ io.pos = (io.size - 1) - bytes_backwards
35
+ md = io.readlines("\n").map {|line| line.match(tag_re) }.compact.shift
36
+ md[1].to_i if md
37
+ end
38
+ end
39
+
40
+ # an index indexed by scan number
41
+ attr_accessor :by_scans
42
+
43
+ # the name of the index (as a symbol)
44
+ attr_accessor :name
45
+
46
+ # a parallel array of ids (idRef's)
47
+ attr_accessor :ids
48
+
49
+ def start_byte_and_id(int)
50
+ [self[int], ids[int]]
51
+ end
52
+
53
+ # returns hash of id to start_byte
54
+ def create_id_index
55
+ Hash[self.ids.zip(self)]
56
+ end
57
+
58
+ # @return [Integer] the start byte of the spectrum
59
+ # @param [Object] an Integer (the index number) or String (an id string)
60
+ def start_byte(arg)
61
+ case arg
62
+ when Integer
63
+ self[arg]
64
+ when String
65
+ @id_index ||= create_id_index
66
+ @id_index[arg]
67
+ end
68
+ end
69
+
70
+ # generates a scan to index hash that points from scan number to the
71
+ # spectrum index number. returns the index, nil if the scan ids
72
+ # are not present and spectra are, or false if they are not unique.
73
+ def create_scan_to_index
74
+ scan_re = /scan=(\d+)/
75
+ scan_to_index = {}
76
+ ids.each_with_index do |id, index|
77
+ md = id.match(scan_re)
78
+ scan_num = md[1].to_i if md
79
+ if scan_num
80
+ if scan_to_index.key?(scan_num)
81
+ return false
82
+ else
83
+ scan_to_index[scan_num] = index
84
+ end
85
+ end
86
+ end
87
+ if scan_to_index.size > 0
88
+ by_scans = scan_to_index
89
+ elsif ids.size > 0
90
+ nil # there are scans, but we did not find scan numbers
91
+ else
92
+ scan_to_index
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+