mspire 0.5.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,112 @@
1
+ require 'nokogiri'
2
+ require 'ms/ident'
3
+ require 'ms/ident/pepxml/msms_pipeline_analysis'
4
+
5
+ require 'ostruct'
6
+
7
+ module MS ; module Ident ; end ; end
8
+
9
+ class Numeric
10
+ # returns a string with a + or - on the front
11
+ def to_plus_minus_string
12
+ if self >= 0
13
+ '+' << self.to_s
14
+ else
15
+ self.to_s
16
+ end
17
+ end
18
+ end
19
+
20
+ class MS::Ident::Pepxml
21
+ XML_STYLESHEET_LOCATION = '/tools/bin/TPP/tpp/schema/pepXML_std.xsl'
22
+ DEFAULT_PEPXML_VERSION = MsmsPipelineAnalysis::PEPXML_VERSION
23
+ XML_ENCODING = 'UTF-8'
24
+
25
+ attr_accessor :msms_pipeline_analysis
26
+
27
+ # returns an array of MS::Ident::Pepxml::SearchHit::Simple structs
28
+ def self.simple_search_hits(file)
29
+ hit_values = File.open(file) do |io|
30
+ doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS | Nokogiri::XML::ParseOptions::STRICT)
31
+ # we can work with namespaces, or just remove them ...
32
+ doc.remove_namespaces!
33
+ root = doc.root
34
+ search_hits = root.xpath('//search_hit')
35
+ search_hits.each_with_index.map do |search_hit,i|
36
+ aaseq = search_hit['peptide']
37
+ charge = search_hit.parent.parent['assumed_charge'].to_i
38
+ search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
39
+ search_scores = {}
40
+ search_score_nodes.each do |node|
41
+ search_scores[node['name'].to_sym] = node['value'].to_f
42
+ end
43
+ MS::Ident::Pepxml::SearchHit::Simple.new("hit_#{i}", MS::Ident::Search.new(file.chomp(File.extname(file))), aaseq, charge, search_scores)
44
+ end
45
+ end
46
+ end
47
+
48
+ def pepxml_version
49
+ msms_pipeline_analysis.pepxml_version
50
+ end
51
+
52
+ # returns an array of spectrum queries
53
+ def spectrum_queries
54
+ msms_pipeline_analysis.msms_run_summary.spectrum_queries
55
+ end
56
+
57
+ # yields a new Msms_Pipeline_Analysis object if given a block
58
+ def initialize(&block)
59
+ block.call(@msms_pipeline_analysis=MsmsPipelineAnalysis.new) if block
60
+ end
61
+
62
+ # takes an xml document object and sets it with the xml stylesheet
63
+ def add_stylesheet(doc, location)
64
+ xml_stylesheet = Nokogiri::XML::ProcessingInstruction.new(doc, "xml-stylesheet", %Q{type="text/xsl" href="#{location}"})
65
+ doc.root.add_previous_sibling xml_stylesheet
66
+ doc
67
+ end
68
+
69
+ # if no options are given, an xml string is returned. If either :outdir or
70
+ # :outfile is given, the xml is written to file and the output filename is returned.
71
+ # A single string argument will be interpreted as :outfile if it ends in
72
+ # '.xml' and the :outdir otherwise. In this case, update_summary_xml is still true
73
+ #
74
+ # options:
75
+ #
76
+ # arg default
77
+ # :outdir => nil write to disk using this outdir with summary_xml basename
78
+ # :outfile => nil write to this filename (overrides outdir)
79
+ # :update_summary_xml => true update summary_xml attribute to point to the output file true/false
80
+ #
81
+ # set outdir to
82
+ # File.dirname(pepxml_obj.msms_pipeline_analysis.msms_run_summary.base_name)
83
+ # to write to the same directory as the input search file.
84
+ def to_xml(opts={})
85
+ opts ||= {}
86
+ if opts.is_a?(String)
87
+ opts = ( opts.match(/\.xml$/) ? {:outfile => opts} : {:outdir => opts } )
88
+ end
89
+ opt = {:update_summary_xml => true, :outdir => nil, :outfile => nil}.merge(opts)
90
+
91
+ if opt[:outfile]
92
+ outfile = opt[:outfile]
93
+ elsif opt[:outdir]
94
+ outfile = File.join(opt[:outdir], msms_pipeline_analysis.summary_xml.split(/[\/\\]/).last)
95
+ end
96
+ self.msms_pipeline_analysis.summary_xml = File.expand_path(outfile) if (opt[:update_summary_xml] && outfile)
97
+
98
+ builder = Nokogiri::XML::Builder.new(:encoding => XML_ENCODING)
99
+ msms_pipeline_analysis.to_xml(builder)
100
+ add_stylesheet(builder.doc, MS::Ident::Pepxml::XML_STYLESHEET_LOCATION)
101
+ string = builder.doc.to_xml
102
+
103
+ if outfile
104
+ File.open(outfile,'w') {|out| out.print(string) }
105
+ outfile
106
+ else
107
+ string
108
+ end
109
+ end
110
+ end
111
+
112
+
@@ -0,0 +1,33 @@
1
+ require 'andand'
2
+
3
+ module MS ; end
4
+ module MS::Ident
5
+ module ProteinLike
6
+ # an id for the protein
7
+ attr_accessor :id
8
+
9
+ # the protein sequence
10
+ attr_accessor :sequence
11
+ alias_method :seq, :sequence
12
+ alias_method :seq=, :sequence=
13
+
14
+ # a description of the protein
15
+ attr_accessor :description
16
+
17
+ # if the GN=([^\s]+) regexp is found in the description, returns the first
18
+ # match, or nil if not found
19
+ def gene_id
20
+ description.andand.match(/ GN=(\w+) ?/)[1]
21
+ end
22
+ end
23
+
24
+ # a generic protein class that is ProteinLike
25
+ class Protein
26
+ include ProteinLike
27
+
28
+ def initialize(id=nil, sequence=nil)
29
+ (@id, @sequence) = id, sequence
30
+ end
31
+ end
32
+ end
33
+
@@ -0,0 +1,80 @@
1
+ require 'set'
2
+
3
+ module MS
4
+ module Ident
5
+ # represents a group of proteins, typically indistinguishable in the
6
+ # experiment.
7
+ class ProteinGroup < Array
8
+ attr_accessor :peptide_hits
9
+
10
+ PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
11
+ peptide_hits = protein_group_and_peptide_hits.last
12
+ num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
13
+ num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
14
+ [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
15
+ end
16
+
17
+ # greedy algorithm to map a set of peptide_hits to protein groups. each
18
+ # peptide hit should respond to :aaseq, :charge, :proteins if a block is
19
+ # given, yields a single argument: a doublet of protein_group and peptide
20
+ # set. It expects a metric or array to sort by for creating greedy protein
21
+ # groups (the greediest proteins should sort to the back of the array). if
22
+ # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
23
+ # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
24
+ # peptide_hits and the objects returned by peptide_hit#proteins are used as
25
+ # hash keys. As long as each peptide hit has a unique signature (like an
26
+ # id) then any object will work. If they are Struct objects, you might
27
+ # consider redefining the #hash method to be object_id for performance and
28
+ # accuracy.
29
+ #
30
+ # returns an array of ProteinGroup objects, each set with :peptide_hits
31
+ #
32
+ # If update_peptide_hits is true, then each peptide_hit is linked to the array
33
+ # of protein_groups it is associated with using :protein_groups. A
34
+ # symbol can also be passed in, and that method will be called instead.
35
+ def self.peptide_hits_to_protein_groups(peptide_hits, update_peptide_hits=false, &sort_by)
36
+ update_peptide_hits = 'protein_groups='.to_sym if (update_peptide_hits==true)
37
+ sort_by ||= PRIORITIZE_PROTEINS
38
+ # note to self: I wrote this in 2011, so I think I know what I'm doing now
39
+ protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
40
+ peptide_hits.each do |peptide_hit|
41
+ peptide_hit.proteins.each do |protein|
42
+ protein_to_peptides[protein] << peptide_hit
43
+ end
44
+ end
45
+ peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
46
+ protein_to_peptides.each do |protein, peptide_set|
47
+ peptides_to_protein_group[peptide_set] << protein
48
+ end
49
+ peptides_to_protein_group.each do |pephits,ar_of_prots|
50
+ pg = MS::Ident::ProteinGroup.new(ar_of_prots)
51
+ pg.peptide_hits = pephits
52
+ peptides_to_protein_group[pephits] = pg
53
+ end
54
+
55
+ protein_group_to_peptides = peptides_to_protein_group.invert
56
+ greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
57
+
58
+ accounted_for = Set.new
59
+ # we are discarding the subsumed sets, but we could get them with
60
+ # partition
61
+ greedy_first.select! do |group, peptide_set|
62
+ has_an_unaccounted_peptide = false
63
+ peptide_set.each do |peptide_hit|
64
+ unless accounted_for.include?(peptide_hit)
65
+ has_an_unaccounted_peptide = true
66
+ accounted_for.add(peptide_hit)
67
+ end
68
+ end
69
+ group.peptide_hits = peptide_set if has_an_unaccounted_peptide
70
+ has_an_unaccounted_peptide
71
+ end
72
+ if update_peptide_hits
73
+ greedy_first.each {|pg, pephits| pephits.each {|hit| hit.send(update_peptide_hits, pg) } }
74
+ end
75
+ greedy_first.map(&:first)
76
+ end
77
+
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,114 @@
1
+
2
+ module MS
3
+ module Ident
4
+
5
+ module SearchLike
6
+ attr_accessor :id
7
+ attr_accessor :peptide_hits
8
+ alias_method :hits, :peptide_hits
9
+ alias_method :hits=, :peptide_hits=
10
+
11
+ # returns an array of peptide_hits and protein_hits that are linked to
12
+ # one another. NOTE: this will update peptide and protein
13
+ # hits :proteins and :peptide_hits attributes respectively). Assumes that each search
14
+ # responds to :peptide_hits, each peptide responds to :proteins and each protein to
15
+ # :peptide_hits. Can be done on a single file to restore protein/peptide
16
+ # linkages to their original single-file state.
17
+ # Assumes the protein is initialized with (reference, peptide_ar)
18
+ #
19
+ # yields the protein that will become the template for a new protein
20
+ # and expects a new protein hit
21
+ #def merge!(ar_of_peptide_hit_arrays)
22
+ # all_peptide_hits = []
23
+ # reference_hash = {}
24
+ # ar_of_peptide_hit_arrays.each do |peptide_hits|
25
+ # all_peptide_hits.push(*peptide_hits)
26
+ # peptide_hits.each do |peptide|
27
+ # peptide.proteins.each do |protein|
28
+ # id = protein.id
29
+ # if reference_hash.key?(id)
30
+ # reference_hash[id].peptide_hits << peptide
31
+ # reference_hash[id]
32
+ # else
33
+ # reference_hash[id] = yield(protein, [peptide])
34
+ # end
35
+ # end
36
+ # end
37
+ # end
38
+ # [all_peptide_hits, reference_hash.values]
39
+ #end
40
+ end
41
+
42
+ class Search
43
+ include SearchLike
44
+
45
+ def initialize(id=nil, peptide_hits=[])
46
+ @id = id
47
+ @peptide_hits = peptide_hits
48
+ end
49
+ end
50
+
51
+
52
+ module SearchGroup
53
+
54
+ # an array of search objects
55
+ attr_accessor :searches
56
+
57
+ # the group's file extension (with no leading period)
58
+ def extension
59
+ 'grp'
60
+ end
61
+
62
+ def search_class
63
+ Search
64
+ end
65
+
66
+ # a simple formatted file with paths to the search files
67
+ def to_paths(file)
68
+ IO.readlines(file).grep(/\w/).reject {|v| v =~ /^#/}.map {|v| v.chomp }
69
+ end
70
+
71
+ def from_file(file)
72
+ from_filenames(to_paths(file))
73
+ end
74
+
75
+
76
+ def from_filenames(filenames)
77
+ filenames.each do |file|
78
+ if !File.exist? file
79
+ message = "File: #{file} does not exist!\n"
80
+ message << "perhaps you need to modify the file with file paths"
81
+ abort message
82
+ end
83
+ @searches << search_class.new(file)
84
+ end
85
+ end
86
+
87
+
88
+ # takes an array of filenames or a single search filename (with
89
+ # extension defined by 'extendsion') or an array of objects passes any
90
+ # arguments to the initializer for each search
91
+ # the optional block yields the object for further processing
92
+ def initialize(arg=nil, opts={})
93
+ @peptide_hits = []
94
+ @reference_hash = {}
95
+ @searches = []
96
+
97
+ if arg
98
+ if arg.is_a?(String) && arg =~ /\.#{Regexp.escap(extension)}$/
99
+ from_file(arg)
100
+ elsif arg.is_a?(Array) && arg.first.is_a?(String)
101
+ from_filenames(arg)
102
+ elsif arg.is_a?(Array)
103
+ @searches = array
104
+ else
105
+ raise ArgumentError, "must be file, array of filenames, or array of objs"
106
+ end
107
+ @searches << search_class.new(file, opts)
108
+ end
109
+ yield(self) if block_given?
110
+ end
111
+
112
+ end
113
+ end
114
+ end
data/lib/ms/ident.rb ADDED
@@ -0,0 +1,37 @@
1
+
2
+ require 'ms/ident/protein_group'
3
+ require 'ms/ident/protein'
4
+ require 'ms/ident/peptide_hit'
5
+
6
+ module MS
7
+
8
+ # An MS::Ident::ProteinGroup is an array of proteins that responds to
9
+ # :peptide_hits. All protein level identifications should be stored in a
10
+ # proteingroup object.
11
+ #
12
+ # An MS::Ident::Protein is an object representing a protein (:id,
13
+ # :sequence, :description). Note, it is not a protein hit (use a
14
+ # ProteinGroup)
15
+ #
16
+ # An MS::Ident::PeptideHit is an object representing a match between an
17
+ # amino acid sequence and a spectrum.
18
+ #
19
+ # Typical usage:
20
+ #
21
+ # require 'ms/ident'
22
+ #
23
+ # hit1 = PeptideHit.new(:id => 1, :aaseq => 'PEPTIDE', :search =>
24
+ # MS::Ident::Search.new, etc...)
25
+ # peptide_hits = [hit1, hit2, ...]
26
+ #
27
+ # protein_groups = MS::Ident::ProteinGroup.peptide_hits_to_protein_groups(peptide_hits)
28
+ # protein_groups.first.peptide_hits # => the peptide hits in that group
29
+ module Ident
30
+ # returns the filetype (if possible)
31
+ def self.filetype(file)
32
+ if file =~ /\.srf$/i
33
+ :srf
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,59 @@
1
+ module MS
2
+ module Isotope
3
+ module AA
4
+ ATOM_COUNTS_STR = {
5
+ 'A' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
6
+ 'R' => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
7
+ 'N' => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
8
+ 'D' => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
9
+ 'C' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
10
+ 'E' => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
11
+ 'Q' => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
12
+ 'G' => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
13
+ 'H' => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
14
+ 'I' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
15
+ 'L' => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
16
+ 'K' => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
17
+ 'M' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
18
+ 'F' => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
19
+ 'P' => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
20
+ 'S' => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
21
+ 'T' => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
22
+ 'W' => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
23
+ 'Y' => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
24
+ 'V' => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
25
+ 'U' => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
26
+ 'O' => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
27
+ }
28
+ ATOM_COUNTS_SYM = {
29
+ :A => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
30
+ :R => { :c =>6, :h =>14 , :o =>2 , :n =>4 , :s =>0 , :p =>0, :se =>0 },
31
+ :N => { :c =>4, :h =>8 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
32
+ :D => { :c =>4, :h =>7 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
33
+ :C => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
34
+ :E => { :c =>5, :h =>9 , :o =>4 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
35
+ :Q => { :c =>5, :h =>10 , :o =>3 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
36
+ :G => { :c =>2, :h =>5 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
37
+ :H => { :c =>6, :h =>9 , :o =>2 , :n =>3 , :s =>0 , :p =>0, :se =>0 },
38
+ :I => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
39
+ :L => { :c =>6, :h =>13 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
40
+ :K => { :c =>6, :h =>14 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
41
+ :M => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>1 , :p =>0, :se =>0 },
42
+ :F => { :c =>9, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
43
+ :P => { :c =>5, :h =>9 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
44
+ :S => { :c =>3, :h =>7 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
45
+ :T => { :c =>4, :h =>9 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
46
+ :W => { :c =>11, :h =>12 , :o =>2 , :n =>2 , :s =>0 , :p =>0, :se =>0 },
47
+ :Y => { :c =>9, :h =>11 , :o =>3 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
48
+ :V => { :c =>5, :h =>11 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>0 },
49
+ :U => { :c =>3, :h =>7 , :o =>2 , :n =>1 , :s =>0 , :p =>0, :se =>1 },
50
+ :O => { :c =>12, :h =>21 , :o =>3 , :n =>3 , :s =>0 , :p =>0, :se =>0 }
51
+ }
52
+ ATOM_COUNTS_STR.each {|aa,val| ATOM_COUNTS[aa.to_sym] = val }
53
+
54
+ # string and symbol access of amino acid (atoms are all lower case
55
+ # symbols)
56
+ ATOM_COUNTS = ATOM_COUNTS_SYM.merge ATOM_COUNTS_STR
57
+ end
58
+ end
59
+ end
data/lib/ms/mascot.rb ADDED
@@ -0,0 +1,6 @@
1
+
2
+ module MS
3
+ module Mascot
4
+ H_PLUS = 1.007276
5
+ end
6
+ end
data/lib/ms/mass/aa.rb ADDED
@@ -0,0 +1,79 @@
1
+ require 'ms/mass'
2
+
3
+ module MS
4
+ module Mass
5
+ module AA
6
+ # amino_acids keys as strings, average masses
7
+ AVG_STRING = {
8
+ "*"=>118.88603,
9
+ "A"=>71.0779,
10
+ "B"=>172.1405,
11
+ "C"=>103.1429,
12
+ "D"=>115.0874,
13
+ "E"=>129.11398,
14
+ "F"=>147.17386,
15
+ "G"=>57.05132,
16
+ "H"=>137.13928,
17
+ "I"=>113.15764,
18
+ "K"=>128.17228,
19
+ "L"=>113.15764,
20
+ "M"=>131.19606,
21
+ "N"=>114.10264,
22
+ "O"=>211.28076,
23
+ "P"=>97.11518,
24
+ "Q"=>128.12922,
25
+ "R"=>156.18568,
26
+ "S"=>87.0773,
27
+ "T"=>101.10388,
28
+ "U"=>150.0379,
29
+ "V"=>99.13106,
30
+ "W"=>186.2099,
31
+ "X"=>118.88603,
32
+ "Y"=>163.17326,
33
+ "Z"=>128.6231
34
+ }
35
+ # amino_acids keys as strings, monoisotopic masses
36
+ MONO_STRING = {
37
+ "*"=>118.805716,
38
+ "A"=>71.0371137878,
39
+ "B"=>172.048405,
40
+ "C"=>103.0091844778,
41
+ "D"=>115.026943032,
42
+ "E"=>129.0425930962,
43
+ "F"=>147.0684139162,
44
+ "G"=>57.0214637236,
45
+ "H"=>137.0589118624,
46
+ "I"=>113.0840639804,
47
+ "K"=>128.0949630177,
48
+ "L"=>113.0840639804,
49
+ "M"=>131.0404846062,
50
+ "N"=>114.0429274472,
51
+ "O"=>211.1446528645,
52
+ "P"=>97.052763852,
53
+ "Q"=>128.0585775114,
54
+ "R"=>156.1011110281,
55
+ "S"=>87.0320284099,
56
+ "T"=>101.0476784741,
57
+ "U"=>150.9536355878,
58
+ "V"=>99.0684139162,
59
+ "W"=>186.0793129535,
60
+ "X"=>118.805716,
61
+ "Y"=>163.0633285383,
62
+ "Z"=>128.550585
63
+ }
64
+
65
+ # amino_acids keys as symbols, monoisotopic masses
66
+ MONO_SYM = Hash[MONO_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
67
+
68
+ # amino_acids keys as symbols, average masses
69
+ AVG_SYM = Hash[AVG_STRING.map {|aa,mass| [aa.to_sym, mass] } ]
70
+
71
+ # Monoisotopic amino acid masses keyed as symbols and also strings (all
72
+ # upper case). Also includes MS::Mass::MONO for things like protons ('h+')
73
+ MONO = MONO_SYM.merge(MONO_STRING).merge(MS::Mass::MONO)
74
+ # Average amino acid masses keyed as symbols and also strings (all
75
+ # uppder case). Also includes MS::Mass::AVG for things like protons ('h+')
76
+ AVG = AVG_SYM.merge(AVG_STRING).merge(MS::Mass::AVG)
77
+ end
78
+ end
79
+ end
data/lib/ms/mass.rb ADDED
@@ -0,0 +1,55 @@
1
+
2
+ module MS
3
+ module Mass
4
+
5
+ # takes a chemical formula in this format: C2BrH12O
6
+ def self.formula_to_exact_mass(formula)
7
+ # TODO: add other input methods
8
+ pairs = formula.scan(/([A-Z][a-z]?)(\d*)/).map do |match|
9
+ if match.last == ''
10
+ match[-1] = 1
11
+ end
12
+ [match[0], match[1].to_i]
13
+ end
14
+ pairs.map do |pair|
15
+ MONO[pair.first.downcase] * pair.last
16
+ end.reduce(:+)
17
+ end
18
+
19
+ H_PLUS = 1.00727646677
20
+ # + http://www.unimod.org/masses.html
21
+ MONO_STR = {
22
+ 'c' => 12.0, # +
23
+ 'br' => 78.9183361, # +
24
+ 'd' => 2.014101779, # +
25
+ 'f' => 18.99840322, # +
26
+ 'n' => 14.003074, # +
27
+ 'o' => 15.99491463, # +
28
+ 'na' => 22.9897677, # +
29
+ 'p' => 30.973762, # +
30
+ 's' => 31.9720707, # +
31
+ 'li' => 7.016003, # +
32
+ 'cl' => 34.96885272, # +
33
+ 'k' => 38.9637074, # +
34
+ 'si' => 27.9769265325, # http://physics.nist.gov/cgi-bin/Compositions/stand_alone.pl?ele=Si&ascii=html&isotype=some
35
+ 'i' => 126.904473, # +
36
+ 'h+' => 1.00727646677,
37
+ 'h' => 1.007825035, # +
38
+ 'h2o' => 18.0105647,
39
+ 'oh' => 17.002739665,
40
+ }
41
+ AVG_STR = {
42
+ 'h+' => 1.007276, # using Mascot_H_plus mass (is this right for AVG??)
43
+ 'h' => 1.00794,
44
+ 'h2o' => 18.01528,
45
+ 'oh' => 17.00734,
46
+ }
47
+ # sets MONO_STR, MONO, AVG_STR, and AVG
48
+ %w(MONO AVG).each do |type|
49
+ const_set "#{type}_SYM", Hash[ const_get("#{type}_STR").map {|k,v| [k.to_sym, v] } ]
50
+ const_set type, const_get("#{type}_STR").merge( const_get("#{type}_SYM") )
51
+ end
52
+ end
53
+ end
54
+
55
+
@@ -0,0 +1,98 @@
1
+ module MS
2
+ class Mzml
3
+ # A simple array of indices but #[] has been overloaded to find an index
4
+ # by name
5
+ #
6
+ # index_list[0] # the first index
7
+ # index_list.map(&:names) # -> [:spectrum, :chromatogram]
8
+ # index_list[:spectrum] # the spectrum index
9
+ # index_list[:chromatogram] # the chromatogram index
10
+ class IndexList < Array
11
+ alias_method :old_bracket_slice, :'[]'
12
+
13
+ # @param [Object] an Integer (index number) or a Symbol (:spectrum or
14
+ # :chromatogram)
15
+ # @return [MS::Mzml::Index] an index object
16
+ def [](int_or_symbol)
17
+ if int_or_symbol.is_a?(Integer)
18
+ old_bracket_slice(int_or_symbol)
19
+ else
20
+ self.find {|index| index.name == int_or_symbol }
21
+ end
22
+ end
23
+ end
24
+
25
+ # the array holds start bytes
26
+ class Index < Array
27
+
28
+ class << self
29
+ # returns an Integer or nil if not found
30
+ # does a single jump backwards from the tail of the file looking for
31
+ # an xml element based on tag. If it is not found, returns nil
32
+ def index_offset(io, tag='indexListOffset', bytes_backwards=200)
33
+ tag_re = %r{<#{tag}>([\-\d]+)</#{tag}>}
34
+ io.pos = (io.size - 1) - bytes_backwards
35
+ md = io.readlines("\n").map {|line| line.match(tag_re) }.compact.shift
36
+ md[1].to_i if md
37
+ end
38
+ end
39
+
40
+ # an index indexed by scan number
41
+ attr_accessor :by_scans
42
+
43
+ # the name of the index (as a symbol)
44
+ attr_accessor :name
45
+
46
+ # a parallel array of ids (idRef's)
47
+ attr_accessor :ids
48
+
49
+ def start_byte_and_id(int)
50
+ [self[int], ids[int]]
51
+ end
52
+
53
+ # returns hash of id to start_byte
54
+ def create_id_index
55
+ Hash[self.ids.zip(self)]
56
+ end
57
+
58
+ # @return [Integer] the start byte of the spectrum
59
+ # @param [Object] an Integer (the index number) or String (an id string)
60
+ def start_byte(arg)
61
+ case arg
62
+ when Integer
63
+ self[arg]
64
+ when String
65
+ @id_index ||= create_id_index
66
+ @id_index[arg]
67
+ end
68
+ end
69
+
70
+ # generates a scan to index hash that points from scan number to the
71
+ # spectrum index number. returns the index, nil if the scan ids
72
+ # are not present and spectra are, or false if they are not unique.
73
+ def create_scan_to_index
74
+ scan_re = /scan=(\d+)/
75
+ scan_to_index = {}
76
+ ids.each_with_index do |id, index|
77
+ md = id.match(scan_re)
78
+ scan_num = md[1].to_i if md
79
+ if scan_num
80
+ if scan_to_index.key?(scan_num)
81
+ return false
82
+ else
83
+ scan_to_index[scan_num] = index
84
+ end
85
+ end
86
+ end
87
+ if scan_to_index.size > 0
88
+ by_scans = scan_to_index
89
+ elsif ids.size > 0
90
+ nil # there are scans, but we did not find scan numbers
91
+ else
92
+ scan_to_index
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+