mspire 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
data/README.rdoc ADDED
@@ -0,0 +1,24 @@
1
+ = mspire
2
+
3
+ Tools for working with mass spectrometry data in ruby.
4
+
5
+ == Examples
6
+
7
+ === mzml
8
+
9
+ require 'ms/mzml'
10
+
11
+ MS::Mzml.open("somefile.mzml") do |mzml|
12
+ spectrum = mzml[0] # the first spectrum ( same as mzml.spectrum(0) )
13
+ spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"] # query by id string
14
+ mzml.spectrum_from_scan_num(23) # raises ScanNumbersNotFound or ScanNumbersNotUnique errors if problems
15
+ end
16
+
17
+ require 'ms/mass/aa'
18
+
19
+ MS::Mass::AA::MONO['A'] # or access by symbol
20
+
21
+ == Copyright
22
+
23
+ See LICENSE (MIT)
24
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rspec/core/rake_task'
4
+
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
8
+ gem.name = "mspire"
9
+ gem.homepage = "http://github.com/princelab/mspire"
10
+ gem.license = "MIT"
11
+ gem.summary = %Q{mass spectrometry proteomics, lipidomics, and tools}
12
+ gem.description = %Q{mass spectrometry proteomics, lipidomics, and tools, a rewrite of mspire, merging of ms-* gems}
13
+ gem.email = "jtprince@gmail.com"
14
+ gem.authors = ["John T. Prince", "Simon Chiang"]
15
+ gem.add_dependency "nokogiri", "~> 1.5"
16
+ gem.add_development_dependency "rspec", "~> 2.6"
17
+ gem.add_development_dependency "jeweler", "~> 1.5.2"
18
+ gem.add_development_dependency "rcov", ">= 0"
19
+ gem.add_development_dependency "obo", ">= 0.1.0"
20
+ end
21
+ Jeweler::RubygemsDotOrgTasks.new
22
+
23
+ require 'rspec/core'
24
+ require 'rspec/core/rake_task'
25
+ RSpec::Core::RakeTask.new(:spec) do |spec|
26
+ spec.pattern = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ #require 'rcov/rcovtask'
35
+ #Rcov::RcovTask.new do |spec|
36
+ # spec.libs << 'spec'
37
+ # spec.pattern = 'spec/**/*_spec.rb'
38
+ # spec.verbose = true
39
+ #end
40
+
41
+ task :default => :spec
42
+
43
+ require 'rdoc/task'
44
+ Rake::RDocTask.new do |rdoc|
45
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
46
+
47
+ rdoc.rdoc_dir = 'rdoc'
48
+ rdoc.title = "mspire #{version}"
49
+ rdoc.rdoc_files.include('README*')
50
+ rdoc.rdoc_files.include('lib/**/*.rb')
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.6.1
@@ -0,0 +1,18 @@
1
+
2
+ module CV
3
+ class Description < Array
4
+ def initialize(*args, &block)
5
+ super(args)
6
+ self.instance_eval &block
7
+ end
8
+
9
+ # pushes a CV::Param object onto the description array
10
+ def param(*args)
11
+ push CV::Param.new(*args)
12
+ end
13
+
14
+ def to_xml(xml)
15
+ each {|param| param.to_xml(xml) }
16
+ end
17
+ end
18
+ end
data/lib/cv/param.rb ADDED
@@ -0,0 +1,33 @@
1
+
2
+ module CV
3
+ class Param
4
+ attr_accessor :cv_ref, :accession, :name, :value
5
+ # A valueless CV::Param object that describes the units being used
6
+ attr_accessor :unit
7
+
8
+ def initialize(cv_ref, accession, name, value=nil)
9
+ (@cv_ref, @accession, @name, @value) = [cv_ref, accession, name, value]
10
+ end
11
+
12
+ def to_xml(xml, name=:cvParam)
13
+ hash_to_send = {:cvRef => @cvref, :accession => @accession, :name => @name}
14
+ hash_to_send[:value] = @value if @value
15
+ if unit
16
+ hash_to_send.merge!( { :unitCvRef => unit.cv_ref,
17
+ :unitAccession => unit.accession,
18
+ :unitName => unit.name } )
19
+ end
20
+ xml.send(name, hash_to_send)
21
+ end
22
+
23
+ def ==(other)
24
+ if !other.nil? && other.is_a?(CV::Param)
25
+ [:cv_ref, :accession, :name, :value, :unit].inject(true) do |bool, mthd|
26
+ bool && (self.send(mthd) == other.send(mthd))
27
+ end
28
+ else ; false
29
+ end
30
+ end
31
+ end
32
+ end
33
+
data/lib/cv.rb ADDED
@@ -0,0 +1,3 @@
1
+
2
+ require 'cv/description'
3
+ require 'cv/param'
@@ -0,0 +1,13 @@
1
+
2
+ class IO
3
+ # saves the position and returns to it after the block
4
+ # is executed. Returns the block's reply. if rewind, io.rewind is called
5
+ # before handing the io object to the block.
6
+ def bookmark(rewind=false, &block)
7
+ start = self.pos
8
+ self.rewind if rewind
9
+ reply = block.call(self)
10
+ self.pos = start
11
+ reply
12
+ end
13
+ end
data/lib/merge.rb ADDED
@@ -0,0 +1,7 @@
1
+ module Merge
2
+ # allows object attributes to be set from a hash
3
+ def merge!(hash={}, &block)
4
+ hash.each {|k,v| send("#{k}=",v) }
5
+ block.call(block_arg) if block
6
+ end
7
+ end
data/lib/ms/cvlist.rb ADDED
@@ -0,0 +1,76 @@
1
+
2
+ require 'cv'
3
+ require 'obo/ms'
4
+ require 'obo/ims'
5
+ require 'obo/unit'
6
+
7
+ module MS
8
+ module CV
9
+ Obo = {
10
+ 'MS' => Obo::MS.id_to_name,
11
+ 'IMS' => Obo::IMS.id_to_name,
12
+ 'UO' => Obo::Unit.id_to_name,
13
+ }
14
+
15
+ class Param < ::CV::Param
16
+ # takes a variety of arguments (acc = accession):
17
+ #
18
+ # acc#
19
+ # acc#, value
20
+ # acc#, unit_acc# or CV::Param object
21
+ # acc#, value, unit_acc# or CV::Param object
22
+ # cvref, acc#, name
23
+ # cvref, acc#, name, value
24
+ # cvref, acc#, name, unit_acc# or CV::Param object
25
+ # cvref, acc#, name, value, unit_acc# or CV::Param object
26
+ def initialize(*args)
27
+ @unit =
28
+ if args.size > 1 && ((args.last.is_a?(::CV::Param) || args.last =~ /[A-Za-z]+:\d+/))
29
+ unit_arg = args.pop
30
+ unit_arg.is_a?(::CV::Param) ? unit_arg : self.class.new(unit_arg)
31
+ end
32
+ (@cv_ref, @accession, @name, @value) =
33
+ case args.size
34
+ when 1..2 # accession number (maybe with value)
35
+ (obo_type, accnum) = args.first.split(':')
36
+ [obo_type, args.first, MS::CV::Obo[obo_type][args.first], args[1]]
37
+ when 3..4 # they have fully specified the object
38
+ args
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ # CVList.new( <CV::Param> )
45
+ # CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
46
+ # CVList.new do
47
+ # param MS:1000004
48
+ # param MS:1000042, 23
49
+ # end
50
+ class CVList < Array
51
+
52
+ # ensures that each argument is an argument that can be handled by
53
+ # CV::Param. Returns the CVList object it creates
54
+ def self.[](*args)
55
+ list = self.new
56
+ args.each do |arg|
57
+ arg.is_a?(Array) ? list.param(*arg) : list.param(arg)
58
+ end
59
+ list
60
+ end
61
+
62
+ # takes a list of valid CV::Param objects, or they can be set in the block
63
+ # using param
64
+ def initialize(*args, &block)
65
+ args.each {|arg| param(arg) }
66
+ instance_eval &block if block
67
+ end
68
+
69
+ # if the first object is a MS::CV::Param it is just pushed onto the list,
70
+ # otherwise the arguments are sent in to initialize a fresh MS::CV::Param,
71
+ # and this object is pushed onto the list.
72
+ def param(*args)
73
+ push args.first.is_a?(::CV::Param) ? args.first : MS::CV::Param.new(*args)
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,245 @@
1
+ require 'strscan'
2
+
3
+ module MS
4
+
5
+ # A Digester splits a protein sequence into peptides at specified sites.
6
+ #
7
+ # trypsin = MS::Digester[:trypsin]
8
+ #
9
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
10
+ # # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG']
11
+ #
12
+ # With 1 missed cleavage:
13
+ #
14
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
15
+ # # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK',
16
+ # # 'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG']
17
+ #
18
+ # Return the start and end sites of digestion:
19
+ #
20
+ # trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
21
+ # # => [[0,6],[0,24],[6,24],[6,33],[24,33]]
22
+ class Digester
23
+
24
+ # The name of the digester
25
+ attr_reader :name
26
+
27
+ # A string of residues at which cleavage occurs
28
+ attr_reader :cleave_str
29
+
30
+ # A c-terminal resitriction residue which prevents
31
+ # cleavage at a potential cleavage site (optional).
32
+ attr_reader :cterm_exception
33
+
34
+ # True if cleavage occurs at the c-terminus of a
35
+ # cleavage residue, false if cleavage occurs at
36
+ # the n-terminus.
37
+ attr_reader :cterm_cleavage
38
+
39
+ MULTILINE_WHITESPACE = /\s*/m
40
+
41
+ def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
42
+ regexp = []
43
+ 0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
44
+
45
+ @name = name
46
+ @cleave_str = cleave_str
47
+ @cleave_regexp = Regexp.new(regexp.join('|'))
48
+ @cterm_exception = case
49
+ when cterm_exception == nil || cterm_exception.empty? then nil
50
+ when cterm_exception.length == 1 then cterm_exception[0]
51
+ else
52
+ raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
53
+ end
54
+
55
+ @cterm_cleavage = cterm_cleavage
56
+ @scanner = StringScanner.new('')
57
+ end
58
+
59
+ # Returns digestion sites in sequence, as determined by the
60
+ # cleave_regexp boundaries. The digestion sites correspond to the
61
+ # positions where a peptide begins and ends, such that [n, (n+1) - n]
62
+ # corresponds to the [index, length] for peptide n.
63
+ #
64
+ # d = Digester.new('Trypsin', 'KR', 'P')
65
+ # seq = "AARGGR"
66
+ # sites = d.cleavage_sites(seq) # => [0, 3, 6]
67
+ #
68
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
69
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
70
+ #
71
+ # Trailing whitespace is included in the fragment.
72
+ #
73
+ # seq = "AAR \n GGR"
74
+ # sites = d.cleavage_sites(seq) # => [0, 8, 11]
75
+ #
76
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
77
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
78
+ #
79
+ # The digested section of sequence may be specified using offset
80
+ # and length.
81
+ def cleavage_sites(seq, offset=0, length=seq.length-offset)
82
+ return [0, 1] if seq.size == 1 # adding exceptions is lame--algorithm should just work
83
+
84
+ adjustment = cterm_cleavage ? 0 : 1
85
+ limit = offset + length
86
+
87
+ positions = [offset]
88
+ pos = scan(seq, offset, limit) do |pos|
89
+ positions << (pos - adjustment)
90
+ end
91
+
92
+ # add the final position
93
+ if (pos < limit) || (positions.length == 1)
94
+ positions << limit
95
+ end
96
+ # adding exceptions is lame.. this code probably needs to be
97
+ # refactored (corrected).
98
+ if !cterm_cleavage && pos == limit
99
+ positions << limit
100
+ end
101
+ positions
102
+ end
103
+
104
+ # Returns digestion sites of sequence as [start_index, end_index] pairs,
105
+ # allowing for missed cleavages. Digestion sites are determined using
106
+ # cleavage_sites; as in that method, the digested section of sequence
107
+ # may be specified using offset and length.
108
+ #
109
+ # Each [start_index, end_index] pair is yielded to the block, if given,
110
+ # and the collected results are returned.
111
+ def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index
112
+ frag_sites = cleavage_sites(seq, offset, length)
113
+
114
+ overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
115
+ start_index = frag_sites[start_index]
116
+ end_index = frag_sites[end_index]
117
+
118
+ block ? block.call(start_index, end_index) : [start_index, end_index]
119
+ end
120
+ end
121
+
122
+ # Returns an array of peptides produced by digesting sequence, allowing for
123
+ # missed cleavage sites. Digestion sites are determined using cleavage_sites;
124
+ # as in that method, the digested section of sequence may be specified using
125
+ # offset and length.
126
+ def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
127
+ site_digest(seq, max_misses, offset, length).map do |s, e|
128
+ seq[s, e-s]
129
+ end
130
+ end
131
+
132
+ protected
133
+
134
+ # The cleavage regexp used to identify cleavage sites
135
+ attr_reader :cleave_regexp # :nodoc:
136
+
137
+ # The scanner used to digest strings.
138
+ attr_reader :scanner # :nodoc:
139
+
140
+ # Scans seq between offset and limit for the cleave_regexp, skipping whitespace
141
+ # and being mindful of exception characters. The positions of the scanner at
142
+ # each match are yielded to the block.
143
+ def scan(seq, offset, limit, &block) # :nodoc:
144
+ scanner.string = seq
145
+ scanner.pos = offset
146
+
147
+ while scanner.search_full(cleave_regexp, true, false)
148
+ scanner.search_full(MULTILINE_WHITESPACE, true, false)
149
+ pos = scanner.pos
150
+
151
+ # skip if the next character is the exception character
152
+ next if cterm_exception != nil && seq[pos] == cterm_exception
153
+
154
+ # break if you scanned past the upper limit
155
+ break if pos > limit
156
+
157
+ block.call(pos)
158
+ end
159
+
160
+ scanner.pos
161
+ end
162
+
163
+ # Performs an overlap-collect algorithm providing the start and end
164
+ # indicies of spans skipping up to max_misses boundaries.
165
+ def overlay(n, max_misses, offset, &block) # :nodoc:
166
+ results = []
167
+ 0.upto(n-1) do |start_index|
168
+ 0.upto(max_misses) do |n_miss|
169
+ end_index = start_index + offset + n_miss
170
+ break if end_index == n
171
+
172
+ results << block.call(start_index, end_index)
173
+ end
174
+ end
175
+ results
176
+ end
177
+
178
+ #
179
+ # Enzymes adapted from the default Mascot enzyme list.
180
+ #
181
+
182
+ class << self
183
+ # takes the name of the enzyme in any case (symbol or string)
184
+ # and accesses the constant (returns nil if none found)
185
+ def [](enzyme_name)
186
+ ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ]
187
+ end
188
+
189
+ # Utility method to parse a mascot enzyme configuration
190
+ # string (tab separated) into a Digester.
191
+ def mascot_parse(str) # :nodoc:
192
+ name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
193
+ cterm_cleavage = case sense
194
+ when 'C-Term' then true
195
+ when 'N-Term' then false
196
+ else raise ArgumentError, "unknown sense: #{sense}"
197
+ end
198
+
199
+ new(name, cleave_str, cterm_exception, cterm_cleavage)
200
+ end
201
+ end
202
+
203
+ # ARG_C = mascot_parse('Arg-C C-Term R P no no')
204
+ # ENZYMES[:arg_c] = <'Arg-C' enzyme>
205
+ MASCOT_ENZYME_CONFIG_STRINGS = {
206
+ :arg_c => 'Arg-C C-Term R P no no',
207
+ :asp_n => 'Asp-N N-Term BD no no',
208
+ :asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
209
+ :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
210
+ :cnbr => 'CNBr C-Term M no no',
211
+ :lys_c => 'Lys-C C-Term K P no no',
212
+ :lys_c_p => 'Lys-C/P C-Term K no no',
213
+ :pepsin_a => 'PepsinA C-Term FL no no',
214
+ :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
215
+ :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
216
+ :trypsin_p => 'Trypsin/P C-Term KR no no',
217
+ :v8_de => 'V8-DE C-Term BDEZ P no no',
218
+ :v8_e => 'V8-E C-Term EZ P no no',
219
+ :trypsin => 'Trypsin C-Term KR P no no',
220
+ :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
221
+ :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
222
+ :arg_c => 'Arg-C C-Term R P no no',
223
+ :asp_n => 'Asp-N N-Term BD no no',
224
+ :asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
225
+ :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
226
+ :cnbr => 'CNBr C-Term M no no',
227
+ :lys_c => 'Lys-C C-Term K P no no',
228
+ :lys_c_p => 'Lys-C/P C-Term K no no',
229
+ :pepsin_a => 'PepsinA C-Term FL no no',
230
+ :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
231
+ :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
232
+ :trypsin_p => 'Trypsin/P C-Term KR no no',
233
+ :v8_de => 'V8-DE C-Term BDEZ P no no',
234
+ :v8_e => 'V8-E C-Term EZ P no no',
235
+ :trypsin => 'Trypsin C-Term KR P no no',
236
+ :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
237
+ :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
238
+ }
239
+
240
+ ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)|
241
+ hash[k] = mascot_parse(v)
242
+ hash
243
+ end
244
+ end
245
+ end
data/lib/ms/fasta.rb ADDED
@@ -0,0 +1,86 @@
1
+ require 'bio'
2
+ require 'stringio'
3
+
4
+ class Bio::FlatFile
5
+ include Enumerable
6
+ end
7
+
8
+ class Bio::FastaFormat
9
+ alias_method :header, :definition
10
+ alias_method :sequence, :seq
11
+ end
12
+
13
+ module MS
14
+ # A convenience class for working with fasta formatted sequence databases.
15
+ # the file which includes this class also includes Enumerable with
16
+ # Bio::FlatFile so you can do things like this:
17
+ #
18
+ # accessions = MS::Fasta.open("file.fasta") do |fasta|
19
+ # fasta.map(&:accession)
20
+ # end
21
+ #
22
+ # A few aliases are added to Bio::FastaFormat
23
+ #
24
+ # entry.header == entry.definition
25
+ # entry.sequence == entry.seq
26
+ #
27
+ # MS::Fasta.new accepts both an IO object or a String (a fasta formatted
28
+ # string itself)
29
+ #
30
+ # # taking an io object:
31
+ # File.open("file.fasta") do |io|
32
+ # fasta = MS::Fasta.new(io)
33
+ # ... do something with it
34
+ # end
35
+ # # taking a string
36
+ # string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
37
+ # fasta = MS::Fasta.new(string)
38
+ # (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
39
+ module Fasta
40
+
41
+ # opens the flatfile and yields a Bio::FlatFile object
42
+ def self.open(file, &block)
43
+ Bio::FlatFile.open(Bio::FastaFormat, file, &block)
44
+ end
45
+
46
+ # yields each Bio::FastaFormat object in turn
47
+ def self.foreach(file, &block)
48
+ Bio::FlatFile.open(Bio::FastaFormat, file) do |fasta|
49
+ fasta.each(&block)
50
+ end
51
+ end
52
+
53
+ # takes an IO object or a string that is the fasta data itself
54
+ def self.new(io)
55
+ io = StringIO.new(io) if io.is_a?(String)
56
+ Bio::FlatFile.new(Bio::FastaFormat, io)
57
+ end
58
+
59
+ # returns two hashes [id_to_length, id_to_description]
60
+ # faster (~4x) than official route.
61
+ def self.protein_lengths_and_descriptions(file)
62
+ protid_to_description = {}
63
+ protid_to_length = {}
64
+ re = /^>([^\s]+) (.*)/
65
+ ids = []
66
+ lengths = []
67
+ current_length = nil
68
+ IO.foreach(file) do |line|
69
+ line.chomp!
70
+ if md=re.match(line)
71
+ lengths << current_length
72
+ current_id = md[1]
73
+ ids << current_id
74
+ current_length = 0
75
+ protid_to_description[current_id] = md[2]
76
+ else
77
+ current_length += line.size
78
+ end
79
+ end
80
+ lengths << current_length
81
+ lengths.shift # remove the first nil entry
82
+ [Hash[ids.zip(lengths).to_a], protid_to_description]
83
+ end
84
+
85
+ end
86
+ end