mspire 0.5.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. data/README.rdoc +24 -0
  2. data/Rakefile +51 -0
  3. data/VERSION +1 -0
  4. data/lib/cv/description.rb +18 -0
  5. data/lib/cv/param.rb +33 -0
  6. data/lib/cv.rb +3 -0
  7. data/lib/io/bookmark.rb +13 -0
  8. data/lib/merge.rb +7 -0
  9. data/lib/ms/cvlist.rb +76 -0
  10. data/lib/ms/digester.rb +245 -0
  11. data/lib/ms/fasta.rb +86 -0
  12. data/lib/ms/ident/peptide/db.rb +243 -0
  13. data/lib/ms/ident/peptide.rb +72 -0
  14. data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
  15. data/lib/ms/ident/peptide_hit.rb +26 -0
  16. data/lib/ms/ident/pepxml/modifications.rb +83 -0
  17. data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
  18. data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
  19. data/lib/ms/ident/pepxml/parameters.rb +14 -0
  20. data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
  21. data/lib/ms/ident/pepxml/search_database.rb +49 -0
  22. data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
  23. data/lib/ms/ident/pepxml/search_hit.rb +144 -0
  24. data/lib/ms/ident/pepxml/search_result.rb +35 -0
  25. data/lib/ms/ident/pepxml/search_summary.rb +92 -0
  26. data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
  27. data/lib/ms/ident/pepxml.rb +112 -0
  28. data/lib/ms/ident/protein.rb +33 -0
  29. data/lib/ms/ident/protein_group.rb +80 -0
  30. data/lib/ms/ident/search.rb +114 -0
  31. data/lib/ms/ident.rb +37 -0
  32. data/lib/ms/isotope/aa.rb +59 -0
  33. data/lib/ms/mascot.rb +6 -0
  34. data/lib/ms/mass/aa.rb +79 -0
  35. data/lib/ms/mass.rb +55 -0
  36. data/lib/ms/mzml/index_list.rb +98 -0
  37. data/lib/ms/mzml/plms1.rb +34 -0
  38. data/lib/ms/mzml.rb +197 -0
  39. data/lib/ms/obo.rb +38 -0
  40. data/lib/ms/plms1.rb +156 -0
  41. data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
  42. data/lib/ms/quant/qspec.rb +112 -0
  43. data/lib/ms/spectrum.rb +154 -8
  44. data/lib/ms.rb +3 -10
  45. data/lib/msplat.rb +2 -0
  46. data/lib/obo/ims.rb +5 -0
  47. data/lib/obo/ms.rb +7 -0
  48. data/lib/obo/ontology.rb +41 -0
  49. data/lib/obo/unit.rb +5 -0
  50. data/lib/openany.rb +23 -0
  51. data/lib/write_file_or_string.rb +18 -0
  52. data/obo/ims.obo +562 -0
  53. data/obo/ms.obo +11677 -0
  54. data/obo/unit.obo +2563 -0
  55. data/spec/ms/cvlist_spec.rb +60 -0
  56. data/spec/ms/digester_spec.rb +351 -0
  57. data/spec/ms/fasta_spec.rb +100 -0
  58. data/spec/ms/ident/peptide/db_spec.rb +108 -0
  59. data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
  60. data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
  61. data/spec/ms/ident/pepxml_spec.rb +442 -0
  62. data/spec/ms/ident/protein_group_spec.rb +68 -0
  63. data/spec/ms/mass_spec.rb +8 -0
  64. data/spec/ms/mzml/index_list_spec.rb +122 -0
  65. data/spec/ms/mzml/plms1_spec.rb +62 -0
  66. data/spec/ms/mzml_spec.rb +50 -0
  67. data/spec/ms/plms1_spec.rb +38 -0
  68. data/spec/ms/quant/qspec_spec.rb +25 -0
  69. data/spec/msplat_spec.rb +24 -0
  70. data/spec/obo_spec.rb +25 -0
  71. data/spec/spec_helper.rb +25 -0
  72. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
  73. data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
  74. data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
  75. data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
  76. data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
  77. data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
  78. data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
  79. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
  80. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
  81. data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
  82. data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
  83. data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
  84. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
  85. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
  86. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
  87. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
  88. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
  89. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
  90. data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
  91. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
  92. data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
  93. data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
  94. data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
  95. data/spec/testfiles/plms1/output.key +0 -0
  96. metadata +157 -40
  97. data/README +0 -77
  98. data/changelog.txt +0 -196
  99. data/lib/ms/calc.rb +0 -32
  100. data/lib/ms/data/interleaved.rb +0 -60
  101. data/lib/ms/data/lazy_io.rb +0 -73
  102. data/lib/ms/data/lazy_string.rb +0 -15
  103. data/lib/ms/data/simple.rb +0 -59
  104. data/lib/ms/data/transposed.rb +0 -41
  105. data/lib/ms/data.rb +0 -57
  106. data/lib/ms/format/format_error.rb +0 -12
  107. data/lib/ms/support/binary_search.rb +0 -126
data/README.rdoc ADDED
@@ -0,0 +1,24 @@
1
+ = mspire
2
+
3
+ Tools for working with mass spectrometry data in ruby.
4
+
5
+ == Examples
6
+
7
+ === mzml
8
+
9
+ require 'ms/mzml'
10
+
11
+ MS::Mzml.open("somefile.mzml") do |mzml|
12
+ spectrum = mzml[0] # the first spectrum ( same as mzml.spectrum(0) )
13
+ spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"] # query by id string
14
+ mzml.spectrum_from_scan_num(23) # raises ScanNumbersNotFound or ScanNumbersNotUnique errors if problems
15
+ end
16
+
17
+ require 'ms/mass/aa'
18
+
19
+ MS::Mass::AA::MONO['A'] # or access by symbol
20
+
21
+ == Copyright
22
+
23
+ See LICENSE (MIT)
24
+
data/Rakefile ADDED
@@ -0,0 +1,51 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rspec/core/rake_task'
4
+
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
8
+ gem.name = "mspire"
9
+ gem.homepage = "http://github.com/princelab/mspire"
10
+ gem.license = "MIT"
11
+ gem.summary = %Q{mass spectrometry proteomics, lipidomics, and tools}
12
+ gem.description = %Q{mass spectrometry proteomics, lipidomics, and tools, a rewrite of mspire, merging of ms-* gems}
13
+ gem.email = "jtprince@gmail.com"
14
+ gem.authors = ["John T. Prince", "Simon Chiang"]
15
+ gem.add_dependency "nokogiri", "~> 1.5"
16
+ gem.add_development_dependency "rspec", "~> 2.6"
17
+ gem.add_development_dependency "jeweler", "~> 1.5.2"
18
+ gem.add_development_dependency "rcov", ">= 0"
19
+ gem.add_development_dependency "obo", ">= 0.1.0"
20
+ end
21
+ Jeweler::RubygemsDotOrgTasks.new
22
+
23
+ require 'rspec/core'
24
+ require 'rspec/core/rake_task'
25
+ RSpec::Core::RakeTask.new(:spec) do |spec|
26
+ spec.pattern = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ #require 'rcov/rcovtask'
35
+ #Rcov::RcovTask.new do |spec|
36
+ # spec.libs << 'spec'
37
+ # spec.pattern = 'spec/**/*_spec.rb'
38
+ # spec.verbose = true
39
+ #end
40
+
41
+ task :default => :spec
42
+
43
+ require 'rdoc/task'
44
+ Rake::RDocTask.new do |rdoc|
45
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
46
+
47
+ rdoc.rdoc_dir = 'rdoc'
48
+ rdoc.title = "mspire #{version}"
49
+ rdoc.rdoc_files.include('README*')
50
+ rdoc.rdoc_files.include('lib/**/*.rb')
51
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.6.1
@@ -0,0 +1,18 @@
1
+
2
+ module CV
3
+ class Description < Array
4
+ def initialize(*args, &block)
5
+ super(args)
6
+ self.instance_eval &block
7
+ end
8
+
9
+ # pushes a CV::Param object onto the description array
10
+ def param(*args)
11
+ push CV::Param.new(*args)
12
+ end
13
+
14
+ def to_xml(xml)
15
+ each {|param| param.to_xml(xml) }
16
+ end
17
+ end
18
+ end
data/lib/cv/param.rb ADDED
@@ -0,0 +1,33 @@
1
+
2
+ module CV
3
+ class Param
4
+ attr_accessor :cv_ref, :accession, :name, :value
5
+ # A valueless CV::Param object that describes the units being used
6
+ attr_accessor :unit
7
+
8
+ def initialize(cv_ref, accession, name, value=nil)
9
+ (@cv_ref, @accession, @name, @value) = [cv_ref, accession, name, value]
10
+ end
11
+
12
+ def to_xml(xml, name=:cvParam)
13
+ hash_to_send = {:cvRef => @cvref, :accession => @accession, :name => @name}
14
+ hash_to_send[:value] = @value if @value
15
+ if unit
16
+ hash_to_send.merge!( { :unitCvRef => unit.cv_ref,
17
+ :unitAccession => unit.accession,
18
+ :unitName => unit.name } )
19
+ end
20
+ xml.send(name, hash_to_send)
21
+ end
22
+
23
+ def ==(other)
24
+ if !other.nil? && other.is_a?(CV::Param)
25
+ [:cv_ref, :accession, :name, :value, :unit].inject(true) do |bool, mthd|
26
+ bool && (self.send(mthd) == other.send(mthd))
27
+ end
28
+ else ; false
29
+ end
30
+ end
31
+ end
32
+ end
33
+
data/lib/cv.rb ADDED
@@ -0,0 +1,3 @@
1
+
2
+ require 'cv/description'
3
+ require 'cv/param'
@@ -0,0 +1,13 @@
1
+
2
+ class IO
3
+ # saves the position and returns to it after the block
4
+ # is executed. Returns the block's reply. if rewind, io.rewind is called
5
+ # before handing the io object to the block.
6
+ def bookmark(rewind=false, &block)
7
+ start = self.pos
8
+ self.rewind if rewind
9
+ reply = block.call(self)
10
+ self.pos = start
11
+ reply
12
+ end
13
+ end
data/lib/merge.rb ADDED
@@ -0,0 +1,7 @@
1
+ module Merge
2
+ # allows object attributes to be set from a hash
3
+ def merge!(hash={}, &block)
4
+ hash.each {|k,v| send("#{k}=",v) }
5
+ block.call(block_arg) if block
6
+ end
7
+ end
data/lib/ms/cvlist.rb ADDED
@@ -0,0 +1,76 @@
1
+
2
+ require 'cv'
3
+ require 'obo/ms'
4
+ require 'obo/ims'
5
+ require 'obo/unit'
6
+
7
+ module MS
8
+ module CV
9
+ Obo = {
10
+ 'MS' => Obo::MS.id_to_name,
11
+ 'IMS' => Obo::IMS.id_to_name,
12
+ 'UO' => Obo::Unit.id_to_name,
13
+ }
14
+
15
+ class Param < ::CV::Param
16
+ # takes a variety of arguments (acc = accession):
17
+ #
18
+ # acc#
19
+ # acc#, value
20
+ # acc#, unit_acc# or CV::Param object
21
+ # acc#, value, unit_acc# or CV::Param object
22
+ # cvref, acc#, name
23
+ # cvref, acc#, name, value
24
+ # cvref, acc#, name, unit_acc# or CV::Param object
25
+ # cvref, acc#, name, value, unit_acc# or CV::Param object
26
+ def initialize(*args)
27
+ @unit =
28
+ if args.size > 1 && ((args.last.is_a?(::CV::Param) || args.last =~ /[A-Za-z]+:\d+/))
29
+ unit_arg = args.pop
30
+ unit_arg.is_a?(::CV::Param) ? unit_arg : self.class.new(unit_arg)
31
+ end
32
+ (@cv_ref, @accession, @name, @value) =
33
+ case args.size
34
+ when 1..2 # accession number (maybe with value)
35
+ (obo_type, accnum) = args.first.split(':')
36
+ [obo_type, args.first, MS::CV::Obo[obo_type][args.first], args[1]]
37
+ when 3..4 # they have fully specified the object
38
+ args
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ # CVList.new( <CV::Param> )
45
+ # CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
46
+ # CVList.new do
47
+ # param MS:1000004
48
+ # param MS:1000042, 23
49
+ # end
50
+ class CVList < Array
51
+
52
+ # ensures that each argument is an argument that can be handled by
53
+ # CV::Param. Returns the CVList object it creates
54
+ def self.[](*args)
55
+ list = self.new
56
+ args.each do |arg|
57
+ arg.is_a?(Array) ? list.param(*arg) : list.param(arg)
58
+ end
59
+ list
60
+ end
61
+
62
+ # takes a list of valid CV::Param objects, or they can be set in the block
63
+ # using param
64
+ def initialize(*args, &block)
65
+ args.each {|arg| param(arg) }
66
+ instance_eval &block if block
67
+ end
68
+
69
+ # if the first object is a MS::CV::Param it is just pushed onto the list,
70
+ # otherwise the arguments are sent in to initialize a fresh MS::CV::Param,
71
+ # and this object is pushed onto the list.
72
+ def param(*args)
73
+ push args.first.is_a?(::CV::Param) ? args.first : MS::CV::Param.new(*args)
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,245 @@
1
+ require 'strscan'
2
+
3
+ module MS
4
+
5
+ # A Digester splits a protein sequence into peptides at specified sites.
6
+ #
7
+ # trypsin = MS::Digester[:trypsin]
8
+ #
9
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
10
+ # # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG']
11
+ #
12
+ # With 1 missed cleavage:
13
+ #
14
+ # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
15
+ # # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK',
16
+ # # 'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG']
17
+ #
18
+ # Return the start and end sites of digestion:
19
+ #
20
+ # trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
21
+ # # => [[0,6],[0,24],[6,24],[6,33],[24,33]]
22
+ class Digester
23
+
24
+ # The name of the digester
25
+ attr_reader :name
26
+
27
+ # A string of residues at which cleavage occurs
28
+ attr_reader :cleave_str
29
+
30
+ # A c-terminal resitriction residue which prevents
31
+ # cleavage at a potential cleavage site (optional).
32
+ attr_reader :cterm_exception
33
+
34
+ # True if cleavage occurs at the c-terminus of a
35
+ # cleavage residue, false if cleavage occurs at
36
+ # the n-terminus.
37
+ attr_reader :cterm_cleavage
38
+
39
+ MULTILINE_WHITESPACE = /\s*/m
40
+
41
+ def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
42
+ regexp = []
43
+ 0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
44
+
45
+ @name = name
46
+ @cleave_str = cleave_str
47
+ @cleave_regexp = Regexp.new(regexp.join('|'))
48
+ @cterm_exception = case
49
+ when cterm_exception == nil || cterm_exception.empty? then nil
50
+ when cterm_exception.length == 1 then cterm_exception[0]
51
+ else
52
+ raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
53
+ end
54
+
55
+ @cterm_cleavage = cterm_cleavage
56
+ @scanner = StringScanner.new('')
57
+ end
58
+
59
+ # Returns digestion sites in sequence, as determined by the
60
+ # cleave_regexp boundaries. The digestion sites correspond to the
61
+ # positions where a peptide begins and ends, such that [n, (n+1) - n]
62
+ # corresponds to the [index, length] for peptide n.
63
+ #
64
+ # d = Digester.new('Trypsin', 'KR', 'P')
65
+ # seq = "AARGGR"
66
+ # sites = d.cleavage_sites(seq) # => [0, 3, 6]
67
+ #
68
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR"
69
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
70
+ #
71
+ # Trailing whitespace is included in the fragment.
72
+ #
73
+ # seq = "AAR \n GGR"
74
+ # sites = d.cleavage_sites(seq) # => [0, 8, 11]
75
+ #
76
+ # seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n "
77
+ # seq[sites[1], sites[1+1] - sites[1]] # => "GGR"
78
+ #
79
+ # The digested section of sequence may be specified using offset
80
+ # and length.
81
+ def cleavage_sites(seq, offset=0, length=seq.length-offset)
82
+ return [0, 1] if seq.size == 1 # adding exceptions is lame--algorithm should just work
83
+
84
+ adjustment = cterm_cleavage ? 0 : 1
85
+ limit = offset + length
86
+
87
+ positions = [offset]
88
+ pos = scan(seq, offset, limit) do |pos|
89
+ positions << (pos - adjustment)
90
+ end
91
+
92
+ # add the final position
93
+ if (pos < limit) || (positions.length == 1)
94
+ positions << limit
95
+ end
96
+ # adding exceptions is lame.. this code probably needs to be
97
+ # refactored (corrected).
98
+ if !cterm_cleavage && pos == limit
99
+ positions << limit
100
+ end
101
+ positions
102
+ end
103
+
104
+ # Returns digestion sites of sequence as [start_index, end_index] pairs,
105
+ # allowing for missed cleavages. Digestion sites are determined using
106
+ # cleavage_sites; as in that method, the digested section of sequence
107
+ # may be specified using offset and length.
108
+ #
109
+ # Each [start_index, end_index] pair is yielded to the block, if given,
110
+ # and the collected results are returned.
111
+ def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index
112
+ frag_sites = cleavage_sites(seq, offset, length)
113
+
114
+ overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
115
+ start_index = frag_sites[start_index]
116
+ end_index = frag_sites[end_index]
117
+
118
+ block ? block.call(start_index, end_index) : [start_index, end_index]
119
+ end
120
+ end
121
+
122
+ # Returns an array of peptides produced by digesting sequence, allowing for
123
+ # missed cleavage sites. Digestion sites are determined using cleavage_sites;
124
+ # as in that method, the digested section of sequence may be specified using
125
+ # offset and length.
126
+ def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
127
+ site_digest(seq, max_misses, offset, length).map do |s, e|
128
+ seq[s, e-s]
129
+ end
130
+ end
131
+
132
+ protected
133
+
134
+ # The cleavage regexp used to identify cleavage sites
135
+ attr_reader :cleave_regexp # :nodoc:
136
+
137
+ # The scanner used to digest strings.
138
+ attr_reader :scanner # :nodoc:
139
+
140
+ # Scans seq between offset and limit for the cleave_regexp, skipping whitespace
141
+ # and being mindful of exception characters. The positions of the scanner at
142
+ # each match are yielded to the block.
143
+ def scan(seq, offset, limit, &block) # :nodoc:
144
+ scanner.string = seq
145
+ scanner.pos = offset
146
+
147
+ while scanner.search_full(cleave_regexp, true, false)
148
+ scanner.search_full(MULTILINE_WHITESPACE, true, false)
149
+ pos = scanner.pos
150
+
151
+ # skip if the next character is the exception character
152
+ next if cterm_exception != nil && seq[pos] == cterm_exception
153
+
154
+ # break if you scanned past the upper limit
155
+ break if pos > limit
156
+
157
+ block.call(pos)
158
+ end
159
+
160
+ scanner.pos
161
+ end
162
+
163
+ # Performs an overlap-collect algorithm providing the start and end
164
+ # indicies of spans skipping up to max_misses boundaries.
165
+ def overlay(n, max_misses, offset, &block) # :nodoc:
166
+ results = []
167
+ 0.upto(n-1) do |start_index|
168
+ 0.upto(max_misses) do |n_miss|
169
+ end_index = start_index + offset + n_miss
170
+ break if end_index == n
171
+
172
+ results << block.call(start_index, end_index)
173
+ end
174
+ end
175
+ results
176
+ end
177
+
178
+ #
179
+ # Enzymes adapted from the default Mascot enzyme list.
180
+ #
181
+
182
+ class << self
183
+ # takes the name of the enzyme in any case (symbol or string)
184
+ # and accesses the constant (returns nil if none found)
185
+ def [](enzyme_name)
186
+ ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ]
187
+ end
188
+
189
+ # Utility method to parse a mascot enzyme configuration
190
+ # string (tab separated) into a Digester.
191
+ def mascot_parse(str) # :nodoc:
192
+ name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
193
+ cterm_cleavage = case sense
194
+ when 'C-Term' then true
195
+ when 'N-Term' then false
196
+ else raise ArgumentError, "unknown sense: #{sense}"
197
+ end
198
+
199
+ new(name, cleave_str, cterm_exception, cterm_cleavage)
200
+ end
201
+ end
202
+
203
+ # ARG_C = mascot_parse('Arg-C C-Term R P no no')
204
+ # ENZYMES[:arg_c] = <'Arg-C' enzyme>
205
+ MASCOT_ENZYME_CONFIG_STRINGS = {
206
+ :arg_c => 'Arg-C C-Term R P no no',
207
+ :asp_n => 'Asp-N N-Term BD no no',
208
+ :asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
209
+ :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
210
+ :cnbr => 'CNBr C-Term M no no',
211
+ :lys_c => 'Lys-C C-Term K P no no',
212
+ :lys_c_p => 'Lys-C/P C-Term K no no',
213
+ :pepsin_a => 'PepsinA C-Term FL no no',
214
+ :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
215
+ :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
216
+ :trypsin_p => 'Trypsin/P C-Term KR no no',
217
+ :v8_de => 'V8-DE C-Term BDEZ P no no',
218
+ :v8_e => 'V8-E C-Term EZ P no no',
219
+ :trypsin => 'Trypsin C-Term KR P no no',
220
+ :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
221
+ :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
222
+ :arg_c => 'Arg-C C-Term R P no no',
223
+ :asp_n => 'Asp-N N-Term BD no no',
224
+ :asp_n_ambic => 'Asp-N_ambic N-Term DE no no',
225
+ :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no',
226
+ :cnbr => 'CNBr C-Term M no no',
227
+ :lys_c => 'Lys-C C-Term K P no no',
228
+ :lys_c_p => 'Lys-C/P C-Term K no no',
229
+ :pepsin_a => 'PepsinA C-Term FL no no',
230
+ :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no',
231
+ :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no',
232
+ :trypsin_p => 'Trypsin/P C-Term KR no no',
233
+ :v8_de => 'V8-DE C-Term BDEZ P no no',
234
+ :v8_e => 'V8-E C-Term EZ P no no',
235
+ :trypsin => 'Trypsin C-Term KR P no no',
236
+ :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no',
237
+ :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no',
238
+ }
239
+
240
+ ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)|
241
+ hash[k] = mascot_parse(v)
242
+ hash
243
+ end
244
+ end
245
+ end
data/lib/ms/fasta.rb ADDED
@@ -0,0 +1,86 @@
1
+ require 'bio'
2
+ require 'stringio'
3
+
4
+ class Bio::FlatFile
5
+ include Enumerable
6
+ end
7
+
8
+ class Bio::FastaFormat
9
+ alias_method :header, :definition
10
+ alias_method :sequence, :seq
11
+ end
12
+
13
+ module MS
14
+ # A convenience class for working with fasta formatted sequence databases.
15
+ # the file which includes this class also includes Enumerable with
16
+ # Bio::FlatFile so you can do things like this:
17
+ #
18
+ # accessions = MS::Fasta.open("file.fasta") do |fasta|
19
+ # fasta.map(&:accession)
20
+ # end
21
+ #
22
+ # A few aliases are added to Bio::FastaFormat
23
+ #
24
+ # entry.header == entry.definition
25
+ # entry.sequence == entry.seq
26
+ #
27
+ # MS::Fasta.new accepts both an IO object or a String (a fasta formatted
28
+ # string itself)
29
+ #
30
+ # # taking an io object:
31
+ # File.open("file.fasta") do |io|
32
+ # fasta = MS::Fasta.new(io)
33
+ # ... do something with it
34
+ # end
35
+ # # taking a string
36
+ # string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
37
+ # fasta = MS::Fasta.new(string)
38
+ # (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
39
+ module Fasta
40
+
41
+ # opens the flatfile and yields a Bio::FlatFile object
42
+ def self.open(file, &block)
43
+ Bio::FlatFile.open(Bio::FastaFormat, file, &block)
44
+ end
45
+
46
+ # yields each Bio::FastaFormat object in turn
47
+ def self.foreach(file, &block)
48
+ Bio::FlatFile.open(Bio::FastaFormat, file) do |fasta|
49
+ fasta.each(&block)
50
+ end
51
+ end
52
+
53
+ # takes an IO object or a string that is the fasta data itself
54
+ def self.new(io)
55
+ io = StringIO.new(io) if io.is_a?(String)
56
+ Bio::FlatFile.new(Bio::FastaFormat, io)
57
+ end
58
+
59
+ # returns two hashes [id_to_length, id_to_description]
60
+ # faster (~4x) than official route.
61
+ def self.protein_lengths_and_descriptions(file)
62
+ protid_to_description = {}
63
+ protid_to_length = {}
64
+ re = /^>([^\s]+) (.*)/
65
+ ids = []
66
+ lengths = []
67
+ current_length = nil
68
+ IO.foreach(file) do |line|
69
+ line.chomp!
70
+ if md=re.match(line)
71
+ lengths << current_length
72
+ current_id = md[1]
73
+ ids << current_id
74
+ current_length = 0
75
+ protid_to_description[current_id] = md[2]
76
+ else
77
+ current_length += line.size
78
+ end
79
+ end
80
+ lengths << current_length
81
+ lengths.shift # remove the first nil entry
82
+ [Hash[ids.zip(lengths).to_a], protid_to_description]
83
+ end
84
+
85
+ end
86
+ end