RubyGems - mspire - Versions diffs - 0.5.0 → 0.6.1 - Mend

mspire 0.5.0 → 0.6.1

Files changed (107) hide show

data/README.rdoc +24 -0
data/Rakefile +51 -0
data/VERSION +1 -0
data/lib/cv/description.rb +18 -0
data/lib/cv/param.rb +33 -0
data/lib/cv.rb +3 -0
data/lib/io/bookmark.rb +13 -0
data/lib/merge.rb +7 -0
data/lib/ms/cvlist.rb +76 -0
data/lib/ms/digester.rb +245 -0
data/lib/ms/fasta.rb +86 -0
data/lib/ms/ident/peptide/db.rb +243 -0
data/lib/ms/ident/peptide.rb +72 -0
data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
data/lib/ms/ident/peptide_hit.rb +26 -0
data/lib/ms/ident/pepxml/modifications.rb +83 -0
data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
data/lib/ms/ident/pepxml/parameters.rb +14 -0
data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
data/lib/ms/ident/pepxml/search_database.rb +49 -0
data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
data/lib/ms/ident/pepxml/search_hit.rb +144 -0
data/lib/ms/ident/pepxml/search_result.rb +35 -0
data/lib/ms/ident/pepxml/search_summary.rb +92 -0
data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
data/lib/ms/ident/pepxml.rb +112 -0
data/lib/ms/ident/protein.rb +33 -0
data/lib/ms/ident/protein_group.rb +80 -0
data/lib/ms/ident/search.rb +114 -0
data/lib/ms/ident.rb +37 -0
data/lib/ms/isotope/aa.rb +59 -0
data/lib/ms/mascot.rb +6 -0
data/lib/ms/mass/aa.rb +79 -0
data/lib/ms/mass.rb +55 -0
data/lib/ms/mzml/index_list.rb +98 -0
data/lib/ms/mzml/plms1.rb +34 -0
data/lib/ms/mzml.rb +197 -0
data/lib/ms/obo.rb +38 -0
data/lib/ms/plms1.rb +156 -0
data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
data/lib/ms/quant/qspec.rb +112 -0
data/lib/ms/spectrum.rb +154 -8
data/lib/ms.rb +3 -10
data/lib/msplat.rb +2 -0
data/lib/obo/ims.rb +5 -0
data/lib/obo/ms.rb +7 -0
data/lib/obo/ontology.rb +41 -0
data/lib/obo/unit.rb +5 -0
data/lib/openany.rb +23 -0
data/lib/write_file_or_string.rb +18 -0
data/obo/ims.obo +562 -0
data/obo/ms.obo +11677 -0
data/obo/unit.obo +2563 -0
data/spec/ms/cvlist_spec.rb +60 -0
data/spec/ms/digester_spec.rb +351 -0
data/spec/ms/fasta_spec.rb +100 -0
data/spec/ms/ident/peptide/db_spec.rb +108 -0
data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
data/spec/ms/ident/pepxml_spec.rb +442 -0
data/spec/ms/ident/protein_group_spec.rb +68 -0
data/spec/ms/mass_spec.rb +8 -0
data/spec/ms/mzml/index_list_spec.rb +122 -0
data/spec/ms/mzml/plms1_spec.rb +62 -0
data/spec/ms/mzml_spec.rb +50 -0
data/spec/ms/plms1_spec.rb +38 -0
data/spec/ms/quant/qspec_spec.rb +25 -0
data/spec/msplat_spec.rb +24 -0
data/spec/obo_spec.rb +25 -0
data/spec/spec_helper.rb +25 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
data/spec/testfiles/plms1/output.key +0 -0
metadata +157 -40
data/README +0 -77
data/changelog.txt +0 -196
data/lib/ms/calc.rb +0 -32
data/lib/ms/data/interleaved.rb +0 -60
data/lib/ms/data/lazy_io.rb +0 -73
data/lib/ms/data/lazy_string.rb +0 -15
data/lib/ms/data/simple.rb +0 -59
data/lib/ms/data/transposed.rb +0 -41
data/lib/ms/data.rb +0 -57
data/lib/ms/format/format_error.rb +0 -12
data/lib/ms/support/binary_search.rb +0 -126

data/README.rdoc ADDED Viewed

@@ -0,0 +1,24 @@
+= mspire
+Tools for working with mass spectrometry data in ruby.
+== Examples
+=== mzml
+    require 'ms/mzml'
+    MS::Mzml.open("somefile.mzml") do |mzml|
+      spectrum = mzml[0]   # the first spectrum ( same as mzml.spectrum(0) )
+      spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]  # query by id string
+      mzml.spectrum_from_scan_num(23) # raises ScanNumbersNotFound or ScanNumbersNotUnique errors if problems
+    end
+    require 'ms/mass/aa'
+    MS::Mass::AA::MONO['A'] # or access by symbol
+== Copyright
+See LICENSE (MIT)

data/Rakefile ADDED Viewed

@@ -0,0 +1,51 @@
+require 'rubygems'
+require 'rake'
+require 'rspec/core/rake_task'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "mspire"
+  gem.homepage = "http://github.com/princelab/mspire"
+  gem.license = "MIT"
+  gem.summary = %Q{mass spectrometry proteomics, lipidomics, and tools}
+  gem.description = %Q{mass spectrometry proteomics, lipidomics, and tools, a rewrite of mspire, merging of ms-* gems}
+  gem.email = "jtprince@gmail.com"
+  gem.authors = ["John T. Prince", "Simon Chiang"]
+  gem.add_dependency "nokogiri", "~> 1.5"
+  gem.add_development_dependency "rspec", "~> 2.6"
+  gem.add_development_dependency "jeweler", "~> 1.5.2"
+  gem.add_development_dependency "rcov", ">= 0"
+  gem.add_development_dependency "obo", ">= 0.1.0"
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+#require 'rcov/rcovtask'
+#Rcov::RcovTask.new do |spec|
+#  spec.libs << 'spec'
+#  spec.pattern = 'spec/**/*_spec.rb'
+#  spec.verbose = true
+#end
+task :default => :spec
+require 'rdoc/task'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "mspire #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.6.1

data/lib/cv/description.rb ADDED Viewed

@@ -0,0 +1,18 @@
+module CV
+  class Description < Array
+    def initialize(*args, &block)
+      super(args)
+      self.instance_eval &block
+    end
+    # pushes a CV::Param object onto the description array
+    def param(*args)
+      push CV::Param.new(*args)
+    end
+    def to_xml(xml)
+      each {|param| param.to_xml(xml) }
+    end
+  end
+end

data/lib/cv/param.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module CV
+  class Param
+    attr_accessor :cv_ref, :accession, :name, :value
+    # A valueless CV::Param object that describes the units being used
+    attr_accessor :unit
+    def initialize(cv_ref, accession, name, value=nil)
+      (@cv_ref, @accession, @name, @value) = [cv_ref, accession, name, value]
+    end
+    def to_xml(xml, name=:cvParam)
+      hash_to_send = {:cvRef => @cvref, :accession => @accession, :name => @name}
+      hash_to_send[:value] = @value if @value
+      if unit
+        hash_to_send.merge!( { :unitCvRef => unit.cv_ref,
+                            :unitAccession => unit.accession,
+                            :unitName => unit.name } )
+      end
+      xml.send(name, hash_to_send)
+    end
+    def ==(other)
+      if !other.nil? && other.is_a?(CV::Param)
+        [:cv_ref, :accession, :name, :value, :unit].inject(true) do |bool, mthd|
+          bool && (self.send(mthd) == other.send(mthd))
+        end
+      else ; false
+      end
+    end
+  end
+end

data/lib/cv.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require 'cv/description'
+require 'cv/param'

data/lib/io/bookmark.rb ADDED Viewed

@@ -0,0 +1,13 @@
+class IO
+  # saves the position and returns to it after the block
+  # is executed. Returns the block's reply.  if rewind, io.rewind is called
+  # before handing the io object to the block.
+  def bookmark(rewind=false, &block)
+    start = self.pos
+    self.rewind if rewind
+    reply = block.call(self)
+    self.pos = start
+    reply
+  end
+end

data/lib/merge.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Merge
+  # allows object attributes to be set from a hash
+  def merge!(hash={}, &block)
+    hash.each {|k,v| send("#{k}=",v) }
+    block.call(block_arg) if block
+  end
+end

data/lib/ms/cvlist.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'cv'
+require 'obo/ms'
+require 'obo/ims'
+require 'obo/unit'
+module MS
+  module CV
+    Obo = {
+      'MS' => Obo::MS.id_to_name,
+      'IMS' => Obo::IMS.id_to_name,
+      'UO' => Obo::Unit.id_to_name,
+    }
+    class Param < ::CV::Param
+      # takes a variety of arguments (acc = accession):
+      #
+      #     acc#
+      #     acc#, value
+      #     acc#, unit_acc# or CV::Param object
+      #     acc#, value, unit_acc# or CV::Param object
+      #     cvref, acc#, name
+      #     cvref, acc#, name, value
+      #     cvref, acc#, name, unit_acc# or CV::Param object
+      #     cvref, acc#, name, value, unit_acc# or CV::Param object
+      def initialize(*args)
+        @unit =
+          if args.size > 1 && ((args.last.is_a?(::CV::Param) || args.last =~ /[A-Za-z]+:\d+/))
+            unit_arg = args.pop
+            unit_arg.is_a?(::CV::Param) ? unit_arg : self.class.new(unit_arg)
+          end
+        (@cv_ref, @accession, @name, @value) =
+          case args.size
+          when 1..2  # accession number (maybe with value)
+            (obo_type, accnum) = args.first.split(':')
+            [obo_type, args.first, MS::CV::Obo[obo_type][args.first], args[1]]
+          when 3..4  # they have fully specified the object
+            args
+          end
+      end
+    end
+  end
+  #     CVList.new( <CV::Param> )
+  #     CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
+  #     CVList.new do
+  #       param MS:1000004
+  #       param MS:1000042, 23
+  #     end
+  class CVList < Array
+    # ensures that each argument is an argument that can be handled by
+    # CV::Param. Returns the CVList object it creates
+    def self.[](*args)
+      list = self.new
+      args.each do |arg|
+        arg.is_a?(Array) ? list.param(*arg) : list.param(arg)
+      end
+      list
+    end
+    # takes a list of valid CV::Param objects, or they can be set in the block
+    # using param
+    def initialize(*args, &block)
+      args.each {|arg| param(arg) }
+      instance_eval &block if block
+    end
+    # if the first object is a MS::CV::Param it is just pushed onto the list,
+    # otherwise the arguments are sent in to initialize a fresh MS::CV::Param,
+    # and this object is pushed onto the list.
+    def param(*args)
+      push args.first.is_a?(::CV::Param) ? args.first : MS::CV::Param.new(*args)
+    end
+  end
+end

data/lib/ms/digester.rb ADDED Viewed

@@ -0,0 +1,245 @@
+require 'strscan'
+module MS
+  # A Digester splits a protein sequence into peptides at specified sites.
+  #
+  #     trypsin = MS::Digester[:trypsin]
+  #
+  #     trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG')
+  #     # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG']
+  #
+  # With 1 missed cleavage:
+  #
+  #     trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
+  #     # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK',
+  #     #     'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG']
+  #
+  # Return the start and end sites of digestion:
+  #
+  #   trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1)
+  #   # => [[0,6],[0,24],[6,24],[6,33],[24,33]]
+  class Digester
+    # The name of the digester
+    attr_reader :name
+    # A string of residues at which cleavage occurs
+    attr_reader :cleave_str
+    # A c-terminal resitriction residue which prevents
+    # cleavage at a potential cleavage site (optional).
+    attr_reader :cterm_exception
+    # True if cleavage occurs at the c-terminus of a
+    # cleavage residue, false if cleavage occurs at
+    # the n-terminus.
+    attr_reader :cterm_cleavage
+    MULTILINE_WHITESPACE = /\s*/m
+    def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true)
+      regexp = []
+      0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] }
+      @name = name
+      @cleave_str = cleave_str
+      @cleave_regexp = Regexp.new(regexp.join('|'))
+      @cterm_exception = case
+                         when cterm_exception == nil || cterm_exception.empty? then nil
+                         when cterm_exception.length == 1 then cterm_exception[0]
+                         else
+                           raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}"
+                         end
+      @cterm_cleavage = cterm_cleavage
+      @scanner = StringScanner.new('')
+    end
+    # Returns digestion sites in sequence, as determined by the
+    # cleave_regexp boundaries.  The digestion sites correspond to the
+    # positions where a peptide begins and ends, such that [n, (n+1) - n]
+    # corresponds to the [index, length] for peptide n.
+    #
+    #   d = Digester.new('Trypsin', 'KR', 'P')
+    #   seq = "AARGGR"
+    #   sites = d.cleavage_sites(seq)                 # => [0, 3, 6]
+    #
+    #   seq[sites[0], sites[0+1] - sites[0]]          # => "AAR"
+    #   seq[sites[1], sites[1+1] - sites[1]]          # => "GGR"
+    #
+    # Trailing whitespace is included in the fragment.
+    #
+    #   seq = "AAR  \n  GGR"
+    #   sites = d.cleavage_sites(seq)                 # => [0, 8, 11]
+    #
+    #   seq[sites[0], sites[0+1] - sites[0]]          # => "AAR  \n  "
+    #   seq[sites[1], sites[1+1] - sites[1]]          # => "GGR"
+    #
+    # The digested section of sequence may be specified using offset
+    # and length.
+    def cleavage_sites(seq, offset=0, length=seq.length-offset)
+      return [0, 1] if seq.size == 1  # adding exceptions is lame--algorithm should just work
+      adjustment = cterm_cleavage ? 0 : 1
+      limit = offset + length
+      positions = [offset]
+      pos = scan(seq, offset, limit) do |pos|
+        positions << (pos - adjustment)
+      end
+      # add the final position
+      if (pos < limit) || (positions.length == 1)
+        positions << limit
+      end
+      # adding exceptions is lame.. this code probably needs to be
+      # refactored (corrected).
+      if !cterm_cleavage && pos == limit
+        positions << limit
+      end
+      positions
+    end
+    # Returns digestion sites of sequence as [start_index, end_index] pairs,
+    # allowing for missed cleavages.  Digestion sites are determined using
+    # cleavage_sites; as in that method, the digested section of sequence
+    # may be specified using offset and length.
+    #
+    # Each [start_index, end_index] pair is yielded to the block, if given,
+    # and the collected results are returned.
+    def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index
+      frag_sites = cleavage_sites(seq, offset, length)
+      overlay(frag_sites.length, max_misses, 1) do |start_index, end_index|
+        start_index = frag_sites[start_index]
+        end_index = frag_sites[end_index]
+        block ? block.call(start_index, end_index) : [start_index, end_index]
+      end
+    end
+    # Returns an array of peptides produced by digesting sequence, allowing for
+    # missed cleavage sites. Digestion sites are determined using cleavage_sites;
+    # as in that method, the digested section of sequence may be specified using
+    # offset and length.
+    def digest(seq, max_misses=0, offset=0, length=seq.length-offset)
+      site_digest(seq, max_misses, offset, length).map do |s, e|
+        seq[s, e-s]
+      end
+    end
+    protected
+    # The cleavage regexp used to identify cleavage sites
+    attr_reader :cleave_regexp # :nodoc:
+    # The scanner used to digest strings.
+    attr_reader :scanner # :nodoc:
+    # Scans seq between offset and limit for the cleave_regexp, skipping whitespace
+    # and being mindful of exception characters. The positions of the scanner at
+    # each match are yielded to the block.
+    def scan(seq, offset, limit, &block) # :nodoc:
+      scanner.string = seq
+      scanner.pos = offset
+      while scanner.search_full(cleave_regexp, true, false)
+        scanner.search_full(MULTILINE_WHITESPACE, true, false)
+        pos = scanner.pos
+        # skip if the next character is the exception character
+        next if cterm_exception != nil && seq[pos] == cterm_exception
+        # break if you scanned past the upper limit
+        break if pos > limit
+        block.call(pos)
+      end
+      scanner.pos
+    end
+    # Performs an overlap-collect algorithm providing the start and end
+    # indicies of spans skipping up to max_misses boundaries.
+    def overlay(n, max_misses, offset, &block) # :nodoc:
+      results = []
+      0.upto(n-1) do |start_index|
+        0.upto(max_misses) do |n_miss|
+          end_index = start_index + offset + n_miss
+          break if end_index == n
+          results << block.call(start_index, end_index)
+        end
+      end
+      results
+    end
+    #
+    # Enzymes adapted from the default Mascot enzyme list.
+    #
+    class << self
+      # takes the name of the enzyme in any case (symbol or string)
+      # and accesses the constant (returns nil if none found)
+      def [](enzyme_name)
+        ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ]
+      end
+      # Utility method to parse a mascot enzyme configuration
+      # string (tab separated) into a Digester.
+      def mascot_parse(str) # :nodoc:
+        name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */)
+        cterm_cleavage = case sense
+                         when 'C-Term' then true
+                         when 'N-Term' then false
+                         else raise ArgumentError, "unknown sense: #{sense}"
+                         end
+        new(name, cleave_str, cterm_exception, cterm_cleavage)
+      end
+    end
+    # ARG_C = mascot_parse('Arg-C 	C-Term 	R 	P 	 no 	 no')
+    # ENZYMES[:arg_c] = <'Arg-C' enzyme>
+    MASCOT_ENZYME_CONFIG_STRINGS = {
+      :arg_c => 'Arg-C 	C-Term 	R 	P 	 no 	 no',
+      :asp_n => 'Asp-N 	N-Term 	BD 	  	no 	no',
+      :asp_n_ambic => 'Asp-N_ambic 	N-Term 	DE 	  	no 	no',
+      :chymotrypsin => 'Chymotrypsin 	C-Term 	FLWY 	P 	no 	no',
+      :cnbr => 'CNBr 	C-Term 	M 	  	no 	no',
+      :lys_c => 'Lys-C 	C-Term 	K 	P 	no 	no',
+      :lys_c_p => 'Lys-C/P 	C-Term 	K 	  	no 	no',
+      :pepsin_a => 'PepsinA 	C-Term 	FL 	  	no 	no',
+      :tryp_cnbr => 'Tryp-CNBr 	C-Term 	KMR 	P 	no 	no',
+      :tryp_chymo => 'TrypChymo 	C-Term 	FKLRWY 	P 	no 	no',
+      :trypsin_p => 'Trypsin/P 	C-Term 	KR 	  	no 	no',
+      :v8_de => 'V8-DE 	C-Term 	BDEZ 	P 	no 	no',
+      :v8_e => 'V8-E 	C-Term 	EZ 	P 	no 	no',
+      :trypsin => 'Trypsin 	C-Term	KR 	P 	no 	no',
+      :v8_e_trypsin => 'V8-E+Trypsin 	C-Term 	EKRZ 	P 	no 	no',
+      :v8_de_trypsin => 'V8-DE+Trypsin 	C-Term 	BDEKRZ 	P 	no 	no',
+      :arg_c => 'Arg-C 	C-Term 	R 	P 	 no 	 no',
+      :asp_n => 'Asp-N 	N-Term 	BD 	  	no 	no',
+      :asp_n_ambic => 'Asp-N_ambic 	N-Term 	DE 	  	no 	no',
+      :chymotrypsin => 'Chymotrypsin 	C-Term 	FLWY 	P 	no 	no',
+      :cnbr => 'CNBr 	C-Term 	M 	  	no 	no',
+      :lys_c => 'Lys-C 	C-Term 	K 	P 	no 	no',
+      :lys_c_p => 'Lys-C/P 	C-Term 	K 	  	no 	no',
+      :pepsin_a => 'PepsinA 	C-Term 	FL 	  	no 	no',
+      :tryp_cnbr => 'Tryp-CNBr 	C-Term 	KMR 	P 	no 	no',
+      :tryp_chymo => 'TrypChymo 	C-Term 	FKLRWY 	P 	no 	no',
+      :trypsin_p => 'Trypsin/P 	C-Term 	KR 	  	no 	no',
+      :v8_de => 'V8-DE 	C-Term 	BDEZ 	P 	no 	no',
+      :v8_e => 'V8-E 	C-Term 	EZ 	P 	no 	no',
+      :trypsin => 'Trypsin 	C-Term	KR 	P 	no 	no',
+      :v8_e_trypsin => 'V8-E+Trypsin 	C-Term 	EKRZ 	P 	no 	no',
+      :v8_de_trypsin => 'V8-DE+Trypsin 	C-Term 	BDEKRZ 	P 	no 	no',
+    }
+    ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)|
+      hash[k] = mascot_parse(v)
+      hash
+    end
+  end
+end

data/lib/ms/fasta.rb ADDED Viewed

@@ -0,0 +1,86 @@
+require 'bio'
+require 'stringio'
+class Bio::FlatFile
+  include Enumerable
+end
+class Bio::FastaFormat
+  alias_method :header, :definition
+  alias_method :sequence, :seq
+end
+module MS
+  # A convenience class for working with fasta formatted sequence databases.
+  # the file which includes this class also includes Enumerable with
+  # Bio::FlatFile so you can do things like this:
+  #
+  #     accessions = MS::Fasta.open("file.fasta") do |fasta|
+  #       fasta.map(&:accession)
+  #     end
+  #
+  # A few aliases are added to Bio::FastaFormat
+  #
+  #     entry.header == entry.definition
+  #     entry.sequence == entry.seq
+  #
+  # MS::Fasta.new accepts both an IO object or a String (a fasta formatted
+  # string itself)
+  #
+  #     # taking an io object:
+  #     File.open("file.fasta") do |io|
+  #       fasta = MS::Fasta.new(io)
+  #       ... do something with it
+  #     end
+  #     # taking a string
+  #     string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
+  #     fasta = MS::Fasta.new(string)
+  #     (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
+  module Fasta
+    # opens the flatfile and yields a Bio::FlatFile object
+    def self.open(file, &block)
+      Bio::FlatFile.open(Bio::FastaFormat, file, &block)
+    end
+    # yields each Bio::FastaFormat object in turn
+    def self.foreach(file, &block)
+      Bio::FlatFile.open(Bio::FastaFormat, file) do |fasta|
+        fasta.each(&block)
+      end
+    end
+    # takes an IO object or a string that is the fasta data itself
+    def self.new(io)
+      io = StringIO.new(io) if io.is_a?(String)
+      Bio::FlatFile.new(Bio::FastaFormat, io)
+    end
+    # returns two hashes [id_to_length, id_to_description]
+    # faster (~4x) than official route.
+    def self.protein_lengths_and_descriptions(file)
+      protid_to_description = {}
+      protid_to_length = {}
+      re = /^>([^\s]+) (.*)/
+        ids = []
+      lengths = []
+      current_length = nil
+      IO.foreach(file) do |line|
+        line.chomp!
+        if md=re.match(line)
+          lengths << current_length
+          current_id = md[1]
+          ids << current_id
+          current_length = 0
+          protid_to_description[current_id] = md[2]
+        else
+          current_length += line.size
+        end
+      end
+      lengths << current_length
+      lengths.shift # remove the first nil entry
+      [Hash[ids.zip(lengths).to_a], protid_to_description]
+    end
+  end
+end