RubyGems - mspire - Versions diffs - 0.5.0 → 0.6.1 - Mend

mspire 0.5.0 → 0.6.1

Files changed (107) hide show

data/README.rdoc +24 -0
data/Rakefile +51 -0
data/VERSION +1 -0
data/lib/cv/description.rb +18 -0
data/lib/cv/param.rb +33 -0
data/lib/cv.rb +3 -0
data/lib/io/bookmark.rb +13 -0
data/lib/merge.rb +7 -0
data/lib/ms/cvlist.rb +76 -0
data/lib/ms/digester.rb +245 -0
data/lib/ms/fasta.rb +86 -0
data/lib/ms/ident/peptide/db.rb +243 -0
data/lib/ms/ident/peptide.rb +72 -0
data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
data/lib/ms/ident/peptide_hit.rb +26 -0
data/lib/ms/ident/pepxml/modifications.rb +83 -0
data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
data/lib/ms/ident/pepxml/parameters.rb +14 -0
data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
data/lib/ms/ident/pepxml/search_database.rb +49 -0
data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
data/lib/ms/ident/pepxml/search_hit.rb +144 -0
data/lib/ms/ident/pepxml/search_result.rb +35 -0
data/lib/ms/ident/pepxml/search_summary.rb +92 -0
data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
data/lib/ms/ident/pepxml.rb +112 -0
data/lib/ms/ident/protein.rb +33 -0
data/lib/ms/ident/protein_group.rb +80 -0
data/lib/ms/ident/search.rb +114 -0
data/lib/ms/ident.rb +37 -0
data/lib/ms/isotope/aa.rb +59 -0
data/lib/ms/mascot.rb +6 -0
data/lib/ms/mass/aa.rb +79 -0
data/lib/ms/mass.rb +55 -0
data/lib/ms/mzml/index_list.rb +98 -0
data/lib/ms/mzml/plms1.rb +34 -0
data/lib/ms/mzml.rb +197 -0
data/lib/ms/obo.rb +38 -0
data/lib/ms/plms1.rb +156 -0
data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
data/lib/ms/quant/qspec.rb +112 -0
data/lib/ms/spectrum.rb +154 -8
data/lib/ms.rb +3 -10
data/lib/msplat.rb +2 -0
data/lib/obo/ims.rb +5 -0
data/lib/obo/ms.rb +7 -0
data/lib/obo/ontology.rb +41 -0
data/lib/obo/unit.rb +5 -0
data/lib/openany.rb +23 -0
data/lib/write_file_or_string.rb +18 -0
data/obo/ims.obo +562 -0
data/obo/ms.obo +11677 -0
data/obo/unit.obo +2563 -0
data/spec/ms/cvlist_spec.rb +60 -0
data/spec/ms/digester_spec.rb +351 -0
data/spec/ms/fasta_spec.rb +100 -0
data/spec/ms/ident/peptide/db_spec.rb +108 -0
data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
data/spec/ms/ident/pepxml_spec.rb +442 -0
data/spec/ms/ident/protein_group_spec.rb +68 -0
data/spec/ms/mass_spec.rb +8 -0
data/spec/ms/mzml/index_list_spec.rb +122 -0
data/spec/ms/mzml/plms1_spec.rb +62 -0
data/spec/ms/mzml_spec.rb +50 -0
data/spec/ms/plms1_spec.rb +38 -0
data/spec/ms/quant/qspec_spec.rb +25 -0
data/spec/msplat_spec.rb +24 -0
data/spec/obo_spec.rb +25 -0
data/spec/spec_helper.rb +25 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
data/spec/testfiles/plms1/output.key +0 -0
metadata +157 -40
data/README +0 -77
data/changelog.txt +0 -196
data/lib/ms/calc.rb +0 -32
data/lib/ms/data/interleaved.rb +0 -60
data/lib/ms/data/lazy_io.rb +0 -73
data/lib/ms/data/lazy_string.rb +0 -15
data/lib/ms/data/simple.rb +0 -59
data/lib/ms/data/transposed.rb +0 -41
data/lib/ms/data.rb +0 -57
data/lib/ms/format/format_error.rb +0 -12
data/lib/ms/support/binary_search.rb +0 -126

data/spec/ms/cvlist_spec.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'spec_helper'
+require 'ms/cvlist'
+require 'cv'
+describe 'appending CV params objects to an MS::CVList' do
+  describe 'intelligently appending params with #param' do
+    before do
+      @cv = MS::CVList.new
+    end
+    it 'sends detailed descriptions to CV::Param.new' do
+      arglist = [
+        ['IMS', 'IMS:1000052', 'position z', 22],
+        ['IMS', 'IMS:1000030', 'continuous'],
+        ['IMS', 'IMS:1000052', 'position z', 22, 'UO:0000008'],
+        ['IMS', 'IMS:1000030', 'continuous', 'UO:0000008'],
+        ['IMS', 'IMS:1000052', 'position z', 22, MS::CV::Param.new('UO:0000008')],
+        ['IMS', 'IMS:1000030', 'continuous', MS::CV::Param.new('UO:0000008')],
+      ]
+      arglist.each do |args|
+        @cv.param *args
+      end
+      @cv.size.should == arglist.size
+      arglist.each_with_index do |args, i|
+        @cv[i].should == MS::CV::Param.new(*args)
+      end
+    end
+    it 'deciphers short accession descriptions' do
+      @cv.param 'MS:1000004'  # sample mass
+      @cv.param 'IMS:1000042', 23 # max count of pixels x
+      {cv_ref: 'MS', accession: 'MS:1000004', name: 'sample mass', value: nil}.each do |key,val|
+        @cv[0].send(key).should == val
+      end
+      {cv_ref: 'IMS', accession: 'IMS:1000042', name: 'max count of pixels x', value: 23}.each do |key,val|
+        @cv[1].send(key).should == val
+      end
+    end
+    describe 'appending on initialization' do
+      it 'can be done with a block' do
+        cvlist = MS::CVList.new do
+          param 'MS:1000004'  # sample mass
+          param 'IMS:1000042', 23 # max count of pixels of y
+        end
+        cvlist.size.should == 2
+      end
+    end
+    it 'can be done with brackets' do
+      args = ['IMS', 'IMS:1000052', 'position z', 22]
+      param_obj = CV::Param.new(*args)
+      cvlist = MS::CVList['MS:1000004', ['MS:1000004'], ['IMS:1000042', 23], param_obj, args]
+      cvlist.size.should == 5
+      cvlist[0].should == cvlist[1]
+      cvlist.each do |param|
+        param.accession.should_not be_nil
+        param.name.should_not be_nil
+        param.cv_ref.should_not be_nil
+      end
+    end
+  end
+end

data/spec/ms/digester_spec.rb ADDED Viewed

@@ -0,0 +1,351 @@
+require 'spec_helper.rb'
+require 'ms/digester'
+require 'pp'
+describe 'a digester' do
+  before do
+    @digester = MS::Digester.new('arg', 'R')
+  end
+  def spp(input, str="")
+    PP.singleline_pp(input, str)
+  end
+  def nk_string(n, split)
+    str = []
+    count = 0
+    (n * 1000).times do
+      count += 1
+      if count < split
+        str << 'A'
+      else
+        count = 0
+        str << 'R'
+      end
+    end
+    str.join('')
+  end
+  it 'finds cleavage site indices' do
+    {
+      "" => [0,0],
+      "A" => [0,1],
+      "R" => [0,1],
+      "AAA" => [0,3],
+      "RAA" => [0,1,3],
+      "ARA" => [0,2,3],
+      "AAR" => [0,3],
+      "RRA" => [0,1,2,3],
+      "RAR" => [0,1,3],
+      "RRR" => [0,1,2,3],
+      "R\nR\nR" => [0,2,4,5],
+      "R\n\n\nR\nR\n\n" => [0,4,6,9]
+   }.each do |sequence, expected|
+       @digester.cleavage_sites(sequence).should == expected
+    end
+  end
+  it 'finds cleavage sites with exception' do
+    @digester = MS::Digester.new('argp', 'R', 'P')
+    {
+      "" => [0,0],
+      "A" => [0,1],
+      "R" => [0,1],
+      "AAA" => [0,3],
+      "RAA" => [0,1,3],
+      "ARA" => [0,2,3],
+      "AAR" => [0,3],
+      "RRA" => [0,1,2,3],
+      "RAR" => [0,1,3],
+      "RRR" => [0,1,2,3],
+      "PR" => [0,1,2],
+      "PR" => [0,2],
+      "PRR" => [0,2,3],
+      "RPR" => [0,3],
+      "RRP" => [0,1,3],
+      "APRA" => [0,3,4],
+      "ARPA" => [0,4],
+      "ARPARA" => [0,5,6],
+      "R\nPR\nR" => [0,5,6],
+      "RP\nR\nR" => [0,5,6],
+      "RP\nR\nR\n" => [0,5,7]
+    }.each do |sequence, expected|
+       @digester.cleavage_sites(sequence).should == expected
+    end
+  end
+  it 'finds cleavage sites with offset and limit' do
+    {
+      "RxxR" => [2,4],
+      "RxAxR" => [2,4],
+      "RxAAAxR" => [2,4],
+      "RxRRRxR" => [2,3,4]
+    }.each do |sequence, expected|
+       @digester.cleavage_sites(sequence, 2, 2).should == expected
+    end
+  end
+  it 'finds cleavage sites fast' do
+    str = nk_string(10, 1000)
+     @digester.cleavage_sites(str).length.should == 11
+    benchmark(20) do |x|
+      x.report("10kx - fragments") do
+        10000.times { @digester.cleavage_sites(str) }
+      end
+    end
+  end
+  it 'digests proteins' do
+    {
+      "" => [''],
+      "A" => ["A"],
+      "R" => ["R"],
+      "AAA" => ["AAA"],
+      "RAA" => ["R", "AA"],
+      "ARA" => ["AR", "A"],
+      "AAR" => ["AAR"],
+      "RRA" => ["R", "R", "A"],
+      "RAR" => ["R", "AR"],
+      "RRR" => ["R", "R", "R"]
+    }.each do |sequence, expected|
+      # spp(sequence)
+       @digester.digest(sequence).should == expected
+       #@digester.digest(sequence) {|frag, s, e| frag}.should == expected
+    end
+  end
+  it 'digests with missed cleavages' do
+    {
+      "" => [''],
+      "A" => ["A"],
+      "R" => ["R"],
+      "AAA" => ["AAA"],
+      "RAA" => ["R", "RAA", "AA"],
+      "ARA" => ["AR", "ARA", "A"],
+      "AAR" => ["AAR"],
+      "RRA" => ["R", "RR", "R", "RA", "A"],
+      "RAR" => ["R", "RAR", "AR"],
+      "RRR" => ["R", "RR", "R", "RR", "R"]
+    }.each do |sequence, expected|
+       @digester.digest(sequence, 1).should == expected
+       #@digester.digest(sequence, 1) {|frag, s, e| frag}.should == expected
+    end
+  end
+  it 'digests with two missed cleavages' do
+    {
+      "" => [''],
+      "A" => ["A"],
+      "R" => ["R"],
+      "AAA" => ["AAA"],
+      "RAA" => ["R", "RAA", "AA"],
+      "ARA" => ["AR", "ARA", "A"],
+      "AAR" => ["AAR"],
+      "RRA" => ["R", "RR", "RRA", "R", "RA", "A"],
+      "RAR" => ["R", "RAR", "AR"],
+      "RRR" => ["R", "RR", "RRR", "R", "RR", "R"]
+    }.each do |sequence, expected|
+       @digester.digest(sequence, 2).should == expected
+       #@digester.digest(sequence, 2) {|frag, s, e| frag}.should == expected
+    end
+  end
+  it 'digests fast' do
+    str = nk_string(10, 1000)
+     @digester.digest(str).length.should == 10
+    benchmark(20) do |x|
+      x.report("10kx - fragments") do
+        10000.times { @digester.digest(str) }
+      end
+    end
+  end
+  it 'finds sites to be digested' do
+    {
+      "" => [[0,0]],
+      "A" => [[0,1]],
+      "R" => [[0,1]],
+      "AAA" => [[0,3]],
+      "RAA" => [[0,1],[1,3]],
+      "ARA" => [[0,2],[2,3]],
+      "AAR" => [[0,3]],
+      "RRA" => [[0,1],[1,2],[2,3]],
+      "RAR" => [[0,1],[1,3]],
+      "RRR" => [[0,1],[1,2],[2,3]]
+    }.each do |sequence, expected|
+       @digester.site_digest(sequence).should == expected
+    end
+  end
+  it 'finds sites to be digested with missed cleavages' do
+    {
+      "" => [[0,0]],
+      "A" => [[0,1]],
+      "R" => [[0,1]],
+      "AAA" => [[0,3]],
+      "RAA" => [[0,1],[0,3],[1,3]],
+      "ARA" => [[0,2],[0,3],[2,3]],
+      "AAR" => [[0,3]],
+      "RRA" => [[0,1],[0,2],[1,2],[1,3],[2,3]],
+      "RAR" => [[0,1],[0,3],[1,3]],
+      "RRR" => [[0,1],[0,2],[1,2],[1,3],[2,3]]
+    }.each do |sequence, expected|
+       @digester.site_digest(sequence, 1).should == expected
+    end
+  end
+  it 'finds sites to be digested with two missed cleavages' do
+    {
+      "" => [[0,0]],
+      "A" => [[0,1]],
+      "R" => [[0,1]],
+      "AAA" => [[0,3]],
+      "RAA" => [[0,1],[0,3],[1,3]],
+      "ARA" => [[0,2],[0,3],[2,3]],
+      "AAR" => [[0,3]],
+      "RRA" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]],
+      "RAR" => [[0,1],[0,3],[1,3]],
+      "RRR" => [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
+    }.each do |sequence, expected|
+       @digester.site_digest(sequence, 2).should == expected
+    end
+  end
+  it 'does site digestion fast' do
+    str = nk_string(10, 1000)
+     @digester.site_digest(str).length.should == 10
+    benchmark(20) do |x|
+      x.report("10kx - fragments") do
+        10000.times { @digester.site_digest(str) }
+      end
+    end
+  end
+end
+describe 'performs as documented in readme' do
+ it 'runs cleavage sites documentation' do
+    d = MS::Digester.new('Trypsin', 'KR', 'P')
+    seq = "AARGGR"
+    sites = d.cleavage_sites(seq)
+    sites.should == [0, 3, 6]
+    seq[sites[0], sites[0+1] - sites[0]].should == "AAR"
+    seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
+    seq = "AAR  \n  GGR"
+    sites = d.cleavage_sites(seq)
+    sites.should == [0, 8, 11]
+    seq[sites[0], sites[0+1] - sites[0]].should == "AAR  \n  "
+    seq[sites[1], sites[1+1] - sites[1]].should == "GGR"
+  end
+end
+describe 'basic trypsin digestion' do
+  it 'performs digestion and can specify sites of digestion' do
+    trypsin = MS::Digester['Trypsin']
+    expected = [
+    'MIVIGR',
+    'SIVHPYITNEYEPFAAEK',
+    'QQILSIMAG']
+    trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG').should == expected
+    expected =  [
+    'MIVIGR',
+    'MIVIGRSIVHPYITNEYEPFAAEK',
+    'SIVHPYITNEYEPFAAEK',
+    'SIVHPYITNEYEPFAAEKQQILSIMAG',
+    'QQILSIMAG']
+    trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
+    expected = [
+    [0,6],
+    [0,24],
+    [6,24],
+    [6,33],
+    [24,33]]
+    trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1).should == expected
+  end
+  it 'completely ignores whitespace inside protein sequences' do
+    expected = [
+    "\tMIVIGR",
+    "SIVHP\nYITNEYEPFAAE K",
+    "QQILSI\rMAG"]
+    MS::Digester['Trypsin'].digest("\tMIVIGRSIVHP\nYITNEYEPFAAE KQQILSI\rMAG").should == expected
+  end
+  it 'does a trypsin digest' do
+    trypsin = MS::Digester[:trypsin]
+    {
+      "" => [''],
+      "A" => ["A"],
+      "R" => ["R"],
+      "AAA" => ["AAA"],
+      "RAA" => ["R", "AA"],
+      "ARA" => ["AR", "A"],
+      "AAR" => ["AAR"],
+      "RRA" => ["R", "R", "A"],
+      "RAR" => ["R", "AR"],
+      "RRR" => ["R", "R", "R"],
+      "RKR" => ["R", "K", "R"],
+      "ARP" => ["ARP"],
+      "PRA" => ["PR","A"],
+      "ARPARAA" => ["ARPAR", "AA"],
+      "RPRRR" => ["RPR", "R", "R"]
+    }.each do |sequence, expected|
+       trypsin.digest(sequence).should == expected
+    end
+  end
+end
+describe 'digestion with other enzymes' do
+  # This is how to access the already created enzyme:
+  # MS::Digester['Arg-C']  (or :arg_c, 'ARG-C', :ARG_C')
+  {
+      ['Arg-C', :arg_c] => {
+      "AARC" => ["AAR", "C"],
+      "AARP" => ["AARP"]
+    },
+      ['Asp-N', :asp_n] => {
+      "AABDS" => ["AA", "B", "DS"],
+      "ADZBS" => ["A", "DZ", "BS"],
+      "B" => %w(B),
+      "A" => %w(A),
+      "ABD" => %w(A B D),
+    },
+    ['Asp-N_ambic', :asp_n_ambic] => {
+      "AAEDS" => ["AA", "E", "DS"],
+      "ADZES" => ["A", "DZ", "ES"],
+      "AED" => %w(A E D),
+      "GDE" => %w(G D E),
+      "AAECCDGG" => %w(AA ECC DGG),
+    }
+  }.each do |enzyme_names, test_hash|
+    it "digests with '#{enzyme_names.first}'" do
+      digester = MS::Digester[enzyme_names.first]
+      digester.should == MS::Digester[enzyme_names.last]
+      digester.name.should == enzyme_names.first
+      test_hash.each do |sequence, expected|
+        digester.digest(sequence).should == expected
+      end
+    end
+  end
+end

data/spec/ms/fasta_spec.rb ADDED Viewed

@@ -0,0 +1,100 @@
+require 'spec_helper'
+require 'ms/fasta'
+describe 'basic fasta operations' do
+  before do
+    @headers = [">gi|5524211 [hello]", ">another B", ">again C"]
+    @entries = ["LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV\nGLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX\nIENY", "ABCDEF\nGHIJK", "ABCD"]
+    @sequences = @entries.map {|v| v.gsub("\n", '') }
+    @data = {}
+    @data['newlines'] = @headers.zip(@entries).map do |header, data|
+      header + "\n" + data
+    end.join("\n")
+    @data['carriage_returns_and_newlines'] = @data['newlines'].gsub("\n", "\r\n")
+    file_key_to_filename_pairs = @data.map do |k,v|
+      file_key = k + '_file'
+      filename = k + '.tmp'
+      File.open(filename, 'w') {|out| out.print v }
+      [file_key, filename]
+    end
+    file_key_to_filename_pairs.each {|k,v| @data[k] = v }
+  end
+  after do
+    @data.select {|k,v| k =~ /_file$/ }.each do |k,filename|
+      index = filename.sub('.tmp', '.index')
+      [filename, index].each do |fn|
+        File.unlink(fn) if File.exist? fn
+      end
+    end
+  end
+  def fasta_correct?(fasta)
+    entries = fasta.map
+    @headers.size.times.zip(entries) do |i,entry|
+      header, sequence, entry = @headers[i], @sequences[i], entry
+      entry.header.should_not == nil
+      entry.sequence.should_not == nil
+      entry.header.should == header[1..-1]
+      entry.sequence.should == sequence
+    end
+  end
+  xit 'can deliver length and description hashes' do
+    # need to test
+  end
+  it 'can read a file' do
+    %w(newlines_file carriage_returns_and_newlines_file).each do |file|
+      MS::Fasta.open(@data[file]) do |fasta|
+        fasta_correct? fasta
+      end
+    end
+  end
+  it 'can read an IO object' do
+    %w(newlines_file carriage_returns_and_newlines_file).each do |file|
+      File.open(@data[file]) do |io|
+        fasta = MS::Fasta.new(io)
+        fasta_correct? fasta
+      end
+    end
+  end
+  it 'can read a string' do
+    %w(newlines carriage_returns_and_newlines).each do |key|
+      fasta = MS::Fasta.new @data[key]
+      fasta_correct? fasta
+    end
+  end
+  it 'iterates entries with foreach' do
+    %w(newlines_file carriage_returns_and_newlines_file).each do |file|
+      MS::Fasta.foreach(@data[file]) do |entry|
+        entry.should be_an_instance_of Bio::FastaFormat
+      end
+    end
+  end
+  it 'runs the documentation' do
+    fasta_file = @data['newlines_file']
+    ids = MS::Fasta.open(fasta_file) do |fasta|
+      fasta.map(&:entry_id)
+    end
+    ids.is_a?(Array)
+    ids.should == %w(gi|5524211 another again)
+    # this code is already tested above
+    # File.open(fasta_file) do |io|
+    #   fasta = MS::Fasta.new(io)
+    # end
+    # taking a string
+    string = ">id1 a simple header\nAAASDDEEEDDD\n>id2 header again\nPPPPPPWWWWWWTTTTYY\n"
+    fasta = MS::Fasta.new(string)
+    (simple, not_simple) = fasta.partition {|entry| entry.header =~ /simple/ }
+    simple.first.header.include?("simple").should == true
+    not_simple.first.header.include?("simple").should == false
+  end
+end

data/spec/ms/ident/peptide/db_spec.rb ADDED Viewed

@@ -0,0 +1,108 @@
+require 'spec_helper'
+require 'yaml'
+path = 'ms/ident/peptide/db'
+require path
+module Kernel
+  def capture_stdout
+    out = StringIO.new
+    $stdout = out
+    yield
+    out.rewind
+    return out.read
+  ensure
+    $stdout = STDOUT
+  end
+end
+describe 'a uniprot fasta file' do
+  before do
+    @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
+  end
+  describe 'amino acid expansion' do
+    it 'can expand out wildcard amino acid combinations' do
+      array = MS::Ident::Peptide::Db.expand_peptides('ALXX', 'X' =>  %w(* % &), 'L' => %w(P Q) )
+      array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
+    end
+    it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
+      # this is from real data
+      worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
+      MS::Ident::Peptide::Db.expand_peptides(worst_case, 'X' =>  %w(* % &)).nil?.should == true
+    end
+    it 'returns the peptide in the array if no expansion' do
+      array = MS::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' =>  %w(* % &), 'L' => %w(P Q) )
+      array.should == ['ZZZZZ']
+    end
+  end
+  describe 'creating a peptide centric database' do
+    before do
+      #@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
+      @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
+    end
+    it 'converts a fasta file into peptide centric db' do
+      output_files = MS::Ident::Peptide::Db.cmdline([@fasta_file])
+      output_files.first.should == File.expand_path(@output_file)
+      File.exist?(@output_file).should == true
+      hash = {}
+      YAML.load_file(@output_file).each do |k,v|
+        hash[k] = v.split("\t")
+      end
+      sorted = hash.sort
+      # these are merely frozen, not perfectly defined
+      sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
+      sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
+      sorted.size.should == 728
+      File.unlink(@output_file)
+    end
+    it 'lists approved enzymes and exits' do
+      output = capture_stdout do
+        begin
+          MS::Ident::Peptide::Db.cmdline(['--list-enzymes'])
+        rescue SystemExit
+          1.should == 1 # we exited
+        end
+      end
+      lines = output.split("\n")
+      lines.include?("trypsin").should == true
+      lines.include?("chymotrypsin").should == true
+    end
+  end
+  describe 'reading a peptide centric database' do
+    before do
+      outfiles = MS::Ident::Peptide::Db.cmdline([@fasta_file])
+      @outfile = outfiles.first
+    end
+    it 'creates a hash that can retrieve peptides as an array' do
+      hash = MS::Ident::Peptide::Db.new(@outfile)
+      hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN	sp|P31946-2|1433B_HUMAN)
+      hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
+    end
+    it 'reads the file on disk with random access or is enumerable' do
+      MS::Ident::Peptide::Db::IO.open(@outfile) do |io|
+        io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN	sp|P31946-2|1433B_HUMAN)
+        io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
+        io.each_with_index do |key_prots, i|
+          key_prots.first.should be_an_instance_of String
+          key_prots.last.should be_a_kind_of Array
+        end
+      end
+    end
+  end
+end