RubyGems - mspire - Versions diffs - 0.1.5 → 0.1.7 - Mend

mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/Rakefile +5 -2
data/bin/bioworks_to_pepxml.rb +84 -40
data/bin/fasta_shaker.rb +100 -0
data/bin/filter_spec_id.rb +185 -23
data/bin/gi2annot.rb +2 -110
data/bin/id_class_anal.rb +31 -21
data/bin/id_precision.rb +12 -8
data/bin/{false_positive_rate.rb → precision.rb} +1 -1
data/bin/protein_summary.rb +55 -62
data/changelog.txt +34 -0
data/lib/align.rb +0 -1
data/lib/fasta.rb +88 -24
data/lib/gi.rb +114 -0
data/lib/roc.rb +64 -58
data/lib/spec_id/aa_freqs.rb +166 -0
data/lib/spec_id/bioworks.rb +5 -1
data/lib/spec_id/precision.rb +427 -0
data/lib/spec_id/proph.rb +2 -2
data/lib/spec_id/sequest.rb +810 -113
data/lib/spec_id/srf.rb +486 -0
data/lib/spec_id.rb +107 -23
data/release_notes.txt +11 -0
data/script/estimate_fpr_by_cysteine.rb +226 -0
data/script/filter-peps.rb +3 -3
data/script/find_cysteine_background.rb +137 -0
data/script/gen_database_searching.rb +11 -7
data/script/genuine_tps_and_probs.rb +136 -0
data/script/top_hit_per_scan.rb +5 -2
data/test/tc_aa_freqs.rb +59 -0
data/test/tc_bioworks.rb +6 -1
data/test/tc_bioworks_to_pepxml.rb +25 -18
data/test/tc_fasta.rb +81 -3
data/test/tc_fasta_shaker.rb +147 -0
data/test/tc_gi.rb +20 -0
data/test/tc_id_class_anal.rb +9 -12
data/test/tc_id_precision.rb +12 -11
data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
data/test/tc_protein_summary.rb +31 -22
data/test/tc_roc.rb +95 -50
data/test/tc_sequest.rb +212 -145
data/test/tc_spec.rb +10 -5
data/test/tc_spec_id.rb +0 -2
data/test/tc_spec_id_xml.rb +36 -0
data/test/tc_srf.rb +216 -0
metadata +35 -21
data/lib/spec_id/false_positive_rate.rb +0 -476
data/test/tc_gi2annot.rb +0 -12

data/lib/fasta.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+require 'sample_enzyme'
 class String
@@ -7,8 +8,10 @@ class String
     end
   end
+  # modifies and returns self
   def shuffle!
     each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
+    self
   end
   def shuffle
@@ -44,6 +47,7 @@ class Fasta
   # Checks that the first character per line is '>' or character class [A-Za-z*]
   # returns a fasta object for stringing commands
   def read_file(fn)
+    first_char_re = /[A-Za-z*]/o
     obj = nil
     regex = /(\r\n)|\n/o
     fh = File.new(fn).binmode
@@ -57,7 +61,7 @@ class Fasta
           obj = Prot.new
           @prots << obj
           obj.header = line.dup
-        elsif first_char =~ /[A-Za-z*]/
+        elsif first_char =~ first_char_re
           obj.aaseq << line.chomp
         else
           raise "Line not in fasta format (between arrows): -->#{line}<--"
@@ -105,26 +109,40 @@ class Fasta
   # returns a new fasta object using some fraction of proteins randomly
   # selected (fraction may be > 1).  Always rounds up.  Will not choose a
   # protein twice unless all other proteins have been chosen
-  def fraction_of_prots(fraction=1)
-    fasta_fraction = nil
-    if fraction == 1
-      fasta_fraction = self.dup
-    else
-      new_num = (fraction.to_f * self.prots.size).ceil
-      arr = []
-      prots.each_with_index do |prot,i|
-        arr << i << prot
+  #
+  # fraction_prefix ensures that a unique header is given even if multiple
+  # fraction of proteins are being created
+  # fraction_cnt = (prot_cnt/num_prots).floor.to_i
+  # so for the first n proteins, it will be 0,
+  # the 2n proteins will be 1, etc.
+  # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
+  # would give headers like this: >f0_<some_real_header>,
+  # >f1_<some_real_header>, ...
+  def fraction_of_prots(fraction=1, prefix_proc=nil)
+    new_num = (fraction.to_f * self.prots.size).ceil
+    arr = []
+    orig_num_prots = @prots.size
+    # initialize
+    new_prots = @prots.map {|prt| prt.dup }
+    frac_cnt = 0
+    ind_cnt = 0
+    prt_cnt = orig_num_prots
+    while ind_cnt < new_num
+      arr << new_prots.delete_at(rand(new_prots.size))
+      if prefix_proc
+        prefix = prefix_proc.call(frac_cnt)
+        arr.last.header_prefix!(prefix)
       end
-      hash = Hash[*arr]
-      size = prots.size
-      new_arr = []
-      while new_arr.size <= new_num
-        new_arr.push( hash.delete( rand(hash.size/2) ) )
-        if hash.size == 0 then hash = Hash[*arr] end
+      prt_cnt -= 1  # index
+      if prt_cnt == 0
+        frac_cnt += 1
+        new_prots = @prots.map {|prt| prt.dup }
+        prt_cnt = orig_num_prots
       end
-      fasta_fraction = Fasta.new(new_arr)
+      ind_cnt += 1
     end
-    fasta_fraction
+    fasta_fraction = Fasta.new(arr)
   end
   # Convenience method for modifying some fraction of the proteins of a file
@@ -204,18 +222,32 @@ class Fasta
     other
   end
+  # method = :shuffle! | :reverse!
+  def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
+    if tryptic_peptides
+      @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
+    else
+      @prots.each {|prot| prot.aaseq!(method_as_symbol) }
+    end
+  end
   # shuffles the aa sequence of each protein (each protein within itself)
   def aaseq_shuffle!
-    @prots.each do |prot|
-      prot.shuffle!
-    end
+    @prots.each {|prot| prot.shuffle! }
   end
   # shuffles the aa sequence of each protein (each protein within itself)
   def aaseq_invert!
-    @prots.each do |prot|
-      prot.invert!
-    end
+    @prots.each {|prot| prot.invert! }
+  end
+  def aaseq_invert_tryptic_peptides!
+    @prots.each {|prot| prot.invert_tryptic_peptides! }
+  end
+  def aaseq_shuffle_tryptic_peptides!
+    @prots.each {|prot| prot.invert_tryptic_peptides! }
   end
   def header_prefix!(prefix)
@@ -264,6 +296,37 @@ class Fasta::Prot
     end
   end
+  # convenience
+  def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
+  def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
+  # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
+  # [cuts after K or R but not if followed by a P]
+  # if method_as_symbol = :reverse
+  # :reverse | :shuffle OR :reverse! | :shuffle!
+  #  aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
+  #  -> 'ABCKCDERDEKDGEKWXYRRKDER'
+  def tryptic_peptides!(method_as_symbol)
+    peps = SampleEnzyme.tryptic(@aaseq)
+    ends_in_RK = /[KR]/o
+    ## if the last peptide doesn't end in R or K we want to flip it completely
+    last_pep_special = nil
+    if peps.last[-1,1] !~ /[KR]/
+      last_pep_special = peps.pop
+    end
+    rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
+    if last_pep_special
+      rev_peps << last_pep_special.send(method_as_symbol)
+    end
+    @aaseq = rev_peps.join
+  end
+  # takes :reverse! | :shuffle!
+  def aaseq!(method_as_symbol)
+    @aaseq.send(method_as_symbol)
+  end
   def invert!
     @aaseq.reverse!
   end
@@ -323,3 +386,4 @@ end
 #    end
 #  end
 #end

data/lib/gi.rb ADDED Viewed

@@ -0,0 +1,114 @@
+require 'open-uri'
+require 'rexml/document'
+require 'rexml/streamlistener'
+$ANNOTS = []
+class GIListener
+  include REXML
+  include StreamListener
+  attr_accessor :annotations
+  def initialize
+    @get_title = false
+    @annotations = []
+  end
+  def tag_start(name, attributes)
+    #puts "NAME" + name
+    #p attributes
+    if name == "Item" && attributes["Name"] == "Title"
+      @get_title = true
+    end
+  end
+  def text(text)
+    #puts "TEXT: " + text + @get_title.to_s
+    if @get_title
+      #puts "GETTING TITLE!"
+      @annotations.push text.chomp
+      @get_title = false
+    end
+  end
+end
+class GI
+  BATCH_SIZE = 500
+  # takes an array of gi numbers and returns an array of annotation
+  # This allows use of the batch search mode on NCBI
+  def self.gi2annot(list_of_gi_numbers)
+    loop do
+      batch = list_of_gi_numbers.slice!(0..BATCH_SIZE)
+      if batch.size == 0 then break end
+      string = batch.join(",")
+      url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&retmode=xml&id=#{string}"
+      #puts url
+      annots = []
+      open(url) do |handle|
+        annots = parse_etool_output(handle)
+      end
+      annots
+    end
+  end
+  protected
+  # Returns a list of Annotation strings
+  def self.parse_etool_output(handle)
+    listener = GIListener.new
+    parser = REXML::Parsers::StreamParser.new(handle, listener)
+    parser.parse
+    listener.annotations
+  end
+end
+=begin
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_041029.dtd">
+<eSummaryResult>
+<DocSum>
+<Id>24115498</Id>
+<Item Name="Caption" Type="String">NP_710008</Item>
+<Item Name="Title" Type="String">chaperonin GroEL [Shigella flexneri 2a str. 301]</Item>
+<Item Name="Extra" Type="String">gi|24115498|ref|NP_710008.1|[24115498]</Item>
+<Item Name="Gi" Type="Integer">24115498</Item>
+<Item Name="CreateDate" Type="String">2002/10/16</Item>
+<Item Name="UpdateDate" Type="String">2006/04/03</Item>
+<Item Name="Flags" Type="Integer">512</Item>
+<Item Name="TaxId" Type="Integer">198214</Item>
+<Item Name="Status" Type="String">live</Item>
+<Item Name="ReplacedBy" Type="String"></Item>
+<Item Name="Comment" Type="String"><![CDATA[  ]]></Item>
+</DocSum>
+<DocSum>
+<Id>434011</Id>
+<Item Name="Caption" Type="String">CAA24741</Item>
+<Item Name="Title" Type="String">unnamed protein product [Escherichia coli]</Item>
+<Item Name="Extra" Type="String">gi|434011|emb|CAA24741.1|[434011]</Item>
+<Item Name="Gi" Type="Integer">434011</Item>
+<Item Name="CreateDate" Type="String">1983/12/06</Item>
+<Item Name="UpdateDate" Type="String">2005/04/18</Item>
+<Item Name="Flags" Type="Integer">0</Item>
+<Item Name="TaxId" Type="Integer">562</Item>
+<Item Name="Status" Type="String">live</Item>
+<Item Name="ReplacedBy" Type="String"></Item>
+<Item Name="Comment" Type="String"><![CDATA[  ]]></Item>
+</DocSum>
+</eSummaryResult>
+=end

data/lib/roc.rb CHANGED Viewed

@@ -6,18 +6,22 @@
 # receiver-operator-characteristics, precision-recall, etc..  Some definitions
 # from (Davis & Goadrich. Proceedings of the 23rd
 # International Conference on Machine Learning, Pittsburgh, PA, 2006):
-#   Recall              = TP/(TP+FN)
-#   Precision           = TP/(TP+FP)
+#   Recall              = TP/(TP+FN) [aka, Sensitivity]
+#   Precision           = TP/(TP+FP) [aka, Positive Predictive Value]
 #   True Positive Rate  = TP/(TP+FN)
 #   False Positive Rate = FP/(FP+TN)
 #
 # Keys to some abbreviations used in this class:
+#   pred = number predicted to be correct
 #   tps = number of true positives
-#   fpr = false positive rate
-#   fpr2 = false positive rate calculated as: FP/(FP+TP)
+#   ppv = positive predictive value
+#   om_ppv = one minus positive predictive value = FP/(TP+FP)
 #
 # NOTE: this class assumes that lower scores are better.  Negate your scores
 # if this is not the case.
+#
+# For estimation of false positive rates using a decoy database strategy, see
+# the DecoyROC class.
 class ROC
@@ -38,82 +42,84 @@ class ROC
     area
   end
-  # Returns (#tp, #yval) where #tp = number of true positives and yval is the
-  # type of classification analysis (as symbol) (accepts: precision, fpr2,
-  # fpr2_times2)
-  def by_tps(yval, tp, fp)
-    new_method = "tps_and_#{yval}".to_sym
-    send(new_method, tp, fp)
-  end
-  # Returns (num_true_positives(ints), precision_arr(floats))
-  # gives the precision TP/(TP+FP) as a function of number of true positives.
-  # True positive values that are equal will cause jumps in the array values
-  # of true positives returned.  If false negatives are known, then a
-  # recall-precision plot could be made (recall is TP/(TP+FN).
-  #   e.g. tps = [1,2,4] # -> jumps from 2 to 4
-  def tps_and_precision(tp, fp)
-    prc = proc {|tp_i, fp_i| (tp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f) }
-    _tps_calc(tp, fp, prc)
-  end
-  # Returns (num_true_positives(ints), false_positive_rate(floats))
-  # calculated as ( FP/(FP+TP) ) as a function of number of true positives
-  # true positive values that are equal will cause jumps in the array values
-  # of true positives returned
-  #   e.g. tps = [1,2,4] # -> jumps from 2 to 4
-  def tps_and_fpr2(tp, fp)
-    prc = proc {|tp_i,fp_i| (fp_i).to_f/((tp_i+1).to_f + fp_i.to_f) }
-    _tps_calc(tp, fp, prc)
+  # given an array of doublets where each doublet is a value and a boolean,
+  # sorts the list and divides it into two arrays (tps, fps) of the values.
+  # The output can then be fed into many of the other routines.
+  def prep_list(list)
+    tp = []; fp = []
+    list.each do |dbl|
+      if dbl[1]
+        tp << dbl
+      else
+        fp << dbl
+      end
+    end
+    [tp,fp].collect do |arr|
+      arr.collect! {|dbl| dbl[0] }
+      arr.sort
+    end
   end
   # Base function for tps calculations
-  def _tps_calc(tp, fp, prc)
+  def tps_and_ppv(tp, fp)
     tp_i = 0
     fp_i = 0
     x = []
     y = []
+    num_tps = 0
     while tp_i < tp.size
       while fp_i < fp.size && tp[tp_i] >= fp[fp_i]
         fp_i += 1
       end
       unless tp[tp_i] == tp[tp_i+1]
-        x << tp_i+1
-        #y << (fp_i+1).to_f/((tp_i+1).to_f + fp_i.to_f)
-        y << prc.call(tp_i, fp_i)
+        # get the correct number of each
+        num_tps = tp_i + 1
+        num_fps = fp_i
+        x << num_tps
+        y << num_tps.to_f/(num_tps+num_fps)
       end
       tp_i += 1
     end
     return x, y
   end
+end
-  # Calculates the fpr based on Peng et. al. J. Proteome Res. 2003, 2, 43-50.
-  # fpr = 2[#rev/(#rev+#real) == 2[FP/(FP+TP)]
-  # This merely multiplies the fpr by 2.
-  def tps_and_fpr2_times2(tp, fp)
-    x, y = tps_and_fpr2(tp,fp)
-    y.collect! {|v| v*2 }
-    return x, y
-  end
+# For calculating precision given lists of hits and decoy hits.  The hits are
+# assumed to have false positives within them that can be estimated from the
+# number of decoy hits at the same rate
+class DecoyROC < ROC
-  # given an array of doublets where each doublet is a value and a boolean,
-  # sorts the list and divides it into two arrays (tps, fps) of the values.
-  # The output can then be fed into many of the other routines.
-  def prep_list(list)
-    tp = []; fp = []
-    list.each do |dbl|
-      if dbl[1]
-        tp << dbl
-      else
-        fp << dbl
+  # returns the [num_hits, num_tps, precision] as a function of true
+  # positives.  Method will return precisely what is calculated (meaning some
+  # answers may seem bizarre if you have better decoy hits than real).
+  def pred_and_tps_and_ppv(hits, decoy_hits)
+    hits_i = 0
+    decoy_i = 0
+    num_hits_ar = []
+    num_tps_ar = []
+    ppv_ar = []
+    while hits_i < hits.size
+      while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
+        decoy_i += 1
       end
+      unless hits[hits_i] == hits[hits_i+1]
+        ## determine the number of false positives
+        tot_num_hits = hits_i+1
+        num_tps = tot_num_hits - decoy_i
+        num_hits_ar << tot_num_hits
+        num_tps_ar << num_tps
+        ppv_ar << ( num_tps.to_f/tot_num_hits )
+      end
+      hits_i += 1
     end
-    [tp,fp].collect do |arr|
-      arr.collect! {|dbl| dbl[0] }
-      arr.sort
-    end
+    [num_hits_ar, num_tps_ar, ppv_ar]
   end
 end

data/lib/spec_id/aa_freqs.rb ADDED Viewed

@@ -0,0 +1,166 @@
+require 'fasta'
+class SpecID::AAFreqs
+  # a fasta object
+  attr_accessor :fasta
+  # hash by capital one-letter amino acid symbols giving the frequency of
+  # seeing that amino acid.  Frequencies should add to 1.
+  attr_accessor :aafreqs
+  def initialize(fasta_file=nil)
+    if fasta_file
+      @fasta = Fasta.new.read_file(fasta_file)
+      @aafreqs = calculate_frequencies(@fasta)
+    end
+  end
+  # creates an aafreqs hash based on fasta object
+  def calculate_frequencies(fasta)
+    hash = {}
+    total_aas = 0
+    ('A'..'Z').each do |x|
+      hash[x] = 0
+    end
+    hash['*'] = 0
+    fasta.prots.each do |prot|
+      aaseq = prot.aaseq
+      total_aas += aaseq.size
+      aaseq.split('').each do |x|
+        hash[x] += 1
+      end
+    end
+    # normalize by total amount:
+    hash.each do |k,v|
+      hash[k] = hash[k].to_f / total_aas
+    end
+    # convert all strings to symbols:
+    hash.each do |k,v|
+      hash[k.to_sym] = hash.delete(k)
+    end
+    hash
+  end
+  # The expected probability for seeing that amino acid in a given length.
+  # This calculates a lookup table (array) from 0 to highest_length of the
+  # probability of seeing at least one amino acid (given its frequency, where
+  # frequency is from 0 to 1)
+  def self.probability_of_length_table(frequency, max_length)
+    one_minus_freq = 1.0 - frequency.to_f
+    lookup = Array.new(max_length + 1)
+    (0..max_length).each do |len|
+      lookup[len] =  1.0 - (one_minus_freq**len);
+    end
+    lookup
+  end
+  # takes an array of peptide strings
+  # gives the actual number of peptides with at least one
+  # gives the expected number of peptides given the probabilities in the
+  # length lookup table.
+  # currently ONLY takes at_least = 1
+  # depends on @aafreqs
+  # returns two numbers in array [actual, expected]
+  # expected is a Float!!!
+  def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
+    one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
+    amino_acid_as_st = amino_acid.to_s
+    probs = []
+    actual = 0
+    expected = 0.0
+    peptide_aaseqs.each do |pep|
+      expected += (1.0 - (one_minus_freq**pep.size))
+      if pep.include?(amino_acid_as_st)
+        actual += 1
+      end
+    end
+    [actual, expected]
+  end
+  # pep_objs respond to sequence?
+  def actual_and_expected_number_containing_cysteines(pep_objs, cyst_freq)
+    @aafreqs ||= {}
+    @aafreqs[:C] = cyst_freq
+    seqs = pep_objs.map do |v|
+      if v.sequence =~ /\.([\w\*]+)\./
+        $1
+      else
+        abort v.sequence.to_s + " could not be matched!"
+      end
+    end
+    actual_and_expected_number(seqs, :C, 1)
+  end
+  ##
+=begin
+  foreach my $pep (@$peps) {
+        unless ($pep->prob() >= $prob_cutoff) {next;}
+        my %freq = ();
+        my $aa = $pep->AA_sequence();
+        my $len = length($aa);
+        ## EXPECTED probability for each length
+        for (my $i = 0; $i < 20; $i++) {
+            ## rolling at least one 6 in n rolls is 1 - (5/6)^n.
+            $expected[$cnt][$i] = 1 - (($freqs_inv[$i])**$len);
+        }
+        ## FILTER any peptides we've already seen
+        if ($seen{$aa}) { next; }
+        else { $seen{$aa}++; }
+        ## Fill in these values with zeroes:
+        for (my $a = 0; $a < 20; $a++) { $pepc[$cnt][$a] = 0; }
+        ## get the frequencies for each AA in each peptide:
+        for (my $i = 0; $i < $len; $i++) {
+            my $let = substr($aa, $i, 1);
+            $tot_freq{$let}++;
+            $pepc[$cnt][$an{$let}]++;
+        }
+        $cnt++;
+    }
+##############################################################
+# ANALYSIS 2: Fraction of Peptides containing X Amino Acid
+##############################################################
+## What is the percentage of peptides containing at least 1 cysteine?
+    my $atleast = 1;
+    my @has;
+## initialize
+    for (my $i = 0; $i < 20; $i++) { $has[$i] = 0; }
+    my $tot = scalar(@pepc);
+    foreach my $pep (@pepc) {
+        for (my $index = 0; $index < 20; $index++) {
+            if ($pep->[$index] >= $atleast) {
+                $has[$index]++;
+            }
+        }
+    }
+    my @exp_sum = ();  ## The total number of peptides I'd expect
+## WE simply add up the peptides' probabilities
+## can think of it like this avg(peptide_prob) * #peptides = sum(pep_prob)
+    foreach my $pep (@expected) {
+        for (my $i = 0; $i < 20; $i++) {
+            $exp_sum[$i] += $pep->[$i];
+        }
+    }
+    my @obs = map { $_/$tot } @has;
+    my @exp = map { $_/$tot } @exp_sum;
+    print STDERR "*********************************************\n";
+    print "Fraction of peptides (obs and expected)\nwith at least one of the AA:\n";
+    print "[AA] [Observed] [Predicted]\n";
+    for (my $i = 0; $i < 20; $i++) {
+        print "$AA[$i] $obs[$i] $exp[$i]\n";
+    }
+    print STDERR "*********************************************\n";
+=end
+end

data/lib/spec_id/bioworks.rb CHANGED Viewed

@@ -15,12 +15,15 @@ module SpecIDXML; end
 class SpecID::Bioworks
   # Regular expressions
   @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
+  @@modifications_re = /<modifications>(.*)<\/modifications>/o
   @@protein_re = /<protein>/o
   @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
   @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
   attr_accessor :prots, :version, :global_filename, :origfilename, :origfilepath
+  # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
+  attr_accessor :modifications
   attr_writer :peps
   def hi_prob_best ; false end
@@ -196,6 +199,7 @@ class SpecID::Bioworks
       @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
     end
     @version = get_regex_val(fh, @@bioworksinfo_re)
+    @modifications = get_regex_val(fh, @@modifications_re)
     @prots = get_prots(fh, self)
     fh.close
   end
@@ -456,7 +460,7 @@ class SpecID::Bioworks::Pep < Array
       first_scan = first_scan[0]
       last_scan = first_scan
     end
-    return base_name, first_scan, last_scan
+    [base_name, first_scan, last_scan]
   end
   def file=(arg)