RubyGems - mspire - Versions diffs - 0.2.4 → 0.3.0 - Mend

mspire 0.2.4 → 0.3.0

Files changed (233) hide show

data/INSTALL +1 -0
data/README +25 -0
data/Rakefile +129 -40
data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
data/bin/bioworks_to_pepxml.rb +1 -0
data/bin/fasta_shaker.rb +1 -96
data/bin/filter_and_validate.rb +5 -0
data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
data/bin/prob_validate.rb +6 -0
data/bin/raw_to_mzXML.rb +2 -2
data/bin/srf_group.rb +1 -0
data/bin/srf_to_sqt.rb +40 -0
data/changelog.txt +68 -0
data/lib/align/chams.rb +6 -6
data/lib/align.rb +4 -3
data/lib/bsearch.rb +120 -0
data/lib/fasta.rb +318 -86
data/lib/group_by.rb +10 -0
data/lib/index_by.rb +11 -0
data/lib/merge_deep.rb +21 -0
data/lib/{spec → ms/converter}/mzxml.rb +77 -109
data/lib/ms/gradient_program.rb +171 -0
data/lib/ms/msrun.rb +209 -0
data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
data/lib/ms/parser/mzdata/axml.rb +12 -0
data/lib/ms/parser/mzdata/dom.rb +160 -0
data/lib/ms/parser/mzdata/libxml.rb +7 -0
data/lib/ms/parser/mzdata.rb +25 -0
data/lib/ms/parser/mzxml/axml.rb +11 -0
data/lib/ms/parser/mzxml/dom.rb +159 -0
data/lib/ms/parser/mzxml/hpricot.rb +253 -0
data/lib/ms/parser/mzxml/libxml.rb +15 -0
data/lib/ms/parser/mzxml/regexp.rb +122 -0
data/lib/ms/parser/mzxml/rexml.rb +72 -0
data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
data/lib/ms/parser/mzxml.rb +175 -0
data/lib/ms/parser.rb +108 -0
data/lib/ms/precursor.rb +10 -0
data/lib/ms/scan.rb +81 -0
data/lib/ms/spectrum.rb +193 -0
data/lib/ms.rb +10 -0
data/lib/mspire.rb +4 -0
data/lib/roc.rb +61 -1
data/lib/sample_enzyme.rb +31 -8
data/lib/scan_i.rb +21 -0
data/lib/spec_id/aa_freqs.rb +7 -3
data/lib/spec_id/bioworks.rb +20 -14
data/lib/spec_id/digestor.rb +139 -0
data/lib/spec_id/mass.rb +116 -0
data/lib/spec_id/parser/proph.rb +236 -0
data/lib/spec_id/precision/filter/cmdline.rb +209 -0
data/lib/spec_id/precision/filter/interactive.rb +134 -0
data/lib/spec_id/precision/filter/output.rb +147 -0
data/lib/spec_id/precision/filter.rb +623 -0
data/lib/spec_id/precision/output.rb +60 -0
data/lib/spec_id/precision/prob/cmdline.rb +139 -0
data/lib/spec_id/precision/prob/output.rb +88 -0
data/lib/spec_id/precision/prob.rb +171 -0
data/lib/spec_id/proph/pep_summary.rb +92 -0
data/lib/spec_id/proph/prot_summary.rb +484 -0
data/lib/spec_id/proph.rb +2 -466
data/lib/spec_id/protein_summary.rb +2 -2
data/lib/spec_id/sequest/params.rb +316 -0
data/lib/spec_id/sequest/pepxml.rb +1513 -0
data/lib/spec_id/sequest.rb +2 -1672
data/lib/spec_id/srf.rb +445 -177
data/lib/spec_id.rb +183 -95
data/lib/spec_id_xml.rb +8 -10
data/lib/transmem/phobius.rb +147 -0
data/lib/transmem/toppred.rb +368 -0
data/lib/transmem.rb +157 -0
data/lib/validator/aa.rb +135 -0
data/lib/validator/background.rb +73 -0
data/lib/validator/bias.rb +95 -0
data/lib/validator/cmdline.rb +260 -0
data/lib/validator/decoy.rb +94 -0
data/lib/validator/digestion_based.rb +69 -0
data/lib/validator/probability.rb +48 -0
data/lib/validator/prot_from_pep.rb +234 -0
data/lib/validator/transmem.rb +272 -0
data/lib/validator/true_pos.rb +46 -0
data/lib/validator.rb +214 -0
data/lib/xml.rb +38 -0
data/lib/xml_style_parser.rb +105 -0
data/lib/xmlparser_wrapper.rb +19 -0
data/script/compile_and_plot_smriti_final.rb +97 -0
data/script/extract_gradient_programs.rb +56 -0
data/script/get_apex_values_rexml.rb +44 -0
data/script/mzXML2timeIndex.rb +1 -1
data/script/smriti_final_analysis.rb +103 -0
data/script/toppred_to_yaml.rb +47 -0
data/script/tpp_installer.rb +1 -1
data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
data/specs/bin/fasta_shaker_spec.rb +259 -0
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
data/specs/bin/filter_and_validate_spec.rb +124 -0
data/specs/bin/ms_to_lmat_spec.rb +34 -0
data/specs/bin/prob_validate_spec.rb +62 -0
data/specs/bin/protein_summary_spec.rb +10 -0
data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
data/specs/gi_spec.rb +22 -0
data/specs/load_bin_path.rb +7 -0
data/specs/merge_deep_spec.rb +13 -0
data/specs/ms/gradient_program_spec.rb +77 -0
data/specs/ms/msrun_spec.rb +455 -0
data/specs/ms/parser_spec.rb +92 -0
data/specs/ms/spectrum_spec.rb +89 -0
data/specs/roc_spec.rb +251 -0
data/specs/rspec_autotest.rb +149 -0
data/specs/sample_enzyme_spec.rb +41 -0
data/specs/spec_helper.rb +133 -0
data/specs/spec_id/aa_freqs_spec.rb +52 -0
data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
data/specs/spec_id/digestor_spec.rb +75 -0
data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
data/specs/spec_id/precision/filter/output_spec.rb +31 -0
data/specs/spec_id/precision/filter_spec.rb +243 -0
data/specs/spec_id/precision/prob_spec.rb +111 -0
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
data/specs/spec_id/sequest/params_spec.rb +68 -0
data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
data/specs/spec_id/sqt_spec.rb +138 -0
data/specs/spec_id/srf_spec.rb +209 -0
data/specs/spec_id/srf_spec_helper.rb +302 -0
data/specs/spec_id_helper.rb +33 -0
data/specs/spec_id_spec.rb +361 -0
data/specs/spec_id_xml_spec.rb +33 -0
data/specs/transmem/phobius_spec.rb +423 -0
data/specs/transmem/toppred_spec.rb +297 -0
data/specs/transmem_spec.rb +60 -0
data/specs/transmem_spec_shared.rb +64 -0
data/specs/validator/aa_spec.rb +107 -0
data/specs/validator/background_spec.rb +51 -0
data/specs/validator/bias_spec.rb +146 -0
data/specs/validator/decoy_spec.rb +51 -0
data/specs/validator/fasta_helper.rb +26 -0
data/specs/validator/prot_from_pep_spec.rb +141 -0
data/specs/validator/transmem_spec.rb +145 -0
data/specs/validator/true_pos_spec.rb +58 -0
data/specs/validator_helper.rb +33 -0
data/specs/xml_spec.rb +12 -0
data/test_files/000_pepxml18_small.xml +206 -0
data/test_files/020a.mzXML.timeIndex +4710 -0
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
data/test_files/4-03-03_small-prot.xml +321 -0
data/test_files/4-03-03_small.xml +3876 -0
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +5999 -0
data/test_files/bioworks31.params +77 -0
data/test_files/bioworks32.params +62 -0
data/test_files/bioworks33.params +63 -0
data/test_files/bioworks_single_run_small.xml +7237 -0
data/test_files/bioworks_small.fasta +212 -0
data/test_files/bioworks_small.params +63 -0
data/test_files/bioworks_small.phobius +109 -0
data/test_files/bioworks_small.toppred.out +2847 -0
data/test_files/bioworks_small.xml +5610 -0
data/test_files/bioworks_with_INV_small.xml +3753 -0
data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +304 -0
data/test_files/messups.fasta +297 -0
data/test_files/opd1/000.my_answer.100lines.xml +101 -0
data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
data/test_files/opd1/000_020_3prots-prot.xml +62 -0
data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
data/test_files/opd1/sequest.3.1.params +77 -0
data/test_files/opd1/sequest.3.2.params +62 -0
data/test_files/opd1/twenty_scans.mzXML +418 -0
data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +9 -0
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
data/test_files/pepproph_small.xml +4691 -0
data/test_files/phobius.small.noheader.txt +50 -0
data/test_files/phobius.small.small.txt +53 -0
data/test_files/s01_anC1_ld020mM.key.txt +25 -0
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +297 -0
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +14340 -0
data/test_files/tf_bioworks2excel.txt.actual +1035 -0
data/test_files/toppred.small.out +416 -0
data/test_files/toppred.xml.out +318 -0
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
data/test_files/yeast_gly_small-prot.xml +265 -0
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
data/test_files/yeast_gly_small.xml +3807 -0
data/test_files/yeast_gly_small2.parentTimes +6 -0
metadata +273 -57
data/bin/filter.rb +0 -6
data/bin/precision.rb +0 -5
data/lib/spec/mzdata/parser.rb +0 -108
data/lib/spec/mzdata.rb +0 -48
data/lib/spec/mzxml/parser.rb +0 -449
data/lib/spec/scan.rb +0 -55
data/lib/spec_id/filter.rb +0 -797
data/lib/spec_id/precision.rb +0 -421
data/lib/toppred.rb +0 -18
data/script/filter-peps.rb +0 -164
data/test/tc_aa_freqs.rb +0 -59
data/test/tc_fasta_shaker.rb +0 -149
data/test/tc_filter.rb +0 -203
data/test/tc_filter_peps.rb +0 -46
data/test/tc_gi.rb +0 -17
data/test/tc_id_class_anal.rb +0 -70
data/test/tc_id_precision.rb +0 -89
data/test/tc_msrun.rb +0 -88
data/test/tc_mzxml.rb +0 -88
data/test/tc_mzxml_to_lmat.rb +0 -36
data/test/tc_peptide_parent_times.rb +0 -27
data/test/tc_precision.rb +0 -60
data/test/tc_roc.rb +0 -166
data/test/tc_sample_enzyme.rb +0 -32
data/test/tc_scan.rb +0 -26
data/test/tc_sequest.rb +0 -336
data/test/tc_spec.rb +0 -78
data/test/tc_spec_id.rb +0 -201
data/test/tc_spec_id_xml.rb +0 -36
data/test/tc_srf.rb +0 -262

data/lib/bsearch.rb ADDED Viewed

@@ -0,0 +1,120 @@
+#
+# Ruby/Bsearch - a binary search library for Ruby.
+#
+# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
+#     All rights reserved.
+#     This is free software with ABSOLUTELY NO WARRANTY.
+#
+# You can redistribute it and/or modify it under the terms of
+# the Ruby's licence.
+#
+# Example:
+#
+#  % irb -r ./bsearch.rb
+#  >> %w(a b c c c d e f).bsearch_first {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b c c c d e f).bsearch_last {|x| x <=> "c"}
+#  => 4
+#  >> %w(a b c e f).bsearch_first {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b e f).bsearch_first {|x| x <=> "c"}
+#  => nil
+#  >> %w(a b e f).bsearch_last {|x| x <=> "c"}
+#  => nil
+#  >> %w(a b e f).bsearch_lower_boundary {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b e f).bsearch_upper_boundary {|x| x <=> "c"}
+#  => 2
+#  >> %w(a b c c c d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...5
+#  >> %w(a b c d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...3
+#  >> %w(a b d e f).bsearch_range {|x| x <=> "c"}
+#  => 2...2
+module Bsearch
+  VERSION = '1.5'
+end
+class Array
+  #
+  # The binary search algorithm is extracted from Jon Bentley's
+  # Programming Pearls 2nd ed. p.93
+  #
+  #
+  # Return the lower boundary. (inside)
+  #
+  def bsearch_lower_boundary (range = 0 ... self.length, &block)
+    lower  = range.first() -1
+    upper = if range.exclude_end? then range.last else range.last + 1 end
+    while lower + 1 != upper
+      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
+      if yield(self[mid]) < 0
+	lower = mid
+      else
+	upper = mid
+      end
+    end
+    return upper
+  end
+  #
+  # This method searches the FIRST occurrence which satisfies a
+  # condition given by a block in binary fashion and return the
+  # index of the first occurrence. Return nil if not found.
+  #
+  def bsearch_first (range = 0 ... self.length, &block)
+    boundary = bsearch_lower_boundary(range, &block)
+    if boundary >= self.length || yield(self[boundary]) != 0
+      return nil
+    else
+      return boundary
+    end
+  end
+  alias bsearch bsearch_first
+  #
+  # Return the upper boundary. (outside)
+  #
+  def bsearch_upper_boundary (range = 0 ... self.length, &block)
+    lower  = range.first() -1
+    upper = if range.exclude_end? then range.last else range.last + 1 end
+    while lower + 1 != upper
+      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
+      if yield(self[mid]) <= 0
+	lower = mid
+      else
+	upper = mid
+      end
+    end
+    return lower + 1 # outside of the matching range.
+  end
+  #
+  # This method searches the LAST occurrence which satisfies a
+  # condition given by a block in binary fashion and return the
+  # index of the last occurrence. Return nil if not found.
+  #
+  def bsearch_last (range = 0 ... self.length, &block)
+    # `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
+    boundary = bsearch_upper_boundary(range, &block) - 1
+    if (boundary <= -1 || yield(self[boundary]) != 0)
+      return nil
+    else
+      return boundary
+    end
+  end
+  #
+  # Return the search result as a Range object.
+  #
+  def bsearch_range (range = 0 ... self.length, &block)
+    lower = bsearch_lower_boundary(range, &block)
+    upper = bsearch_upper_boundary(range, &block)
+    return lower ... upper
+  end
+end

data/lib/fasta.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 require 'sample_enzyme'
 require 'each_index'
+require 'optparse'
+require 'delegate'
+require 'hash_by'
+require 'digest/md5'
 tmp = $VERBOSE ; $VERBOSE = nil
@@ -27,8 +31,10 @@ end
 $VERBOSE = tmp
+module FastaManipulation ; end
-class Fasta
+class Fasta < DelegateClass(Array)
+  include FastaManipulation
   SHUFF_PREFIX = "SHUFF_"
   SHUFF_FILE_POSTFIX = "_SHUFF"
   CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
@@ -37,21 +43,124 @@ class Fasta
   INV_FILE_POSTFIX = "_INV"
   CAT_INV_FILE_POSTFIX = "_CAT_INV"
-  attr_accessor :prots
+  attr_writer :prots
+  # this will probably be relative
+  attr_accessor :filename
-  def initialize(prots=nil)
-    if prots
-      @prots = prots
+  # for backwards compatibility
+  def prots
+    @prots
+  end
+  def self.to_fasta(file_or_obj)
+    if file_or_obj.is_a? Fasta
+      file_or_obj
+    else
+      Fasta.new(file_or_obj)
+    end
+  end
+  # arg can be:
+  #   Fasta::Prot objects (Array)
+  #   filename (String)
+  #   Another Fasta object (Fasta) (shallow copy!)
+  def initialize(arg=nil, filename=nil)
+    @filename = filename
+    @prots = []
+    if arg
+      if arg.is_a? Fasta
+        self.prots = arg.prots
+        self.filename = arg.filename
+      elsif arg.is_a? Array
+        @prots = arg
+      else
+        read_file(arg)
+      end
+    end
+    super(@prots)
+  end
+  # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
+  def md5_sum
+    if File.exist?(@filename)
+      Digest::MD5.hexdigest(File.read(@filename))
     else
-      @prots = []
+      nil
     end
   end
+  # returns the length of the file (in terms of the total number of amino
+  # acids represented)
+  def aa_seq_length
+    tot = 0
+    self.each do |prot|
+      tot += prot.aaseq.size
+    end
+    tot
+  end
+  # searches proteins for a match to the exact sequence and returns a single
+  # protein header (with > & no newline)
+  # exact matches). nil if no matches
+  def header_from_exact_sequence(aaseq)
+    hash = self.hash_by(:aaseq)
+    answ = hash[aaseq].map{|v| v.header}
+    if answ.size == 1
+      answ
+    elsif answ.size == 0
+      nil
+    else
+      answ
+    end
+  end
+  # searches all headers to see if they include input string
+  # returns true if one matches, false otherwise
+  # (remember that headers are not stored with newline chars but do contain
+  # beginning '>'
+  def included_in_header?(input)
+    @prots.any? do |prot|
+      prot.header.include? input
+    end
+  end
+  # takes an io object or string (which is the fasta data) This is not as
+  # stringent as 'read_file' which is recommended for industrial type use. For
+  # instance, this will fail if your newlines are different in your file from
+  # those defined on your operating system.  If you have a string, simply pass
+  # in StringIO.new(your_string) to be read.
+  # returns self
+  def load(io)
+    current_prot = nil
+    current_aaseq = nil
+    @prots.clear
+    io.each do |line|
+      if line[0,1] == '>'
+        current_prot = Prot.new
+        @prots << current_prot
+        current_prot.header = line.chomp
+        current_aaseq = ''
+        current_prot.aaseq = current_aaseq
+      elsif (line =~ /[^ ]/) && (line.size > 1)
+        current_aaseq << line.chomp
+      end
+    end
+    self
+  end
+  # uses 'load' to create a fasta object from a fasta string
+  def self.from_string(string)
+    Fasta.new.load(StringIO.new(string))
+  end
   # Reads fasta files (under windows or unix newlines)
   # Always outputs LF separated files
   # Checks that the first character per line is '>' or character class [A-Za-z*]
   # returns a fasta object for stringing commands
-  def read_file(fn)
+  # if fn not given, will read the :filename attribute
+  # will set :filename to fn is given
+  def read_file(fn=nil)
+    @filename = fn if fn
     first_char_re = /[A-Za-z*]/o
     obj = nil
     regex = /(\r\n)|\n/o
@@ -76,14 +185,151 @@ class Fasta
     self
   end
-  # Returns filename with the extension (including the '.' prefixed with
-  # the extension_prefix (given as a string)
-  def self.prefix_extension(filename, extension_prefix)
-    ext = File.extname(filename)
-    ext_regex = /#{Regexp.escape(ext)}$/o
-    new_filename = filename.gsub(ext_regex, extension_prefix + ext)
+  # if no fn, will write to :filename attribute
+  def write_file(fn=nil)
+    fn = @out unless fn
+    File.open(fn, "wb") do |out|
+      @prots.each do |prot|
+        out.print(prot.to_s)
+      end
+    end
+  end
+  # duplicates the object (deep copy)
+  def dup
+    other = self.class.new
+    other.filename = self.filename
+    self.prots.each do |prot|
+      other.prots << prot.dup
+    end
+    other
+  end
+end
+class FastaShaker
+  def reverse(fasta_file_or_obj, opts={})
+    shake_it(:reverse, fasta_file_or_obj, opts)
+  end
+  def shuffle(fasta_file_or_obj, opts={})
+    shake_it(:shuffle, fasta_file_or_obj, opts)
+  end
+  # sets the outbound filename attribute from opts
+  def create_filename(fasta, method, opts={})
+    file = fasta.filename || 'fasta'
+    filebase = file.sub(/\..*$/,'')
+    parts = [filebase]
+    parts << 'cat' if opts[:cat]
+    parts << method
+    parts << 'prefix' << opts[:prefix] if opts[:prefix]
+    parts << 'fraction' << opts[:fraction] if opts[:fraction]
+    parts << 'tryptic_peptides' if opts[:tryptic_peptides]
+    parts.join("_") << ".fasta"
+  end
+  protected
+  def shake_it(method, fasta_file_or_obj, opt)
+    fasta = Fasta.to_fasta(fasta_file_or_obj)
+    if opt[:cat] && !opt[:prefix]
+      message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
+      warn message
+    end
+    unless opt[:out]
+      opt[:out] = create_filename(fasta, method, opt)
+    end
+    ## CAT (save an original copy)
+    fasta_orig = fasta.dup if opt[:cat]
+    ## FRACTION the proteins
+    if f = opt[:fraction]
+      prefix = nil
+      if f > 1.0
+        prefix = proc {|cnt| "f#{cnt}_" }
+      end
+      fasta = fasta.fraction_of_prots(f, prefix)
+    end
+    ## PREFIX the proteins
+    if pre = opt[:prefix]
+      fasta.header_prefix!(pre)
+    end
+    ## MODIFY the proteins
+    fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
+    ## CAT (finish it up)
+    if opt[:cat]
+      fasta_orig << fasta
+      fasta = fasta_orig
+    end
+    ## WRITE out the file
+    fasta.write_file(opt[:out])
+  end
+  #############################################
+  # END MAIN METHODS
+  #############################################
+  # takes command line input, and sends it to shake
+  def FastaShaker.shake_from_argv(argv)
+    opt = {}
+    opts = OptionParser.new do |op|
+      prog = File.basename(__FILE__)
+      op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
+      op.separator "   <method> = reverse | shuffle"
+      op.separator ""
+      op.separator "fasta_shaker is kind of like a salt shaker:"
+      op.separator "shake up your fasta proteins and let them"
+      op.separator "season your dinner (hopefully a protein dinner).  Mmmm."
+      op.separator "false identification rates never tasted so good :)"
+      op.separator ""
+      op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
+      op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
+      op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
+      op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
+      op.separator "        [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
+      op.separator "         (after any given prefix) so that proteins are unique]"
+      op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
+      op.separator ""
+      op.separator "EXAMPLES: "
+      op.separator "   #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
+      op.separator "   #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
+      op.separator "   #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
+      op.separator "   #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
+    end
+    #p argv
+    opts.parse!(argv)
+    if argv.size < 2
+      puts opts
+      exit
+    end
+    (method, file) = argv
+    fs = FastaShaker.new
+    fs.send(method.to_sym, file, opt)
   end
+  private
+end
+module FastaManipulation
   # concatenates the filenames like this:
   #   cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
   #   the path and extension of the first filename are kept intact.
@@ -99,18 +345,6 @@ class Fasta
     fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
   end
-  # Convenience method for creating a modified file with a particular method
-  # from Fasta.  Returns the name of the output file.
-  def self.modify_file(file, method, file_postfix="", prot_header_prefix=nil)
-    file_out = prefix_extension(file, file_postfix)
-    fasta = Fasta.new
-    fasta.read_file(file)
-    fasta.send(method)
-    fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
-    fasta.write_file(file_out)
-    file_out
-  end
   # returns a new fasta object using some fraction of proteins randomly
   # selected (fraction may be > 1).  Always rounds up.  Will not choose a
   # protein twice unless all other proteins have been chosen
@@ -150,30 +384,6 @@ class Fasta
     fasta_fraction = Fasta.new(arr)
   end
-  # Convenience method for modifying some fraction of the proteins of a file
-  # and concatenating it to a copy of the original.  Returns the name of the
-  # output file.
-  def self.modify_fraction_and_cat_to_file(file, method, fraction=1, file_postfix=nil, prot_header_prefix=nil)
-    #puts [file, method, fraction, file_postfix, prot_header_prefix].join("*")
-    file_postfix = "" unless file_postfix
-    fasta = Fasta.new
-    fasta.read_file(file)
-    outfile = prefix_extension(file, file_postfix)
-    other_fasta = fasta.fraction_of_prots(fraction)
-    other_fasta.send(method)
-    other_fasta.header_prefix!(prot_header_prefix) if prot_header_prefix
-    fasta << other_fasta
-    fasta.write_file(outfile)
-    return outfile
-  end
-  # Convenience method for modifying a file and concatenating it to a copy of
-  # the original.  Returns th name of the output file.
-  def self.modify_and_cat_to_file(file, method, file_postfix=nil, prot_header_prefix=nil)
-    fraction = 1
-    modify_fraction_and_cat_to_file(file, method, fraction, file_postfix, prot_header_prefix)
-  end
   # Convenience method to concatenate an array of fasta files.  Filenames are
   # concatenated according to 'cat_filenames') and prefixes the proteins
   # according to the values in 'file_prot_header_prefixes' array
@@ -196,35 +406,13 @@ class Fasta
   end
   def <<(other)
-    @prots.push(*(other.prots))
-  end
-  # @TODO: this should be in terms of sets, right now depends on order!!
-  def ==(other)
-    other_prots = other.prots
-    @prots.each_with_index do |prot, index|
-      if other_prots[index] != prot
-        return false
-      end
-    end
-    return true
-  end
-  def write_file(fn)
-    File.open(fn, "wb") do |out|
-      @prots.each do |prot|
-        out.print(prot.to_s)
-      end
-    end
-  end
-  # duplicates the object (deep copy)
-  def dup
-    other = self.class.new
-    self.prots.each do |prot|
-      other.prots << prot.dup
+    # case when with class names uses === operator
+    case other
+    when Fasta
+      @prots.push(*(other.prots))
+    when Fasta::Prot
+      @prots.push(other)
     end
-    other
   end
   # method = :shuffle! | :reverse!
@@ -260,10 +448,39 @@ class Fasta
       prot.header_prefix!(prefix)
     end
   end
+end
+# requires that object respond_to? :reference
+module ProteinReferenceable
+  # gives the string up to the first space (without the leading '>')
+  def first_entry
+    ref = reference
+    if ref
+      if ref.size > 1
+        ls_ref = ref.lstrip
+        index = ls_ref.index(' ')
+        if index
+          ls_ref[0...index]
+        else
+          ls_ref.dup
+        end
+      else
+        ''
+      end
+    else
+      nil
+    end
+  end
 end
 class Fasta::Prot
+  include ProteinReferenceable
   # header given as full line with starting '>' (but no newline chars!).
   # aaseq also given without any newline chars
   attr_accessor :header, :aaseq
@@ -280,15 +497,30 @@ class Fasta::Prot
     other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
   end
-  # returns the fasta header information without the leading '>'
-  def reference
-    if @header =~ /^>(.*)/
-      $1.dup
+  # gives the string up to the first space (without the leading '>')
+  def first_entry
+    if @header
+      if @header.size > 1
+        index = @header.index(' ')
+        if index
+          @header[1...index]
+        else
+          @header[1..-1]
+        end
+      else
+        ''
+      end
     else
-      @header
+      nil
     end
   end
+  # returns the fasta header information without the leading '>'
+  def reference
+    @header[1..-1]
+  end
   # returns the value after the first '|' and before the second '|'
   # according to this regexp: /\|(.*?)\|/
   # This will typically be the gi code
@@ -314,7 +546,7 @@ class Fasta::Prot
   def tryptic_peptides!(method_as_symbol)
     peps = SampleEnzyme.tryptic(@aaseq)
     ends_in_RK = /[KR]/o
     ## if the last peptide doesn't end in R or K we want to flip it completely
     last_pep_special = nil
     if peps.last[-1,1] !~ /[KR]/
@@ -360,7 +592,7 @@ class Fasta::Prot
 end
 # For reference, my code is about 15X faster than the first code I wrote
 # below!  It turns out that the major slowdown is in the randomize routine.
 # Using my own randomize routine with the below way of reading fasta
@@ -391,4 +623,4 @@ end
 #    end
 #  end
 #end
+by=:protein, num=1

data/lib/group_by.rb ADDED Viewed

@@ -0,0 +1,10 @@
+#taken from rails, will be in Ruby 1.9
+module Enumerable
+  def group_by
+    inject({}) do |groups, element|
+      (groups[yield(element)] ||= []) << element
+      groups
+    end
+  end
+end

data/lib/index_by.rb ADDED Viewed

@@ -0,0 +1,11 @@
+# taken from rails (will be in Ruby 1.9??)
+module Enumerable
+  def index_by
+    inject({}) do |accum, elem|
+      accum[yield(elem)] = elem
+      accum
+    end
+  end
+end

data/lib/merge_deep.rb ADDED Viewed

@@ -0,0 +1,21 @@
+class Hash
+  # any hashes within the hash will also be merged to the level specifid
+  def merge_deep(hash2, level=1)
+    if level == 1
+      tmp_opts = {}
+      self.each do |k,v|
+        if (v.is_a?(Hash) and hash2[k].is_a?(Hash))
+          tmp_opts[k] = v.merge(hash2[k])
+        end
+      end
+      opts = self.merge(hash2)
+      opts.merge!(tmp_opts)
+      opts
+    else
+      raise NotImplementedError, "need to implement level > 1"
+    end
+  end
+end