RubyGems - mspire - Versions diffs - 0.4.9 → 0.5.0 - Mend

mspire 0.4.9 → 0.5.0

Files changed (255) hide show

data/README +27 -17
data/changelog.txt +31 -62
data/lib/ms/calc.rb +32 -0
data/lib/ms/data/interleaved.rb +60 -0
data/lib/ms/data/lazy_io.rb +73 -0
data/lib/ms/data/lazy_string.rb +15 -0
data/lib/ms/data/simple.rb +59 -0
data/lib/ms/data/transposed.rb +41 -0
data/lib/ms/data.rb +57 -0
data/lib/ms/format/format_error.rb +12 -0
data/lib/ms/spectrum.rb +25 -384
data/lib/ms/support/binary_search.rb +126 -0
data/lib/ms.rb +10 -10
metadata +38 -350
data/INSTALL +0 -58
data/README.rdoc +0 -18
data/Rakefile +0 -330
data/bin/aafreqs.rb +0 -23
data/bin/bioworks2excel.rb +0 -14
data/bin/bioworks_to_pepxml.rb +0 -148
data/bin/bioworks_to_pepxml_gui.rb +0 -225
data/bin/fasta_shaker.rb +0 -5
data/bin/filter_and_validate.rb +0 -5
data/bin/gi2annot.rb +0 -14
data/bin/id_class_anal.rb +0 -112
data/bin/id_precision.rb +0 -172
data/bin/ms_to_lmat.rb +0 -67
data/bin/pepproph_filter.rb +0 -16
data/bin/prob_validate.rb +0 -6
data/bin/protein_summary.rb +0 -6
data/bin/protxml2prots_peps.rb +0 -32
data/bin/raw_to_mzXML.rb +0 -55
data/bin/run_percolator.rb +0 -122
data/bin/sqt_group.rb +0 -26
data/bin/srf_group.rb +0 -27
data/bin/srf_to_sqt.rb +0 -40
data/lib/align/chams.rb +0 -78
data/lib/align.rb +0 -154
data/lib/archive/targz.rb +0 -94
data/lib/bsearch.rb +0 -120
data/lib/core_extensions.rb +0 -16
data/lib/fasta.rb +0 -626
data/lib/gi.rb +0 -124
data/lib/group_by.rb +0 -10
data/lib/index_by.rb +0 -11
data/lib/merge_deep.rb +0 -21
data/lib/ms/converter/mzxml.rb +0 -77
data/lib/ms/gradient_program.rb +0 -170
data/lib/ms/msrun.rb +0 -244
data/lib/ms/msrun_index.rb +0 -108
data/lib/ms/parser/mzdata/axml.rb +0 -67
data/lib/ms/parser/mzdata/dom.rb +0 -175
data/lib/ms/parser/mzdata/libxml.rb +0 -7
data/lib/ms/parser/mzdata.rb +0 -31
data/lib/ms/parser/mzxml/axml.rb +0 -70
data/lib/ms/parser/mzxml/dom.rb +0 -182
data/lib/ms/parser/mzxml/hpricot.rb +0 -253
data/lib/ms/parser/mzxml/libxml.rb +0 -19
data/lib/ms/parser/mzxml/regexp.rb +0 -122
data/lib/ms/parser/mzxml/rexml.rb +0 -72
data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
data/lib/ms/parser/mzxml.rb +0 -282
data/lib/ms/parser.rb +0 -108
data/lib/ms/precursor.rb +0 -25
data/lib/ms/scan.rb +0 -81
data/lib/mspire.rb +0 -4
data/lib/pi_zero.rb +0 -244
data/lib/qvalue.rb +0 -161
data/lib/roc.rb +0 -187
data/lib/sample_enzyme.rb +0 -160
data/lib/scan_i.rb +0 -21
data/lib/spec_id/aa_freqs.rb +0 -170
data/lib/spec_id/bioworks.rb +0 -497
data/lib/spec_id/digestor.rb +0 -138
data/lib/spec_id/mass.rb +0 -179
data/lib/spec_id/parser/proph.rb +0 -335
data/lib/spec_id/precision/filter/cmdline.rb +0 -218
data/lib/spec_id/precision/filter/interactive.rb +0 -134
data/lib/spec_id/precision/filter/output.rb +0 -148
data/lib/spec_id/precision/filter.rb +0 -637
data/lib/spec_id/precision/output.rb +0 -60
data/lib/spec_id/precision/prob/cmdline.rb +0 -160
data/lib/spec_id/precision/prob/output.rb +0 -94
data/lib/spec_id/precision/prob.rb +0 -249
data/lib/spec_id/proph/pep_summary.rb +0 -104
data/lib/spec_id/proph/prot_summary.rb +0 -484
data/lib/spec_id/proph.rb +0 -4
data/lib/spec_id/protein_summary.rb +0 -489
data/lib/spec_id/sequest/params.rb +0 -316
data/lib/spec_id/sequest/pepxml.rb +0 -1458
data/lib/spec_id/sequest.rb +0 -33
data/lib/spec_id/sqt.rb +0 -349
data/lib/spec_id/srf.rb +0 -973
data/lib/spec_id.rb +0 -778
data/lib/spec_id_xml.rb +0 -99
data/lib/transmem/phobius.rb +0 -147
data/lib/transmem/toppred.rb +0 -368
data/lib/transmem.rb +0 -157
data/lib/validator/aa.rb +0 -48
data/lib/validator/aa_est.rb +0 -112
data/lib/validator/background.rb +0 -77
data/lib/validator/bias.rb +0 -95
data/lib/validator/cmdline.rb +0 -431
data/lib/validator/decoy.rb +0 -107
data/lib/validator/digestion_based.rb +0 -70
data/lib/validator/probability.rb +0 -51
data/lib/validator/prot_from_pep.rb +0 -234
data/lib/validator/q_value.rb +0 -32
data/lib/validator/transmem.rb +0 -272
data/lib/validator/true_pos.rb +0 -46
data/lib/validator.rb +0 -197
data/lib/xml.rb +0 -38
data/lib/xml_style_parser.rb +0 -119
data/lib/xmlparser_wrapper.rb +0 -19
data/release_notes.txt +0 -2
data/script/compile_and_plot_smriti_final.rb +0 -97
data/script/create_little_pepxml.rb +0 -61
data/script/degenerate_peptides.rb +0 -47
data/script/estimate_fpr_by_cysteine.rb +0 -226
data/script/extract_gradient_programs.rb +0 -56
data/script/find_cysteine_background.rb +0 -137
data/script/genuine_tps_and_probs.rb +0 -136
data/script/get_apex_values_rexml.rb +0 -44
data/script/histogram_probs.rb +0 -61
data/script/mascot_fix_pepxml.rb +0 -123
data/script/msvis.rb +0 -42
data/script/mzXML2timeIndex.rb +0 -25
data/script/peps_per_bin.rb +0 -67
data/script/prep_dir.rb +0 -121
data/script/simple_protein_digestion.rb +0 -27
data/script/smriti_final_analysis.rb +0 -103
data/script/sqt_to_meta.rb +0 -24
data/script/top_hit_per_scan.rb +0 -67
data/script/toppred_to_yaml.rb +0 -47
data/script/tpp_installer.rb +0 -249
data/specs/align_spec.rb +0 -79
data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
data/specs/bin/fasta_shaker_spec.rb +0 -259
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
data/specs/bin/filter_and_validate_spec.rb +0 -180
data/specs/bin/ms_to_lmat_spec.rb +0 -34
data/specs/bin/prob_validate_spec.rb +0 -86
data/specs/bin/protein_summary_spec.rb +0 -14
data/specs/fasta_spec.rb +0 -354
data/specs/gi_spec.rb +0 -22
data/specs/load_bin_path.rb +0 -7
data/specs/merge_deep_spec.rb +0 -13
data/specs/ms/gradient_program_spec.rb +0 -77
data/specs/ms/msrun_spec.rb +0 -498
data/specs/ms/parser_spec.rb +0 -92
data/specs/ms/spectrum_spec.rb +0 -87
data/specs/pi_zero_spec.rb +0 -115
data/specs/qvalue_spec.rb +0 -39
data/specs/roc_spec.rb +0 -251
data/specs/rspec_autotest.rb +0 -149
data/specs/sample_enzyme_spec.rb +0 -126
data/specs/spec_helper.rb +0 -135
data/specs/spec_id/aa_freqs_spec.rb +0 -52
data/specs/spec_id/bioworks_spec.rb +0 -148
data/specs/spec_id/digestor_spec.rb +0 -75
data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
data/specs/spec_id/precision/filter/output_spec.rb +0 -31
data/specs/spec_id/precision/filter_spec.rb +0 -246
data/specs/spec_id/precision/prob_spec.rb +0 -44
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
data/specs/spec_id/protein_summary_spec.rb +0 -189
data/specs/spec_id/sequest/params_spec.rb +0 -68
data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
data/specs/spec_id/sequest_spec.rb +0 -38
data/specs/spec_id/sqt_spec.rb +0 -246
data/specs/spec_id/srf_spec.rb +0 -172
data/specs/spec_id/srf_spec_helper.rb +0 -139
data/specs/spec_id_helper.rb +0 -33
data/specs/spec_id_spec.rb +0 -366
data/specs/spec_id_xml_spec.rb +0 -33
data/specs/transmem/phobius_spec.rb +0 -425
data/specs/transmem/toppred_spec.rb +0 -298
data/specs/transmem_spec.rb +0 -60
data/specs/transmem_spec_shared.rb +0 -64
data/specs/validator/aa_est_spec.rb +0 -66
data/specs/validator/aa_spec.rb +0 -40
data/specs/validator/background_spec.rb +0 -67
data/specs/validator/bias_spec.rb +0 -122
data/specs/validator/decoy_spec.rb +0 -51
data/specs/validator/fasta_helper.rb +0 -26
data/specs/validator/prot_from_pep_spec.rb +0 -141
data/specs/validator/transmem_spec.rb +0 -146
data/specs/validator/true_pos_spec.rb +0 -58
data/specs/validator_helper.rb +0 -33
data/specs/xml_spec.rb +0 -12
data/test_files/000_pepxml18_small.xml +0 -206
data/test_files/020a.mzXML.timeIndex +0 -4710
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
data/test_files/4-03-03_small-prot.xml +0 -321
data/test_files/4-03-03_small.xml +0 -3876
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +0 -5999
data/test_files/bioworks31.params +0 -77
data/test_files/bioworks32.params +0 -62
data/test_files/bioworks33.params +0 -63
data/test_files/bioworks_single_run_small.xml +0 -7237
data/test_files/bioworks_small.fasta +0 -212
data/test_files/bioworks_small.params +0 -63
data/test_files/bioworks_small.phobius +0 -109
data/test_files/bioworks_small.toppred.out +0 -2847
data/test_files/bioworks_small.xml +0 -5610
data/test_files/bioworks_with_INV_small.xml +0 -3753
data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +0 -304
data/test_files/messups.fasta +0 -297
data/test_files/opd1/000.my_answer.100lines.xml +0 -101
data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
data/test_files/opd1/000_020_3prots-prot.xml +0 -62
data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
data/test_files/opd1/sequest.3.1.params +0 -77
data/test_files/opd1/sequest.3.2.params +0 -62
data/test_files/opd1/twenty_scans.mzXML +0 -418
data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +0 -9
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
data/test_files/pepproph_small.xml +0 -4691
data/test_files/phobius.small.noheader.txt +0 -50
data/test_files/phobius.small.small.txt +0 -53
data/test_files/s01_anC1_ld020mM.key.txt +0 -25
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +0 -297
data/test_files/small.sqt +0 -87
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +0 -14340
data/test_files/tf_bioworks2excel.txt.actual +0 -1035
data/test_files/toppred.small.out +0 -416
data/test_files/toppred.xml.out +0 -318
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
data/test_files/yeast_gly_small-prot.xml +0 -265
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
data/test_files/yeast_gly_small.xml +0 -3807
data/test_files/yeast_gly_small2.parentTimes +0 -6

data/lib/fasta.rb DELETED Viewed

@@ -1,626 +0,0 @@
-require 'sample_enzyme'
-require 'each_index'
-require 'optparse'
-require 'delegate'
-require 'hash_by'
-require 'digest/md5'
-tmp = $VERBOSE ; $VERBOSE = nil
-class String
-  def each_index
-    (0...self.size).each do |c|
-      yield c
-    end
-  end
-  # modifies and returns self
-  def shuffle!
-    each_index {|j| i = rand(size-j); self[j], self[j+i] = self[j+i], self[j]}
-    self
-  end
-  def shuffle
-    out = self.dup
-    out.shuffle!
-    out
-  end
-end
-$VERBOSE = tmp
-module FastaManipulation ; end
-class Fasta < DelegateClass(Array)
-  include FastaManipulation
-  SHUFF_PREFIX = "SHUFF_"
-  SHUFF_FILE_POSTFIX = "_SHUFF"
-  CAT_SHUFF_FILE_POSTFIX = "_CAT_SHUFF"
-  FILE_CONNECTOR = "__"
-  INV_PREFIX = "INV_"
-  INV_FILE_POSTFIX = "_INV"
-  CAT_INV_FILE_POSTFIX = "_CAT_INV"
-  attr_writer :prots
-  # this will probably be relative
-  attr_accessor :filename
-  # for backwards compatibility
-  def prots
-    @prots
-  end
-  def self.to_fasta(file_or_obj)
-    if file_or_obj.is_a? Fasta
-      file_or_obj
-    else
-      Fasta.new(file_or_obj)
-    end
-  end
-  # arg can be:
-  #   Fasta::Prot objects (Array)
-  #   filename (String)
-  #   Another Fasta object (Fasta) (shallow copy!)
-  def initialize(arg=nil, filename=nil)
-    @filename = filename
-    @prots = []
-    if arg
-      if arg.is_a? Fasta
-        self.prots = arg.prots
-        self.filename = arg.filename
-      elsif arg.is_a? Array
-        @prots = arg
-      else
-        read_file(arg)
-      end
-    end
-    super(@prots)
-  end
-  # uses the filename (if available, otherwise returning nil) to grab the md5 sum of the file
-  def md5_sum
-    if File.exist?(@filename)
-      Digest::MD5.hexdigest(File.read(@filename))
-    else
-      nil
-    end
-  end
-  # returns the length of the file (in terms of the total number of amino
-  # acids represented)
-  def aa_seq_length
-    tot = 0
-    self.each do |prot|
-      tot += prot.aaseq.size
-    end
-    tot
-  end
-  # searches proteins for a match to the exact sequence and returns a single
-  # protein header (with > & no newline)
-  # exact matches). nil if no matches
-  def header_from_exact_sequence(aaseq)
-    hash = self.hash_by(:aaseq)
-    answ = hash[aaseq].map{|v| v.header}
-    if answ.size == 1
-      answ
-    elsif answ.size == 0
-      nil
-    else
-      answ
-    end
-  end
-  # searches all headers to see if they include input string
-  # returns true if one matches, false otherwise
-  # (remember that headers are not stored with newline chars but do contain
-  # beginning '>'
-  def included_in_header?(input)
-    @prots.any? do |prot|
-      prot.header.include? input
-    end
-  end
-  # takes an io object or string (which is the fasta data) This is not as
-  # stringent as 'read_file' which is recommended for industrial type use. For
-  # instance, this will fail if your newlines are different in your file from
-  # those defined on your operating system.  If you have a string, simply pass
-  # in StringIO.new(your_string) to be read.
-  # returns self
-  def load(io)
-    current_prot = nil
-    current_aaseq = nil
-    @prots.clear
-    io.each do |line|
-      if line[0,1] == '>'
-        current_prot = Prot.new
-        @prots << current_prot
-        current_prot.header = line.chomp
-        current_aaseq = ''
-        current_prot.aaseq = current_aaseq
-      elsif (line =~ /[^ ]/) && (line.size > 1)
-        current_aaseq << line.chomp
-      end
-    end
-    self
-  end
-  # uses 'load' to create a fasta object from a fasta string
-  def self.from_string(string)
-    Fasta.new.load(StringIO.new(string))
-  end
-  # Reads fasta files (under windows or unix newlines)
-  # Always outputs LF separated files
-  # Checks that the first character per line is '>' or character class [A-Za-z*]
-  # returns a fasta object for stringing commands
-  # if fn not given, will read the :filename attribute
-  # will set :filename to fn is given
-  def read_file(fn=nil)
-    @filename = fn if fn
-    first_char_re = /[A-Za-z*]/o
-    obj = nil
-    regex = /(\r\n)|\n/o
-    fh = File.new(fn).binmode
-    lines = fh.read.split(regex)
-    fh.close
-    first_char = nil
-    lines.each do |line|
-      if line =~ /[^ \n\r]/
-        first_char = line[0,1]
-        if first_char == '>'
-          obj = Prot.new
-          @prots << obj
-          obj.header = line.dup
-        elsif first_char =~ first_char_re
-          obj.aaseq << line.chomp
-        else
-          raise "Line not in fasta format (between arrows): -->#{line}<--"
-        end
-      end
-    end
-    self
-  end
-  # if no fn, will write to :filename attribute
-  def write_file(fn=nil)
-    fn = @out unless fn
-    File.open(fn, "wb") do |out|
-      @prots.each do |prot|
-        out.print(prot.to_s)
-      end
-    end
-  end
-  # duplicates the object (deep copy)
-  def dup
-    other = self.class.new
-    other.filename = self.filename
-    self.prots.each do |prot|
-      other.prots << prot.dup
-    end
-    other
-  end
-end
-class FastaShaker
-  def reverse(fasta_file_or_obj, opts={})
-    shake_it(:reverse, fasta_file_or_obj, opts)
-  end
-  def shuffle(fasta_file_or_obj, opts={})
-    shake_it(:shuffle, fasta_file_or_obj, opts)
-  end
-  # sets the outbound filename attribute from opts
-  def create_filename(fasta, method, opts={})
-    file = fasta.filename || 'fasta'
-    filebase = file.sub(/\..*$/,'')
-    parts = [filebase]
-    parts << 'cat' if opts[:cat]
-    parts << method
-    parts << 'prefix' << opts[:prefix] if opts[:prefix]
-    parts << 'fraction' << opts[:fraction] if opts[:fraction]
-    parts << 'tryptic_peptides' if opts[:tryptic_peptides]
-    parts.join("_") << ".fasta"
-  end
-  protected
-  def shake_it(method, fasta_file_or_obj, opt)
-    fasta = Fasta.to_fasta(fasta_file_or_obj)
-    if opt[:cat] && !opt[:prefix]
-      message = "WARNING: concatenated proteins don't have unique headers\n[you probably wanted to use the '--prefix' option!]"
-      warn message
-    end
-    unless opt[:out]
-      opt[:out] = create_filename(fasta, method, opt)
-    end
-    ## CAT (save an original copy)
-    fasta_orig = fasta.dup if opt[:cat]
-    ## FRACTION the proteins
-    if f = opt[:fraction]
-      prefix = nil
-      if f > 1.0
-        prefix = proc {|cnt| "f#{cnt}_" }
-      end
-      fasta = fasta.fraction_of_prots(f, prefix)
-    end
-    ## PREFIX the proteins
-    if pre = opt[:prefix]
-      fasta.header_prefix!(pre)
-    end
-    ## MODIFY the proteins
-    fasta.aaseq!((method.to_s + '!').to_sym, opt[:tryptic_peptides])
-    ## CAT (finish it up)
-    if opt[:cat]
-      fasta_orig << fasta
-      fasta = fasta_orig
-    end
-    ## WRITE out the file
-    fasta.write_file(opt[:out])
-  end
-  #############################################
-  # END MAIN METHODS
-  #############################################
-  # takes command line input, and sends it to shake
-  def FastaShaker.shake_from_argv(argv)
-    opt = {}
-    opts = OptionParser.new do |op|
-      prog = File.basename(__FILE__)
-      op.banner = "USAGE: #{prog} <method> [OPTIONS] <file>.fasta"
-      op.separator "   <method> = reverse | shuffle"
-      op.separator ""
-      op.separator "fasta_shaker is kind of like a salt shaker:"
-      op.separator "shake up your fasta proteins and let them"
-      op.separator "season your dinner (hopefully a protein dinner).  Mmmm."
-      op.separator "false identification rates never tasted so good :)"
-      op.separator ""
-      op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
-      op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
-      op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
-      op.on("-f", "--fraction <float>", Float, "creates some fraction of proteins") {|v| opt[:fraction] = v }
-      op.separator "        [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
-      op.separator "         (after any given prefix) so that proteins are unique]"
-      op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
-      op.separator ""
-      op.separator "EXAMPLES: "
-      op.separator "   #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
-      op.separator "   #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
-      op.separator "   #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
-      op.separator "   #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
-    end
-    #p argv
-    opts.parse!(argv)
-    if argv.size < 2
-      puts opts
-      exit
-    end
-    (method, file) = argv
-    fs = FastaShaker.new
-    fs.send(method.to_sym, file, opt)
-  end
-  private
-end
-module FastaManipulation
-  # concatenates the filenames like this:
-  #   cat_filenames('fn1.ext1', 'fn2.ext2', '__') # -> 'fn1__fn2.ext1'
-  #   the path and extension of the first filename are kept intact.
-  #   other files only use the basename (with no extension)
-  def self.cat_filenames(filenames, connector="")
-    fn1 = filenames.shift
-    fn1_ext = File.extname(fn1)
-    filenames.collect! do |fn|
-      fn_ext = File.extname(fn)
-      fn_base_no_ext = File.basename(fn, fn_ext)
-    end
-    con_filenames = filenames.join(connector)
-    fn1.gsub(/#{Regexp.escape(fn1_ext)}$/, connector + con_filenames + fn1_ext)
-  end
-  # returns a new fasta object using some fraction of proteins randomly
-  # selected (fraction may be > 1).  Always rounds up.  Will not choose a
-  # protein twice unless all other proteins have been chosen
-  #
-  # fraction_prefix ensures that a unique header is given even if multiple
-  # fraction of proteins are being created
-  # fraction_cnt = (prot_cnt/num_prots).floor.to_i
-  # so for the first n proteins, it will be 0,
-  # the 2n proteins will be 1, etc.
-  # e.g. prefix_proc = proc {|frac_cnt| "f#{frac_cnt}_" }
-  # would give headers like this: >f0_<some_real_header>,
-  # >f1_<some_real_header>, ...
-  def fraction_of_prots(fraction=1, prefix_proc=nil)
-    new_num = (fraction.to_f * self.prots.size).ceil
-    arr = []
-    orig_num_prots = @prots.size
-    # initialize
-    new_prots = @prots.map {|prt| prt.dup }
-    frac_cnt = 0
-    ind_cnt = 0
-    prt_cnt = orig_num_prots
-    while ind_cnt < new_num
-      arr << new_prots.delete_at(rand(new_prots.size))
-      if prefix_proc
-        prefix = prefix_proc.call(frac_cnt)
-        arr.last.header_prefix!(prefix)
-      end
-      prt_cnt -= 1  # index
-      if prt_cnt == 0
-        frac_cnt += 1
-        new_prots = @prots.map {|prt| prt.dup }
-        prt_cnt = orig_num_prots
-      end
-      ind_cnt += 1
-    end
-    fasta_fraction = Fasta.new(arr)
-  end
-  # Convenience method to concatenate an array of fasta files.  Filenames are
-  # concatenated according to 'cat_filenames') and prefixes the proteins
-  # according to the values in 'file_prot_header_prefixes' array
-  def self.cat_and_prefix(files, file_prot_header_prefixes=nil, file_connector=nil)
-    fastas = files.collect do |file|
-      Fasta.new.read_file(file)
-    end
-    outfile = cat_filenames(files, file_connector)
-    if file_prot_header_prefixes
-      file_prot_header_prefixes.each_with_index do |prefix,i|
-        fastas[i].header_prefix!(prefix) if prefix
-      end
-    end
-    fasta1 = fastas.shift
-    fastas.each do |fasta|
-      fasta1 << fasta
-    end
-    fasta1.write_file(outfile)
-    outfile
-  end
-  def <<(other)
-    # case when with class names uses === operator
-    case other
-    when Fasta
-      @prots.push(*(other.prots))
-    when Fasta::Prot
-      @prots.push(other)
-    end
-  end
-  # method = :shuffle! | :reverse!
-  def aaseq!(method_as_symbol=:shuffle!, tryptic_peptides=false)
-    if tryptic_peptides
-      @prots.each {|prot| prot.tryptic_peptides!( method_as_symbol) }
-    else
-      @prots.each {|prot| prot.aaseq!(method_as_symbol) }
-    end
-  end
-  # shuffles the aa sequence of each protein (each protein within itself)
-  def aaseq_shuffle!
-    @prots.each {|prot| prot.shuffle! }
-  end
-  # shuffles the aa sequence of each protein (each protein within itself)
-  def aaseq_invert!
-    @prots.each {|prot| prot.invert! }
-  end
-  def aaseq_invert_tryptic_peptides!
-    @prots.each {|prot| prot.invert_tryptic_peptides! }
-  end
-  def aaseq_shuffle_tryptic_peptides!
-    @prots.each {|prot| prot.invert_tryptic_peptides! }
-  end
-  def header_prefix!(prefix)
-    @prots.each do |prot|
-      prot.header_prefix!(prefix)
-    end
-  end
-end
-# requires that object respond_to? :reference
-module ProteinReferenceable
-  # gives the string up to the first space (without the leading '>')
-  def first_entry
-    ref = reference
-    if ref
-      if ref.size > 1
-        ls_ref = ref.lstrip
-        index = ls_ref.index(' ')
-        if index
-          ls_ref[0...index]
-        else
-          ls_ref.dup
-        end
-      else
-        ''
-      end
-    else
-      nil
-    end
-  end
-end
-class Fasta::Prot
-  include ProteinReferenceable
-  # header given as full line with starting '>' (but no newline chars!).
-  # aaseq also given without any newline chars
-  attr_accessor :header, :aaseq
-  def initialize(header=nil, aaseq=nil)
-    @header = header || ''
-    if aaseq
-      @aaseq = aaseq
-    else
-      @aaseq = ""
-    end
-  end
-  def ==(other)
-    other && other.class == self.class && other.aaseq == self.aaseq && other.header == self.header
-  end
-  # gives the string up to the first space (without the leading '>')
-  def first_entry
-    if @header
-      if @header.size > 1
-        index = @header.index(' ')
-        if index
-          @header[1...index]
-        else
-          @header[1..-1]
-        end
-      else
-        ''
-      end
-    else
-      nil
-    end
-  end
-  # returns the fasta header information without the leading '>'
-  def reference
-    @header[1..-1]
-  end
-  # returns the value after the first '|' and before the second '|'
-  # according to this regexp: /\|(.*?)\|/
-  # This will typically be the gi code
-  # Returns nil if it doesn't match
-  def gi
-    if @header =~ /\|(.*?)\|/
-      $1.dup
-    else
-      nil
-    end
-  end
-  # convenience
-  def invert_tryptic_peptides! ; tryptic_peptides!(:reverse) end
-  def shuffle_tryptic_peptides! ; tryptic_peptides!(:shuffle) end
-  # modifies tryptic peptides as given by SampleEnzyme.tryptic(@aaseq)
-  # [cuts after K or R but not if followed by a P]
-  # if method_as_symbol = :reverse
-  # :reverse | :shuffle OR :reverse! | :shuffle!
-  #  aaseq = 'ABCKCDERDEKDGEKWXYRRKDER'
-  #  -> 'ABCKCDERDEKDGEKWXYRRKDER'
-  def tryptic_peptides!(method_as_symbol)
-    peps = SampleEnzyme.tryptic(@aaseq)
-    ends_in_RK = /[KR]/o
-    ## if the last peptide doesn't end in R or K we want to flip it completely
-    last_pep_special = nil
-    if peps.last[-1,1] !~ /[KR]/
-      last_pep_special = peps.pop
-    end
-    rev_peps = peps.map{|pep| pep[0..-2].send(method_as_symbol) << pep[-1]}
-    if last_pep_special
-      rev_peps << last_pep_special.send(method_as_symbol)
-    end
-    @aaseq = rev_peps.join
-  end
-  # takes :reverse! | :shuffle!
-  def aaseq!(method_as_symbol)
-    @aaseq.send(method_as_symbol)
-  end
-  def invert!
-    @aaseq.reverse!
-  end
-  def shuffle!
-    @aaseq.shuffle!
-  end
-  # adds a prefix to the protein header (which comes after the '>' char) if
-  # one is not already there.
-  def header_prefix!(prefix)
-    unless @header =~ /^>#{Regexp.escape(prefix)}/
-      @header.gsub!(/^>/, ">#{prefix}")
-    end
-  end
-  def dup
-    self.class.new(@header.dup, @aaseq.dup)
-  end
-  # returns the header line and aaseq with trailing newlines as one might find
-  # in a fasta file
-  def to_s
-    @header + "\n" + @aaseq + "\n"
-  end
-end
-# For reference, my code is about 15X faster than the first code I wrote
-# below!  It turns out that the major slowdown is in the randomize routine.
-# Using my own randomize routine with the below way of reading fasta
-# files is 2X faster than below (in other words, my reader is 2X as fasta).
-#
-##!/usr/bin/ruby -w
-#
-#require 'bio'
-#
-#SHUFF_EXT = "_shuffled"
-#
-#if ARGV.size < 1
-#  puts <<END
-#usage: #{File.basename(__FILE__)} file.fasta ...  # -> file#{SHUFF_EXT}.fasta ...
-#Shuffles the amino acid sequence of each protein.
-#END
-#  exit
-#end
-#
-#ARGV.each do |fn|
-#  fn_ext = File.extname(fn)
-#  fn_out = fn.gsub(fn_ext, SHUFF_EXT + fn_ext)
-#  File.open(fn_out, "w") do |fh|
-#    f = Bio::FlatFile.auto(fn)
-#    f.each_entry do |e|
-#      fh.puts '>' + e.definition
-#      fh.puts e.aaseq.randomize
-#    end
-#  end
-#end
-by=:protein, num=1