RubyGems - mspire - Versions diffs - 0.8.5 → 0.8.6 - Mend

mspire 0.8.5 → 0.8.6

Files changed (23) hide show

data/VERSION +1 -1
data/lib/hash/inverse.rb +15 -0
data/lib/mspire/error_rate/qvalue.rb +5 -5
data/lib/mspire/fasta.rb +2 -0
data/lib/mspire/ident/peptide/db/creator.rb +48 -58
data/lib/mspire/ident/peptide/db/io.rb +5 -0
data/lib/mspire/ident/peptide_hit/qvalue.rb +2 -2
data/lib/mspire/ident/peptide_hit.rb +2 -2
data/lib/mspire/ident/protein_group.rb +4 -2
data/lib/mspire/isotope/aa.rb +10 -10
data/lib/mspire/mzml/instrument_configuration.rb +10 -3
data/lib/mspire/quant/cmdline.rb +42 -0
data/lib/mspire/quant/protein_group_comparison.rb +29 -0
data/lib/mspire/quant/spectral_counts.rb +42 -0
data/script/fasta_to_peptide_centric_db.rb +5 -0
data/script/mascot_dat_to_peptide_hit_qvalues.rb +37 -45
data/script/mass_correct.rb +118 -0
data/script/minimal_protein_set.rb +345 -0
data/script/mzml_to_mgf.rb +46 -0
data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +275 -0
data/spec/mspire/ident/peptide/db/creator_spec.rb +11 -0
data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +157 -157
metadata +11 -2

data/script/mass_correct.rb ADDED Viewed

@@ -0,0 +1,118 @@
+#!/usr/bin/env ruby
+require 'rserve/simpler/R'
+require 'runarray/narray'
+MzDiffs = Struct.new(:mz, :intensity, :spectrum_id, :dev) do
+  def abs_dev
+    self.dev.abs
+  end
+end
+# returns an array of spectrum_id => shift
+def find_spectral_shifts(mz_theor, mz_diffs, dev_cutoff = 0.5)
+  spec_id_to_shift = {}
+  (close_diffs, far_diffs) = mz_diffs.partition {|diff| diff.abs_dev < dev_cutoff }
+  close_mz_vals = close_diffs.map(&:mz)
+  runarray = Runarray::NArray.new(close_mz_vals)
+  outlier_indices = runarray.outliers_iteratively(3)
+  # need the global shift
+  tight_mz_vals = close_mz_vals.reject.with_index do |mz, i|
+    outlier_indices.include?(i)
+  end
+  (mean, sd) = Runarray::NArray.new(tight_mz_vals).sample_stats
+  global_shift = mean - mz_theor
+  close_diffs.zip(close_mz_vals).each.with_index do |(mz_diff, mz_val),i|
+  spec_id_to_shift[mz_diff.spectrum_id] =
+    if outlier_indices.include?(i)
+      global_shift
+    else
+      global_shift + (mz_val - mean)
+    end
+  end
+  far_diffs.each {|mz_diff| spec_id_to_shift[mz_diff.spectrum_id] = global_shift }
+  #pvalue = R.converse( mz_diffs: close_mz_vals ) do
+  #  "shapiro.test(mz_diffs)$p.value"
+  #end
+  spec_id_to_shift
+end
+require 'optparse'
+require 'mspire/mzml'
+ext = ".massCorrected.mzML"
+opt = {}
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename($0)} [OPTS] <m/z> <file>.mzML ..."
+  op.separator "output: <file>#{ext}"
+  op.separator "finds the nearest m/z to <m/z> and shifts m/z values"
+  op.separator "prints the corrected deviation to stdout"
+  op.separator ""
+  op.separator "options:"
+  op.on("-t", "--threshold <Float>", Float, 'intensity must be above threshold') {|v| opt[:threshold] = v }
+  op.on("-f", "--filter-string-regex <regex-no-slashes>", 'only match and calibrate if matches filter string') {|v| opt[:filter_string_regex] = Regexp.new(Regexp.escape(v)) }
+end
+opts.parse!
+if ARGV.size == 0
+  puts opts
+  exit
+end
+threshold = opt[:threshold] || 0.0
+filter_string_regex = opt[:filter_string_regex]
+mz_theor = ARGV.shift.to_f
+ARGV.each do |file|
+  base = file.chomp(File.extname(file))
+  outfile = base + ext
+  mz_diffs = []
+  Mspire::Mzml.open(file) do |mzml|
+    #Finding the deviation
+    mzml.each do |spectrum|
+      if spectrum.ms_level == 1
+        if filter_string_regex
+          next unless filter_string_regex.match(spectrum.scan_list.first.fetch_by_acc('MS:1000512'))
+        end
+        indices = spectrum.find_all_nearest_index(mz_theor)
+        best_index = indices.max {|i| spectrum.intensities[i] }
+        closest_mz = spectrum.mzs[best_index]
+        mz_diffs << MzDiffs.new(closest_mz, spectrum.intensities[best_index], spectrum.id, closest_mz - mz_theor)
+      end
+    end
+    spectral_shifts = find_spectral_shifts(mz_theor, mz_diffs)
+    #correcting the masses
+    spectra = mzml.map do |spectrum|
+      if spectrum.ms_level == 1
+        spectrum.mzs.map! do|mz|
+          if (shift=spectral_shifts[spectrum.id])
+            mz + shift
+          else
+            mz
+          end
+        end
+        spectrum
+      else
+        spectrum
+      end
+    end
+    data_processing = Mspire::Mzml::DataProcessing.new("Corrected_Mass")
+    mzml.data_processing_list << data_processing
+    mzml.run.spectrum_list = Mspire::Mzml::SpectrumList.new(data_processing, spectra)
+    mzml.write(outfile)
+  end
+end

data/script/minimal_protein_set.rb ADDED Viewed

@@ -0,0 +1,345 @@
+#!/usr/bin/ruby
+require 'yaml'
+require 'set'
+require 'optparse'
+require 'mspire/fasta'
+require 'mspire/ident/peptide/db/io'
+SET_RE = /Set\s+(.*)/i
+QVALUE_EXT = ".phq.tsv"
+# returns [sets_to_paths_hash, sets_order]
+  def sets_compare_to_paths(file, ext=QVALUE_EXT)
+  dirname = File.dirname(File.expand_path(file))
+  lines = IO.readlines(file).map {|v| v.chomp }.select {|v| v =~ /\w/}
+  sets = {}
+  current_set = nil
+  sets_order = []
+  lines.each do |line|
+    if line =~ SET_RE
+      current_set = $1.dup
+      sets[current_set] = []
+      sets_order << current_set
+    else
+      full_path = (File.join(dirname,(line + ext)))
+      raise RuntimeError, "file #{full_path} does not exist!!" unless File.exist?(full_path)
+      sets[current_set] << full_path
+    end
+  end
+  [sets, sets_order]
+end
+# returns [minimal_protein_to_uniq_peps_hash, indistinguishable_protein_hash]
+# takes a hash of proteins to aaseqs. Uses a greedy algorithm where
+# things are sorted first by the number of uniq amino acid sequences and total
+# aa length.  if a block is given, then will yield the prot and the
+# peptide_array and sort by the returned value.  The greedy algorithm acts on
+# the REVERSE of the sorted proteins.  indistinguishable_protein_hash is keyed
+# on the proteins in the minimal_protein_array and gives an array of other
+# proteins.
+def minimal_protein_set(proteins_to_aaseqs)
+  blk_given = block_given?
+  #STDERR.puts "using block for minimal_protein_set" if blk_given
+  proteins_and_uniq_peps = []
+  sorted_most_to_least = proteins_to_aaseqs.sort_by do |k,v|
+    if blk_given
+      yield(k,v)
+    else
+      [ v.size, v.inject(0){|m,s| m+s.size} ]
+    end
+  end.reverse
+  found_seq = Set.new
+  same_peptide_hits = {}
+  last_peps = nil
+  last_uniq_prot = nil
+  sorted_most_to_least.each do |prot, peps|
+    sorted_peps = peps.sort # is it necessary to SORT?????????
+    uniq_peps = peps.select do |pep|
+      if found_seq.include?(pep)
+        false
+      else
+        found_seq.add pep
+        true
+      end
+    end
+    if uniq_peps.size > 0
+      proteins_and_uniq_peps << [prot, uniq_peps]
+      same_peptide_hits[prot] = []
+      last_peps = sorted_peps
+      last_uniq_prot = prot
+    else
+      if sorted_peps == last_peps
+        same_peptide_hits[last_uniq_prot] << prot
+      end
+    end
+  end
+  prot_to_uniq_peps_hash = {}
+  proteins_and_uniq_peps.each do |prot, uniq_peps|
+    prot_to_uniq_peps_hash[prot] = uniq_peps
+  end
+  [prot_to_uniq_peps_hash, same_peptide_hits]
+end
+def cutoffs_to_floats(ar)
+  ar.map do |v|
+    if v == 'nil' || v == '-'
+      nil
+    else
+      answ = v.to_f
+    end
+  end
+end
+# returns a hash keyed on protein id that yields an array:
+#   [#aaseq, #aaseq_and_charge, #total_hits]
+def stats_per_prot(prot_to_peps, seq_to_hits)
+  per_protein_hash = {}
+  prot_to_peps.each do |prot, uniq_pep_seqs|
+    all = Set.new
+    aaseqcharges = Set.new
+    aaseqs = Set.new
+    uniq_pep_seqs.each do |pep_seq|
+      all_hits = seq_to_hits[pep_seq]
+      all.merge( all_hits )
+      all_hits.each do |hit|
+        aaseq = hit.sequence
+        aaseqs.add( aaseq )
+        aaseqcharges.add( aaseq + '_' + hit.charge.to_s )
+      end
+      per_protein_hash[prot] = [aaseqs.size, aaseqcharges.size, all.size]
+    end
+  end
+  per_protein_hash
+end
+opt = {
+  :cutoffs => [nil],
+  :outfile => "summary.yml",
+}
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename(__FILE__)} pepcentric_db.yml sets_compare.txt"
+  op.separator "output: #{opt[:outfile]}"
+  op.separator ""
+  op.separator "input: "
+  op.separator "    each <file> referenced in sets_compare.txt should have a"
+  op.separator "    <file>.phq.tsv file"
+  op.separator ""
+  op.separator "options:"
+  op.on("-q", "--qvalue <0-1[,...]>", Array, "only take qvalues < given ['-' for no threshold]") {|v| opt[:cutoffs] = cutoffs_to_floats(v)}
+  op.separator ""
+  op.separator "formats:"
+  op.on("--output-format", "prints the output yaml scheme and exits") {|v| opt[:output_format] = v }
+  op.on("--input-format", "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
+  op.on("--pepcentric-db-format", "prints peptide centric db format and exits") {|v| opt[:pepcentric_db_format] = v }
+end
+# later on we could implement full isoform resolution like IsoformResolver
+# for now we will generate a report, realizing that some isoforms may not be
+# reported
+# it is implemented by using a pre-made map from sequence to protein groups
+# then, a set of sequences allows one to deduce all the relationships from the
+# protein groups.
+opts.parse!
+pd = Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER
+kvd  = Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER
+if opt[:pepcentric_db_format]
+  puts "pepcentric_db.yml needs to be in the format:"
+  puts "<PEPTIDE>#{kvd.inspect}<ID>#{pd.inspect}<ID>#{pd.inspect}<ID>"
+  puts "(The delimiters are shown with #inspect)"
+end
+if opt[:output_format]
+  yaml = <<SKEL
+results:
+- qvalue_cutoff: <Float>
+  sets:
+    <set_name>:
+      num_uniq_aaseqs: <Integer>
+      num_aaseqs_not_in_pep_db: <Integer>
+      num_uniq_aaseqs_charge: <Integer>
+      proteins:
+        <protein_id>:
+          num_hits_all:
+          - <Integer> # total num aaseqs
+          - <Integer> # total num aaseq+charge "prints sets_compare.txt format and exits") {|v| opt[:input_format] = v }
+  op.on("--pepcentric-db-
+          - <Integer> # total num hits
+          num_hits_minimal:
+          - <Integer> # total num aaseqs
+          - <Integer> # total num aaseq+charge
+          - <Integer> # total num hits
+          indistinguishable:
+          - <protein_id>
+          - <protein_id>
+          aaseqs:
+          - <String>
+          - <String>
+sets_order:
+- <String>
+- <String>
+SKEL
+  print yaml
+end
+if opt[:input_format]
+  string =<<EXPLANATION
+# the sets_compare.txt format is very simple:
+Set <some_name_for_set1>
+filename1_no_ext
+filename2_no_ext
+Set <some_name_for_set2>
+filename3_no_ext
+filename4_no_ext
+...
+EXPLANATION
+  puts string
+end
+exit if opt.keys.any? {|key| key.to_s =~ /_format/ }
+if ARGV.size != 2
+  p opts
+  puts opts.to_s
+  exit
+end
+(pepcentric_fn, sets_compare_fn) = ARGV
+results = {}
+results['results'] = []
+(sets_hash, sets_order) = sets_compare_to_paths(sets_compare_fn)
+results['sets_order'] = sets_order
+STDERR.print "Loading peptide centric DB (this takes about a minute)..."
+start = Time.now
+Mspire::Ident::Peptide::Db::IO.open(pepcentric_fn) do |pep_to_prots|
+  STDERR.puts "#{Time.now - start} seconds."
+  opt[:cutoffs].each do |cutoff|
+    cutoff_results = {'qvalue_cutoff' => cutoff}
+    results_sets_hash = {}
+    cutoff_results['sets'] = results_sets_hash
+    results['results'] << cutoff_results
+    #########################
+    # FOR EACH SET:
+    #########################
+    pep_klass = nil
+    sets_hash.each do |set, files|
+      set_results = {}
+      results_sets_hash[set] = set_results
+      # assumes the indices are the same into each data file
+      # get the complete set of passing hits
+      all_passing_hits = files.inject([]) do |all_passing_hits, file|
+        hash = YAML.load_file(file)
+        header_hash = hash['headers']
+        pep_klass ||= Struct.new(*(header_hash.map {|v| v.to_sym }))
+        hits = hash['data'].map {|v| pep_klass.new(*v) }
+        passing_hits =
+          if cutoff
+            # assumes monotonic qvalues values!
+            (above, below) = hits.partition {|hit| hit.qvalue <= cutoff }
+            above
+          else
+            hits
+          end
+        all_passing_hits.push(*passing_hits)
+      end
+      # create an index from aaseq to hits
+      seq_to_hits = Hash.new {|h,k| h[k] = []}
+      uniq_seqcharge = Set.new
+      all_passing_hits.each do |hit|
+        seq_to_hits[hit.sequence] << hit
+        uniq_seqcharge.add( hit.sequence + '_' + hit.charge.to_s )
+      end
+      # determine the number of uniq aaseqs
+      uniq_seqs = seq_to_hits.size
+      num_uniq_seqcharges = uniq_seqcharge.size
+      set_results.merge!( { 'num_peptide_hits' => all_passing_hits.size,
+                         'num_uniq_aaseqs' => uniq_seqs,
+                         'num_uniq_aaseqs_charge' => num_uniq_seqcharges,
+      })
+      # create an index from proteins to peptides
+      prots_to_peps = Hash.new {|h,k| h[k] = [] }
+      peptides_not_found = []
+      seq_to_hits.keys.each do |seq|
+        if pep_db.key?(seq)
+          pep_db[seq].each do |prot|
+            prots_to_peps[prot] << seq
+          end
+        else
+          peptides_not_found << seq
+        end
+      end
+      # Determine the number of 1) hits, 2) aaseqs, 3) aaseqcharges per protein BEFORE minimization
+      stats_per_protein_before = stats_per_prot(prots_to_peps, seq_to_hits)
+      # get the minimal protein set
+      (prot_to_uniq_peps_hash, indistinguishable_protein_hash) = minimal_protein_set(prots_to_peps) do |prot,peps|
+        # will sort with lowest
+        [ peps.size, peps.inject(0){|m,s| m+s.size}, -(prot_sizes_hash[prot])]
+      end
+      prot_to_uniq_peps_hash.each do |prot, peps|
+        [prot, *indistinguishable_protein_hash[prot]].each do |prot|
+          protein_info[prot] = prot_header_hash[prot]
+        end
+      end
+      stats_per_protein_minimal = stats_per_prot(prot_to_uniq_peps_hash, seq_to_hits)
+      # create a hash of data for each protein
+      protein_data_hashes_hash = {}
+      prot_to_uniq_peps_hash.each do |prot, peps|
+        protein_data_hashes_hash[prot] = {
+          'aaseqs' => peps,
+          # this will be a triplet
+          'num_hits_minimal' => stats_per_protein_minimal[prot],
+          'indistinguishable' => indistinguishable_protein_hash[prot],
+          'num_hits_all' => stats_per_protein_before[prot],
+        }
+      end
+      set_results['proteins'] = protein_data_hashes_hash
+      set_results['num_proteins'] = prot_to_uniq_peps_hash.size
+      set_results['num_aaseqs_not_in_pep_db'] = peptides_not_found.size
+      if peptides_not_found.size > 0
+        warn "Did not find in peptide centric db: #{peptides_not_found.join(', ')}"
+      end
+    end
+  end
+  File.open(opt[:outfile], 'w') do |out|
+    out.print results.to_yaml
+  end
+end

data/script/mzml_to_mgf.rb ADDED Viewed

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+require 'mspire/mzml'
+require 'optparse'
+opt = {
+  filter_zero_intensity: true,
+  retention_times: true,
+}
+opts = OptionParser.new do |op|
+  op.banner = "usage: #{File.basename($0)} <file>.mzML ..."
+  op.separator "outputs: <file>.mgf"
+  #op.on("--no-filter-zeros", "won't remove values with zero intensity") {|v| opt[:filter_zero_intensity] = false }
+  # the default is set in ms/msrun/search.rb -> set_opts
+  op.on("--no-retention-times", "won't include RT even if available") {|v| opt[:retention_times] = false }
+end
+opts.parse!
+if ARGV.size == 0
+  puts opts
+  exit
+end
+ARGV.each do |file|
+  if File.exist?(file)
+    Mspire::Mzml.foreach(file).with_index do |spectrum,i|
+      next unless spectrum.ms_level > 1
+      puts "BEGIN IONS"
+      # id, spectrumid,
+      rt = spectrum.retention_time
+      title = [i, "id_#{spectrum.id}", "rt_#{rt.round}"].join('.')
+      puts "TITLE=#{title}"
+      puts "RTINSECONDS=#{rt}" if opt[:retention_times]
+      puts "PEPMASS=#{spectrum.precursor_mz}"
+      puts "CHARGE=#{spectrum.precursor_charge}+"
+      spectrum.each do |mz,int|
+        puts [mz, int].join(" ")
+      end
+      puts "END IONS"
+      puts ""
+    end
+  else
+    puts "missing file: #{file} [skipping]"
+  end
+end