RubyGems - mspire - Versions diffs - 0.10.7.1 → 0.10.7.2 - Mend

mspire 0.10.7.1 → 0.10.7.2

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +12 -1
data/lib/mspire/mzml/index_list.rb +2 -1
data/lib/mspire/mzml/io_index.rb +2 -1
data/lib/mspire/quant/qspec.rb +63 -23
data/lib/mspire/version.rb +1 -1
data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +40 -49
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: eb806c3b3fc8c31258494541f15be3064f9e8a15
-  data.tar.gz: 9666168614b20a6a8ac974c5c6cd29c715b87b3e
+  metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
+  data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
 SHA512:
-  metadata.gz: f8d798ed8a3efd8b0483c4b957b139eb4a040b76f598eecc9615cfe8332d8bdfbd0b66185e3416d5e98e57fc8f8705af44e479a3d7e263f2c90d138cd0098223
-  data.tar.gz: db2e325aef76a22747cca13f5a25e9357b1e76c1a253ef9b71a10fae7b7df6fd8f1faded2c1f5352112a37ab65e54fe3df6099e1c7b717a4e89b70db80f70075
+  metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
+  data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875

data/README.md CHANGED

@@ -56,12 +56,23 @@ objects associated with Mzml files.
 ```ruby
 require 'mspire/mzml'
+# get the intensity of the highest peak from each spectrum
+intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
+  spectrum.intensities.max
+end
+# open the file for other operations
 Mspire::Mzml.open(mzml_file) do |mzml|
+  # read each spectra
+  mzml.each do |spectrum|
+    # do something with each spectrum ...
+  end
-  # random access by index or id (even if file wasn't indexed)
+  # or random access by index or id (even if file wasn't indexed)
   spectrum = mzml[0]
   spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
+  # some things to do with a spectrum
   spectrum.mzs
   spectrum.intensities

data/lib/mspire/mzml/index_list.rb CHANGED

@@ -55,7 +55,8 @@ module Mspire
         def read_index_list(io)
           if (offset = index_offset(io))
             io.seek(offset)
-            xml = Nokogiri::XML.parse(io.read, nil, @encoding, Parser::NOBLANKS)
+            # TODO: pass in encoding (as second nil)
+            xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
             index_list = xml.root
             num_indices = index_list['count'].to_i
             array = index_list.children.map do |index_n|

data/lib/mspire/mzml/io_index.rb CHANGED

@@ -66,8 +66,9 @@ module Mspire
       end
       def xml_node_from_start_byte(start_byte)
+        # consider passing in @encoding from upstream object (as second nil):
         xml = get_xml_string(start_byte)
-        Nokogiri::XML.parse(xml, nil, @encoding, Parser::NOBLANKS).root
+        Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
       end
       def fetch_xml_node(index)

data/lib/mspire/quant/qspec.rb CHANGED

@@ -2,6 +2,26 @@ module Mspire ; end
 module Mspire::Quant ; end
 class Mspire::Quant::Qspec
+  # This is my current best guess based on the behavior of the original QSpec
+  # and going into the source code and looking at the paired and param
+  # versions.
+  # qspec: discrete spectral count data
+  # qprot: continuous protein abundance data (could be non-discrete spectral
+  # counts or quantitation data)
+  # paired: one sample against another sample
+  # param: one sample against another sample but with one or more replicates
+  EXE = {
+    qspec: {
+      paired: 'qspec-paired',  # <- the old qspec (use qspec here if you have old software)
+      param: 'qspec-param',    # <  the old qspecgp (use qspecgp if you have old software)
+    },
+    qprot: {
+      paired: 'qprot-paired',
+      param: 'qprot-param',
+    },
+    getfdr: 'getfdr',
+  }
   # personal communication with Hyungwon Choi: "We typically use nburn=2000,
   # niter=10000, which is quite sufficient to guarantee the reproducibility of
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
   INIT_HEADER = %w(protid protLen)
   DELIMITER = "\t"
-  SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
   # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
   # returns an array of ints [0,0,0,1,1,1...]
   def self.conditions_to_ints(conditions)
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
   end
   # returns an array of Results structs which is each row of the returned file
-  # works with V2 of QSpec
+  # works with version 1.2.2 of Qprot
   def self.results_array(resultsfile)
     rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
     headers = rows.shift
-    start_bayes = headers.index {|v| v =~ /BayesFactor/i }
+    start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
     rows.map do |row|
       data = [row[0]]
-      data.push( row[1...start_bayes].map(&:to_f) )
-      data.push( *row[start_bayes,4].map(&:to_f) )
-      data.push( row[start_bayes+4] )
+      data.push( row[1...start_log_fold].map(&:to_f) )
+      data.push( *row[start_log_fold,5].map(&:to_f) )
       Results.new(*data)
     end
   end
   # returns the right executable based on the array of conditions
-  def self.executable(conditions)
+  def executable
     biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
-    (biggest_size >= 3) ? 'qspecgp' : 'qspec'
+    EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
   end
-  # protname_length_pairs is an array of doublets: [protname, length]
+  # protname is a list of protein names.
+  # by default, qprot will be run.  If you really want qspec to be run, then
+  # supply a [protname, length] doublet in place of each protname.
   # condition_to_count_array is an array doublets: [condition, array_of_counts]
-  def initialize(protname_length_pairs, condition_to_count_array)
-    @protname_length_pairs = protname_length_pairs
+  def initialize(protnames, condition_to_count_array)
+    @protnames = protnames
+    if @protnames.first.is_a?(Array)
+      @protname_length_pairs = @protnames
+      @protnames = nil
+    end
     @condition_to_count_array = condition_to_count_array
   end
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
   # writes a qspec formatted file to filename
   def write(filename)
-    ints = Mspire::Quant::Qspec.conditions_to_ints(conditions)
-    header_cats = INIT_HEADER + ints
-    rows = @protname_length_pairs.map {|pair| pair.map.to_a }
+    header_cats = %w(protid)
+    header_cats << 'protLen' if @protname_length_pairs
+    header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
+    ar = @protnames || @protname_length_pairs
+    rows = ar.map {|obj| Array(obj) }
     @condition_to_count_array.each do |cond,counts|
       rows.zip(counts) {|row,cnt| row << cnt }
     end
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
   # returns an array of Qspec::Results objects (each object can be considered
   # a row of data)
   def run(normalize=true, opts={})
+    exe = executable
+    puts "using #{exe}" if $VERBOSE
+    executable_base = exe.split('-')[0]
     puts "normalize: #{normalize}" if $VERBOSE
-    tfile = Tempfile.new("qspec")
+    tfile = Tempfile.new(executable_base)
     write(tfile.path)
     if opts[:keep]
       local_file = File.join(Dir.pwd,File.basename(tfile.path))
       FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
-      puts "(copy of) file submitted to qspec: #{local_file}" if $VERBOSE
+      puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
     end
-    qspec_exe = self.class.executable(conditions)
-    cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
+    cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
     if $VERBOSE
       puts "running #{cmd}" if $VERBOSE
     else
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
     end
     reply = `#{cmd}`
     puts reply if $VERBOSE
-    outfile = tfile.path + '_' + qspec_exe
-    results = self.class.results_array(outfile)
+    outfile = tfile.path + '_' + executable_base
+    system EXE[:getfdr], outfile
+    fdr_file = outfile + "_fdr"
+    puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
+    results = self.class.results_array(fdr_file)
     if opts[:keep]
       local_outfile = File.join(Dir.pwd, File.basename(outfile))
+      local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
       FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
-      puts "(copy of) file returned from qspec: #{outfile}"
+      FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
+      if $VERBOSE
+        puts "(copy of) file returned from qspec: #{outfile}"
+        puts "(copy of) file returned from qspec: #{fdr_file}"
+      end
     end
     tfile.unlink
     results
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
   # for version 2 of QSpec
   # counts array is parallel to the experiment names passed in originally
-  Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
+  #Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
+  # for version 1.2.2 of QProt
+  # counts array is parallel to the experiment names passed in originally
+  Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
 end

data/lib/mspire/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Mspire
-  VERSION = "0.10.7.1"
+  VERSION = "0.10.7.2"
 end

data/script/peptide_hit_qvalues_to_spectral_counts_table.rb CHANGED

@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
 require 'mspire/quant/cmdline'
 require 'mspire/fasta'
 require 'yaml'
 require 'tempfile'
@@ -53,7 +52,7 @@ class Ruport::Data::Table
     File.open(file,'w') do |out|
       opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
       out.puts self.column_names.join(delimiter)
-      self.data.each do |row|
+      self.sort_rows_by(:fdr).data.each do |row|
         out.puts row.to_a.join(delimiter)
       end
       opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
@@ -87,16 +86,16 @@ writes to #{outfile}
 group names can be arbitrarily defined
 }
   opt :fdr_percent, "%FDR as cutoff", :default => 1.0
-  opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta.  Only 2 groups currently allowed", :default => false
+  opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta.  Only 2 groups currently allowed", :default => false
   opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
-  opt :fasta, "the fasta file.  Required for :qspec and :descriptions", :type => String
+  opt :fasta, "the fasta file.  Required for :descriptions", :type => String
   opt :outfile, "the to which file data are written", :default => outfile
   opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
   opt :verbose, "speak up", :default => false
   opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
-  opt :qspec_decibans, "report bayesfactor in decibans"
-  opt :qspec_normalize, "normalize spectral counts per run", :default => false
-  opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
+  opt :qprot_normalize, "normalize spectral counts per run", :default => false
+  opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
+  opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
   opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
   opt :write_subset, "(dev use only) write subset db", :default => false
 end
@@ -112,8 +111,8 @@ if ARGV.size < 2
   opts.educate && exit
 end
-if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
-  puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
+if opt[:descriptions] && !opt[:fasta]
+  puts "You must provide a fasta file with --fasta to use descriptions!!"
   opts.educate && exit
 end
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
 (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
-raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
+raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
 samplenames = samplename_to_filename.keys
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
   attr_accessor :protein_groups
 end
-class Mspire::Ident::Protein
-  attr_accessor :length
-end
+#class Mspire::Ident::Protein
+#  attr_accessor :length
+#end
 fdr_cutoff = opt[:fdr_percent] / 100
-if opt[:qspec] || opt[:descriptions]
-  putsv "reading lengths and descriptions from #{opt[:fasta]}"
+if opt[:descriptions]
+  putsv "reading descriptions from #{opt[:fasta]}"
   #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
-  id_to_length = {}
   id_to_desc = {}
   Mspire::Fasta.foreach(opt[:fasta]) do |entry|
     #acc = Mspire::Fasta.uniprot_id(entry.header)
     acc = entry.accession
-    id_to_length[acc] = entry.length
     id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
   end
 end
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
       # update each peptide with its protein hits
       protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
         protein = all_protein_hits[id]
-        protein.length = id_to_length[id] if id_to_length
         protein.description = id_to_desc[id] if id_to_desc
         protein
       end
@@ -218,48 +214,43 @@ end
 # each cell holds a SpectralCounts object, which hash 3 types of count data
 counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
+counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
+counts_table.data.zip(protein_groups) do |row, pg|
+  best_id = pg.first   # pg.sort_by {|prot| [prot.id, prot.length] }.first
+  row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
+  row.ids = pg.map(&:id).join(',')
+  row.description = best_id.description
+  row.qprot_protname = pg.map(&:id).join(":")
+end
 # return a list of ProteinGroupComparisons
-if opt[:qspec]
+if opt[:qprot]
-  # prepare data for qspec
-  condition_to_count_array = counts_table.column_names.map do |name|
-    [samplename_to_condition[name], counts_table.column(name)]
+  if opt[:qprot_remove_sparse_rows]
+    newrows = counts_table.data.select do |row|
+      row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
+    end
+    counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
   end
-  # average length of the proteins in the group
-  name_length_pairs = protein_groups.map do |pg|
-    [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
+  # prepare data for qprot
+  condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
+    [samplename_to_condition[name], counts_table.column(name)]
   end
-  qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
+  qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
-  cols_to_add = [:bayes_factor, :fold_change, :fdr]
-  to_add_as_headers = cols_to_add.map do |v|
-    if opt[:qspec_decibans] && v == :bayes_factor
-      :decibans
-    else
-      v
-    end
-  end
-  counts_table.add_columns to_add_as_headers
-  counts_table.data.zip(qspec_results) do |row, qspec_result|
+  cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
+  counts_table.add_columns cols_to_add
+  counts_table.data.zip(qprot_results) do |row, qprot_result|
     cols_to_add.each do |cat|
-      if cat == :bayes_factor && opt[:qspec_decibans]
-        row[:decibans] = 10 * Math.log10(qspec_result[cat])
-      else
-        row[cat] = qspec_result[cat]
-      end
+      row[cat] = qprot_result[cat]
     end
   end
 end
-counts_table.add_columns( [:name, :ids, :description] )
-counts_table.data.zip(protein_groups) do |row, pg|
-  best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
-  row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
-  row.ids = pg.map(&:id).join(',')
-  row.description = best_id.description
-end
+counts_table.remove_column(:qprot_protname)
 if opt[:peptides]
   hits_table.each do |record|

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mspire
 version: !ruby/object:Gem::Version
-  version: 0.10.7.1
+  version: 0.10.7.2
 platform: ruby
 authors:
 - John T. Prince
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-21 00:00:00.000000000 Z
+date: 2014-05-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri