RubyGems - mspire - Versions diffs - 0.10.7.1 → 0.10.7.2 - Mend

mspire 0.10.7.1 → 0.10.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +12 -1
data/lib/mspire/mzml/index_list.rb +2 -1
data/lib/mspire/mzml/io_index.rb +2 -1
data/lib/mspire/quant/qspec.rb +63 -23
data/lib/mspire/version.rb +1 -1
data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +40 -49
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: eb806c3b3fc8c31258494541f15be3064f9e8a15
-  data.tar.gz: 9666168614b20a6a8ac974c5c6cd29c715b87b3e
+  metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
+  data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
 SHA512:
-  metadata.gz: f8d798ed8a3efd8b0483c4b957b139eb4a040b76f598eecc9615cfe8332d8bdfbd0b66185e3416d5e98e57fc8f8705af44e479a3d7e263f2c90d138cd0098223
-  data.tar.gz: db2e325aef76a22747cca13f5a25e9357b1e76c1a253ef9b71a10fae7b7df6fd8f1faded2c1f5352112a37ab65e54fe3df6099e1c7b717a4e89b70db80f70075
+  metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
+  data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875

data/README.md CHANGED

@@ -56,12 +56,23 @@ objects associated with Mzml files.
 ```ruby
 require 'mspire/mzml'
+# get the intensity of the highest peak from each spectrum
+intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
+  spectrum.intensities.max
+end
+# open the file for other operations
 Mspire::Mzml.open(mzml_file) do |mzml|
+  # read each spectra
+  mzml.each do |spectrum|
+    # do something with each spectrum ...
+  end
-  # random access by index or id (even if file wasn't indexed)
+  # or random access by index or id (even if file wasn't indexed)
   spectrum = mzml[0]
   spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
+  # some things to do with a spectrum
   spectrum.mzs
   spectrum.intensities

data/lib/mspire/mzml/index_list.rb CHANGED

@@ -55,7 +55,8 @@ module Mspire
         def read_index_list(io)
           if (offset = index_offset(io))
             io.seek(offset)
-            xml = Nokogiri::XML.parse(io.read, nil, @encoding, Parser::NOBLANKS)
+            # TODO: pass in encoding (as second nil)
+            xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
             index_list = xml.root
             num_indices = index_list['count'].to_i
             array = index_list.children.map do |index_n|

data/lib/mspire/mzml/io_index.rb CHANGED

@@ -66,8 +66,9 @@ module Mspire
       end
       def xml_node_from_start_byte(start_byte)
+        # consider passing in @encoding from upstream object (as second nil):
         xml = get_xml_string(start_byte)
-        Nokogiri::XML.parse(xml, nil, @encoding, Parser::NOBLANKS).root
+        Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
       end
       def fetch_xml_node(index)

data/lib/mspire/quant/qspec.rb CHANGED

@@ -2,6 +2,26 @@ module Mspire ; end
 module Mspire::Quant ; end
 class Mspire::Quant::Qspec
+  # This is my current best guess based on the behavior of the original QSpec
+  # and going into the source code and looking at the paired and param
+  # versions.
+  # qspec: discrete spectral count data
+  # qprot: continuous protein abundance data (could be non-discrete spectral
+  # counts or quantitation data)
+  # paired: one sample against another sample
+  # param: one sample against another sample but with one or more replicates
+  EXE = {
+    qspec: {
+      paired: 'qspec-paired',  # <- the old qspec (use qspec here if you have old software)
+      param: 'qspec-param',    # <  the old qspecgp (use qspecgp if you have old software)
+    },
+    qprot: {
+      paired: 'qprot-paired',
+      param: 'qprot-param',
+    },
+    getfdr: 'getfdr',
+  }
   # personal communication with Hyungwon Choi: "We typically use nburn=2000,
   # niter=10000, which is quite sufficient to guarantee the reproducibility of
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
   INIT_HEADER = %w(protid protLen)
   DELIMITER = "\t"
-  SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
   # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
   # returns an array of ints [0,0,0,1,1,1...]
   def self.conditions_to_ints(conditions)
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
   end
   # returns an array of Results structs which is each row of the returned file
-  # works with V2 of QSpec
+  # works with version 1.2.2 of Qprot
   def self.results_array(resultsfile)
     rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
     headers = rows.shift
-    start_bayes = headers.index {|v| v =~ /BayesFactor/i }
+    start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
     rows.map do |row|
       data = [row[0]]
-      data.push( row[1...start_bayes].map(&:to_f) )
-      data.push( *row[start_bayes,4].map(&:to_f) )
-      data.push( row[start_bayes+4] )
+      data.push( row[1...start_log_fold].map(&:to_f) )
+      data.push( *row[start_log_fold,5].map(&:to_f) )
       Results.new(*data)
     end
   end
   # returns the right executable based on the array of conditions
-  def self.executable(conditions)
+  def executable
     biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
-    (biggest_size >= 3) ? 'qspecgp' : 'qspec'
+    EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
   end
-  # protname_length_pairs is an array of doublets: [protname, length]
+  # protname is a list of protein names.
+  # by default, qprot will be run.  If you really want qspec to be run, then
+  # supply a [protname, length] doublet in place of each protname.
   # condition_to_count_array is an array doublets: [condition, array_of_counts]
-  def initialize(protname_length_pairs, condition_to_count_array)
-    @protname_length_pairs = protname_length_pairs
+  def initialize(protnames, condition_to_count_array)
+    @protnames = protnames
+    if @protnames.first.is_a?(Array)
+      @protname_length_pairs = @protnames
+      @protnames = nil
+    end
     @condition_to_count_array = condition_to_count_array
   end
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
   # writes a qspec formatted file to filename
   def write(filename)
-    ints = Mspire::Quant::Qspec.conditions_to_ints(conditions)
-    header_cats = INIT_HEADER + ints
-    rows = @protname_length_pairs.map {|pair| pair.map.to_a }
+    header_cats = %w(protid)
+    header_cats << 'protLen' if @protname_length_pairs
+    header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
+    ar = @protnames || @protname_length_pairs
+    rows = ar.map {|obj| Array(obj) }
     @condition_to_count_array.each do |cond,counts|
       rows.zip(counts) {|row,cnt| row << cnt }
     end
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
   # returns an array of Qspec::Results objects (each object can be considered
   # a row of data)
   def run(normalize=true, opts={})
+    exe = executable
+    puts "using #{exe}" if $VERBOSE
+    executable_base = exe.split('-')[0]
     puts "normalize: #{normalize}" if $VERBOSE
-    tfile = Tempfile.new("qspec")
+    tfile = Tempfile.new(executable_base)
     write(tfile.path)
     if opts[:keep]
       local_file = File.join(Dir.pwd,File.basename(tfile.path))
       FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
-      puts "(copy of) file submitted to qspec: #{local_file}" if $VERBOSE
+      puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
     end
-    qspec_exe = self.class.executable(conditions)
-    cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
+    cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
     if $VERBOSE
       puts "running #{cmd}" if $VERBOSE
     else
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
     end
     reply = `#{cmd}`
     puts reply if $VERBOSE
-    outfile = tfile.path + '_' + qspec_exe
-    results = self.class.results_array(outfile)
+    outfile = tfile.path + '_' + executable_base
+    system EXE[:getfdr], outfile
+    fdr_file = outfile + "_fdr"
+    puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
+    results = self.class.results_array(fdr_file)
     if opts[:keep]
       local_outfile = File.join(Dir.pwd, File.basename(outfile))
+      local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
       FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
-      puts "(copy of) file returned from qspec: #{outfile}"
+      FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
+      if $VERBOSE
+        puts "(copy of) file returned from qspec: #{outfile}"
+        puts "(copy of) file returned from qspec: #{fdr_file}"
+      end
     end
     tfile.unlink
     results
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
   # for version 2 of QSpec
   # counts array is parallel to the experiment names passed in originally
-  Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
+  #Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
+  # for version 1.2.2 of QProt
+  # counts array is parallel to the experiment names passed in originally
+  Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
 end

data/lib/mspire/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Mspire
-  VERSION = "0.10.7.1"
+  VERSION = "0.10.7.2"
 end

data/script/peptide_hit_qvalues_to_spectral_counts_table.rb CHANGED

@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
 require 'mspire/quant/cmdline'
 require 'mspire/fasta'
 require 'yaml'
 require 'tempfile'
@@ -53,7 +52,7 @@ class Ruport::Data::Table
     File.open(file,'w') do |out|
       opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
       out.puts self.column_names.join(delimiter)
-      self.data.each do |row|
+      self.sort_rows_by(:fdr).data.each do |row|
         out.puts row.to_a.join(delimiter)
       end
       opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
@@ -87,16 +86,16 @@ writes to #{outfile}
 group names can be arbitrarily defined
 }
   opt :fdr_percent, "%FDR as cutoff", :default => 1.0
-  opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta.  Only 2 groups currently allowed", :default => false
+  opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta.  Only 2 groups currently allowed", :default => false
   opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
-  opt :fasta, "the fasta file.  Required for :qspec and :descriptions", :type => String
+  opt :fasta, "the fasta file.  Required for :descriptions", :type => String
   opt :outfile, "the to which file data are written", :default => outfile
   opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
   opt :verbose, "speak up", :default => false
   opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
-  opt :qspec_decibans, "report bayesfactor in decibans"
-  opt :qspec_normalize, "normalize spectral counts per run", :default => false
-  opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
+  opt :qprot_normalize, "normalize spectral counts per run", :default => false
+  opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
+  opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
   opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
   opt :write_subset, "(dev use only) write subset db", :default => false
 end
@@ -112,8 +111,8 @@ if ARGV.size < 2
   opts.educate && exit
 end
-if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
-  puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
+if opt[:descriptions] && !opt[:fasta]
+  puts "You must provide a fasta file with --fasta to use descriptions!!"
   opts.educate && exit
 end
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
 (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
-raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
+raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
 samplenames = samplename_to_filename.keys
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
   attr_accessor :protein_groups
 end
-class Mspire::Ident::Protein
-  attr_accessor :length
-end
+#class Mspire::Ident::Protein
+#  attr_accessor :length
+#end
 fdr_cutoff = opt[:fdr_percent] / 100
-if opt[:qspec] || opt[:descriptions]
-  putsv "reading lengths and descriptions from #{opt[:fasta]}"
+if opt[:descriptions]
+  putsv "reading descriptions from #{opt[:fasta]}"
   #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
-  id_to_length = {}
   id_to_desc = {}
   Mspire::Fasta.foreach(opt[:fasta]) do |entry|
     #acc = Mspire::Fasta.uniprot_id(entry.header)
     acc = entry.accession
-    id_to_length[acc] = entry.length
     id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
   end
 end
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
       # update each peptide with its protein hits
       protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
         protein = all_protein_hits[id]
-        protein.length = id_to_length[id] if id_to_length
         protein.description = id_to_desc[id] if id_to_desc
         protein
       end
@@ -218,48 +214,43 @@ end
 # each cell holds a SpectralCounts object, which hash 3 types of count data
 counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
+counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
+counts_table.data.zip(protein_groups) do |row, pg|
+  best_id = pg.first   # pg.sort_by {|prot| [prot.id, prot.length] }.first
+  row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
+  row.ids = pg.map(&:id).join(',')
+  row.description = best_id.description
+  row.qprot_protname = pg.map(&:id).join(":")
+end
 # return a list of ProteinGroupComparisons
-if opt[:qspec]
+if opt[:qprot]
-  # prepare data for qspec
-  condition_to_count_array = counts_table.column_names.map do |name|
-    [samplename_to_condition[name], counts_table.column(name)]
+  if opt[:qprot_remove_sparse_rows]
+    newrows = counts_table.data.select do |row|
+      row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
+    end
+    counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
   end
-  # average length of the proteins in the group
-  name_length_pairs = protein_groups.map do |pg|
-    [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
+  # prepare data for qprot
+  condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
+    [samplename_to_condition[name], counts_table.column(name)]
   end
-  qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
+  qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
-  cols_to_add = [:bayes_factor, :fold_change, :fdr]
-  to_add_as_headers = cols_to_add.map do |v|
-    if opt[:qspec_decibans] && v == :bayes_factor
-      :decibans
-    else
-      v
-    end
-  end
-  counts_table.add_columns to_add_as_headers
-  counts_table.data.zip(qspec_results) do |row, qspec_result|
+  cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
+  counts_table.add_columns cols_to_add
+  counts_table.data.zip(qprot_results) do |row, qprot_result|
     cols_to_add.each do |cat|
-      if cat == :bayes_factor && opt[:qspec_decibans]
-        row[:decibans] = 10 * Math.log10(qspec_result[cat])
-      else
-        row[cat] = qspec_result[cat]
-      end
+      row[cat] = qprot_result[cat]
     end
   end
 end
-counts_table.add_columns( [:name, :ids, :description] )
-counts_table.data.zip(protein_groups) do |row, pg|
-  best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
-  row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
-  row.ids = pg.map(&:id).join(',')
-  row.description = best_id.description
-end
+counts_table.remove_column(:qprot_protname)
 if opt[:peptides]
   hits_table.each do |record|

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mspire
 version: !ruby/object:Gem::Version
-  version: 0.10.7.1
+  version: 0.10.7.2
 platform: ruby
 authors:
 - John T. Prince
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-03-21 00:00:00.000000000 Z
+date: 2014-05-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri