mspire 0.10.7.1 → 0.10.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb806c3b3fc8c31258494541f15be3064f9e8a15
4
- data.tar.gz: 9666168614b20a6a8ac974c5c6cd29c715b87b3e
3
+ metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
4
+ data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
5
5
  SHA512:
6
- metadata.gz: f8d798ed8a3efd8b0483c4b957b139eb4a040b76f598eecc9615cfe8332d8bdfbd0b66185e3416d5e98e57fc8f8705af44e479a3d7e263f2c90d138cd0098223
7
- data.tar.gz: db2e325aef76a22747cca13f5a25e9357b1e76c1a253ef9b71a10fae7b7df6fd8f1faded2c1f5352112a37ab65e54fe3df6099e1c7b717a4e89b70db80f70075
6
+ metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
7
+ data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875
data/README.md CHANGED
@@ -56,12 +56,23 @@ objects associated with Mzml files.
56
56
  ```ruby
57
57
  require 'mspire/mzml'
58
58
 
59
+ # get the intensity of the highest peak from each spectrum
60
+ intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
61
+ spectrum.intensities.max
62
+ end
63
+
64
+ # open the file for other operations
59
65
  Mspire::Mzml.open(mzml_file) do |mzml|
66
+ # read each spectra
67
+ mzml.each do |spectrum|
68
+ # do something with each spectrum ...
69
+ end
60
70
 
61
- # random access by index or id (even if file wasn't indexed)
71
+ # or random access by index or id (even if file wasn't indexed)
62
72
  spectrum = mzml[0]
63
73
  spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
64
74
 
75
+ # some things to do with a spectrum
65
76
  spectrum.mzs
66
77
  spectrum.intensities
67
78
 
@@ -55,7 +55,8 @@ module Mspire
55
55
  def read_index_list(io)
56
56
  if (offset = index_offset(io))
57
57
  io.seek(offset)
58
- xml = Nokogiri::XML.parse(io.read, nil, @encoding, Parser::NOBLANKS)
58
+ # TODO: pass in encoding (as second nil)
59
+ xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
59
60
  index_list = xml.root
60
61
  num_indices = index_list['count'].to_i
61
62
  array = index_list.children.map do |index_n|
@@ -66,8 +66,9 @@ module Mspire
66
66
  end
67
67
 
68
68
  def xml_node_from_start_byte(start_byte)
69
+ # consider passing in @encoding from upstream object (as second nil):
69
70
  xml = get_xml_string(start_byte)
70
- Nokogiri::XML.parse(xml, nil, @encoding, Parser::NOBLANKS).root
71
+ Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
71
72
  end
72
73
 
73
74
  def fetch_xml_node(index)
@@ -2,6 +2,26 @@ module Mspire ; end
2
2
  module Mspire::Quant ; end
3
3
 
4
4
  class Mspire::Quant::Qspec
5
+ # This is my current best guess based on the behavior of the original QSpec
6
+ # and going into the source code and looking at the paired and param
7
+ # versions.
8
+
9
+ # qspec: discrete spectral count data
10
+ # qprot: continuous protein abundance data (could be non-discrete spectral
11
+ # counts or quantitation data)
12
+ # paired: one sample against another sample
13
+ # param: one sample against another sample but with one or more replicates
14
+ EXE = {
15
+ qspec: {
16
+ paired: 'qspec-paired', # <- the old qspec (use qspec here if you have old software)
17
+ param: 'qspec-param', # < the old qspecgp (use qspecgp if you have old software)
18
+ },
19
+ qprot: {
20
+ paired: 'qprot-paired',
21
+ param: 'qprot-param',
22
+ },
23
+ getfdr: 'getfdr',
24
+ }
5
25
 
6
26
  # personal communication with Hyungwon Choi: "We typically use nburn=2000,
7
27
  # niter=10000, which is quite sufficient to guarantee the reproducibility of
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
11
31
  INIT_HEADER = %w(protid protLen)
12
32
  DELIMITER = "\t"
13
33
 
14
- SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
15
-
16
34
  # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
17
35
  # returns an array of ints [0,0,0,1,1,1...]
18
36
  def self.conditions_to_ints(conditions)
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
29
47
  end
30
48
 
31
49
  # returns an array of Results structs which is each row of the returned file
32
- # works with V2 of QSpec
50
+ # works with version 1.2.2 of Qprot
33
51
  def self.results_array(resultsfile)
34
52
  rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
35
53
  headers = rows.shift
36
- start_bayes = headers.index {|v| v =~ /BayesFactor/i }
54
+ start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
37
55
  rows.map do |row|
38
56
  data = [row[0]]
39
- data.push( row[1...start_bayes].map(&:to_f) )
40
- data.push( *row[start_bayes,4].map(&:to_f) )
41
- data.push( row[start_bayes+4] )
57
+ data.push( row[1...start_log_fold].map(&:to_f) )
58
+ data.push( *row[start_log_fold,5].map(&:to_f) )
42
59
  Results.new(*data)
43
60
  end
44
61
  end
45
62
 
46
63
  # returns the right executable based on the array of conditions
47
- def self.executable(conditions)
64
+ def executable
48
65
  biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
49
- (biggest_size >= 3) ? 'qspecgp' : 'qspec'
66
+ EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
50
67
  end
51
68
 
52
- # protname_length_pairs is an array of doublets: [protname, length]
69
+ # protname is a list of protein names.
70
+ # by default, qprot will be run. If you really want qspec to be run, then
71
+ # supply a [protname, length] doublet in place of each protname.
53
72
  # condition_to_count_array is an array doublets: [condition, array_of_counts]
54
- def initialize(protname_length_pairs, condition_to_count_array)
55
- @protname_length_pairs = protname_length_pairs
73
+ def initialize(protnames, condition_to_count_array)
74
+ @protnames = protnames
75
+ if @protnames.first.is_a?(Array)
76
+ @protname_length_pairs = @protnames
77
+ @protnames = nil
78
+ end
56
79
  @condition_to_count_array = condition_to_count_array
57
80
  end
58
81
 
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
62
85
 
63
86
  # writes a qspec formatted file to filename
64
87
  def write(filename)
65
- ints = Mspire::Quant::Qspec.conditions_to_ints(conditions)
66
- header_cats = INIT_HEADER + ints
67
- rows = @protname_length_pairs.map {|pair| pair.map.to_a }
88
+ header_cats = %w(protid)
89
+ header_cats << 'protLen' if @protname_length_pairs
90
+ header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
91
+ ar = @protnames || @protname_length_pairs
92
+ rows = ar.map {|obj| Array(obj) }
68
93
  @condition_to_count_array.each do |cond,counts|
69
94
  rows.zip(counts) {|row,cnt| row << cnt }
70
95
  end
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
77
102
  # returns an array of Qspec::Results objects (each object can be considered
78
103
  # a row of data)
79
104
  def run(normalize=true, opts={})
105
+ exe = executable
106
+ puts "using #{exe}" if $VERBOSE
107
+ executable_base = exe.split('-')[0]
108
+
80
109
  puts "normalize: #{normalize}" if $VERBOSE
81
- tfile = Tempfile.new("qspec")
110
+ tfile = Tempfile.new(executable_base)
82
111
  write(tfile.path)
83
112
  if opts[:keep]
84
113
  local_file = File.join(Dir.pwd,File.basename(tfile.path))
85
114
  FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
86
- puts "(copy of) file submitted to qspec: #{local_file}" if $VERBOSE
115
+ puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
87
116
  end
88
- qspec_exe = self.class.executable(conditions)
89
- cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
117
+ cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
90
118
  if $VERBOSE
91
119
  puts "running #{cmd}" if $VERBOSE
92
120
  else
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
94
122
  end
95
123
  reply = `#{cmd}`
96
124
  puts reply if $VERBOSE
97
- outfile = tfile.path + '_' + qspec_exe
98
- results = self.class.results_array(outfile)
125
+ outfile = tfile.path + '_' + executable_base
126
+ system EXE[:getfdr], outfile
127
+ fdr_file = outfile + "_fdr"
128
+ puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
129
+ results = self.class.results_array(fdr_file)
99
130
  if opts[:keep]
100
131
  local_outfile = File.join(Dir.pwd, File.basename(outfile))
132
+ local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
101
133
  FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
102
- puts "(copy of) file returned from qspec: #{outfile}"
134
+ FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
135
+ if $VERBOSE
136
+ puts "(copy of) file returned from qspec: #{outfile}"
137
+ puts "(copy of) file returned from qspec: #{fdr_file}"
138
+ end
103
139
  end
104
140
  tfile.unlink
105
141
  results
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
107
143
 
108
144
  # for version 2 of QSpec
109
145
  # counts array is parallel to the experiment names passed in originally
110
- Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
146
+ #Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
147
+
148
+ # for version 1.2.2 of QProt
149
+ # counts array is parallel to the experiment names passed in originally
150
+ Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
111
151
  end
112
152
 
@@ -1,3 +1,3 @@
1
1
  module Mspire
2
- VERSION = "0.10.7.1"
2
+ VERSION = "0.10.7.2"
3
3
  end
@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
17
17
  require 'mspire/quant/cmdline'
18
18
  require 'mspire/fasta'
19
19
 
20
-
21
20
  require 'yaml'
22
21
  require 'tempfile'
23
22
 
@@ -53,7 +52,7 @@ class Ruport::Data::Table
53
52
  File.open(file,'w') do |out|
54
53
  opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
55
54
  out.puts self.column_names.join(delimiter)
56
- self.data.each do |row|
55
+ self.sort_rows_by(:fdr).data.each do |row|
57
56
  out.puts row.to_a.join(delimiter)
58
57
  end
59
58
  opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
@@ -87,16 +86,16 @@ writes to #{outfile}
87
86
  group names can be arbitrarily defined
88
87
  }
89
88
  opt :fdr_percent, "%FDR as cutoff", :default => 1.0
90
- opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
89
+ opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta. Only 2 groups currently allowed", :default => false
91
90
  opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
92
- opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
91
+ opt :fasta, "the fasta file. Required for :descriptions", :type => String
93
92
  opt :outfile, "the to which file data are written", :default => outfile
94
93
  opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
95
94
  opt :verbose, "speak up", :default => false
96
95
  opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
97
- opt :qspec_decibans, "report bayesfactor in decibans"
98
- opt :qspec_normalize, "normalize spectral counts per run", :default => false
99
- opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
96
+ opt :qprot_normalize, "normalize spectral counts per run", :default => false
97
+ opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
98
+ opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
100
99
  opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
101
100
  opt :write_subset, "(dev use only) write subset db", :default => false
102
101
  end
@@ -112,8 +111,8 @@ if ARGV.size < 2
112
111
  opts.educate && exit
113
112
  end
114
113
 
115
- if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
116
- puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
114
+ if opt[:descriptions] && !opt[:fasta]
115
+ puts "You must provide a fasta file with --fasta to use descriptions!!"
117
116
  opts.educate && exit
118
117
  end
119
118
 
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
125
124
 
126
125
  (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
127
126
 
128
- raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
127
+ raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
129
128
 
130
129
  samplenames = samplename_to_filename.keys
131
130
 
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
134
133
  attr_accessor :protein_groups
135
134
  end
136
135
 
137
- class Mspire::Ident::Protein
138
- attr_accessor :length
139
- end
136
+ #class Mspire::Ident::Protein
137
+ # attr_accessor :length
138
+ #end
140
139
 
141
140
 
142
141
  fdr_cutoff = opt[:fdr_percent] / 100
143
142
 
144
- if opt[:qspec] || opt[:descriptions]
145
- putsv "reading lengths and descriptions from #{opt[:fasta]}"
143
+ if opt[:descriptions]
144
+ putsv "reading descriptions from #{opt[:fasta]}"
146
145
  #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
147
- id_to_length = {}
148
146
  id_to_desc = {}
149
147
  Mspire::Fasta.foreach(opt[:fasta]) do |entry|
150
148
  #acc = Mspire::Fasta.uniprot_id(entry.header)
151
149
  acc = entry.accession
152
- id_to_length[acc] = entry.length
153
150
  id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
154
151
  end
155
152
  end
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
170
167
  # update each peptide with its protein hits
171
168
  protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
172
169
  protein = all_protein_hits[id]
173
- protein.length = id_to_length[id] if id_to_length
174
170
  protein.description = id_to_desc[id] if id_to_desc
175
171
  protein
176
172
  end
@@ -218,48 +214,43 @@ end
218
214
  # each cell holds a SpectralCounts object, which hash 3 types of count data
219
215
  counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
220
216
 
217
+ counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
218
+ counts_table.data.zip(protein_groups) do |row, pg|
219
+ best_id = pg.first # pg.sort_by {|prot| [prot.id, prot.length] }.first
220
+ row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
221
+ row.ids = pg.map(&:id).join(',')
222
+ row.description = best_id.description
223
+ row.qprot_protname = pg.map(&:id).join(":")
224
+ end
225
+
221
226
  # return a list of ProteinGroupComparisons
222
- if opt[:qspec]
227
+ if opt[:qprot]
223
228
 
224
- # prepare data for qspec
225
- condition_to_count_array = counts_table.column_names.map do |name|
226
- [samplename_to_condition[name], counts_table.column(name)]
229
+ if opt[:qprot_remove_sparse_rows]
230
+ newrows = counts_table.data.select do |row|
231
+ row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
232
+ end
233
+ counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
227
234
  end
228
- # average length of the proteins in the group
229
- name_length_pairs = protein_groups.map do |pg|
230
- [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
235
+
236
+ # prepare data for qprot
237
+ condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
238
+ [samplename_to_condition[name], counts_table.column(name)]
231
239
  end
232
240
 
233
- qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
241
+ qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
234
242
 
235
- cols_to_add = [:bayes_factor, :fold_change, :fdr]
236
- to_add_as_headers = cols_to_add.map do |v|
237
- if opt[:qspec_decibans] && v == :bayes_factor
238
- :decibans
239
- else
240
- v
241
- end
242
- end
243
- counts_table.add_columns to_add_as_headers
244
- counts_table.data.zip(qspec_results) do |row, qspec_result|
243
+ cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
244
+
245
+ counts_table.add_columns cols_to_add
246
+ counts_table.data.zip(qprot_results) do |row, qprot_result|
245
247
  cols_to_add.each do |cat|
246
- if cat == :bayes_factor && opt[:qspec_decibans]
247
- row[:decibans] = 10 * Math.log10(qspec_result[cat])
248
- else
249
- row[cat] = qspec_result[cat]
250
- end
248
+ row[cat] = qprot_result[cat]
251
249
  end
252
250
  end
253
251
  end
254
252
 
255
- counts_table.add_columns( [:name, :ids, :description] )
256
- counts_table.data.zip(protein_groups) do |row, pg|
257
- best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
258
- row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
259
- row.ids = pg.map(&:id).join(',')
260
- row.description = best_id.description
261
- end
262
-
253
+ counts_table.remove_column(:qprot_protname)
263
254
 
264
255
  if opt[:peptides]
265
256
  hits_table.each do |record|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mspire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.7.1
4
+ version: 0.10.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John T. Prince
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-21 00:00:00.000000000 Z
12
+ date: 2014-05-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri