mspire 0.10.7.1 → 0.10.7.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eb806c3b3fc8c31258494541f15be3064f9e8a15
4
- data.tar.gz: 9666168614b20a6a8ac974c5c6cd29c715b87b3e
3
+ metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
4
+ data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
5
5
  SHA512:
6
- metadata.gz: f8d798ed8a3efd8b0483c4b957b139eb4a040b76f598eecc9615cfe8332d8bdfbd0b66185e3416d5e98e57fc8f8705af44e479a3d7e263f2c90d138cd0098223
7
- data.tar.gz: db2e325aef76a22747cca13f5a25e9357b1e76c1a253ef9b71a10fae7b7df6fd8f1faded2c1f5352112a37ab65e54fe3df6099e1c7b717a4e89b70db80f70075
6
+ metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
7
+ data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875
data/README.md CHANGED
@@ -56,12 +56,23 @@ objects associated with Mzml files.
56
56
  ```ruby
57
57
  require 'mspire/mzml'
58
58
 
59
+ # get the intensity of the highest peak from each spectrum
60
+ intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
61
+ spectrum.intensities.max
62
+ end
63
+
64
+ # open the file for other operations
59
65
  Mspire::Mzml.open(mzml_file) do |mzml|
66
+ # read each spectra
67
+ mzml.each do |spectrum|
68
+ # do something with each spectrum ...
69
+ end
60
70
 
61
- # random access by index or id (even if file wasn't indexed)
71
+ # or random access by index or id (even if file wasn't indexed)
62
72
  spectrum = mzml[0]
63
73
  spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
64
74
 
75
+ # some things to do with a spectrum
65
76
  spectrum.mzs
66
77
  spectrum.intensities
67
78
 
@@ -55,7 +55,8 @@ module Mspire
55
55
  def read_index_list(io)
56
56
  if (offset = index_offset(io))
57
57
  io.seek(offset)
58
- xml = Nokogiri::XML.parse(io.read, nil, @encoding, Parser::NOBLANKS)
58
+ # TODO: pass in encoding (as second nil)
59
+ xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
59
60
  index_list = xml.root
60
61
  num_indices = index_list['count'].to_i
61
62
  array = index_list.children.map do |index_n|
@@ -66,8 +66,9 @@ module Mspire
66
66
  end
67
67
 
68
68
  def xml_node_from_start_byte(start_byte)
69
+ # consider passing in @encoding from upstream object (as second nil):
69
70
  xml = get_xml_string(start_byte)
70
- Nokogiri::XML.parse(xml, nil, @encoding, Parser::NOBLANKS).root
71
+ Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
71
72
  end
72
73
 
73
74
  def fetch_xml_node(index)
@@ -2,6 +2,26 @@ module Mspire ; end
2
2
  module Mspire::Quant ; end
3
3
 
4
4
  class Mspire::Quant::Qspec
5
+ # This is my current best guess based on the behavior of the original QSpec
6
+ # and going into the source code and looking at the paired and param
7
+ # versions.
8
+
9
+ # qspec: discrete spectral count data
10
+ # qprot: continuous protein abundance data (could be non-discrete spectral
11
+ # counts or quantitation data)
12
+ # paired: one sample against another sample
13
+ # param: one sample against another sample but with one or more replicates
14
+ EXE = {
15
+ qspec: {
16
+ paired: 'qspec-paired', # <- the old qspec (use qspec here if you have old software)
17
+ param: 'qspec-param', # < the old qspecgp (use qspecgp if you have old software)
18
+ },
19
+ qprot: {
20
+ paired: 'qprot-paired',
21
+ param: 'qprot-param',
22
+ },
23
+ getfdr: 'getfdr',
24
+ }
5
25
 
6
26
  # personal communication with Hyungwon Choi: "We typically use nburn=2000,
7
27
  # niter=10000, which is quite sufficient to guarantee the reproducibility of
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
11
31
  INIT_HEADER = %w(protid protLen)
12
32
  DELIMITER = "\t"
13
33
 
14
- SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
15
-
16
34
  # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
17
35
  # returns an array of ints [0,0,0,1,1,1...]
18
36
  def self.conditions_to_ints(conditions)
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
29
47
  end
30
48
 
31
49
  # returns an array of Results structs which is each row of the returned file
32
- # works with V2 of QSpec
50
+ # works with version 1.2.2 of Qprot
33
51
  def self.results_array(resultsfile)
34
52
  rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
35
53
  headers = rows.shift
36
- start_bayes = headers.index {|v| v =~ /BayesFactor/i }
54
+ start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
37
55
  rows.map do |row|
38
56
  data = [row[0]]
39
- data.push( row[1...start_bayes].map(&:to_f) )
40
- data.push( *row[start_bayes,4].map(&:to_f) )
41
- data.push( row[start_bayes+4] )
57
+ data.push( row[1...start_log_fold].map(&:to_f) )
58
+ data.push( *row[start_log_fold,5].map(&:to_f) )
42
59
  Results.new(*data)
43
60
  end
44
61
  end
45
62
 
46
63
  # returns the right executable based on the array of conditions
47
- def self.executable(conditions)
64
+ def executable
48
65
  biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
49
- (biggest_size >= 3) ? 'qspecgp' : 'qspec'
66
+ EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
50
67
  end
51
68
 
52
- # protname_length_pairs is an array of doublets: [protname, length]
69
+ # protname is a list of protein names.
70
+ # by default, qprot will be run. If you really want qspec to be run, then
71
+ # supply a [protname, length] doublet in place of each protname.
53
72
  # condition_to_count_array is an array doublets: [condition, array_of_counts]
54
- def initialize(protname_length_pairs, condition_to_count_array)
55
- @protname_length_pairs = protname_length_pairs
73
+ def initialize(protnames, condition_to_count_array)
74
+ @protnames = protnames
75
+ if @protnames.first.is_a?(Array)
76
+ @protname_length_pairs = @protnames
77
+ @protnames = nil
78
+ end
56
79
  @condition_to_count_array = condition_to_count_array
57
80
  end
58
81
 
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
62
85
 
63
86
  # writes a qspec formatted file to filename
64
87
  def write(filename)
65
- ints = Mspire::Quant::Qspec.conditions_to_ints(conditions)
66
- header_cats = INIT_HEADER + ints
67
- rows = @protname_length_pairs.map {|pair| pair.map.to_a }
88
+ header_cats = %w(protid)
89
+ header_cats << 'protLen' if @protname_length_pairs
90
+ header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
91
+ ar = @protnames || @protname_length_pairs
92
+ rows = ar.map {|obj| Array(obj) }
68
93
  @condition_to_count_array.each do |cond,counts|
69
94
  rows.zip(counts) {|row,cnt| row << cnt }
70
95
  end
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
77
102
  # returns an array of Qspec::Results objects (each object can be considered
78
103
  # a row of data)
79
104
  def run(normalize=true, opts={})
105
+ exe = executable
106
+ puts "using #{exe}" if $VERBOSE
107
+ executable_base = exe.split('-')[0]
108
+
80
109
  puts "normalize: #{normalize}" if $VERBOSE
81
- tfile = Tempfile.new("qspec")
110
+ tfile = Tempfile.new(executable_base)
82
111
  write(tfile.path)
83
112
  if opts[:keep]
84
113
  local_file = File.join(Dir.pwd,File.basename(tfile.path))
85
114
  FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
86
- puts "(copy of) file submitted to qspec: #{local_file}" if $VERBOSE
115
+ puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
87
116
  end
88
- qspec_exe = self.class.executable(conditions)
89
- cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
117
+ cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
90
118
  if $VERBOSE
91
119
  puts "running #{cmd}" if $VERBOSE
92
120
  else
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
94
122
  end
95
123
  reply = `#{cmd}`
96
124
  puts reply if $VERBOSE
97
- outfile = tfile.path + '_' + qspec_exe
98
- results = self.class.results_array(outfile)
125
+ outfile = tfile.path + '_' + executable_base
126
+ system EXE[:getfdr], outfile
127
+ fdr_file = outfile + "_fdr"
128
+ puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
129
+ results = self.class.results_array(fdr_file)
99
130
  if opts[:keep]
100
131
  local_outfile = File.join(Dir.pwd, File.basename(outfile))
132
+ local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
101
133
  FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
102
- puts "(copy of) file returned from qspec: #{outfile}"
134
+ FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
135
+ if $VERBOSE
136
+ puts "(copy of) file returned from qspec: #{outfile}"
137
+ puts "(copy of) file returned from qspec: #{fdr_file}"
138
+ end
103
139
  end
104
140
  tfile.unlink
105
141
  results
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
107
143
 
108
144
  # for version 2 of QSpec
109
145
  # counts array is parallel to the experiment names passed in originally
110
- Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
146
+ #Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
147
+
148
+ # for version 1.2.2 of QProt
149
+ # counts array is parallel to the experiment names passed in originally
150
+ Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
111
151
  end
112
152
 
@@ -1,3 +1,3 @@
1
1
  module Mspire
2
- VERSION = "0.10.7.1"
2
+ VERSION = "0.10.7.2"
3
3
  end
@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
17
17
  require 'mspire/quant/cmdline'
18
18
  require 'mspire/fasta'
19
19
 
20
-
21
20
  require 'yaml'
22
21
  require 'tempfile'
23
22
 
@@ -53,7 +52,7 @@ class Ruport::Data::Table
53
52
  File.open(file,'w') do |out|
54
53
  opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
55
54
  out.puts self.column_names.join(delimiter)
56
- self.data.each do |row|
55
+ self.sort_rows_by(:fdr).data.each do |row|
57
56
  out.puts row.to_a.join(delimiter)
58
57
  end
59
58
  opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
@@ -87,16 +86,16 @@ writes to #{outfile}
87
86
  group names can be arbitrarily defined
88
87
  }
89
88
  opt :fdr_percent, "%FDR as cutoff", :default => 1.0
90
- opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
89
+ opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta. Only 2 groups currently allowed", :default => false
91
90
  opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
92
- opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
91
+ opt :fasta, "the fasta file. Required for :descriptions", :type => String
93
92
  opt :outfile, "the to which file data are written", :default => outfile
94
93
  opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
95
94
  opt :verbose, "speak up", :default => false
96
95
  opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
97
- opt :qspec_decibans, "report bayesfactor in decibans"
98
- opt :qspec_normalize, "normalize spectral counts per run", :default => false
99
- opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
96
+ opt :qprot_normalize, "normalize spectral counts per run", :default => false
97
+ opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
98
+ opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
100
99
  opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
101
100
  opt :write_subset, "(dev use only) write subset db", :default => false
102
101
  end
@@ -112,8 +111,8 @@ if ARGV.size < 2
112
111
  opts.educate && exit
113
112
  end
114
113
 
115
- if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
116
- puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
114
+ if opt[:descriptions] && !opt[:fasta]
115
+ puts "You must provide a fasta file with --fasta to use descriptions!!"
117
116
  opts.educate && exit
118
117
  end
119
118
 
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
125
124
 
126
125
  (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
127
126
 
128
- raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
127
+ raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
129
128
 
130
129
  samplenames = samplename_to_filename.keys
131
130
 
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
134
133
  attr_accessor :protein_groups
135
134
  end
136
135
 
137
- class Mspire::Ident::Protein
138
- attr_accessor :length
139
- end
136
+ #class Mspire::Ident::Protein
137
+ # attr_accessor :length
138
+ #end
140
139
 
141
140
 
142
141
  fdr_cutoff = opt[:fdr_percent] / 100
143
142
 
144
- if opt[:qspec] || opt[:descriptions]
145
- putsv "reading lengths and descriptions from #{opt[:fasta]}"
143
+ if opt[:descriptions]
144
+ putsv "reading descriptions from #{opt[:fasta]}"
146
145
  #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
147
- id_to_length = {}
148
146
  id_to_desc = {}
149
147
  Mspire::Fasta.foreach(opt[:fasta]) do |entry|
150
148
  #acc = Mspire::Fasta.uniprot_id(entry.header)
151
149
  acc = entry.accession
152
- id_to_length[acc] = entry.length
153
150
  id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
154
151
  end
155
152
  end
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
170
167
  # update each peptide with its protein hits
171
168
  protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
172
169
  protein = all_protein_hits[id]
173
- protein.length = id_to_length[id] if id_to_length
174
170
  protein.description = id_to_desc[id] if id_to_desc
175
171
  protein
176
172
  end
@@ -218,48 +214,43 @@ end
218
214
  # each cell holds a SpectralCounts object, which hash 3 types of count data
219
215
  counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
220
216
 
217
+ counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
218
+ counts_table.data.zip(protein_groups) do |row, pg|
219
+ best_id = pg.first # pg.sort_by {|prot| [prot.id, prot.length] }.first
220
+ row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
221
+ row.ids = pg.map(&:id).join(',')
222
+ row.description = best_id.description
223
+ row.qprot_protname = pg.map(&:id).join(":")
224
+ end
225
+
221
226
  # return a list of ProteinGroupComparisons
222
- if opt[:qspec]
227
+ if opt[:qprot]
223
228
 
224
- # prepare data for qspec
225
- condition_to_count_array = counts_table.column_names.map do |name|
226
- [samplename_to_condition[name], counts_table.column(name)]
229
+ if opt[:qprot_remove_sparse_rows]
230
+ newrows = counts_table.data.select do |row|
231
+ row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
232
+ end
233
+ counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
227
234
  end
228
- # average length of the proteins in the group
229
- name_length_pairs = protein_groups.map do |pg|
230
- [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
235
+
236
+ # prepare data for qprot
237
+ condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
238
+ [samplename_to_condition[name], counts_table.column(name)]
231
239
  end
232
240
 
233
- qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
241
+ qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
234
242
 
235
- cols_to_add = [:bayes_factor, :fold_change, :fdr]
236
- to_add_as_headers = cols_to_add.map do |v|
237
- if opt[:qspec_decibans] && v == :bayes_factor
238
- :decibans
239
- else
240
- v
241
- end
242
- end
243
- counts_table.add_columns to_add_as_headers
244
- counts_table.data.zip(qspec_results) do |row, qspec_result|
243
+ cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
244
+
245
+ counts_table.add_columns cols_to_add
246
+ counts_table.data.zip(qprot_results) do |row, qprot_result|
245
247
  cols_to_add.each do |cat|
246
- if cat == :bayes_factor && opt[:qspec_decibans]
247
- row[:decibans] = 10 * Math.log10(qspec_result[cat])
248
- else
249
- row[cat] = qspec_result[cat]
250
- end
248
+ row[cat] = qprot_result[cat]
251
249
  end
252
250
  end
253
251
  end
254
252
 
255
- counts_table.add_columns( [:name, :ids, :description] )
256
- counts_table.data.zip(protein_groups) do |row, pg|
257
- best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
258
- row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
259
- row.ids = pg.map(&:id).join(',')
260
- row.description = best_id.description
261
- end
262
-
253
+ counts_table.remove_column(:qprot_protname)
263
254
 
264
255
  if opt[:peptides]
265
256
  hits_table.each do |record|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mspire
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.7.1
4
+ version: 0.10.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John T. Prince
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-21 00:00:00.000000000 Z
12
+ date: 2014-05-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri