mspire 0.10.7.1 → 0.10.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -1
- data/lib/mspire/mzml/index_list.rb +2 -1
- data/lib/mspire/mzml/io_index.rb +2 -1
- data/lib/mspire/quant/qspec.rb +63 -23
- data/lib/mspire/version.rb +1 -1
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +40 -49
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
|
4
|
+
data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
|
7
|
+
data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875
|
data/README.md
CHANGED
@@ -56,12 +56,23 @@ objects associated with Mzml files.
|
|
56
56
|
```ruby
|
57
57
|
require 'mspire/mzml'
|
58
58
|
|
59
|
+
# get the intensity of the highest peak from each spectrum
|
60
|
+
intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
|
61
|
+
spectrum.intensities.max
|
62
|
+
end
|
63
|
+
|
64
|
+
# open the file for other operations
|
59
65
|
Mspire::Mzml.open(mzml_file) do |mzml|
|
66
|
+
# read each spectra
|
67
|
+
mzml.each do |spectrum|
|
68
|
+
# do something with each spectrum ...
|
69
|
+
end
|
60
70
|
|
61
|
-
# random access by index or id (even if file wasn't indexed)
|
71
|
+
# or random access by index or id (even if file wasn't indexed)
|
62
72
|
spectrum = mzml[0]
|
63
73
|
spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
|
64
74
|
|
75
|
+
# some things to do with a spectrum
|
65
76
|
spectrum.mzs
|
66
77
|
spectrum.intensities
|
67
78
|
|
@@ -55,7 +55,8 @@ module Mspire
|
|
55
55
|
def read_index_list(io)
|
56
56
|
if (offset = index_offset(io))
|
57
57
|
io.seek(offset)
|
58
|
-
|
58
|
+
# TODO: pass in encoding (as second nil)
|
59
|
+
xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
|
59
60
|
index_list = xml.root
|
60
61
|
num_indices = index_list['count'].to_i
|
61
62
|
array = index_list.children.map do |index_n|
|
data/lib/mspire/mzml/io_index.rb
CHANGED
@@ -66,8 +66,9 @@ module Mspire
|
|
66
66
|
end
|
67
67
|
|
68
68
|
def xml_node_from_start_byte(start_byte)
|
69
|
+
# consider passing in @encoding from upstream object (as second nil):
|
69
70
|
xml = get_xml_string(start_byte)
|
70
|
-
Nokogiri::XML.parse(xml, nil,
|
71
|
+
Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
|
71
72
|
end
|
72
73
|
|
73
74
|
def fetch_xml_node(index)
|
data/lib/mspire/quant/qspec.rb
CHANGED
@@ -2,6 +2,26 @@ module Mspire ; end
|
|
2
2
|
module Mspire::Quant ; end
|
3
3
|
|
4
4
|
class Mspire::Quant::Qspec
|
5
|
+
# This is my current best guess based on the behavior of the original QSpec
|
6
|
+
# and going into the source code and looking at the paired and param
|
7
|
+
# versions.
|
8
|
+
|
9
|
+
# qspec: discrete spectral count data
|
10
|
+
# qprot: continuous protein abundance data (could be non-discrete spectral
|
11
|
+
# counts or quantitation data)
|
12
|
+
# paired: one sample against another sample
|
13
|
+
# param: one sample against another sample but with one or more replicates
|
14
|
+
EXE = {
|
15
|
+
qspec: {
|
16
|
+
paired: 'qspec-paired', # <- the old qspec (use qspec here if you have old software)
|
17
|
+
param: 'qspec-param', # < the old qspecgp (use qspecgp if you have old software)
|
18
|
+
},
|
19
|
+
qprot: {
|
20
|
+
paired: 'qprot-paired',
|
21
|
+
param: 'qprot-param',
|
22
|
+
},
|
23
|
+
getfdr: 'getfdr',
|
24
|
+
}
|
5
25
|
|
6
26
|
# personal communication with Hyungwon Choi: "We typically use nburn=2000,
|
7
27
|
# niter=10000, which is quite sufficient to guarantee the reproducibility of
|
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
|
|
11
31
|
INIT_HEADER = %w(protid protLen)
|
12
32
|
DELIMITER = "\t"
|
13
33
|
|
14
|
-
SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
|
15
|
-
|
16
34
|
# takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
|
17
35
|
# returns an array of ints [0,0,0,1,1,1...]
|
18
36
|
def self.conditions_to_ints(conditions)
|
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
|
|
29
47
|
end
|
30
48
|
|
31
49
|
# returns an array of Results structs which is each row of the returned file
|
32
|
-
# works with
|
50
|
+
# works with version 1.2.2 of Qprot
|
33
51
|
def self.results_array(resultsfile)
|
34
52
|
rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
|
35
53
|
headers = rows.shift
|
36
|
-
|
54
|
+
start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
|
37
55
|
rows.map do |row|
|
38
56
|
data = [row[0]]
|
39
|
-
data.push( row[1...
|
40
|
-
data.push( *row[
|
41
|
-
data.push( row[start_bayes+4] )
|
57
|
+
data.push( row[1...start_log_fold].map(&:to_f) )
|
58
|
+
data.push( *row[start_log_fold,5].map(&:to_f) )
|
42
59
|
Results.new(*data)
|
43
60
|
end
|
44
61
|
end
|
45
62
|
|
46
63
|
# returns the right executable based on the array of conditions
|
47
|
-
def
|
64
|
+
def executable
|
48
65
|
biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
|
49
|
-
(biggest_size >= 3) ?
|
66
|
+
EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
|
50
67
|
end
|
51
68
|
|
52
|
-
#
|
69
|
+
# protname is a list of protein names.
|
70
|
+
# by default, qprot will be run. If you really want qspec to be run, then
|
71
|
+
# supply a [protname, length] doublet in place of each protname.
|
53
72
|
# condition_to_count_array is an array doublets: [condition, array_of_counts]
|
54
|
-
def initialize(
|
55
|
-
@
|
73
|
+
def initialize(protnames, condition_to_count_array)
|
74
|
+
@protnames = protnames
|
75
|
+
if @protnames.first.is_a?(Array)
|
76
|
+
@protname_length_pairs = @protnames
|
77
|
+
@protnames = nil
|
78
|
+
end
|
56
79
|
@condition_to_count_array = condition_to_count_array
|
57
80
|
end
|
58
81
|
|
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
|
|
62
85
|
|
63
86
|
# writes a qspec formatted file to filename
|
64
87
|
def write(filename)
|
65
|
-
|
66
|
-
header_cats
|
67
|
-
|
88
|
+
header_cats = %w(protid)
|
89
|
+
header_cats << 'protLen' if @protname_length_pairs
|
90
|
+
header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
|
91
|
+
ar = @protnames || @protname_length_pairs
|
92
|
+
rows = ar.map {|obj| Array(obj) }
|
68
93
|
@condition_to_count_array.each do |cond,counts|
|
69
94
|
rows.zip(counts) {|row,cnt| row << cnt }
|
70
95
|
end
|
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
|
|
77
102
|
# returns an array of Qspec::Results objects (each object can be considered
|
78
103
|
# a row of data)
|
79
104
|
def run(normalize=true, opts={})
|
105
|
+
exe = executable
|
106
|
+
puts "using #{exe}" if $VERBOSE
|
107
|
+
executable_base = exe.split('-')[0]
|
108
|
+
|
80
109
|
puts "normalize: #{normalize}" if $VERBOSE
|
81
|
-
tfile = Tempfile.new(
|
110
|
+
tfile = Tempfile.new(executable_base)
|
82
111
|
write(tfile.path)
|
83
112
|
if opts[:keep]
|
84
113
|
local_file = File.join(Dir.pwd,File.basename(tfile.path))
|
85
114
|
FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
|
86
|
-
puts "(copy of) file submitted to
|
115
|
+
puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
|
87
116
|
end
|
88
|
-
|
89
|
-
cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
117
|
+
cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
90
118
|
if $VERBOSE
|
91
119
|
puts "running #{cmd}" if $VERBOSE
|
92
120
|
else
|
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
|
|
94
122
|
end
|
95
123
|
reply = `#{cmd}`
|
96
124
|
puts reply if $VERBOSE
|
97
|
-
outfile = tfile.path + '_' +
|
98
|
-
|
125
|
+
outfile = tfile.path + '_' + executable_base
|
126
|
+
system EXE[:getfdr], outfile
|
127
|
+
fdr_file = outfile + "_fdr"
|
128
|
+
puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
|
129
|
+
results = self.class.results_array(fdr_file)
|
99
130
|
if opts[:keep]
|
100
131
|
local_outfile = File.join(Dir.pwd, File.basename(outfile))
|
132
|
+
local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
|
101
133
|
FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
|
102
|
-
|
134
|
+
FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
|
135
|
+
if $VERBOSE
|
136
|
+
puts "(copy of) file returned from qspec: #{outfile}"
|
137
|
+
puts "(copy of) file returned from qspec: #{fdr_file}"
|
138
|
+
end
|
103
139
|
end
|
104
140
|
tfile.unlink
|
105
141
|
results
|
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
|
|
107
143
|
|
108
144
|
# for version 2 of QSpec
|
109
145
|
# counts array is parallel to the experiment names passed in originally
|
110
|
-
Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
146
|
+
#Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
147
|
+
|
148
|
+
# for version 1.2.2 of QProt
|
149
|
+
# counts array is parallel to the experiment names passed in originally
|
150
|
+
Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
|
111
151
|
end
|
112
152
|
|
data/lib/mspire/version.rb
CHANGED
@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
|
|
17
17
|
require 'mspire/quant/cmdline'
|
18
18
|
require 'mspire/fasta'
|
19
19
|
|
20
|
-
|
21
20
|
require 'yaml'
|
22
21
|
require 'tempfile'
|
23
22
|
|
@@ -53,7 +52,7 @@ class Ruport::Data::Table
|
|
53
52
|
File.open(file,'w') do |out|
|
54
53
|
opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
|
55
54
|
out.puts self.column_names.join(delimiter)
|
56
|
-
self.data.each do |row|
|
55
|
+
self.sort_rows_by(:fdr).data.each do |row|
|
57
56
|
out.puts row.to_a.join(delimiter)
|
58
57
|
end
|
59
58
|
opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
|
@@ -87,16 +86,16 @@ writes to #{outfile}
|
|
87
86
|
group names can be arbitrarily defined
|
88
87
|
}
|
89
88
|
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
90
|
-
opt :
|
89
|
+
opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta. Only 2 groups currently allowed", :default => false
|
91
90
|
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
92
|
-
opt :fasta, "the fasta file. Required for :
|
91
|
+
opt :fasta, "the fasta file. Required for :descriptions", :type => String
|
93
92
|
opt :outfile, "the to which file data are written", :default => outfile
|
94
93
|
opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
|
95
94
|
opt :verbose, "speak up", :default => false
|
96
95
|
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
97
|
-
opt :
|
98
|
-
opt :
|
99
|
-
opt :
|
96
|
+
opt :qprot_normalize, "normalize spectral counts per run", :default => false
|
97
|
+
opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
|
98
|
+
opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
|
100
99
|
opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
|
101
100
|
opt :write_subset, "(dev use only) write subset db", :default => false
|
102
101
|
end
|
@@ -112,8 +111,8 @@ if ARGV.size < 2
|
|
112
111
|
opts.educate && exit
|
113
112
|
end
|
114
113
|
|
115
|
-
if
|
116
|
-
puts "You must provide a fasta file with --fasta to use
|
114
|
+
if opt[:descriptions] && !opt[:fasta]
|
115
|
+
puts "You must provide a fasta file with --fasta to use descriptions!!"
|
117
116
|
opts.educate && exit
|
118
117
|
end
|
119
118
|
|
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
|
125
124
|
|
126
125
|
(samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
|
127
126
|
|
128
|
-
raise ArgumentError, "must have 2 conditions for
|
127
|
+
raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
|
129
128
|
|
130
129
|
samplenames = samplename_to_filename.keys
|
131
130
|
|
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
|
|
134
133
|
attr_accessor :protein_groups
|
135
134
|
end
|
136
135
|
|
137
|
-
class Mspire::Ident::Protein
|
138
|
-
attr_accessor :length
|
139
|
-
end
|
136
|
+
#class Mspire::Ident::Protein
|
137
|
+
# attr_accessor :length
|
138
|
+
#end
|
140
139
|
|
141
140
|
|
142
141
|
fdr_cutoff = opt[:fdr_percent] / 100
|
143
142
|
|
144
|
-
if opt[:
|
145
|
-
putsv "reading
|
143
|
+
if opt[:descriptions]
|
144
|
+
putsv "reading descriptions from #{opt[:fasta]}"
|
146
145
|
#Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
147
|
-
id_to_length = {}
|
148
146
|
id_to_desc = {}
|
149
147
|
Mspire::Fasta.foreach(opt[:fasta]) do |entry|
|
150
148
|
#acc = Mspire::Fasta.uniprot_id(entry.header)
|
151
149
|
acc = entry.accession
|
152
|
-
id_to_length[acc] = entry.length
|
153
150
|
id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
|
154
151
|
end
|
155
152
|
end
|
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
|
|
170
167
|
# update each peptide with its protein hits
|
171
168
|
protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
|
172
169
|
protein = all_protein_hits[id]
|
173
|
-
protein.length = id_to_length[id] if id_to_length
|
174
170
|
protein.description = id_to_desc[id] if id_to_desc
|
175
171
|
protein
|
176
172
|
end
|
@@ -218,48 +214,43 @@ end
|
|
218
214
|
# each cell holds a SpectralCounts object, which hash 3 types of count data
|
219
215
|
counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
|
220
216
|
|
217
|
+
counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
|
218
|
+
counts_table.data.zip(protein_groups) do |row, pg|
|
219
|
+
best_id = pg.first # pg.sort_by {|prot| [prot.id, prot.length] }.first
|
220
|
+
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
221
|
+
row.ids = pg.map(&:id).join(',')
|
222
|
+
row.description = best_id.description
|
223
|
+
row.qprot_protname = pg.map(&:id).join(":")
|
224
|
+
end
|
225
|
+
|
221
226
|
# return a list of ProteinGroupComparisons
|
222
|
-
if opt[:
|
227
|
+
if opt[:qprot]
|
223
228
|
|
224
|
-
|
225
|
-
|
226
|
-
|
229
|
+
if opt[:qprot_remove_sparse_rows]
|
230
|
+
newrows = counts_table.data.select do |row|
|
231
|
+
row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
|
232
|
+
end
|
233
|
+
counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
|
227
234
|
end
|
228
|
-
|
229
|
-
|
230
|
-
|
235
|
+
|
236
|
+
# prepare data for qprot
|
237
|
+
condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
|
238
|
+
[samplename_to_condition[name], counts_table.column(name)]
|
231
239
|
end
|
232
240
|
|
233
|
-
|
241
|
+
qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
|
234
242
|
|
235
|
-
cols_to_add = [:
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
else
|
240
|
-
v
|
241
|
-
end
|
242
|
-
end
|
243
|
-
counts_table.add_columns to_add_as_headers
|
244
|
-
counts_table.data.zip(qspec_results) do |row, qspec_result|
|
243
|
+
cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
|
244
|
+
|
245
|
+
counts_table.add_columns cols_to_add
|
246
|
+
counts_table.data.zip(qprot_results) do |row, qprot_result|
|
245
247
|
cols_to_add.each do |cat|
|
246
|
-
|
247
|
-
row[:decibans] = 10 * Math.log10(qspec_result[cat])
|
248
|
-
else
|
249
|
-
row[cat] = qspec_result[cat]
|
250
|
-
end
|
248
|
+
row[cat] = qprot_result[cat]
|
251
249
|
end
|
252
250
|
end
|
253
251
|
end
|
254
252
|
|
255
|
-
counts_table.
|
256
|
-
counts_table.data.zip(protein_groups) do |row, pg|
|
257
|
-
best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
|
258
|
-
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
259
|
-
row.ids = pg.map(&:id).join(',')
|
260
|
-
row.description = best_id.description
|
261
|
-
end
|
262
|
-
|
253
|
+
counts_table.remove_column(:qprot_protname)
|
263
254
|
|
264
255
|
if opt[:peptides]
|
265
256
|
hits_table.each do |record|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mspire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.7.
|
4
|
+
version: 0.10.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John T. Prince
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-05-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|