mspire 0.10.7.1 → 0.10.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -1
- data/lib/mspire/mzml/index_list.rb +2 -1
- data/lib/mspire/mzml/io_index.rb +2 -1
- data/lib/mspire/quant/qspec.rb +63 -23
- data/lib/mspire/version.rb +1 -1
- data/script/peptide_hit_qvalues_to_spectral_counts_table.rb +40 -49
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c3d6fd2ccef3ca83f802127523c4518115d55d3
|
4
|
+
data.tar.gz: 99910a0e278af6f0d096c3fb9f06902977681e3b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b778a89bbe03de755756a267b772006aa26ced9780384519b194edbb40a2efb54055137e3f5ac81a10044a5d8f53d12ad514b17599d7923cb4bb85b78dc5cf6f
|
7
|
+
data.tar.gz: 0692eef670311afebe4549e8d551ec621c0f5dcc366b7a87e4fa71e129fd165a32fd28293b11e071e99bb7b74787cb9cd4914ba61a77b3d0485f204c05ffe875
|
data/README.md
CHANGED
@@ -56,12 +56,23 @@ objects associated with Mzml files.
|
|
56
56
|
```ruby
|
57
57
|
require 'mspire/mzml'
|
58
58
|
|
59
|
+
# get the intensity of the highest peak from each spectrum
|
60
|
+
intensities = Mspire::Mzml.foreach(mzml_file).map do |spectrum|
|
61
|
+
spectrum.intensities.max
|
62
|
+
end
|
63
|
+
|
64
|
+
# open the file for other operations
|
59
65
|
Mspire::Mzml.open(mzml_file) do |mzml|
|
66
|
+
# read each spectra
|
67
|
+
mzml.each do |spectrum|
|
68
|
+
# do something with each spectrum ...
|
69
|
+
end
|
60
70
|
|
61
|
-
# random access by index or id (even if file wasn't indexed)
|
71
|
+
# or random access by index or id (even if file wasn't indexed)
|
62
72
|
spectrum = mzml[0]
|
63
73
|
spectrum = mzml["controllerType=0 controllerNumber=1 scan=2"]
|
64
74
|
|
75
|
+
# some things to do with a spectrum
|
65
76
|
spectrum.mzs
|
66
77
|
spectrum.intensities
|
67
78
|
|
@@ -55,7 +55,8 @@ module Mspire
|
|
55
55
|
def read_index_list(io)
|
56
56
|
if (offset = index_offset(io))
|
57
57
|
io.seek(offset)
|
58
|
-
|
58
|
+
# TODO: pass in encoding (as second nil)
|
59
|
+
xml = Nokogiri::XML.parse(io.read, nil, nil, Parser::NOBLANKS)
|
59
60
|
index_list = xml.root
|
60
61
|
num_indices = index_list['count'].to_i
|
61
62
|
array = index_list.children.map do |index_n|
|
data/lib/mspire/mzml/io_index.rb
CHANGED
@@ -66,8 +66,9 @@ module Mspire
|
|
66
66
|
end
|
67
67
|
|
68
68
|
def xml_node_from_start_byte(start_byte)
|
69
|
+
# consider passing in @encoding from upstream object (as second nil):
|
69
70
|
xml = get_xml_string(start_byte)
|
70
|
-
Nokogiri::XML.parse(xml, nil,
|
71
|
+
Nokogiri::XML.parse(xml, nil, nil, Parser::NOBLANKS).root
|
71
72
|
end
|
72
73
|
|
73
74
|
def fetch_xml_node(index)
|
data/lib/mspire/quant/qspec.rb
CHANGED
@@ -2,6 +2,26 @@ module Mspire ; end
|
|
2
2
|
module Mspire::Quant ; end
|
3
3
|
|
4
4
|
class Mspire::Quant::Qspec
|
5
|
+
# This is my current best guess based on the behavior of the original QSpec
|
6
|
+
# and going into the source code and looking at the paired and param
|
7
|
+
# versions.
|
8
|
+
|
9
|
+
# qspec: discrete spectral count data
|
10
|
+
# qprot: continuous protein abundance data (could be non-discrete spectral
|
11
|
+
# counts or quantitation data)
|
12
|
+
# paired: one sample against another sample
|
13
|
+
# param: one sample against another sample but with one or more replicates
|
14
|
+
EXE = {
|
15
|
+
qspec: {
|
16
|
+
paired: 'qspec-paired', # <- the old qspec (use qspec here if you have old software)
|
17
|
+
param: 'qspec-param', # < the old qspecgp (use qspecgp if you have old software)
|
18
|
+
},
|
19
|
+
qprot: {
|
20
|
+
paired: 'qprot-paired',
|
21
|
+
param: 'qprot-param',
|
22
|
+
},
|
23
|
+
getfdr: 'getfdr',
|
24
|
+
}
|
5
25
|
|
6
26
|
# personal communication with Hyungwon Choi: "We typically use nburn=2000,
|
7
27
|
# niter=10000, which is quite sufficient to guarantee the reproducibility of
|
@@ -11,8 +31,6 @@ class Mspire::Quant::Qspec
|
|
11
31
|
INIT_HEADER = %w(protid protLen)
|
12
32
|
DELIMITER = "\t"
|
13
33
|
|
14
|
-
SUBMITTED_TO_QSPEC = 'submitted_to_qspec.txt'
|
15
|
-
|
16
34
|
# takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
|
17
35
|
# returns an array of ints [0,0,0,1,1,1...]
|
18
36
|
def self.conditions_to_ints(conditions)
|
@@ -29,30 +47,35 @@ class Mspire::Quant::Qspec
|
|
29
47
|
end
|
30
48
|
|
31
49
|
# returns an array of Results structs which is each row of the returned file
|
32
|
-
# works with
|
50
|
+
# works with version 1.2.2 of Qprot
|
33
51
|
def self.results_array(resultsfile)
|
34
52
|
rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
|
35
53
|
headers = rows.shift
|
36
|
-
|
54
|
+
start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
|
37
55
|
rows.map do |row|
|
38
56
|
data = [row[0]]
|
39
|
-
data.push( row[1...
|
40
|
-
data.push( *row[
|
41
|
-
data.push( row[start_bayes+4] )
|
57
|
+
data.push( row[1...start_log_fold].map(&:to_f) )
|
58
|
+
data.push( *row[start_log_fold,5].map(&:to_f) )
|
42
59
|
Results.new(*data)
|
43
60
|
end
|
44
61
|
end
|
45
62
|
|
46
63
|
# returns the right executable based on the array of conditions
|
47
|
-
def
|
64
|
+
def executable
|
48
65
|
biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
|
49
|
-
(biggest_size >= 3) ?
|
66
|
+
EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
|
50
67
|
end
|
51
68
|
|
52
|
-
#
|
69
|
+
# protname is a list of protein names.
|
70
|
+
# by default, qprot will be run. If you really want qspec to be run, then
|
71
|
+
# supply a [protname, length] doublet in place of each protname.
|
53
72
|
# condition_to_count_array is an array doublets: [condition, array_of_counts]
|
54
|
-
def initialize(
|
55
|
-
@
|
73
|
+
def initialize(protnames, condition_to_count_array)
|
74
|
+
@protnames = protnames
|
75
|
+
if @protnames.first.is_a?(Array)
|
76
|
+
@protname_length_pairs = @protnames
|
77
|
+
@protnames = nil
|
78
|
+
end
|
56
79
|
@condition_to_count_array = condition_to_count_array
|
57
80
|
end
|
58
81
|
|
@@ -62,9 +85,11 @@ class Mspire::Quant::Qspec
|
|
62
85
|
|
63
86
|
# writes a qspec formatted file to filename
|
64
87
|
def write(filename)
|
65
|
-
|
66
|
-
header_cats
|
67
|
-
|
88
|
+
header_cats = %w(protid)
|
89
|
+
header_cats << 'protLen' if @protname_length_pairs
|
90
|
+
header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
|
91
|
+
ar = @protnames || @protname_length_pairs
|
92
|
+
rows = ar.map {|obj| Array(obj) }
|
68
93
|
@condition_to_count_array.each do |cond,counts|
|
69
94
|
rows.zip(counts) {|row,cnt| row << cnt }
|
70
95
|
end
|
@@ -77,16 +102,19 @@ class Mspire::Quant::Qspec
|
|
77
102
|
# returns an array of Qspec::Results objects (each object can be considered
|
78
103
|
# a row of data)
|
79
104
|
def run(normalize=true, opts={})
|
105
|
+
exe = executable
|
106
|
+
puts "using #{exe}" if $VERBOSE
|
107
|
+
executable_base = exe.split('-')[0]
|
108
|
+
|
80
109
|
puts "normalize: #{normalize}" if $VERBOSE
|
81
|
-
tfile = Tempfile.new(
|
110
|
+
tfile = Tempfile.new(executable_base)
|
82
111
|
write(tfile.path)
|
83
112
|
if opts[:keep]
|
84
113
|
local_file = File.join(Dir.pwd,File.basename(tfile.path))
|
85
114
|
FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
|
86
|
-
puts "(copy of) file submitted to
|
115
|
+
puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
|
87
116
|
end
|
88
|
-
|
89
|
-
cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
117
|
+
cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
90
118
|
if $VERBOSE
|
91
119
|
puts "running #{cmd}" if $VERBOSE
|
92
120
|
else
|
@@ -94,12 +122,20 @@ class Mspire::Quant::Qspec
|
|
94
122
|
end
|
95
123
|
reply = `#{cmd}`
|
96
124
|
puts reply if $VERBOSE
|
97
|
-
outfile = tfile.path + '_' +
|
98
|
-
|
125
|
+
outfile = tfile.path + '_' + executable_base
|
126
|
+
system EXE[:getfdr], outfile
|
127
|
+
fdr_file = outfile + "_fdr"
|
128
|
+
puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
|
129
|
+
results = self.class.results_array(fdr_file)
|
99
130
|
if opts[:keep]
|
100
131
|
local_outfile = File.join(Dir.pwd, File.basename(outfile))
|
132
|
+
local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
|
101
133
|
FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
|
102
|
-
|
134
|
+
FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
|
135
|
+
if $VERBOSE
|
136
|
+
puts "(copy of) file returned from qspec: #{outfile}"
|
137
|
+
puts "(copy of) file returned from qspec: #{fdr_file}"
|
138
|
+
end
|
103
139
|
end
|
104
140
|
tfile.unlink
|
105
141
|
results
|
@@ -107,6 +143,10 @@ class Mspire::Quant::Qspec
|
|
107
143
|
|
108
144
|
# for version 2 of QSpec
|
109
145
|
# counts array is parallel to the experiment names passed in originally
|
110
|
-
Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
146
|
+
#Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
147
|
+
|
148
|
+
# for version 1.2.2 of QProt
|
149
|
+
# counts array is parallel to the experiment names passed in originally
|
150
|
+
Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
|
111
151
|
end
|
112
152
|
|
data/lib/mspire/version.rb
CHANGED
@@ -17,7 +17,6 @@ require 'mspire/quant/qspec'
|
|
17
17
|
require 'mspire/quant/cmdline'
|
18
18
|
require 'mspire/fasta'
|
19
19
|
|
20
|
-
|
21
20
|
require 'yaml'
|
22
21
|
require 'tempfile'
|
23
22
|
|
@@ -53,7 +52,7 @@ class Ruport::Data::Table
|
|
53
52
|
File.open(file,'w') do |out|
|
54
53
|
opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
|
55
54
|
out.puts self.column_names.join(delimiter)
|
56
|
-
self.data.each do |row|
|
55
|
+
self.sort_rows_by(:fdr).data.each do |row|
|
57
56
|
out.puts row.to_a.join(delimiter)
|
58
57
|
end
|
59
58
|
opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
|
@@ -87,16 +86,16 @@ writes to #{outfile}
|
|
87
86
|
group names can be arbitrarily defined
|
88
87
|
}
|
89
88
|
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
90
|
-
opt :
|
89
|
+
opt :qprot, "return qprot results (executes qprot-param or qprot-paired). Requires :fasta. Only 2 groups currently allowed", :default => false
|
91
90
|
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
92
|
-
opt :fasta, "the fasta file. Required for :
|
91
|
+
opt :fasta, "the fasta file. Required for :descriptions", :type => String
|
93
92
|
opt :outfile, "the to which file data are written", :default => outfile
|
94
93
|
opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
|
95
94
|
opt :verbose, "speak up", :default => false
|
96
95
|
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
97
|
-
opt :
|
98
|
-
opt :
|
99
|
-
opt :
|
96
|
+
opt :qprot_normalize, "normalize spectral counts per run", :default => false
|
97
|
+
opt :qprot_keep_files, "keep a copy of the files submitted and returned from Qprot", :default => false
|
98
|
+
opt :qprot_remove_sparse_rows, "remove any row with only one non-zero value", :default => false
|
100
99
|
opt :version_tag, "pass in a version tag (e.g. pass in git describe --tags) for version record", :type => String
|
101
100
|
opt :write_subset, "(dev use only) write subset db", :default => false
|
102
101
|
end
|
@@ -112,8 +111,8 @@ if ARGV.size < 2
|
|
112
111
|
opts.educate && exit
|
113
112
|
end
|
114
113
|
|
115
|
-
if
|
116
|
-
puts "You must provide a fasta file with --fasta to use
|
114
|
+
if opt[:descriptions] && !opt[:fasta]
|
115
|
+
puts "You must provide a fasta file with --fasta to use descriptions!!"
|
117
116
|
opts.educate && exit
|
118
117
|
end
|
119
118
|
|
@@ -125,7 +124,7 @@ putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
|
125
124
|
|
126
125
|
(samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
|
127
126
|
|
128
|
-
raise ArgumentError, "must have 2 conditions for
|
127
|
+
raise ArgumentError, "must have 2 conditions for qprot to work!" if opt[:qprot] && condition_to_samplenames.size != 2
|
129
128
|
|
130
129
|
samplenames = samplename_to_filename.keys
|
131
130
|
|
@@ -134,22 +133,20 @@ class Mspire::Ident::PeptideHit
|
|
134
133
|
attr_accessor :protein_groups
|
135
134
|
end
|
136
135
|
|
137
|
-
class Mspire::Ident::Protein
|
138
|
-
attr_accessor :length
|
139
|
-
end
|
136
|
+
#class Mspire::Ident::Protein
|
137
|
+
# attr_accessor :length
|
138
|
+
#end
|
140
139
|
|
141
140
|
|
142
141
|
fdr_cutoff = opt[:fdr_percent] / 100
|
143
142
|
|
144
|
-
if opt[:
|
145
|
-
putsv "reading
|
143
|
+
if opt[:descriptions]
|
144
|
+
putsv "reading descriptions from #{opt[:fasta]}"
|
146
145
|
#Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
147
|
-
id_to_length = {}
|
148
146
|
id_to_desc = {}
|
149
147
|
Mspire::Fasta.foreach(opt[:fasta]) do |entry|
|
150
148
|
#acc = Mspire::Fasta.uniprot_id(entry.header)
|
151
149
|
acc = entry.accession
|
152
|
-
id_to_length[acc] = entry.length
|
153
150
|
id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
|
154
151
|
end
|
155
152
|
end
|
@@ -170,7 +167,6 @@ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_prot
|
|
170
167
|
# update each peptide with its protein hits
|
171
168
|
protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
|
172
169
|
protein = all_protein_hits[id]
|
173
|
-
protein.length = id_to_length[id] if id_to_length
|
174
170
|
protein.description = id_to_desc[id] if id_to_desc
|
175
171
|
protein
|
176
172
|
end
|
@@ -218,48 +214,43 @@ end
|
|
218
214
|
# each cell holds a SpectralCounts object, which hash 3 types of count data
|
219
215
|
counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
|
220
216
|
|
217
|
+
counts_table.add_columns( [:name, :ids, :description, :qprot_protname] )
|
218
|
+
counts_table.data.zip(protein_groups) do |row, pg|
|
219
|
+
best_id = pg.first # pg.sort_by {|prot| [prot.id, prot.length] }.first
|
220
|
+
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
221
|
+
row.ids = pg.map(&:id).join(',')
|
222
|
+
row.description = best_id.description
|
223
|
+
row.qprot_protname = pg.map(&:id).join(":")
|
224
|
+
end
|
225
|
+
|
221
226
|
# return a list of ProteinGroupComparisons
|
222
|
-
if opt[:
|
227
|
+
if opt[:qprot]
|
223
228
|
|
224
|
-
|
225
|
-
|
226
|
-
|
229
|
+
if opt[:qprot_remove_sparse_rows]
|
230
|
+
newrows = counts_table.data.select do |row|
|
231
|
+
row.to_a[0,samplenames.size].select {|v| v > 0 }.size >= 2
|
232
|
+
end
|
233
|
+
counts_table = Ruport::Data::Table.new(:data => newrows, :column_names => counts_table.column_names)
|
227
234
|
end
|
228
|
-
|
229
|
-
|
230
|
-
|
235
|
+
|
236
|
+
# prepare data for qprot
|
237
|
+
condition_to_count_array = counts_table.column_names.select {|name| name.is_a?(String) }.map do |name|
|
238
|
+
[samplename_to_condition[name], counts_table.column(name)]
|
231
239
|
end
|
232
240
|
|
233
|
-
|
241
|
+
qprot_results = Mspire::Quant::Qspec.new(counts_table.column(:qprot_protname), condition_to_count_array).run(opt[:qprot_normalize], :keep => opt[:qprot_keep_files])
|
234
242
|
|
235
|
-
cols_to_add = [:
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
else
|
240
|
-
v
|
241
|
-
end
|
242
|
-
end
|
243
|
-
counts_table.add_columns to_add_as_headers
|
244
|
-
counts_table.data.zip(qspec_results) do |row, qspec_result|
|
243
|
+
cols_to_add = [:log_fold_change, :fdr, :fdr_up, :fdr_down]
|
244
|
+
|
245
|
+
counts_table.add_columns cols_to_add
|
246
|
+
counts_table.data.zip(qprot_results) do |row, qprot_result|
|
245
247
|
cols_to_add.each do |cat|
|
246
|
-
|
247
|
-
row[:decibans] = 10 * Math.log10(qspec_result[cat])
|
248
|
-
else
|
249
|
-
row[cat] = qspec_result[cat]
|
250
|
-
end
|
248
|
+
row[cat] = qprot_result[cat]
|
251
249
|
end
|
252
250
|
end
|
253
251
|
end
|
254
252
|
|
255
|
-
counts_table.
|
256
|
-
counts_table.data.zip(protein_groups) do |row, pg|
|
257
|
-
best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
|
258
|
-
row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
|
259
|
-
row.ids = pg.map(&:id).join(',')
|
260
|
-
row.description = best_id.description
|
261
|
-
end
|
262
|
-
|
253
|
+
counts_table.remove_column(:qprot_protname)
|
263
254
|
|
264
255
|
if opt[:peptides]
|
265
256
|
hits_table.each do |record|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mspire
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.7.
|
4
|
+
version: 0.10.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John T. Prince
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-05-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|