ms-quant 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +6 -6
- data/VERSION +1 -1
- data/bin/peptide_hit_qvalues_to_spectral_counts_table.rb +151 -29
- data/lib/ms/quant/qspec.rb +91 -0
- metadata +4 -3
data/Rakefile
CHANGED
@@ -29,12 +29,12 @@ Rake::TestTask.new(:spec) do |spec|
|
|
29
29
|
spec.verbose = true
|
30
30
|
end
|
31
31
|
|
32
|
-
require 'rcov/rcovtask'
|
33
|
-
Rcov::RcovTask.new do |spec|
|
34
|
-
spec.libs << 'spec'
|
35
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
-
spec.verbose = true
|
37
|
-
end
|
32
|
+
#require 'rcov/rcovtask'
|
33
|
+
#Rcov::RcovTask.new do |spec|
|
34
|
+
# spec.libs << 'spec'
|
35
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
# spec.verbose = true
|
37
|
+
#end
|
38
38
|
|
39
39
|
task :default => :spec
|
40
40
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
@@ -4,37 +4,108 @@ require 'ms/ident/peptide_hit/qvalue'
|
|
4
4
|
require 'ms/ident/protein_hit'
|
5
5
|
require 'ms/ident/peptide/db'
|
6
6
|
require 'ms/quant/spectral_counts'
|
7
|
+
require 'ms/quant/qspec'
|
8
|
+
|
9
|
+
require 'yaml'
|
10
|
+
require 'tempfile'
|
7
11
|
|
8
12
|
require 'trollop'
|
9
13
|
|
14
|
+
# inverse from Tilo Sloboda (now in facets)
|
15
|
+
|
16
|
+
class Hash
|
17
|
+
def inverse
|
18
|
+
i = Hash.new
|
19
|
+
self.each_pair do |k,v|
|
20
|
+
if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
|
21
|
+
else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
|
22
|
+
end ; i
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
10
27
|
def putsv(*args)
|
11
28
|
if $VERBOSE
|
12
29
|
puts(*args) ; $stdout.flush
|
13
30
|
end
|
14
31
|
end
|
15
32
|
|
33
|
+
def basename(file)
|
34
|
+
base = file.chomp(File.extname(file))
|
35
|
+
base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
|
36
|
+
base
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
outfile = "spectral_counts.tsv"
|
41
|
+
delimiter = "\t"
|
42
|
+
|
16
43
|
opts = Trollop::Parser.new do
|
17
|
-
banner %Q{usage: #{File.basename(__FILE__)} peptide_centric_db.yml,
|
44
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.psq,f2.psq group2=f3.psq,f4.psq
|
45
|
+
or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.psq file2.psq ...
|
46
|
+
|
47
|
+
writes to #{outfile}
|
48
|
+
group names can be arbitrarily defined
|
49
|
+
psq is really .psq.tsv file
|
18
50
|
}
|
19
|
-
opt :names, "array of names for the table (otherwise filenames)", :type => String
|
20
51
|
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
21
|
-
opt :
|
52
|
+
opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
|
53
|
+
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
54
|
+
opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
|
55
|
+
opt :outfile, "the to which file data are written", :default => outfile
|
56
|
+
opt :verbose, "speak up", :default => false
|
57
|
+
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
58
|
+
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
59
|
+
opt :write_subset, "(dev use only) write subset db", :default => false
|
22
60
|
end
|
23
61
|
|
24
62
|
opt = opts.parse(ARGV)
|
63
|
+
opt[:count_type] = opt[:count_type].to_sym
|
64
|
+
|
65
|
+
$VERBOSE = opt.delete(:verbose)
|
25
66
|
|
26
67
|
if ARGV.size < 2
|
27
68
|
opts.educate && exit
|
28
69
|
end
|
29
70
|
|
71
|
+
if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
|
72
|
+
puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
|
73
|
+
opts.educate && exit
|
74
|
+
end
|
75
|
+
|
30
76
|
peptide_centric_db_file = ARGV.shift
|
77
|
+
raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
|
78
|
+
putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
79
|
+
|
80
|
+
# groupname => files
|
81
|
+
condition_to_samplenames = {}
|
82
|
+
samplename_to_filename = {}
|
83
|
+
ARGV.each do |arg|
|
84
|
+
(condition, files) =
|
85
|
+
if arg.include?('=')
|
86
|
+
(condition, filestring) = arg.split('=')
|
87
|
+
[condition, filestring.split(',')]
|
88
|
+
else
|
89
|
+
[basename(arg), [arg]]
|
90
|
+
end
|
91
|
+
reptag = ARGV.size
|
92
|
+
sample_to_file_pairs = files.each_with_index.map {|file,i| ["#{condition}-rep#{i+1}", file] }
|
93
|
+
sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
|
94
|
+
condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
|
95
|
+
end
|
31
96
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
97
|
+
|
98
|
+
if $VERBOSE
|
99
|
+
puts "** condition: sample_names"
|
100
|
+
puts condition_to_samplenames.to_yaml
|
101
|
+
puts "** samplename: filename"
|
102
|
+
puts samplename_to_filename.to_yaml
|
36
103
|
end
|
37
104
|
|
105
|
+
raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
|
106
|
+
|
107
|
+
samplenames = samplename_to_filename.keys
|
108
|
+
|
38
109
|
class Ms::Ident::PeptideHit
|
39
110
|
attr_accessor :experiment_name
|
40
111
|
end
|
@@ -42,11 +113,9 @@ fdr_cutoff = opt[:fdr_percent] / 100
|
|
42
113
|
|
43
114
|
start=Time.now
|
44
115
|
|
45
|
-
|
46
|
-
|
47
|
-
ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
116
|
+
ar_of_pephit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
48
117
|
putsv "#{Time.now-start} seconds to read #{peptide_centric_db_file}"
|
49
|
-
|
118
|
+
samplename_to_filename.map do |sample, file|
|
50
119
|
peptide_hits = Ms::Ident::PeptideHit::Qvalue.from_file(file)
|
51
120
|
putsv "#{file}: #{peptide_hits.size} hits"
|
52
121
|
peptide_hits.select! do |hit|
|
@@ -54,7 +123,7 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
|
|
54
123
|
# update each peptide with its protein hits
|
55
124
|
prot_ids = peptide_to_proteins[hit.aaseq]
|
56
125
|
if prot_ids
|
57
|
-
hit.experiment_name =
|
126
|
+
hit.experiment_name = sample
|
58
127
|
hit.proteins = prot_ids
|
59
128
|
else ; false end
|
60
129
|
else
|
@@ -65,31 +134,27 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
|
|
65
134
|
end
|
66
135
|
end
|
67
136
|
|
68
|
-
if opt[:write_subset]
|
137
|
+
if opt[:write_subset]
|
69
138
|
aaseqs_to_prots = {}
|
70
|
-
|
71
|
-
|
72
|
-
aaseqs_to_prots[pephit.aaseq] = pephit.proteins
|
73
|
-
end
|
139
|
+
ar_of_pephit_ars.flatten(1).each do |pephit|
|
140
|
+
aaseqs_to_prots[pephit.aaseq] = pephit.proteins
|
74
141
|
end
|
75
142
|
outfile = "peptidecentric_subset.yml"
|
76
143
|
puts "writing #{outfile} with #{aaseqs_to_prots.size} aaseq->protids"
|
77
144
|
File.open(outfile,'w') do |out|
|
78
145
|
aaseqs_to_prots.each do |k,v|
|
79
|
-
out.puts(%Q{#{k}: #{v.
|
146
|
+
out.puts(%Q{#{k}: #{v.join("\t") }})
|
80
147
|
end
|
81
148
|
end
|
82
149
|
end
|
83
150
|
|
84
|
-
$VERBOSE = true
|
85
151
|
if $VERBOSE
|
86
|
-
|
87
|
-
|
152
|
+
samplenames.zip(ar_of_pephit_ars) do |samplename, pep_ar|
|
153
|
+
putsv "#{samplename}: #{pep_ar.size}"
|
88
154
|
end
|
89
155
|
end
|
90
156
|
|
91
|
-
all_peptide_hits =
|
92
|
-
|
157
|
+
all_peptide_hits = ar_of_pephit_ars.flatten(1)
|
93
158
|
|
94
159
|
# because peptide_hit#proteins yields id strings (which hash properly),
|
95
160
|
# each protein group is an array of
|
@@ -102,7 +167,7 @@ end
|
|
102
167
|
|
103
168
|
# partition them all out by filename
|
104
169
|
|
105
|
-
|
170
|
+
counts_parallel_to_names_with_counts_per_group = samplenames.map do |name|
|
106
171
|
pep_hit_to_prot_groups = Hash.new {|h,k| h[k] = [] }
|
107
172
|
groups_of_pephits = protein_groups.map do |prot_group|
|
108
173
|
pep_hits = prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
|
@@ -115,12 +180,69 @@ ar_of_count_data = opt[:names].map do |name|
|
|
115
180
|
#end
|
116
181
|
end
|
117
182
|
|
118
|
-
|
119
|
-
|
183
|
+
if opt[:qspec] || opt[:descriptions]
|
184
|
+
putsv "reading lengths and descriptions from #{opt[:fasta]}"
|
185
|
+
(id_to_length, id_to_desc) = Ms::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
186
|
+
end
|
187
|
+
|
188
|
+
samplename_to_condition = condition_to_samplenames.inverse
|
189
|
+
|
190
|
+
### OUTPUT TABLE
|
191
|
+
header_cats = samplenames.map.to_a
|
192
|
+
|
193
|
+
ar_of_rows = counts_parallel_to_names_with_counts_per_group.map do |counts_per_group|
|
194
|
+
counts_per_group.map(&opt[:count_type])
|
195
|
+
end.transpose
|
120
196
|
|
121
|
-
|
122
|
-
|
123
|
-
|
197
|
+
if opt[:qspec]
|
198
|
+
all_conditions = samplenames.map {|sn| samplename_to_condition[sn] }
|
199
|
+
condition_to_count_array = all_conditions.zip(counts_parallel_to_names_with_counts_per_group).map do |condition, counts_par_groups|
|
200
|
+
[condition, counts_par_groups.map(&opt[:count_type])]
|
201
|
+
end
|
202
|
+
|
203
|
+
name_length_pairs = protein_groups.map do |pg|
|
204
|
+
# prefer swissprot (sp) proteins over tremble (tr) and shorter protein
|
205
|
+
# lengths over longer lengths
|
206
|
+
best_guess_protein_id = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
|
207
|
+
length = id_to_length[best_guess_protein_id]
|
208
|
+
[pg.join(":"), length]
|
209
|
+
end
|
210
|
+
|
211
|
+
putsv "qspec to normalize counts: #{opt[:qspec_normalize]}"
|
212
|
+
qspec_results = Ms::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize])
|
213
|
+
|
214
|
+
to_add = [:fdr, :bayes_factor, :fold_change]
|
215
|
+
header_cats.push(*to_add)
|
216
|
+
qspec_results.zip(ar_of_rows) do |zipped|
|
217
|
+
(result, row) = zipped
|
218
|
+
row.push(*to_add.map {|v| result.send(v) })
|
219
|
+
end
|
124
220
|
end
|
125
221
|
|
222
|
+
header_cats.push( *%w(BestID AllIDs) )
|
223
|
+
header_cats.push( 'Description' ) if opt[:descriptions]
|
224
|
+
|
225
|
+
protein_groups.zip(ar_of_rows) do |zipped|
|
226
|
+
(pg, row) = zipped
|
227
|
+
# swiss-prot and then the shortest
|
228
|
+
best_protid = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
|
229
|
+
(gene_id, desc) =
|
230
|
+
if opt[:descriptions]
|
231
|
+
desc = id_to_desc[best_protid]
|
232
|
+
gene_id = (md=desc.match(/ GN=(\w+) ?/)) ? md[1] : best_protid
|
233
|
+
[gene_id, desc]
|
234
|
+
else
|
235
|
+
[best_protid, nil]
|
236
|
+
end
|
237
|
+
row << gene_id << pg.join(',')
|
238
|
+
row.push(desc) if desc
|
239
|
+
end
|
240
|
+
|
241
|
+
### SORT???
|
242
|
+
|
243
|
+
File.open(opt[:outfile],'w') do |out|
|
244
|
+
out.puts header_cats.join(delimiter)
|
245
|
+
ar_of_rows.each {|row| out.puts row.join(delimiter) }
|
246
|
+
putsv "wrote: #{opt[:outfile]}"
|
247
|
+
end
|
126
248
|
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Ms ; end
|
2
|
+
module Ms::Quant ; end
|
3
|
+
|
4
|
+
class Ms::Quant::Qspec
|
5
|
+
|
6
|
+
NBURNIN = 50 # need to check
|
7
|
+
NITER = 2000 # check
|
8
|
+
INIT_HEADER = %w(protid protLen)
|
9
|
+
DELIMITER = "\t"
|
10
|
+
|
11
|
+
# takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
|
12
|
+
# returns an array of ints [0,0,0,1,1,1...]
|
13
|
+
def self.conditions_to_ints(conditions)
|
14
|
+
i = 0
|
15
|
+
current_condition = conditions.first
|
16
|
+
conditions.map do |cond|
|
17
|
+
if current_condition == cond ; i
|
18
|
+
else
|
19
|
+
i += 1
|
20
|
+
current_condition = cond
|
21
|
+
i
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# returns an array of Results structs which is each row of the returned file
|
27
|
+
# works with V2 of QSpec
|
28
|
+
def self.results_array(resultsfile)
|
29
|
+
rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
|
30
|
+
headers = rows.shift
|
31
|
+
rows.map do |row|
|
32
|
+
data = [row[0]]
|
33
|
+
data.push( *row[1,2].map(&:to_i) )
|
34
|
+
data.push( *row[3,4].map(&:to_f) )
|
35
|
+
data.push( row[7] )
|
36
|
+
Results.new(*data)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# returns the right executable based on the array of conditions
|
41
|
+
def self.executable(conditions)
|
42
|
+
biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
|
43
|
+
(biggest_size >= 3) ? 'qspecgp' : 'qspec'
|
44
|
+
end
|
45
|
+
|
46
|
+
# protname_length_pairs is an array of doublets: [protname, length]
|
47
|
+
# condition_to_count_array is an array doublets: [condition, array_of_counts]
|
48
|
+
def initialize(protname_length_pairs, condition_to_count_array)
|
49
|
+
@protname_length_pairs = protname_length_pairs
|
50
|
+
@condition_to_count_array = condition_to_count_array
|
51
|
+
end
|
52
|
+
|
53
|
+
def conditions
|
54
|
+
@condition_to_count_array.map(&:first)
|
55
|
+
end
|
56
|
+
|
57
|
+
# writes a qspec formatted file to filename
|
58
|
+
def write(filename)
|
59
|
+
ints = Ms::Quant::Qspec.conditions_to_ints(conditions)
|
60
|
+
header_cats = INIT_HEADER + ints
|
61
|
+
rows = @protname_length_pairs.map {|pair| pair.map.to_a }
|
62
|
+
@condition_to_count_array.each do |cond,counts|
|
63
|
+
rows.zip(counts) {|row,cnt| row << cnt }
|
64
|
+
end
|
65
|
+
File.open(filename,'w') do |out|
|
66
|
+
out.puts header_cats.join(DELIMITER)
|
67
|
+
rows.each {|row| out.puts row.join(DELIMITER) }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def run(normalize=true, opts={})
|
72
|
+
puts "normalize: #{normalize}" if $VERBOSE
|
73
|
+
tfile = Tempfile.new("qspec")
|
74
|
+
write(tfile.path)
|
75
|
+
qspec_exe = self.class.executable(conditions)
|
76
|
+
cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
77
|
+
puts "running #{cmd}" if $VERBOSE
|
78
|
+
reply = `#{cmd}`
|
79
|
+
puts reply if $VERBOSE
|
80
|
+
results = self.class.results_array(tfile.path + Results::EXT)
|
81
|
+
tfile.unlink
|
82
|
+
results
|
83
|
+
end
|
84
|
+
|
85
|
+
# for version 2 of QSpec
|
86
|
+
Results = Struct.new(:protid, :set0, :set1, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
87
|
+
class Results
|
88
|
+
EXT = '_qspec'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John T. Prince
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-31 00:00:00 -06:00
|
18
18
|
default_executable: peptide_hit_qvalues_to_spectral_counts_table.rb
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -90,6 +90,7 @@ files:
|
|
90
90
|
- VERSION
|
91
91
|
- bin/peptide_hit_qvalues_to_spectral_counts_table.rb
|
92
92
|
- lib/ms-quant.rb
|
93
|
+
- lib/ms/quant/qspec.rb
|
93
94
|
- lib/ms/quant/spectral_counts.rb
|
94
95
|
- spec/ms/quant/spectral_counts_spec.rb
|
95
96
|
- spec/spec_helper.rb
|