ms-quant 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +6 -6
- data/VERSION +1 -1
- data/bin/peptide_hit_qvalues_to_spectral_counts_table.rb +151 -29
- data/lib/ms/quant/qspec.rb +91 -0
- metadata +4 -3
data/Rakefile
CHANGED
@@ -29,12 +29,12 @@ Rake::TestTask.new(:spec) do |spec|
|
|
29
29
|
spec.verbose = true
|
30
30
|
end
|
31
31
|
|
32
|
-
require 'rcov/rcovtask'
|
33
|
-
Rcov::RcovTask.new do |spec|
|
34
|
-
spec.libs << 'spec'
|
35
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
36
|
-
spec.verbose = true
|
37
|
-
end
|
32
|
+
#require 'rcov/rcovtask'
|
33
|
+
#Rcov::RcovTask.new do |spec|
|
34
|
+
# spec.libs << 'spec'
|
35
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
36
|
+
# spec.verbose = true
|
37
|
+
#end
|
38
38
|
|
39
39
|
task :default => :spec
|
40
40
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
@@ -4,37 +4,108 @@ require 'ms/ident/peptide_hit/qvalue'
|
|
4
4
|
require 'ms/ident/protein_hit'
|
5
5
|
require 'ms/ident/peptide/db'
|
6
6
|
require 'ms/quant/spectral_counts'
|
7
|
+
require 'ms/quant/qspec'
|
8
|
+
|
9
|
+
require 'yaml'
|
10
|
+
require 'tempfile'
|
7
11
|
|
8
12
|
require 'trollop'
|
9
13
|
|
14
|
+
# inverse from Tilo Sloboda (now in facets)
|
15
|
+
|
16
|
+
class Hash
|
17
|
+
def inverse
|
18
|
+
i = Hash.new
|
19
|
+
self.each_pair do |k,v|
|
20
|
+
if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
|
21
|
+
else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
|
22
|
+
end ; i
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
10
27
|
def putsv(*args)
|
11
28
|
if $VERBOSE
|
12
29
|
puts(*args) ; $stdout.flush
|
13
30
|
end
|
14
31
|
end
|
15
32
|
|
33
|
+
def basename(file)
|
34
|
+
base = file.chomp(File.extname(file))
|
35
|
+
base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
|
36
|
+
base
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
outfile = "spectral_counts.tsv"
|
41
|
+
delimiter = "\t"
|
42
|
+
|
16
43
|
opts = Trollop::Parser.new do
|
17
|
-
banner %Q{usage: #{File.basename(__FILE__)} peptide_centric_db.yml,
|
44
|
+
banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.psq,f2.psq group2=f3.psq,f4.psq
|
45
|
+
or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.psq file2.psq ...
|
46
|
+
|
47
|
+
writes to #{outfile}
|
48
|
+
group names can be arbitrarily defined
|
49
|
+
psq is really .psq.tsv file
|
18
50
|
}
|
19
|
-
opt :names, "array of names for the table (otherwise filenames)", :type => String
|
20
51
|
opt :fdr_percent, "%FDR as cutoff", :default => 1.0
|
21
|
-
opt :
|
52
|
+
opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
|
53
|
+
opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
|
54
|
+
opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
|
55
|
+
opt :outfile, "the to which file data are written", :default => outfile
|
56
|
+
opt :verbose, "speak up", :default => false
|
57
|
+
opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
|
58
|
+
opt :qspec_normalize, "normalize spectral counts per run", :default => false
|
59
|
+
opt :write_subset, "(dev use only) write subset db", :default => false
|
22
60
|
end
|
23
61
|
|
24
62
|
opt = opts.parse(ARGV)
|
63
|
+
opt[:count_type] = opt[:count_type].to_sym
|
64
|
+
|
65
|
+
$VERBOSE = opt.delete(:verbose)
|
25
66
|
|
26
67
|
if ARGV.size < 2
|
27
68
|
opts.educate && exit
|
28
69
|
end
|
29
70
|
|
71
|
+
if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
|
72
|
+
puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
|
73
|
+
opts.educate && exit
|
74
|
+
end
|
75
|
+
|
30
76
|
peptide_centric_db_file = ARGV.shift
|
77
|
+
raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
|
78
|
+
putsv "using: #{peptide_centric_db_file} as peptide centric db"
|
79
|
+
|
80
|
+
# groupname => files
|
81
|
+
condition_to_samplenames = {}
|
82
|
+
samplename_to_filename = {}
|
83
|
+
ARGV.each do |arg|
|
84
|
+
(condition, files) =
|
85
|
+
if arg.include?('=')
|
86
|
+
(condition, filestring) = arg.split('=')
|
87
|
+
[condition, filestring.split(',')]
|
88
|
+
else
|
89
|
+
[basename(arg), [arg]]
|
90
|
+
end
|
91
|
+
reptag = ARGV.size
|
92
|
+
sample_to_file_pairs = files.each_with_index.map {|file,i| ["#{condition}-rep#{i+1}", file] }
|
93
|
+
sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
|
94
|
+
condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
|
95
|
+
end
|
31
96
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
97
|
+
|
98
|
+
if $VERBOSE
|
99
|
+
puts "** condition: sample_names"
|
100
|
+
puts condition_to_samplenames.to_yaml
|
101
|
+
puts "** samplename: filename"
|
102
|
+
puts samplename_to_filename.to_yaml
|
36
103
|
end
|
37
104
|
|
105
|
+
raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
|
106
|
+
|
107
|
+
samplenames = samplename_to_filename.keys
|
108
|
+
|
38
109
|
class Ms::Ident::PeptideHit
|
39
110
|
attr_accessor :experiment_name
|
40
111
|
end
|
@@ -42,11 +113,9 @@ fdr_cutoff = opt[:fdr_percent] / 100
|
|
42
113
|
|
43
114
|
start=Time.now
|
44
115
|
|
45
|
-
|
46
|
-
|
47
|
-
ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
116
|
+
ar_of_pephit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
|
48
117
|
putsv "#{Time.now-start} seconds to read #{peptide_centric_db_file}"
|
49
|
-
|
118
|
+
samplename_to_filename.map do |sample, file|
|
50
119
|
peptide_hits = Ms::Ident::PeptideHit::Qvalue.from_file(file)
|
51
120
|
putsv "#{file}: #{peptide_hits.size} hits"
|
52
121
|
peptide_hits.select! do |hit|
|
@@ -54,7 +123,7 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
|
|
54
123
|
# update each peptide with its protein hits
|
55
124
|
prot_ids = peptide_to_proteins[hit.aaseq]
|
56
125
|
if prot_ids
|
57
|
-
hit.experiment_name =
|
126
|
+
hit.experiment_name = sample
|
58
127
|
hit.proteins = prot_ids
|
59
128
|
else ; false end
|
60
129
|
else
|
@@ -65,31 +134,27 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
|
|
65
134
|
end
|
66
135
|
end
|
67
136
|
|
68
|
-
if opt[:write_subset]
|
137
|
+
if opt[:write_subset]
|
69
138
|
aaseqs_to_prots = {}
|
70
|
-
|
71
|
-
|
72
|
-
aaseqs_to_prots[pephit.aaseq] = pephit.proteins
|
73
|
-
end
|
139
|
+
ar_of_pephit_ars.flatten(1).each do |pephit|
|
140
|
+
aaseqs_to_prots[pephit.aaseq] = pephit.proteins
|
74
141
|
end
|
75
142
|
outfile = "peptidecentric_subset.yml"
|
76
143
|
puts "writing #{outfile} with #{aaseqs_to_prots.size} aaseq->protids"
|
77
144
|
File.open(outfile,'w') do |out|
|
78
145
|
aaseqs_to_prots.each do |k,v|
|
79
|
-
out.puts(%Q{#{k}: #{v.
|
146
|
+
out.puts(%Q{#{k}: #{v.join("\t") }})
|
80
147
|
end
|
81
148
|
end
|
82
149
|
end
|
83
150
|
|
84
|
-
$VERBOSE = true
|
85
151
|
if $VERBOSE
|
86
|
-
|
87
|
-
|
152
|
+
samplenames.zip(ar_of_pephit_ars) do |samplename, pep_ar|
|
153
|
+
putsv "#{samplename}: #{pep_ar.size}"
|
88
154
|
end
|
89
155
|
end
|
90
156
|
|
91
|
-
all_peptide_hits =
|
92
|
-
|
157
|
+
all_peptide_hits = ar_of_pephit_ars.flatten(1)
|
93
158
|
|
94
159
|
# because peptide_hit#proteins yields id strings (which hash properly),
|
95
160
|
# each protein group is an array of
|
@@ -102,7 +167,7 @@ end
|
|
102
167
|
|
103
168
|
# partition them all out by filename
|
104
169
|
|
105
|
-
|
170
|
+
counts_parallel_to_names_with_counts_per_group = samplenames.map do |name|
|
106
171
|
pep_hit_to_prot_groups = Hash.new {|h,k| h[k] = [] }
|
107
172
|
groups_of_pephits = protein_groups.map do |prot_group|
|
108
173
|
pep_hits = prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
|
@@ -115,12 +180,69 @@ ar_of_count_data = opt[:names].map do |name|
|
|
115
180
|
#end
|
116
181
|
end
|
117
182
|
|
118
|
-
|
119
|
-
|
183
|
+
if opt[:qspec] || opt[:descriptions]
|
184
|
+
putsv "reading lengths and descriptions from #{opt[:fasta]}"
|
185
|
+
(id_to_length, id_to_desc) = Ms::Fasta.protein_lengths_and_descriptions(opt[:fasta])
|
186
|
+
end
|
187
|
+
|
188
|
+
samplename_to_condition = condition_to_samplenames.inverse
|
189
|
+
|
190
|
+
### OUTPUT TABLE
|
191
|
+
header_cats = samplenames.map.to_a
|
192
|
+
|
193
|
+
ar_of_rows = counts_parallel_to_names_with_counts_per_group.map do |counts_per_group|
|
194
|
+
counts_per_group.map(&opt[:count_type])
|
195
|
+
end.transpose
|
120
196
|
|
121
|
-
|
122
|
-
|
123
|
-
|
197
|
+
if opt[:qspec]
|
198
|
+
all_conditions = samplenames.map {|sn| samplename_to_condition[sn] }
|
199
|
+
condition_to_count_array = all_conditions.zip(counts_parallel_to_names_with_counts_per_group).map do |condition, counts_par_groups|
|
200
|
+
[condition, counts_par_groups.map(&opt[:count_type])]
|
201
|
+
end
|
202
|
+
|
203
|
+
name_length_pairs = protein_groups.map do |pg|
|
204
|
+
# prefer swissprot (sp) proteins over tremble (tr) and shorter protein
|
205
|
+
# lengths over longer lengths
|
206
|
+
best_guess_protein_id = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
|
207
|
+
length = id_to_length[best_guess_protein_id]
|
208
|
+
[pg.join(":"), length]
|
209
|
+
end
|
210
|
+
|
211
|
+
putsv "qspec to normalize counts: #{opt[:qspec_normalize]}"
|
212
|
+
qspec_results = Ms::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize])
|
213
|
+
|
214
|
+
to_add = [:fdr, :bayes_factor, :fold_change]
|
215
|
+
header_cats.push(*to_add)
|
216
|
+
qspec_results.zip(ar_of_rows) do |zipped|
|
217
|
+
(result, row) = zipped
|
218
|
+
row.push(*to_add.map {|v| result.send(v) })
|
219
|
+
end
|
124
220
|
end
|
125
221
|
|
222
|
+
header_cats.push( *%w(BestID AllIDs) )
|
223
|
+
header_cats.push( 'Description' ) if opt[:descriptions]
|
224
|
+
|
225
|
+
protein_groups.zip(ar_of_rows) do |zipped|
|
226
|
+
(pg, row) = zipped
|
227
|
+
# swiss-prot and then the shortest
|
228
|
+
best_protid = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
|
229
|
+
(gene_id, desc) =
|
230
|
+
if opt[:descriptions]
|
231
|
+
desc = id_to_desc[best_protid]
|
232
|
+
gene_id = (md=desc.match(/ GN=(\w+) ?/)) ? md[1] : best_protid
|
233
|
+
[gene_id, desc]
|
234
|
+
else
|
235
|
+
[best_protid, nil]
|
236
|
+
end
|
237
|
+
row << gene_id << pg.join(',')
|
238
|
+
row.push(desc) if desc
|
239
|
+
end
|
240
|
+
|
241
|
+
### SORT???
|
242
|
+
|
243
|
+
File.open(opt[:outfile],'w') do |out|
|
244
|
+
out.puts header_cats.join(delimiter)
|
245
|
+
ar_of_rows.each {|row| out.puts row.join(delimiter) }
|
246
|
+
putsv "wrote: #{opt[:outfile]}"
|
247
|
+
end
|
126
248
|
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Ms ; end
|
2
|
+
module Ms::Quant ; end
|
3
|
+
|
4
|
+
class Ms::Quant::Qspec
|
5
|
+
|
6
|
+
NBURNIN = 50 # need to check
|
7
|
+
NITER = 2000 # check
|
8
|
+
INIT_HEADER = %w(protid protLen)
|
9
|
+
DELIMITER = "\t"
|
10
|
+
|
11
|
+
# takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
|
12
|
+
# returns an array of ints [0,0,0,1,1,1...]
|
13
|
+
def self.conditions_to_ints(conditions)
|
14
|
+
i = 0
|
15
|
+
current_condition = conditions.first
|
16
|
+
conditions.map do |cond|
|
17
|
+
if current_condition == cond ; i
|
18
|
+
else
|
19
|
+
i += 1
|
20
|
+
current_condition = cond
|
21
|
+
i
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# returns an array of Results structs which is each row of the returned file
|
27
|
+
# works with V2 of QSpec
|
28
|
+
def self.results_array(resultsfile)
|
29
|
+
rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
|
30
|
+
headers = rows.shift
|
31
|
+
rows.map do |row|
|
32
|
+
data = [row[0]]
|
33
|
+
data.push( *row[1,2].map(&:to_i) )
|
34
|
+
data.push( *row[3,4].map(&:to_f) )
|
35
|
+
data.push( row[7] )
|
36
|
+
Results.new(*data)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# returns the right executable based on the array of conditions
|
41
|
+
def self.executable(conditions)
|
42
|
+
biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
|
43
|
+
(biggest_size >= 3) ? 'qspecgp' : 'qspec'
|
44
|
+
end
|
45
|
+
|
46
|
+
# protname_length_pairs is an array of doublets: [protname, length]
|
47
|
+
# condition_to_count_array is an array doublets: [condition, array_of_counts]
|
48
|
+
def initialize(protname_length_pairs, condition_to_count_array)
|
49
|
+
@protname_length_pairs = protname_length_pairs
|
50
|
+
@condition_to_count_array = condition_to_count_array
|
51
|
+
end
|
52
|
+
|
53
|
+
def conditions
|
54
|
+
@condition_to_count_array.map(&:first)
|
55
|
+
end
|
56
|
+
|
57
|
+
# writes a qspec formatted file to filename
|
58
|
+
def write(filename)
|
59
|
+
ints = Ms::Quant::Qspec.conditions_to_ints(conditions)
|
60
|
+
header_cats = INIT_HEADER + ints
|
61
|
+
rows = @protname_length_pairs.map {|pair| pair.map.to_a }
|
62
|
+
@condition_to_count_array.each do |cond,counts|
|
63
|
+
rows.zip(counts) {|row,cnt| row << cnt }
|
64
|
+
end
|
65
|
+
File.open(filename,'w') do |out|
|
66
|
+
out.puts header_cats.join(DELIMITER)
|
67
|
+
rows.each {|row| out.puts row.join(DELIMITER) }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def run(normalize=true, opts={})
|
72
|
+
puts "normalize: #{normalize}" if $VERBOSE
|
73
|
+
tfile = Tempfile.new("qspec")
|
74
|
+
write(tfile.path)
|
75
|
+
qspec_exe = self.class.executable(conditions)
|
76
|
+
cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
|
77
|
+
puts "running #{cmd}" if $VERBOSE
|
78
|
+
reply = `#{cmd}`
|
79
|
+
puts reply if $VERBOSE
|
80
|
+
results = self.class.results_array(tfile.path + Results::EXT)
|
81
|
+
tfile.unlink
|
82
|
+
results
|
83
|
+
end
|
84
|
+
|
85
|
+
# for version 2 of QSpec
|
86
|
+
Results = Struct.new(:protid, :set0, :set1, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
|
87
|
+
class Results
|
88
|
+
EXT = '_qspec'
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John T. Prince
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-31 00:00:00 -06:00
|
18
18
|
default_executable: peptide_hit_qvalues_to_spectral_counts_table.rb
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -90,6 +90,7 @@ files:
|
|
90
90
|
- VERSION
|
91
91
|
- bin/peptide_hit_qvalues_to_spectral_counts_table.rb
|
92
92
|
- lib/ms-quant.rb
|
93
|
+
- lib/ms/quant/qspec.rb
|
93
94
|
- lib/ms/quant/spectral_counts.rb
|
94
95
|
- spec/ms/quant/spectral_counts_spec.rb
|
95
96
|
- spec/spec_helper.rb
|