mspire 0.8.5 → 0.8.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'andand'
4
+ require 'set'
5
+ require 'ruport'
6
+
7
+ require 'mspire/ident/peptide_hit/qvalue'
8
+ require 'mspire/ident/peptide_hit'
9
+ require 'mspire/ident/protein_group'
10
+ require 'mspire/ident/protein'
11
+ require 'mspire/ident/peptide/db/io'
12
+ require 'mspire/quant/spectral_counts'
13
+ require 'mspire/quant/protein_group_comparison'
14
+ require 'mspire/quant/qspec/protein_group_comparison'
15
+ require 'mspire/quant/qspec'
16
+ require 'mspire/quant/cmdline'
17
+ require 'mspire/fasta'
18
+
19
+
20
+ require 'yaml'
21
+ require 'tempfile'
22
+
23
+ require 'trollop'
24
+
25
+ def putsv(*args)
26
+ if $VERBOSE
27
+ puts(*args) ; $stdout.flush
28
+ end
29
+ end
30
+
31
+ def basename(file)
32
+ base = file.chomp(File.extname(file))
33
+ base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
34
+ base
35
+ end
36
+
37
+ class Ruport::Data::Table
38
+ # returns self
39
+ def add_column_with_data(colname, array_of_data, opts={})
40
+ self.add_column(colname, opts)
41
+ self.data.zip(array_of_data) do |row, newval|
42
+ row[colname] = newval
43
+ end
44
+ self
45
+ end
46
+
47
+ # acceptable opts:
48
+ #
49
+ # :header => an array of lines (each which will be commented out)
50
+ def to_tsv(file, opt={})
51
+ delimiter = "\t"
52
+ File.open(file,'w') do |out|
53
+ opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
54
+ out.puts self.column_names.join(delimiter)
55
+ self.data.each do |row|
56
+ out.puts row.to_a.join(delimiter)
57
+ end
58
+ opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ def write_subset(sample_to_pephits, outfile="peptidecentric_subset.yml")
65
+ aaseqs_to_prots = {}
66
+ sample_to_pephits.map(&:last).flatten(1).each do |pephit|
67
+ aaseqs_to_prots[pephit.aaseq] = pephit.proteins.map(&:id)
68
+ end
69
+ File.open(outfile,'w') do |out|
70
+ aaseqs_to_prots.each do |k,v|
71
+ out.puts(%Q{#{k}: #{v.join("\t") }})
72
+ end
73
+ end
74
+ end
75
+
76
+
77
+ outfile = "spectral_counts.tsv"
78
+ pephits_outfile = "spectral_counts.pephits.tsv"
79
+ delimiter = "\t"
80
+
81
+ opts = Trollop::Parser.new do
82
+ banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.phq.tsv,f2.phq.tsv group2=f3.phq.tsv,f4.phq.tsv
83
+ or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.phq.tsv file2.phq.tsv ...
84
+
85
+ writes to #{outfile}
86
+ group names can be arbitrarily defined
87
+ }
88
+ opt :fdr_percent, "%FDR as cutoff", :default => 1.0
89
+ opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
90
+ opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
91
+ opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
92
+ opt :outfile, "the to which file data are written", :default => outfile
93
+ opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
94
+ opt :verbose, "speak up", :default => false
95
+ opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
96
+ opt :qspec_decibans, "report bayesfactor in decibans"
97
+ opt :qspec_normalize, "normalize spectral counts per run", :default => false
98
+ opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
99
+ opt :write_subset, "(dev use only) write subset db", :default => false
100
+ end
101
+
102
+ opt = opts.parse(ARGV)
103
+ opt[:count_type] = opt[:count_type].to_sym
104
+
105
+ $VERBOSE = opt.delete(:verbose)
106
+
107
+ if ARGV.size < 2
108
+ opts.educate && exit
109
+ end
110
+
111
+ if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
112
+ puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
113
+ opts.educate && exit
114
+ end
115
+
116
+ peptide_centric_db_file = ARGV.shift
117
+ raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
118
+ putsv "using: #{peptide_centric_db_file} as peptide centric db"
119
+
120
+ # groupname => files
121
+
122
+ (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
123
+
124
+ raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
125
+
126
+ samplenames = samplename_to_filename.keys
127
+
128
+ class Mspire::Ident::PeptideHit
129
+ attr_accessor :experiment_name
130
+ attr_accessor :protein_groups
131
+ end
132
+
133
+ class Mspire::Ident::Protein
134
+ attr_accessor :length
135
+ end
136
+
137
+
138
+ fdr_cutoff = opt[:fdr_percent] / 100
139
+
140
+ if opt[:qspec] || opt[:descriptions]
141
+ putsv "reading lengths and descriptions from #{opt[:fasta]}"
142
+ #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
143
+ id_to_length = {}
144
+ id_to_desc = {}
145
+ Mspire::Fasta.foreach(opt[:fasta]) do |entry|
146
+ acc = entry.accession
147
+ id_to_length[acc] = entry.length
148
+ id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
149
+ end
150
+ end
151
+
152
+ samplename_to_peptidehits = samplename_to_filename.map do |sample, file|
153
+ [sample, Mspire::Ident::PeptideHit::Qvalue.from_file(file).select {|hit| hit.qvalue <= fdr_cutoff }]
154
+ end
155
+
156
+ # update each peptide hit with protein hits and sample name:
157
+ all_protein_hits = Hash.new {|h,id| h[id] = Mspire::Ident::Protein.new(id) }
158
+
159
+ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
160
+ samplename_to_peptidehits.map do |sample, peptide_hits|
161
+ # removes pephits that aren't in the database, (usually ones with aa 'X'
162
+ # in them )
163
+ normal_pephits = peptide_hits.select {|hit| peptide_to_proteins[hit.aaseq] }
164
+ normal_pephits.each do |hit|
165
+ # update each peptide with its protein hits
166
+ protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
167
+ protein = all_protein_hits[id]
168
+ protein.length = id_to_length[id] if id_to_length
169
+ protein.description = id_to_desc[id] if id_to_desc
170
+ protein
171
+ end
172
+ hit.experiment_name = sample
173
+ # if there are protein hits, the peptide hit is selected
174
+ hit.proteins = protein_hits
175
+ end
176
+ end
177
+ end
178
+
179
+ write_subset(samplename_to_peptidehits) if opt[:write_subset]
180
+
181
+ samplename_to_peptidehits.each {|samplename, hits| putsv "#{samplename}: #{hits.size}" } if $VERBOSE
182
+
183
+ all_peptide_hits = samplename_to_peptidehits.map(&:last).flatten(1)
184
+
185
+ # this constricts everything down to a minimal set of protein groups that
186
+ # explain the entire set of peptide hits.
187
+ update_pephits = true # ensures that each pephit is linked to the array of protein groups it is associated with
188
+ protein_groups = Mspire::Ident::ProteinGroup.peptide_hits_to_protein_groups(all_peptide_hits, update_pephits)
189
+
190
+ hits_table_hash = {} # create the table using key => column hash
191
+ samplenames.each do |name|
192
+ hits_table_hash[name] = protein_groups.map do |prot_group|
193
+ prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
194
+ end
195
+ end
196
+
197
+ # The columns are filled with groups of peptide hits, one group of hits per
198
+ # protein group (protein group order is implicit). The rows are sample names.
199
+ #
200
+ # (implied) sample1 sample2 sample3 ...
201
+ # (group1) [hit,hit] [hit...] [hit...] ...
202
+ # (group2) [hit,hit] [hit...] [hit...] ...
203
+ # ... ... ... ... ...
204
+ hits_table = Ruport::Data::Table.new(:data => hits_table_hash.values.transpose, :column_names => hits_table_hash.keys)
205
+
206
+ # spectral counts of type opt[:count_type]
207
+ counts_data = hits_table.data.map do |row|
208
+ row.map do |pephits|
209
+ Mspire::Quant::SpectralCounts.counts(pephits) {|pephit| 1.0 / pephit.protein_groups.size }.send(opt[:count_type])
210
+ end
211
+ end
212
+
213
+ # each cell holds a SpectralCounts object, which hash 3 types of count data
214
+ counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
215
+
216
+ # return a list of ProteinGroupComparisons
217
+ if opt[:qspec]
218
+
219
+ # prepare data for qspec
220
+ condition_to_count_array = counts_table.column_names.map do |name|
221
+ [samplename_to_condition[name], counts_table.column(name)]
222
+ end
223
+ # average length of the proteins in the group
224
+ name_length_pairs = protein_groups.map do |pg|
225
+ [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
226
+ end
227
+
228
+ qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
229
+
230
+ cols_to_add = [:bayes_factor, :fold_change, :fdr]
231
+ to_add_as_headers = cols_to_add.map do |v|
232
+ if opt[:qspec_decibans] && v == :bayes_factor
233
+ :decibans
234
+ else
235
+ v
236
+ end
237
+ end
238
+ counts_table.add_columns to_add_as_headers
239
+ counts_table.data.zip(qspec_results) do |row, qspec_result|
240
+ cols_to_add.each do |cat|
241
+ if cat == :bayes_factor && opt[:qspec_decibans]
242
+ row[:decibans] = 10 * Math.log10(qspec_result[cat])
243
+ else
244
+ row[cat] = qspec_result[cat]
245
+ end
246
+ end
247
+ end
248
+ end
249
+
250
+ counts_table.add_columns( [:name, :ids, :description] )
251
+ counts_table.data.zip(protein_groups) do |row, pg|
252
+ best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
253
+ row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
254
+ row.ids = pg.map(&:id).join(',')
255
+ row.description = best_id.description
256
+ end
257
+
258
+
259
+ if opt[:peptides]
260
+ hits_table.each do |record|
261
+ record.each_with_index do |hits,i|
262
+ new_cell = hits.group_by do |hit|
263
+ [hit.aaseq, hit.charge]
264
+ end.map do |key, hits|
265
+ [key.reverse.join("_"), hits.map(&:id).join(',')].join(":")
266
+ end.join('; ')
267
+ record[i] = new_cell
268
+ end
269
+ end
270
+ hits_table.add_column_with_data(:name, counts_table.column(:name), :position=>0)
271
+ hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
272
+ end
273
+
274
+ intro = ["samples: #{samplename_to_filename}", "options: #{opt}"]
275
+ counts_table.to_tsv(outfile, :footer => intro)
@@ -49,6 +49,17 @@ describe 'creating a peptide centric database' do
49
49
  #File.unlink(@output_file)
50
50
  end
51
51
 
52
+ it 'can use a trie' do
53
+ Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file, '--trie'])
54
+ triefile = TESTFILES + '/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4'
55
+ %w(.trie .tail .da).each do |ext|
56
+ File.exist?(triefile + ext).should be_true
57
+ end
58
+ trie = Trie.read(triefile)
59
+ p trie.get('MADGSGWQPPRPCEAYR')
60
+ #trie.get('MADGSGWQPPRPCEAYR').should == ["D3DX18"]
61
+ end
62
+
52
63
  it 'lists approved enzymes and exits' do
53
64
  output = capture_stdout do
54
65
  begin