mspire 0.8.5 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'andand'
4
+ require 'set'
5
+ require 'ruport'
6
+
7
+ require 'mspire/ident/peptide_hit/qvalue'
8
+ require 'mspire/ident/peptide_hit'
9
+ require 'mspire/ident/protein_group'
10
+ require 'mspire/ident/protein'
11
+ require 'mspire/ident/peptide/db/io'
12
+ require 'mspire/quant/spectral_counts'
13
+ require 'mspire/quant/protein_group_comparison'
14
+ require 'mspire/quant/qspec/protein_group_comparison'
15
+ require 'mspire/quant/qspec'
16
+ require 'mspire/quant/cmdline'
17
+ require 'mspire/fasta'
18
+
19
+
20
+ require 'yaml'
21
+ require 'tempfile'
22
+
23
+ require 'trollop'
24
+
25
+ def putsv(*args)
26
+ if $VERBOSE
27
+ puts(*args) ; $stdout.flush
28
+ end
29
+ end
30
+
31
+ def basename(file)
32
+ base = file.chomp(File.extname(file))
33
+ base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
34
+ base
35
+ end
36
+
37
+ class Ruport::Data::Table
38
+ # returns self
39
+ def add_column_with_data(colname, array_of_data, opts={})
40
+ self.add_column(colname, opts)
41
+ self.data.zip(array_of_data) do |row, newval|
42
+ row[colname] = newval
43
+ end
44
+ self
45
+ end
46
+
47
+ # acceptable opts:
48
+ #
49
+ # :header => an array of lines (each which will be commented out)
50
+ def to_tsv(file, opt={})
51
+ delimiter = "\t"
52
+ File.open(file,'w') do |out|
53
+ opt[:header].each {|line| out.puts "# #{line}" } if opt[:header]
54
+ out.puts self.column_names.join(delimiter)
55
+ self.data.each do |row|
56
+ out.puts row.to_a.join(delimiter)
57
+ end
58
+ opt[:footer].each {|line| out.puts "# #{line}" } if opt[:footer]
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ def write_subset(sample_to_pephits, outfile="peptidecentric_subset.yml")
65
+ aaseqs_to_prots = {}
66
+ sample_to_pephits.map(&:last).flatten(1).each do |pephit|
67
+ aaseqs_to_prots[pephit.aaseq] = pephit.proteins.map(&:id)
68
+ end
69
+ File.open(outfile,'w') do |out|
70
+ aaseqs_to_prots.each do |k,v|
71
+ out.puts(%Q{#{k}: #{v.join("\t") }})
72
+ end
73
+ end
74
+ end
75
+
76
+
77
+ outfile = "spectral_counts.tsv"
78
+ pephits_outfile = "spectral_counts.pephits.tsv"
79
+ delimiter = "\t"
80
+
81
+ opts = Trollop::Parser.new do
82
+ banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.phq.tsv,f2.phq.tsv group2=f3.phq.tsv,f4.phq.tsv
83
+ or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.phq.tsv file2.phq.tsv ...
84
+
85
+ writes to #{outfile}
86
+ group names can be arbitrarily defined
87
+ }
88
+ opt :fdr_percent, "%FDR as cutoff", :default => 1.0
89
+ opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
90
+ opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
91
+ opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
92
+ opt :outfile, "the to which file data are written", :default => outfile
93
+ opt :peptides, "also write peptide hits (to: #{pephits_outfile})", :default => false
94
+ opt :verbose, "speak up", :default => false
95
+ opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
96
+ opt :qspec_decibans, "report bayesfactor in decibans"
97
+ opt :qspec_normalize, "normalize spectral counts per run", :default => false
98
+ opt :qspec_keep_files, "keep a copy of the files submitted and returned from Qspec", :default => false
99
+ opt :write_subset, "(dev use only) write subset db", :default => false
100
+ end
101
+
102
+ opt = opts.parse(ARGV)
103
+ opt[:count_type] = opt[:count_type].to_sym
104
+
105
+ $VERBOSE = opt.delete(:verbose)
106
+
107
+ if ARGV.size < 2
108
+ opts.educate && exit
109
+ end
110
+
111
+ if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
112
+ puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
113
+ opts.educate && exit
114
+ end
115
+
116
+ peptide_centric_db_file = ARGV.shift
117
+ raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
118
+ putsv "using: #{peptide_centric_db_file} as peptide centric db"
119
+
120
+ # groupname => files
121
+
122
+ (samplename_to_filename, condition_to_samplenames, samplename_to_condition) = Mspire::Quant::Cmdline.args_to_hashes(ARGV)
123
+
124
+ raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
125
+
126
+ samplenames = samplename_to_filename.keys
127
+
128
+ class Mspire::Ident::PeptideHit
129
+ attr_accessor :experiment_name
130
+ attr_accessor :protein_groups
131
+ end
132
+
133
+ class Mspire::Ident::Protein
134
+ attr_accessor :length
135
+ end
136
+
137
+
138
+ fdr_cutoff = opt[:fdr_percent] / 100
139
+
140
+ if opt[:qspec] || opt[:descriptions]
141
+ putsv "reading lengths and descriptions from #{opt[:fasta]}"
142
+ #Mspire::Fasta.protein_lengths_and_descriptions(opt[:fasta])
143
+ id_to_length = {}
144
+ id_to_desc = {}
145
+ Mspire::Fasta.foreach(opt[:fasta]) do |entry|
146
+ acc = entry.accession
147
+ id_to_length[acc] = entry.length
148
+ id_to_desc[acc] = entry.definition[/^\S+\s(.*)/,1]
149
+ end
150
+ end
151
+
152
+ samplename_to_peptidehits = samplename_to_filename.map do |sample, file|
153
+ [sample, Mspire::Ident::PeptideHit::Qvalue.from_file(file).select {|hit| hit.qvalue <= fdr_cutoff }]
154
+ end
155
+
156
+ # update each peptide hit with protein hits and sample name:
157
+ all_protein_hits = Hash.new {|h,id| h[id] = Mspire::Ident::Protein.new(id) }
158
+
159
+ Mspire::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
160
+ samplename_to_peptidehits.map do |sample, peptide_hits|
161
+ # removes pephits that aren't in the database, (usually ones with aa 'X'
162
+ # in them )
163
+ normal_pephits = peptide_hits.select {|hit| peptide_to_proteins[hit.aaseq] }
164
+ normal_pephits.each do |hit|
165
+ # update each peptide with its protein hits
166
+ protein_hits = peptide_to_proteins[hit.aaseq].map do |id|
167
+ protein = all_protein_hits[id]
168
+ protein.length = id_to_length[id] if id_to_length
169
+ protein.description = id_to_desc[id] if id_to_desc
170
+ protein
171
+ end
172
+ hit.experiment_name = sample
173
+ # if there are protein hits, the peptide hit is selected
174
+ hit.proteins = protein_hits
175
+ end
176
+ end
177
+ end
178
+
179
+ write_subset(samplename_to_peptidehits) if opt[:write_subset]
180
+
181
+ samplename_to_peptidehits.each {|samplename, hits| putsv "#{samplename}: #{hits.size}" } if $VERBOSE
182
+
183
+ all_peptide_hits = samplename_to_peptidehits.map(&:last).flatten(1)
184
+
185
+ # this constricts everything down to a minimal set of protein groups that
186
+ # explain the entire set of peptide hits.
187
+ update_pephits = true # ensures that each pephit is linked to the array of protein groups it is associated with
188
+ protein_groups = Mspire::Ident::ProteinGroup.peptide_hits_to_protein_groups(all_peptide_hits, update_pephits)
189
+
190
+ hits_table_hash = {} # create the table using key => column hash
191
+ samplenames.each do |name|
192
+ hits_table_hash[name] = protein_groups.map do |prot_group|
193
+ prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
194
+ end
195
+ end
196
+
197
+ # The columns are filled with groups of peptide hits, one group of hits per
198
+ # protein group (protein group order is implicit). The rows are sample names.
199
+ #
200
+ # (implied) sample1 sample2 sample3 ...
201
+ # (group1) [hit,hit] [hit...] [hit...] ...
202
+ # (group2) [hit,hit] [hit...] [hit...] ...
203
+ # ... ... ... ... ...
204
+ hits_table = Ruport::Data::Table.new(:data => hits_table_hash.values.transpose, :column_names => hits_table_hash.keys)
205
+
206
+ # spectral counts of type opt[:count_type]
207
+ counts_data = hits_table.data.map do |row|
208
+ row.map do |pephits|
209
+ Mspire::Quant::SpectralCounts.counts(pephits) {|pephit| 1.0 / pephit.protein_groups.size }.send(opt[:count_type])
210
+ end
211
+ end
212
+
213
+ # each cell holds a SpectralCounts object, which hash 3 types of count data
214
+ counts_table = Ruport::Data::Table.new(:data => counts_data, :column_names => samplenames)
215
+
216
+ # return a list of ProteinGroupComparisons
217
+ if opt[:qspec]
218
+
219
+ # prepare data for qspec
220
+ condition_to_count_array = counts_table.column_names.map do |name|
221
+ [samplename_to_condition[name], counts_table.column(name)]
222
+ end
223
+ # average length of the proteins in the group
224
+ name_length_pairs = protein_groups.map do |pg|
225
+ [pg.map(&:id).join(":"), pg.map(&:length).reduce(:+)./(pg.size).round]
226
+ end
227
+
228
+ qspec_results = Mspire::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize], :keep => opt[:qspec_keep_files])
229
+
230
+ cols_to_add = [:bayes_factor, :fold_change, :fdr]
231
+ to_add_as_headers = cols_to_add.map do |v|
232
+ if opt[:qspec_decibans] && v == :bayes_factor
233
+ :decibans
234
+ else
235
+ v
236
+ end
237
+ end
238
+ counts_table.add_columns to_add_as_headers
239
+ counts_table.data.zip(qspec_results) do |row, qspec_result|
240
+ cols_to_add.each do |cat|
241
+ if cat == :bayes_factor && opt[:qspec_decibans]
242
+ row[:decibans] = 10 * Math.log10(qspec_result[cat])
243
+ else
244
+ row[cat] = qspec_result[cat]
245
+ end
246
+ end
247
+ end
248
+ end
249
+
250
+ counts_table.add_columns( [:name, :ids, :description] )
251
+ counts_table.data.zip(protein_groups) do |row, pg|
252
+ best_id = pg.sort_by {|prot| [prot.id, prot.length] }.first
253
+ row.name = best_id.description.andand.match(/ GN=([^\s]+) ?/).andand[1] || best_id.id
254
+ row.ids = pg.map(&:id).join(',')
255
+ row.description = best_id.description
256
+ end
257
+
258
+
259
+ if opt[:peptides]
260
+ hits_table.each do |record|
261
+ record.each_with_index do |hits,i|
262
+ new_cell = hits.group_by do |hit|
263
+ [hit.aaseq, hit.charge]
264
+ end.map do |key, hits|
265
+ [key.reverse.join("_"), hits.map(&:id).join(',')].join(":")
266
+ end.join('; ')
267
+ record[i] = new_cell
268
+ end
269
+ end
270
+ hits_table.add_column_with_data(:name, counts_table.column(:name), :position=>0)
271
+ hits_table.to_tsv(pephits_outfile, :footer => ["parallel to #{outfile}"])
272
+ end
273
+
274
+ intro = ["samples: #{samplename_to_filename}", "options: #{opt}"]
275
+ counts_table.to_tsv(outfile, :footer => intro)
@@ -49,6 +49,17 @@ describe 'creating a peptide centric database' do
49
49
  #File.unlink(@output_file)
50
50
  end
51
51
 
52
+ it 'can use a trie' do
53
+ Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file, '--trie'])
54
+ triefile = TESTFILES + '/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4'
55
+ %w(.trie .tail .da).each do |ext|
56
+ File.exist?(triefile + ext).should be_true
57
+ end
58
+ trie = Trie.read(triefile)
59
+ p trie.get('MADGSGWQPPRPCEAYR')
60
+ #trie.get('MADGSGWQPPRPCEAYR').should == ["D3DX18"]
61
+ end
62
+
52
63
  it 'lists approved enzymes and exits' do
53
64
  output = capture_stdout do
54
65
  begin