ms-quant 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -29,12 +29,12 @@ Rake::TestTask.new(:spec) do |spec|
29
29
  spec.verbose = true
30
30
  end
31
31
 
32
- require 'rcov/rcovtask'
33
- Rcov::RcovTask.new do |spec|
34
- spec.libs << 'spec'
35
- spec.pattern = 'spec/**/*_spec.rb'
36
- spec.verbose = true
37
- end
32
+ #require 'rcov/rcovtask'
33
+ #Rcov::RcovTask.new do |spec|
34
+ # spec.libs << 'spec'
35
+ # spec.pattern = 'spec/**/*_spec.rb'
36
+ # spec.verbose = true
37
+ #end
38
38
 
39
39
  task :default => :spec
40
40
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
@@ -4,37 +4,108 @@ require 'ms/ident/peptide_hit/qvalue'
4
4
  require 'ms/ident/protein_hit'
5
5
  require 'ms/ident/peptide/db'
6
6
  require 'ms/quant/spectral_counts'
7
+ require 'ms/quant/qspec'
8
+
9
+ require 'yaml'
10
+ require 'tempfile'
7
11
 
8
12
  require 'trollop'
9
13
 
14
+ # inverse from Tilo Sloboda (now in facets)
15
+
16
+ class Hash
17
+ def inverse
18
+ i = Hash.new
19
+ self.each_pair do |k,v|
20
+ if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
21
+ else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
22
+ end ; i
23
+ end
24
+ end
25
+
26
+
10
27
  def putsv(*args)
11
28
  if $VERBOSE
12
29
  puts(*args) ; $stdout.flush
13
30
  end
14
31
  end
15
32
 
33
+ def basename(file)
34
+ base = file.chomp(File.extname(file))
35
+ base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
36
+ base
37
+ end
38
+
39
+
40
+ outfile = "spectral_counts.tsv"
41
+ delimiter = "\t"
42
+
16
43
  opts = Trollop::Parser.new do
17
- banner %Q{usage: #{File.basename(__FILE__)} peptide_centric_db.yml, file1.psq ...
44
+ banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.psq,f2.psq group2=f3.psq,f4.psq
45
+ or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.psq file2.psq ...
46
+
47
+ writes to #{outfile}
48
+ group names can be arbitrarily defined
49
+ psq is really .psq.tsv file
18
50
  }
19
- opt :names, "array of names for the table (otherwise filenames)", :type => String
20
51
  opt :fdr_percent, "%FDR as cutoff", :default => 1.0
21
- opt :write_subset, "(development) write subset db", :default => false
52
+ opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
53
+ opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
54
+ opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
55
+ opt :outfile, "the to which file data are written", :default => outfile
56
+ opt :verbose, "speak up", :default => false
57
+ opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
58
+ opt :qspec_normalize, "normalize spectral counts per run", :default => false
59
+ opt :write_subset, "(dev use only) write subset db", :default => false
22
60
  end
23
61
 
24
62
  opt = opts.parse(ARGV)
63
+ opt[:count_type] = opt[:count_type].to_sym
64
+
65
+ $VERBOSE = opt.delete(:verbose)
25
66
 
26
67
  if ARGV.size < 2
27
68
  opts.educate && exit
28
69
  end
29
70
 
71
+ if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
72
+ puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
73
+ opts.educate && exit
74
+ end
75
+
30
76
  peptide_centric_db_file = ARGV.shift
77
+ raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
78
+ putsv "using: #{peptide_centric_db_file} as peptide centric db"
79
+
80
+ # groupname => files
81
+ condition_to_samplenames = {}
82
+ samplename_to_filename = {}
83
+ ARGV.each do |arg|
84
+ (condition, files) =
85
+ if arg.include?('=')
86
+ (condition, filestring) = arg.split('=')
87
+ [condition, filestring.split(',')]
88
+ else
89
+ [basename(arg), [arg]]
90
+ end
91
+ reptag = ARGV.size
92
+ sample_to_file_pairs = files.each_with_index.map {|file,i| ["#{condition}-rep#{i+1}", file] }
93
+ sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
94
+ condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
95
+ end
31
96
 
32
- opt[:names] ||= ARGV.map do |file|
33
- base = file.chomp(File.extname(file))
34
- base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
35
- base
97
+
98
+ if $VERBOSE
99
+ puts "** condition: sample_names"
100
+ puts condition_to_samplenames.to_yaml
101
+ puts "** samplename: filename"
102
+ puts samplename_to_filename.to_yaml
36
103
  end
37
104
 
105
+ raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
106
+
107
+ samplenames = samplename_to_filename.keys
108
+
38
109
  class Ms::Ident::PeptideHit
39
110
  attr_accessor :experiment_name
40
111
  end
@@ -42,11 +113,9 @@ fdr_cutoff = opt[:fdr_percent] / 100
42
113
 
43
114
  start=Time.now
44
115
 
45
- $VERBOSE = true
46
-
47
- ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
116
+ ar_of_pephit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
48
117
  putsv "#{Time.now-start} seconds to read #{peptide_centric_db_file}"
49
- ARGV.zip(opt[:names]).map do |file,exp|
118
+ samplename_to_filename.map do |sample, file|
50
119
  peptide_hits = Ms::Ident::PeptideHit::Qvalue.from_file(file)
51
120
  putsv "#{file}: #{peptide_hits.size} hits"
52
121
  peptide_hits.select! do |hit|
@@ -54,7 +123,7 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
54
123
  # update each peptide with its protein hits
55
124
  prot_ids = peptide_to_proteins[hit.aaseq]
56
125
  if prot_ids
57
- hit.experiment_name = exp
126
+ hit.experiment_name = sample
58
127
  hit.proteins = prot_ids
59
128
  else ; false end
60
129
  else
@@ -65,31 +134,27 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
65
134
  end
66
135
  end
67
136
 
68
- if opt[:write_subset]
137
+ if opt[:write_subset]
69
138
  aaseqs_to_prots = {}
70
- ar_of_peptide_hit_ars.each do |pephits|
71
- pephits.each do |pephit|
72
- aaseqs_to_prots[pephit.aaseq] = pephit.proteins
73
- end
139
+ ar_of_pephit_ars.flatten(1).each do |pephit|
140
+ aaseqs_to_prots[pephit.aaseq] = pephit.proteins
74
141
  end
75
142
  outfile = "peptidecentric_subset.yml"
76
143
  puts "writing #{outfile} with #{aaseqs_to_prots.size} aaseq->protids"
77
144
  File.open(outfile,'w') do |out|
78
145
  aaseqs_to_prots.each do |k,v|
79
- out.puts(%Q{#{k}: #{v.map(&:id).join("\t") }})
146
+ out.puts(%Q{#{k}: #{v.join("\t") }})
80
147
  end
81
148
  end
82
149
  end
83
150
 
84
- $VERBOSE = true
85
151
  if $VERBOSE
86
- opt[:names].zip(ar_of_peptide_hit_ars) do |name, pep_ar|
87
- puts "#{name}: #{pep_ar.size}"
152
+ samplenames.zip(ar_of_pephit_ars) do |samplename, pep_ar|
153
+ putsv "#{samplename}: #{pep_ar.size}"
88
154
  end
89
155
  end
90
156
 
91
- all_peptide_hits = ar_of_peptide_hit_ars.flatten(1)
92
-
157
+ all_peptide_hits = ar_of_pephit_ars.flatten(1)
93
158
 
94
159
  # because peptide_hit#proteins yields id strings (which hash properly),
95
160
  # each protein group is an array of
@@ -102,7 +167,7 @@ end
102
167
 
103
168
  # partition them all out by filename
104
169
 
105
- ar_of_count_data = opt[:names].map do |name|
170
+ counts_parallel_to_names_with_counts_per_group = samplenames.map do |name|
106
171
  pep_hit_to_prot_groups = Hash.new {|h,k| h[k] = [] }
107
172
  groups_of_pephits = protein_groups.map do |prot_group|
108
173
  pep_hits = prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
@@ -115,12 +180,69 @@ ar_of_count_data = opt[:names].map do |name|
115
180
  #end
116
181
  end
117
182
 
118
- # protein_groups
119
- # [ ar_of_counts_for_exp1, ar_of_counts_for_exp2, ar_of_counts_for_exp3 ]
183
+ if opt[:qspec] || opt[:descriptions]
184
+ putsv "reading lengths and descriptions from #{opt[:fasta]}"
185
+ (id_to_length, id_to_desc) = Ms::Fasta.protein_lengths_and_descriptions(opt[:fasta])
186
+ end
187
+
188
+ samplename_to_condition = condition_to_samplenames.inverse
189
+
190
+ ### OUTPUT TABLE
191
+ header_cats = samplenames.map.to_a
192
+
193
+ ar_of_rows = counts_parallel_to_names_with_counts_per_group.map do |counts_per_group|
194
+ counts_per_group.map(&opt[:count_type])
195
+ end.transpose
120
196
 
121
- protein_groups.zip(*ar_of_count_data) do |row|
122
- pg = row.shift
123
- puts (row.map(&:to_a).flatten + pg.to_a).join("\t")
197
+ if opt[:qspec]
198
+ all_conditions = samplenames.map {|sn| samplename_to_condition[sn] }
199
+ condition_to_count_array = all_conditions.zip(counts_parallel_to_names_with_counts_per_group).map do |condition, counts_par_groups|
200
+ [condition, counts_par_groups.map(&opt[:count_type])]
201
+ end
202
+
203
+ name_length_pairs = protein_groups.map do |pg|
204
+ # prefer swissprot (sp) proteins over tremble (tr) and shorter protein
205
+ # lengths over longer lengths
206
+ best_guess_protein_id = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
207
+ length = id_to_length[best_guess_protein_id]
208
+ [pg.join(":"), length]
209
+ end
210
+
211
+ putsv "qspec to normalize counts: #{opt[:qspec_normalize]}"
212
+ qspec_results = Ms::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize])
213
+
214
+ to_add = [:fdr, :bayes_factor, :fold_change]
215
+ header_cats.push(*to_add)
216
+ qspec_results.zip(ar_of_rows) do |zipped|
217
+ (result, row) = zipped
218
+ row.push(*to_add.map {|v| result.send(v) })
219
+ end
124
220
  end
125
221
 
222
+ header_cats.push( *%w(BestID AllIDs) )
223
+ header_cats.push( 'Description' ) if opt[:descriptions]
224
+
225
+ protein_groups.zip(ar_of_rows) do |zipped|
226
+ (pg, row) = zipped
227
+ # swiss-prot and then the shortest
228
+ best_protid = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
229
+ (gene_id, desc) =
230
+ if opt[:descriptions]
231
+ desc = id_to_desc[best_protid]
232
+ gene_id = (md=desc.match(/ GN=(\w+) ?/)) ? md[1] : best_protid
233
+ [gene_id, desc]
234
+ else
235
+ [best_protid, nil]
236
+ end
237
+ row << gene_id << pg.join(',')
238
+ row.push(desc) if desc
239
+ end
240
+
241
+ ### SORT???
242
+
243
+ File.open(opt[:outfile],'w') do |out|
244
+ out.puts header_cats.join(delimiter)
245
+ ar_of_rows.each {|row| out.puts row.join(delimiter) }
246
+ putsv "wrote: #{opt[:outfile]}"
247
+ end
126
248
 
@@ -0,0 +1,91 @@
1
+ module Ms ; end
2
+ module Ms::Quant ; end
3
+
4
+ class Ms::Quant::Qspec
5
+
6
+ NBURNIN = 50 # need to check
7
+ NITER = 2000 # check
8
+ INIT_HEADER = %w(protid protLen)
9
+ DELIMITER = "\t"
10
+
11
+ # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
12
+ # returns an array of ints [0,0,0,1,1,1...]
13
+ def self.conditions_to_ints(conditions)
14
+ i = 0
15
+ current_condition = conditions.first
16
+ conditions.map do |cond|
17
+ if current_condition == cond ; i
18
+ else
19
+ i += 1
20
+ current_condition = cond
21
+ i
22
+ end
23
+ end
24
+ end
25
+
26
+ # returns an array of Results structs which is each row of the returned file
27
+ # works with V2 of QSpec
28
+ def self.results_array(resultsfile)
29
+ rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
30
+ headers = rows.shift
31
+ rows.map do |row|
32
+ data = [row[0]]
33
+ data.push( *row[1,2].map(&:to_i) )
34
+ data.push( *row[3,4].map(&:to_f) )
35
+ data.push( row[7] )
36
+ Results.new(*data)
37
+ end
38
+ end
39
+
40
+ # returns the right executable based on the array of conditions
41
+ def self.executable(conditions)
42
+ biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
43
+ (biggest_size >= 3) ? 'qspecgp' : 'qspec'
44
+ end
45
+
46
+ # protname_length_pairs is an array of doublets: [protname, length]
47
+ # condition_to_count_array is an array doublets: [condition, array_of_counts]
48
+ def initialize(protname_length_pairs, condition_to_count_array)
49
+ @protname_length_pairs = protname_length_pairs
50
+ @condition_to_count_array = condition_to_count_array
51
+ end
52
+
53
+ def conditions
54
+ @condition_to_count_array.map(&:first)
55
+ end
56
+
57
+ # writes a qspec formatted file to filename
58
+ def write(filename)
59
+ ints = Ms::Quant::Qspec.conditions_to_ints(conditions)
60
+ header_cats = INIT_HEADER + ints
61
+ rows = @protname_length_pairs.map {|pair| pair.map.to_a }
62
+ @condition_to_count_array.each do |cond,counts|
63
+ rows.zip(counts) {|row,cnt| row << cnt }
64
+ end
65
+ File.open(filename,'w') do |out|
66
+ out.puts header_cats.join(DELIMITER)
67
+ rows.each {|row| out.puts row.join(DELIMITER) }
68
+ end
69
+ end
70
+
71
+ def run(normalize=true, opts={})
72
+ puts "normalize: #{normalize}" if $VERBOSE
73
+ tfile = Tempfile.new("qspec")
74
+ write(tfile.path)
75
+ qspec_exe = self.class.executable(conditions)
76
+ cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
77
+ puts "running #{cmd}" if $VERBOSE
78
+ reply = `#{cmd}`
79
+ puts reply if $VERBOSE
80
+ results = self.class.results_array(tfile.path + Results::EXT)
81
+ tfile.unlink
82
+ results
83
+ end
84
+
85
+ # for version 2 of QSpec
86
+ Results = Struct.new(:protid, :set0, :set1, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
87
+ class Results
88
+ EXT = '_qspec'
89
+ end
90
+ end
91
+
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 2
9
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - John T. Prince
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-30 00:00:00 -06:00
17
+ date: 2011-03-31 00:00:00 -06:00
18
18
  default_executable: peptide_hit_qvalues_to_spectral_counts_table.rb
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -90,6 +90,7 @@ files:
90
90
  - VERSION
91
91
  - bin/peptide_hit_qvalues_to_spectral_counts_table.rb
92
92
  - lib/ms-quant.rb
93
+ - lib/ms/quant/qspec.rb
93
94
  - lib/ms/quant/spectral_counts.rb
94
95
  - spec/ms/quant/spectral_counts_spec.rb
95
96
  - spec/spec_helper.rb