ms-quant 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -29,12 +29,12 @@ Rake::TestTask.new(:spec) do |spec|
29
29
  spec.verbose = true
30
30
  end
31
31
 
32
- require 'rcov/rcovtask'
33
- Rcov::RcovTask.new do |spec|
34
- spec.libs << 'spec'
35
- spec.pattern = 'spec/**/*_spec.rb'
36
- spec.verbose = true
37
- end
32
+ #require 'rcov/rcovtask'
33
+ #Rcov::RcovTask.new do |spec|
34
+ # spec.libs << 'spec'
35
+ # spec.pattern = 'spec/**/*_spec.rb'
36
+ # spec.verbose = true
37
+ #end
38
38
 
39
39
  task :default => :spec
40
40
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
@@ -4,37 +4,108 @@ require 'ms/ident/peptide_hit/qvalue'
4
4
  require 'ms/ident/protein_hit'
5
5
  require 'ms/ident/peptide/db'
6
6
  require 'ms/quant/spectral_counts'
7
+ require 'ms/quant/qspec'
8
+
9
+ require 'yaml'
10
+ require 'tempfile'
7
11
 
8
12
  require 'trollop'
9
13
 
14
+ # inverse from Tilo Sloboda (now in facets)
15
+
16
+ class Hash
17
+ def inverse
18
+ i = Hash.new
19
+ self.each_pair do |k,v|
20
+ if (Array === v) ; v.each{ |x| i[x] = ( i.has_key?(x) ? [k,i[x]].flatten : k ) }
21
+ else ; i[v] = ( i.has_key?(v) ? [k,i[v]].flatten : k ) end
22
+ end ; i
23
+ end
24
+ end
25
+
26
+
10
27
  def putsv(*args)
11
28
  if $VERBOSE
12
29
  puts(*args) ; $stdout.flush
13
30
  end
14
31
  end
15
32
 
33
+ def basename(file)
34
+ base = file.chomp(File.extname(file))
35
+ base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
36
+ base
37
+ end
38
+
39
+
40
+ outfile = "spectral_counts.tsv"
41
+ delimiter = "\t"
42
+
16
43
  opts = Trollop::Parser.new do
17
- banner %Q{usage: #{File.basename(__FILE__)} peptide_centric_db.yml, file1.psq ...
44
+ banner %Q{usage: #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml group1=f1.psq,f2.psq group2=f3.psq,f4.psq
45
+ or (each file a group): #{File.basename(__FILE__)} <fasta>.peptide_centric_db.yml file1.psq file2.psq ...
46
+
47
+ writes to #{outfile}
48
+ group names can be arbitrarily defined
49
+ psq is really .psq.tsv file
18
50
  }
19
- opt :names, "array of names for the table (otherwise filenames)", :type => String
20
51
  opt :fdr_percent, "%FDR as cutoff", :default => 1.0
21
- opt :write_subset, "(development) write subset db", :default => false
52
+ opt :qspec, "return qspec results (executes qspec or qspecgp). Requires :fasta. Only 2 groups currently allowed", :default => false
53
+ opt :descriptions, "include descriptions of proteins, requires :fasta", :default => false
54
+ opt :fasta, "the fasta file. Required for :qspec and :descriptions", :type => String
55
+ opt :outfile, "the to which file data are written", :default => outfile
56
+ opt :verbose, "speak up", :default => false
57
+ opt :count_type, "type of spectral counts (<spectral|aaseqcharge|aaseq>)", :default => 'spectral'
58
+ opt :qspec_normalize, "normalize spectral counts per run", :default => false
59
+ opt :write_subset, "(dev use only) write subset db", :default => false
22
60
  end
23
61
 
24
62
  opt = opts.parse(ARGV)
63
+ opt[:count_type] = opt[:count_type].to_sym
64
+
65
+ $VERBOSE = opt.delete(:verbose)
25
66
 
26
67
  if ARGV.size < 2
27
68
  opts.educate && exit
28
69
  end
29
70
 
71
+ if (opt[:qspec] || opt[:descriptions]) && !opt[:fasta]
72
+ puts "You must provide a fasta file with --fasta to use qspec or descriptions!!"
73
+ opts.educate && exit
74
+ end
75
+
30
76
  peptide_centric_db_file = ARGV.shift
77
+ raise ArgumentError, "need .yml file for peptide centric db" unless File.extname(peptide_centric_db_file) == '.yml'
78
+ putsv "using: #{peptide_centric_db_file} as peptide centric db"
79
+
80
+ # groupname => files
81
+ condition_to_samplenames = {}
82
+ samplename_to_filename = {}
83
+ ARGV.each do |arg|
84
+ (condition, files) =
85
+ if arg.include?('=')
86
+ (condition, filestring) = arg.split('=')
87
+ [condition, filestring.split(',')]
88
+ else
89
+ [basename(arg), [arg]]
90
+ end
91
+ reptag = ARGV.size
92
+ sample_to_file_pairs = files.each_with_index.map {|file,i| ["#{condition}-rep#{i+1}", file] }
93
+ sample_to_file_pairs.each {|name,file| samplename_to_filename[name] = file }
94
+ condition_to_samplenames[condition] = sample_to_file_pairs.map(&:first)
95
+ end
31
96
 
32
- opt[:names] ||= ARGV.map do |file|
33
- base = file.chomp(File.extname(file))
34
- base=base.chomp(File.extname(base)) if File.extname(base) == '.phq'
35
- base
97
+
98
+ if $VERBOSE
99
+ puts "** condition: sample_names"
100
+ puts condition_to_samplenames.to_yaml
101
+ puts "** samplename: filename"
102
+ puts samplename_to_filename.to_yaml
36
103
  end
37
104
 
105
+ raise ArgumentError, "must have 2 conditions for qspec!" if opt[:qspec] && condition_to_samplenames.size != 2
106
+
107
+ samplenames = samplename_to_filename.keys
108
+
38
109
  class Ms::Ident::PeptideHit
39
110
  attr_accessor :experiment_name
40
111
  end
@@ -42,11 +113,9 @@ fdr_cutoff = opt[:fdr_percent] / 100
42
113
 
43
114
  start=Time.now
44
115
 
45
- $VERBOSE = true
46
-
47
- ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
116
+ ar_of_pephit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file) do |peptide_to_proteins|
48
117
  putsv "#{Time.now-start} seconds to read #{peptide_centric_db_file}"
49
- ARGV.zip(opt[:names]).map do |file,exp|
118
+ samplename_to_filename.map do |sample, file|
50
119
  peptide_hits = Ms::Ident::PeptideHit::Qvalue.from_file(file)
51
120
  putsv "#{file}: #{peptide_hits.size} hits"
52
121
  peptide_hits.select! do |hit|
@@ -54,7 +123,7 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
54
123
  # update each peptide with its protein hits
55
124
  prot_ids = peptide_to_proteins[hit.aaseq]
56
125
  if prot_ids
57
- hit.experiment_name = exp
126
+ hit.experiment_name = sample
58
127
  hit.proteins = prot_ids
59
128
  else ; false end
60
129
  else
@@ -65,31 +134,27 @@ ar_of_peptide_hit_ars = Ms::Ident::Peptide::Db::IO.open(peptide_centric_db_file)
65
134
  end
66
135
  end
67
136
 
68
- if opt[:write_subset]
137
+ if opt[:write_subset]
69
138
  aaseqs_to_prots = {}
70
- ar_of_peptide_hit_ars.each do |pephits|
71
- pephits.each do |pephit|
72
- aaseqs_to_prots[pephit.aaseq] = pephit.proteins
73
- end
139
+ ar_of_pephit_ars.flatten(1).each do |pephit|
140
+ aaseqs_to_prots[pephit.aaseq] = pephit.proteins
74
141
  end
75
142
  outfile = "peptidecentric_subset.yml"
76
143
  puts "writing #{outfile} with #{aaseqs_to_prots.size} aaseq->protids"
77
144
  File.open(outfile,'w') do |out|
78
145
  aaseqs_to_prots.each do |k,v|
79
- out.puts(%Q{#{k}: #{v.map(&:id).join("\t") }})
146
+ out.puts(%Q{#{k}: #{v.join("\t") }})
80
147
  end
81
148
  end
82
149
  end
83
150
 
84
- $VERBOSE = true
85
151
  if $VERBOSE
86
- opt[:names].zip(ar_of_peptide_hit_ars) do |name, pep_ar|
87
- puts "#{name}: #{pep_ar.size}"
152
+ samplenames.zip(ar_of_pephit_ars) do |samplename, pep_ar|
153
+ putsv "#{samplename}: #{pep_ar.size}"
88
154
  end
89
155
  end
90
156
 
91
- all_peptide_hits = ar_of_peptide_hit_ars.flatten(1)
92
-
157
+ all_peptide_hits = ar_of_pephit_ars.flatten(1)
93
158
 
94
159
  # because peptide_hit#proteins yields id strings (which hash properly),
95
160
  # each protein group is an array of
@@ -102,7 +167,7 @@ end
102
167
 
103
168
  # partition them all out by filename
104
169
 
105
- ar_of_count_data = opt[:names].map do |name|
170
+ counts_parallel_to_names_with_counts_per_group = samplenames.map do |name|
106
171
  pep_hit_to_prot_groups = Hash.new {|h,k| h[k] = [] }
107
172
  groups_of_pephits = protein_groups.map do |prot_group|
108
173
  pep_hits = prot_group.peptide_hits.select {|hit| hit.experiment_name == name }
@@ -115,12 +180,69 @@ ar_of_count_data = opt[:names].map do |name|
115
180
  #end
116
181
  end
117
182
 
118
- # protein_groups
119
- # [ ar_of_counts_for_exp1, ar_of_counts_for_exp2, ar_of_counts_for_exp3 ]
183
+ if opt[:qspec] || opt[:descriptions]
184
+ putsv "reading lengths and descriptions from #{opt[:fasta]}"
185
+ (id_to_length, id_to_desc) = Ms::Fasta.protein_lengths_and_descriptions(opt[:fasta])
186
+ end
187
+
188
+ samplename_to_condition = condition_to_samplenames.inverse
189
+
190
+ ### OUTPUT TABLE
191
+ header_cats = samplenames.map.to_a
192
+
193
+ ar_of_rows = counts_parallel_to_names_with_counts_per_group.map do |counts_per_group|
194
+ counts_per_group.map(&opt[:count_type])
195
+ end.transpose
120
196
 
121
- protein_groups.zip(*ar_of_count_data) do |row|
122
- pg = row.shift
123
- puts (row.map(&:to_a).flatten + pg.to_a).join("\t")
197
+ if opt[:qspec]
198
+ all_conditions = samplenames.map {|sn| samplename_to_condition[sn] }
199
+ condition_to_count_array = all_conditions.zip(counts_parallel_to_names_with_counts_per_group).map do |condition, counts_par_groups|
200
+ [condition, counts_par_groups.map(&opt[:count_type])]
201
+ end
202
+
203
+ name_length_pairs = protein_groups.map do |pg|
204
+ # prefer swissprot (sp) proteins over tremble (tr) and shorter protein
205
+ # lengths over longer lengths
206
+ best_guess_protein_id = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
207
+ length = id_to_length[best_guess_protein_id]
208
+ [pg.join(":"), length]
209
+ end
210
+
211
+ putsv "qspec to normalize counts: #{opt[:qspec_normalize]}"
212
+ qspec_results = Ms::Quant::Qspec.new(name_length_pairs, condition_to_count_array).run(opt[:qspec_normalize])
213
+
214
+ to_add = [:fdr, :bayes_factor, :fold_change]
215
+ header_cats.push(*to_add)
216
+ qspec_results.zip(ar_of_rows) do |zipped|
217
+ (result, row) = zipped
218
+ row.push(*to_add.map {|v| result.send(v) })
219
+ end
124
220
  end
125
221
 
222
+ header_cats.push( *%w(BestID AllIDs) )
223
+ header_cats.push( 'Description' ) if opt[:descriptions]
224
+
225
+ protein_groups.zip(ar_of_rows) do |zipped|
226
+ (pg, row) = zipped
227
+ # swiss-prot and then the shortest
228
+ best_protid = pg.sort_by {|prot_id| [prot_id, -id_to_length[prot_id]] }.first
229
+ (gene_id, desc) =
230
+ if opt[:descriptions]
231
+ desc = id_to_desc[best_protid]
232
+ gene_id = (md=desc.match(/ GN=(\w+) ?/)) ? md[1] : best_protid
233
+ [gene_id, desc]
234
+ else
235
+ [best_protid, nil]
236
+ end
237
+ row << gene_id << pg.join(',')
238
+ row.push(desc) if desc
239
+ end
240
+
241
+ ### SORT???
242
+
243
+ File.open(opt[:outfile],'w') do |out|
244
+ out.puts header_cats.join(delimiter)
245
+ ar_of_rows.each {|row| out.puts row.join(delimiter) }
246
+ putsv "wrote: #{opt[:outfile]}"
247
+ end
126
248
 
@@ -0,0 +1,91 @@
1
+ module Ms ; end
2
+ module Ms::Quant ; end
3
+
4
+ class Ms::Quant::Qspec
5
+
6
+ NBURNIN = 50 # need to check
7
+ NITER = 2000 # check
8
+ INIT_HEADER = %w(protid protLen)
9
+ DELIMITER = "\t"
10
+
11
+ # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
12
+ # returns an array of ints [0,0,0,1,1,1...]
13
+ def self.conditions_to_ints(conditions)
14
+ i = 0
15
+ current_condition = conditions.first
16
+ conditions.map do |cond|
17
+ if current_condition == cond ; i
18
+ else
19
+ i += 1
20
+ current_condition = cond
21
+ i
22
+ end
23
+ end
24
+ end
25
+
26
+ # returns an array of Results structs which is each row of the returned file
27
+ # works with V2 of QSpec
28
+ def self.results_array(resultsfile)
29
+ rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
30
+ headers = rows.shift
31
+ rows.map do |row|
32
+ data = [row[0]]
33
+ data.push( *row[1,2].map(&:to_i) )
34
+ data.push( *row[3,4].map(&:to_f) )
35
+ data.push( row[7] )
36
+ Results.new(*data)
37
+ end
38
+ end
39
+
40
+ # returns the right executable based on the array of conditions
41
+ def self.executable(conditions)
42
+ biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
43
+ (biggest_size >= 3) ? 'qspecgp' : 'qspec'
44
+ end
45
+
46
+ # protname_length_pairs is an array of doublets: [protname, length]
47
+ # condition_to_count_array is an array doublets: [condition, array_of_counts]
48
+ def initialize(protname_length_pairs, condition_to_count_array)
49
+ @protname_length_pairs = protname_length_pairs
50
+ @condition_to_count_array = condition_to_count_array
51
+ end
52
+
53
+ def conditions
54
+ @condition_to_count_array.map(&:first)
55
+ end
56
+
57
+ # writes a qspec formatted file to filename
58
+ def write(filename)
59
+ ints = Ms::Quant::Qspec.conditions_to_ints(conditions)
60
+ header_cats = INIT_HEADER + ints
61
+ rows = @protname_length_pairs.map {|pair| pair.map.to_a }
62
+ @condition_to_count_array.each do |cond,counts|
63
+ rows.zip(counts) {|row,cnt| row << cnt }
64
+ end
65
+ File.open(filename,'w') do |out|
66
+ out.puts header_cats.join(DELIMITER)
67
+ rows.each {|row| out.puts row.join(DELIMITER) }
68
+ end
69
+ end
70
+
71
+ def run(normalize=true, opts={})
72
+ puts "normalize: #{normalize}" if $VERBOSE
73
+ tfile = Tempfile.new("qspec")
74
+ write(tfile.path)
75
+ qspec_exe = self.class.executable(conditions)
76
+ cmd = [qspec_exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
77
+ puts "running #{cmd}" if $VERBOSE
78
+ reply = `#{cmd}`
79
+ puts reply if $VERBOSE
80
+ results = self.class.results_array(tfile.path + Results::EXT)
81
+ tfile.unlink
82
+ results
83
+ end
84
+
85
+ # for version 2 of QSpec
86
+ Results = Struct.new(:protid, :set0, :set1, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
87
+ class Results
88
+ EXT = '_qspec'
89
+ end
90
+ end
91
+
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 2
9
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - John T. Prince
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-30 00:00:00 -06:00
17
+ date: 2011-03-31 00:00:00 -06:00
18
18
  default_executable: peptide_hit_qvalues_to_spectral_counts_table.rb
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -90,6 +90,7 @@ files:
90
90
  - VERSION
91
91
  - bin/peptide_hit_qvalues_to_spectral_counts_table.rb
92
92
  - lib/ms-quant.rb
93
+ - lib/ms/quant/qspec.rb
93
94
  - lib/ms/quant/spectral_counts.rb
94
95
  - spec/ms/quant/spectral_counts_spec.rb
95
96
  - spec/spec_helper.rb