cwords 0.1.11 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,9 +6,12 @@
6
6
  ###
7
7
 
8
8
  srcdir = File.dirname(__FILE__)
9
- $LOAD_PATH << srcdir + '/../lib/'
9
+ basedir = srcdir + "/../"
10
+ libdir = basedir + '/lib/'
11
+ $LOAD_PATH << libdir
10
12
 
11
13
  require 'wordRS-lib.rb'
14
+ require 'rubygems'
12
15
  require 'progressbar'
13
16
  require 'optparse'
14
17
  require 'pp'
@@ -324,6 +327,9 @@ end
324
327
  ### Main
325
328
  ###
326
329
 
330
+ puts ">> Parameters"
331
+ options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
332
+
327
333
  aw = Hash.new()
328
334
  if options[:fdr]
329
335
  IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
@@ -387,7 +393,7 @@ end
387
393
  #pp clusters.select{|x| x.size > 2}
388
394
  #pp clusters
389
395
  wa = aw.invert
390
- resc = clusters.select{|x| x.size >= 3}
396
+ resc = clusters.select{|x| x.size >= 3} #2
391
397
  resc.each{|cl| print_cluster(cl,wa);puts "\n"}
392
398
 
393
399
  puts "Found #{resc.size} word clusters."
@@ -10,6 +10,7 @@ require 'rubygems'
10
10
  require 'progressbar'
11
11
  require 'optparse'
12
12
  require 'java'
13
+ require 'pp'
13
14
  require libdir + 'ushuffle.jar'
14
15
  java_import 'UShuffle'
15
16
 
@@ -82,7 +83,7 @@ end
82
83
  # mandatory parameters
83
84
  [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
84
85
  show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
85
- show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
86
+ show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))
86
87
 
87
88
  testing = options[:testing]
88
89
 
@@ -100,6 +101,7 @@ sequences = nil
100
101
  nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
101
102
  bg=options[:bg] # TODO, make option
102
103
  threads=options[:threads]
104
+ nperms=options[:permutations]
103
105
 
104
106
  ###
105
107
  ### Main program
@@ -115,19 +117,27 @@ IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t"
115
117
  # read optional sequences
116
118
  if options[:seqfile]
117
119
  puts "\n>> reading sequences ..."
118
- sequences = Hash.new
119
- IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
120
- ls = entry.split("\n").map{|x| x.chomp}
121
- # hash ensures sequence ids unique
122
- sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
123
- end
120
+ sequences = Hash.new # id => seq
121
+ IO.readlines(options[:seqfile],">")[1..-1].each do |s|
122
+ ff = s.split("\n").map{|x| x.chomp}
123
+ id = ff.shift
124
+ seq = ff[0..-2].join('').downcase.gsub('u','t')
125
+ seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
126
+ # next if not nucleotide sequence, i.e. "unavailable"
127
+ next if (seq.split('').uniq - ['a','c','g','t']).size > 0
128
+ next if seq.size < 50 # lower bound
129
+ # hash ensures sequence ids are unique
130
+ sequences[id] = seq
131
+ end
124
132
  seqshuffles = options[:seqshuffles]
125
133
  end
126
134
 
127
135
  # initialize word id hash, word sequence => word id (0..nwords-1)
128
136
  wids = Hash.new
129
- i = 0
130
- options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
137
+ begin
138
+ wi = 0
139
+ options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
140
+ end
131
141
 
132
142
  ###
133
143
  ### ID mapping
@@ -277,22 +287,26 @@ analyze.each do |set,nm|
277
287
  perms = []
278
288
  report = []
279
289
  pfdrz = []
280
-
290
+
291
+ report = Array.new(nwords)
292
+ pfdrz = Array.new(nwords*nperms)
293
+
281
294
  franks = Hash.new # tid => index in set
282
295
  set.each_with_index{|x,i| franks[x[0]] = i}
283
296
 
284
297
  puts "permuting #{options[:permutations]} times ...\n"
285
- options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
298
+ nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
286
299
 
287
300
  pbar = ProgressBar.new("progress",nwords)
288
- wids.to_a.sort_by{|x| x[1]}.threach(threads) do |word,wid|
301
+
302
+ wids.to_a.threach(threads) do |word,wid|
289
303
  pbar.inc
290
304
  next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
291
305
  next if options[:plot_words] and !options[:plot_words].include?(word)
292
306
 
293
307
  plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
294
-
295
- score = Array.new(ngenes) # scores ordered by fold change
308
+
309
+ score = Array.new(ngenes,0) # scores ordered by fold change
296
310
 
297
311
  if sequences
298
312
  score = set.map{|x| wordscores[allorder[x[0]]][wid]}
@@ -303,16 +317,20 @@ analyze.each do |set,nm|
303
317
  when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
304
318
  when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
305
319
  when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
320
+ when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
306
321
  end
307
322
  end
308
323
 
324
+ # center scores
309
325
  smean = score.to_statarray.mean
326
+ score.map!{|x| x-smean}
327
+
310
328
  maxrs = 0
311
329
  leading_edge = 0
312
330
  rs = 0 #running sum
313
331
  rsa = [0]
314
332
  score.each_with_index do |x,i|
315
- rs += (x-smean)
333
+ rs += x
316
334
  rsa << rs
317
335
  if rs.abs > maxrs.abs
318
336
  maxrs = rs
@@ -333,7 +351,7 @@ analyze.each do |set,nm|
333
351
  prsa = [0]
334
352
  pmaxrs = 0
335
353
  psa.each do |i|
336
- prs += score[i]-smean
354
+ prs += score[i]
337
355
  prsa << prs
338
356
  pmaxrs = prs if prs.abs > pmaxrs.abs
339
357
  end
@@ -348,14 +366,12 @@ analyze.each do |set,nm|
348
366
  #Because the word zscore distr. can be quite different,
349
367
  # we compute the deviation from the mean of the absolute dist.
350
368
  # The permuted maxRS should be normally distr. (sum of random numbers)
351
- pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
352
-
353
- #pvalue and fdr statistic for word is also computed based on abs. dist.
354
- pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
369
+ poffset = wid*nperms
370
+ pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}
371
+
355
372
  zsc = (maxrs-pmean)/pstd
356
-
357
373
  plotfile.close if options[:plot_words]
358
- report << [wid,zsc,pval,nil,leading_edge]
374
+ report[wid] = [wid,zsc,nil,leading_edge]
359
375
 
360
376
  end # wordsize
361
377
  pbar.finish
@@ -363,31 +379,31 @@ analyze.each do |set,nm|
363
379
  ###
364
380
  ### FDR
365
381
  ###
366
-
367
- puts "fdr calculation ..."
368
- fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
369
- report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
370
- fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
371
- nfp = pfdrz.size.to_f
372
- ntp = report.size.to_f
373
- word_fdrrank = Hash.new()
374
- ifp = 0
375
- itp = 0
376
- fdrrank.each do |zsc,idx|
377
- if idx.nil?
378
- ifp += 1
379
- else
380
- itp += 1
381
- fpr = ifp/nfp
382
- tpr = itp/ntp
383
- report[idx][3] = fpr/tpr
382
+
383
+ begin
384
+ puts "\n>> Estimating FDR ..."
385
+ report.compact! # remove nil entries
386
+ pfdrz.compact!
387
+ fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
388
+ report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
389
+ fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
390
+ nfp, ntp = pfdrz.size.to_f, report.size.to_f
391
+ ifp, itp = 0, 0
392
+ fdrrank.each do |zsc,idx|
393
+ if idx.nil?
394
+ ifp += 1
395
+ else
396
+ itp += 1
397
+ fpr, tpr = ifp/nfp, itp/ntp
398
+ report[idx][2] = fpr/tpr
399
+ end
384
400
  end
401
+
402
+ cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
403
+ puts ""
404
+ puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
405
+ puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
385
406
  end
386
-
387
- cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
388
- puts ""
389
- puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
390
- puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
391
407
 
392
408
  ###
393
409
  ### Output summarization
@@ -396,19 +412,19 @@ analyze.each do |set,nm|
396
412
  wids2 = wids.invert
397
413
  report = report.sort_by{|x| x[1]}.reverse
398
414
  puts "\nTop #{output_top} words"
399
- puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
415
+ puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
400
416
  report[0,output_top].each_with_index do |r,i|
401
417
  wd = wids2[r[0]]
402
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
418
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
403
419
  puts s.map{|x| sprintf("%-10s",x)}.join('')
404
420
  end
405
421
 
406
422
  if options[:report_words]
407
423
  puts "......"
408
424
  report.each_with_index do |r,i|
409
- if options[:report_words].include?(r[0]) # and i > output_top
425
+ if options[:report_words].include?(r[0])
410
426
  wd = wids2[r[0]]
411
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
427
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
412
428
  puts s.map{|x| sprintf("%-10s",x)}.join('')
413
429
  end
414
430
  end
@@ -417,11 +433,11 @@ analyze.each do |set,nm|
417
433
  if options[:dump]
418
434
  fname = rankfilename + ".#{nm}." + options[:dump].to_s
419
435
  of = File.new(fname,"w")
420
- of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
436
+ of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
421
437
  puts "dumping top #{options[:dump]} words in file: #{fname}"
422
438
  report[0..options[:dump]-1].each_with_index do |r,i|
423
439
  wd = wids2[r[0]]
424
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
440
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
425
441
  of.puts s.map{|x| sprintf("%-10s",x)}.join('')
426
442
  end
427
443
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 11
9
- version: 0.1.11
8
+ - 12
9
+ version: 0.1.12
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-09 00:00:00 +02:00
17
+ date: 2010-09-17 00:00:00 +02:00
18
18
  default_executable:
19
19
  - bin/cwords
20
20
  dependencies: