cwords 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,9 +6,12 @@
6
6
  ###
7
7
 
8
8
  srcdir = File.dirname(__FILE__)
9
- $LOAD_PATH << srcdir + '/../lib/'
9
+ basedir = srcdir + "/../"
10
+ libdir = basedir + '/lib/'
11
+ $LOAD_PATH << libdir
10
12
 
11
13
  require 'wordRS-lib.rb'
14
+ require 'rubygems'
12
15
  require 'progressbar'
13
16
  require 'optparse'
14
17
  require 'pp'
@@ -324,6 +327,9 @@ end
324
327
  ### Main
325
328
  ###
326
329
 
330
+ puts ">> Parameters"
331
+ options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
332
+
327
333
  aw = Hash.new()
328
334
  if options[:fdr]
329
335
  IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
@@ -387,7 +393,7 @@ end
387
393
  #pp clusters.select{|x| x.size > 2}
388
394
  #pp clusters
389
395
  wa = aw.invert
390
- resc = clusters.select{|x| x.size >= 3}
396
+ resc = clusters.select{|x| x.size >= 3} #2
391
397
  resc.each{|cl| print_cluster(cl,wa);puts "\n"}
392
398
 
393
399
  puts "Found #{resc.size} word clusters."
@@ -10,6 +10,7 @@ require 'rubygems'
10
10
  require 'progressbar'
11
11
  require 'optparse'
12
12
  require 'java'
13
+ require 'pp'
13
14
  require libdir + 'ushuffle.jar'
14
15
  java_import 'UShuffle'
15
16
 
@@ -82,7 +83,7 @@ end
82
83
  # mandatory parameters
83
84
  [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
84
85
  show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
85
- show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
86
+ show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))
86
87
 
87
88
  testing = options[:testing]
88
89
 
@@ -100,6 +101,7 @@ sequences = nil
100
101
  nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
101
102
  bg=options[:bg] # TODO, make option
102
103
  threads=options[:threads]
104
+ nperms=options[:permutations]
103
105
 
104
106
  ###
105
107
  ### Main program
@@ -115,19 +117,27 @@ IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t"
115
117
  # read optional sequences
116
118
  if options[:seqfile]
117
119
  puts "\n>> reading sequences ..."
118
- sequences = Hash.new
119
- IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
120
- ls = entry.split("\n").map{|x| x.chomp}
121
- # hash ensures sequence ids unique
122
- sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
123
- end
120
+ sequences = Hash.new # id => seq
121
+ IO.readlines(options[:seqfile],">")[1..-1].each do |s|
122
+ ff = s.split("\n").map{|x| x.chomp}
123
+ id = ff.shift
124
+ seq = ff[0..-2].join('').downcase.gsub('u','t')
125
+ seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
126
+ # next if not nucleotide sequence, i.e. "unavailable"
127
+ next if (seq.split('').uniq - ['a','c','g','t']).size > 0
128
+ next if seq.size < 50 # lower bound
129
+ # hash ensures sequence ids are unique
130
+ sequences[id] = seq
131
+ end
124
132
  seqshuffles = options[:seqshuffles]
125
133
  end
126
134
 
127
135
  # initialize word id hash, word sequence => word id (0..nwords-1)
128
136
  wids = Hash.new
129
- i = 0
130
- options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
137
+ begin
138
+ wi = 0
139
+ options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
140
+ end
131
141
 
132
142
  ###
133
143
  ### ID mapping
@@ -277,22 +287,26 @@ analyze.each do |set,nm|
277
287
  perms = []
278
288
  report = []
279
289
  pfdrz = []
280
-
290
+
291
+ report = Array.new(nwords)
292
+ pfdrz = Array.new(nwords*nperms)
293
+
281
294
  franks = Hash.new # tid => index in set
282
295
  set.each_with_index{|x,i| franks[x[0]] = i}
283
296
 
284
297
  puts "permuting #{options[:permutations]} times ...\n"
285
- options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
298
+ nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
286
299
 
287
300
  pbar = ProgressBar.new("progress",nwords)
288
- wids.to_a.sort_by{|x| x[1]}.threach(threads) do |word,wid|
301
+
302
+ wids.to_a.threach(threads) do |word,wid|
289
303
  pbar.inc
290
304
  next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
291
305
  next if options[:plot_words] and !options[:plot_words].include?(word)
292
306
 
293
307
  plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
294
-
295
- score = Array.new(ngenes) # scores ordered by fold change
308
+
309
+ score = Array.new(ngenes,0) # scores ordered by fold change
296
310
 
297
311
  if sequences
298
312
  score = set.map{|x| wordscores[allorder[x[0]]][wid]}
@@ -303,16 +317,20 @@ analyze.each do |set,nm|
303
317
  when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
304
318
  when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
305
319
  when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
320
+ when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
306
321
  end
307
322
  end
308
323
 
324
+ # center scores
309
325
  smean = score.to_statarray.mean
326
+ score.map!{|x| x-smean}
327
+
310
328
  maxrs = 0
311
329
  leading_edge = 0
312
330
  rs = 0 #running sum
313
331
  rsa = [0]
314
332
  score.each_with_index do |x,i|
315
- rs += (x-smean)
333
+ rs += x
316
334
  rsa << rs
317
335
  if rs.abs > maxrs.abs
318
336
  maxrs = rs
@@ -333,7 +351,7 @@ analyze.each do |set,nm|
333
351
  prsa = [0]
334
352
  pmaxrs = 0
335
353
  psa.each do |i|
336
- prs += score[i]-smean
354
+ prs += score[i]
337
355
  prsa << prs
338
356
  pmaxrs = prs if prs.abs > pmaxrs.abs
339
357
  end
@@ -348,14 +366,12 @@ analyze.each do |set,nm|
348
366
  #Because the word zscore distr. can be quite different,
349
367
  # we compute the deviation from the mean of the absolute dist.
350
368
  # The permuted maxRS should be normally distr. (sum of random numbers)
351
- pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
352
-
353
- #pvalue and fdr statistic for word is also computed based on abs. dist.
354
- pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
369
+ poffset = wid*nperms
370
+ pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}
371
+
355
372
  zsc = (maxrs-pmean)/pstd
356
-
357
373
  plotfile.close if options[:plot_words]
358
- report << [wid,zsc,pval,nil,leading_edge]
374
+ report[wid] = [wid,zsc,nil,leading_edge]
359
375
 
360
376
  end # wordsize
361
377
  pbar.finish
@@ -363,31 +379,31 @@ analyze.each do |set,nm|
363
379
  ###
364
380
  ### FDR
365
381
  ###
366
-
367
- puts "fdr calculation ..."
368
- fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
369
- report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
370
- fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
371
- nfp = pfdrz.size.to_f
372
- ntp = report.size.to_f
373
- word_fdrrank = Hash.new()
374
- ifp = 0
375
- itp = 0
376
- fdrrank.each do |zsc,idx|
377
- if idx.nil?
378
- ifp += 1
379
- else
380
- itp += 1
381
- fpr = ifp/nfp
382
- tpr = itp/ntp
383
- report[idx][3] = fpr/tpr
382
+
383
+ begin
384
+ puts "\n>> Estimating FDR ..."
385
+ report.compact! # remove nil entries
386
+ pfdrz.compact!
387
+ fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
388
+ report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
389
+ fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
390
+ nfp, ntp = pfdrz.size.to_f, report.size.to_f
391
+ ifp, itp = 0, 0
392
+ fdrrank.each do |zsc,idx|
393
+ if idx.nil?
394
+ ifp += 1
395
+ else
396
+ itp += 1
397
+ fpr, tpr = ifp/nfp, itp/ntp
398
+ report[idx][2] = fpr/tpr
399
+ end
384
400
  end
401
+
402
+ cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
403
+ puts ""
404
+ puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
405
+ puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
385
406
  end
386
-
387
- cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
388
- puts ""
389
- puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
390
- puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
391
407
 
392
408
  ###
393
409
  ### Output summarization
@@ -396,19 +412,19 @@ analyze.each do |set,nm|
396
412
  wids2 = wids.invert
397
413
  report = report.sort_by{|x| x[1]}.reverse
398
414
  puts "\nTop #{output_top} words"
399
- puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
415
+ puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
400
416
  report[0,output_top].each_with_index do |r,i|
401
417
  wd = wids2[r[0]]
402
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
418
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
403
419
  puts s.map{|x| sprintf("%-10s",x)}.join('')
404
420
  end
405
421
 
406
422
  if options[:report_words]
407
423
  puts "......"
408
424
  report.each_with_index do |r,i|
409
- if options[:report_words].include?(r[0]) # and i > output_top
425
+ if options[:report_words].include?(r[0])
410
426
  wd = wids2[r[0]]
411
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
427
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
412
428
  puts s.map{|x| sprintf("%-10s",x)}.join('')
413
429
  end
414
430
  end
@@ -417,11 +433,11 @@ analyze.each do |set,nm|
417
433
  if options[:dump]
418
434
  fname = rankfilename + ".#{nm}." + options[:dump].to_s
419
435
  of = File.new(fname,"w")
420
- of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
436
+ of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
421
437
  puts "dumping top #{options[:dump]} words in file: #{fname}"
422
438
  report[0..options[:dump]-1].each_with_index do |r,i|
423
439
  wd = wids2[r[0]]
424
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
440
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
425
441
  of.puts s.map{|x| sprintf("%-10s",x)}.join('')
426
442
  end
427
443
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 11
9
- version: 0.1.11
8
+ - 12
9
+ version: 0.1.12
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-09 00:00:00 +02:00
17
+ date: 2010-09-17 00:00:00 +02:00
18
18
  default_executable:
19
19
  - bin/cwords
20
20
  dependencies: