cwords 0.1.11 → 0.1.12
Sign up to get free protection for your applications and to get access to all the features.
- data/scripts/cluster_words.rb +8 -2
- data/scripts/cwords.rb +67 -51
- metadata +3 -3
data/scripts/cluster_words.rb
CHANGED
@@ -6,9 +6,12 @@
|
|
6
6
|
###
|
7
7
|
|
8
8
|
srcdir = File.dirname(__FILE__)
|
9
|
-
|
9
|
+
basedir = srcdir + "/../"
|
10
|
+
libdir = basedir + '/lib/'
|
11
|
+
$LOAD_PATH << libdir
|
10
12
|
|
11
13
|
require 'wordRS-lib.rb'
|
14
|
+
require 'rubygems'
|
12
15
|
require 'progressbar'
|
13
16
|
require 'optparse'
|
14
17
|
require 'pp'
|
@@ -324,6 +327,9 @@ end
|
|
324
327
|
### Main
|
325
328
|
###
|
326
329
|
|
330
|
+
puts ">> Parameters"
|
331
|
+
options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
|
332
|
+
|
327
333
|
aw = Hash.new()
|
328
334
|
if options[:fdr]
|
329
335
|
IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
|
@@ -387,7 +393,7 @@ end
|
|
387
393
|
#pp clusters.select{|x| x.size > 2}
|
388
394
|
#pp clusters
|
389
395
|
wa = aw.invert
|
390
|
-
resc = clusters.select{|x| x.size >= 3}
|
396
|
+
resc = clusters.select{|x| x.size >= 3} #2
|
391
397
|
resc.each{|cl| print_cluster(cl,wa);puts "\n"}
|
392
398
|
|
393
399
|
puts "Found #{resc.size} word clusters."
|
data/scripts/cwords.rb
CHANGED
@@ -10,6 +10,7 @@ require 'rubygems'
|
|
10
10
|
require 'progressbar'
|
11
11
|
require 'optparse'
|
12
12
|
require 'java'
|
13
|
+
require 'pp'
|
13
14
|
require libdir + 'ushuffle.jar'
|
14
15
|
java_import 'UShuffle'
|
15
16
|
|
@@ -82,7 +83,7 @@ end
|
|
82
83
|
# mandatory parameters
|
83
84
|
[:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
84
85
|
show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
|
85
|
-
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
|
86
|
+
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))
|
86
87
|
|
87
88
|
testing = options[:testing]
|
88
89
|
|
@@ -100,6 +101,7 @@ sequences = nil
|
|
100
101
|
nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
|
101
102
|
bg=options[:bg] # TODO, make option
|
102
103
|
threads=options[:threads]
|
104
|
+
nperms=options[:permutations]
|
103
105
|
|
104
106
|
###
|
105
107
|
### Main program
|
@@ -115,19 +117,27 @@ IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t"
|
|
115
117
|
# read optional sequences
|
116
118
|
if options[:seqfile]
|
117
119
|
puts "\n>> reading sequences ..."
|
118
|
-
sequences = Hash.new
|
119
|
-
IO.readlines(options[:seqfile],">")[1..-1].each do
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
120
|
+
sequences = Hash.new # id => seq
|
121
|
+
IO.readlines(options[:seqfile],">")[1..-1].each do |s|
|
122
|
+
ff = s.split("\n").map{|x| x.chomp}
|
123
|
+
id = ff.shift
|
124
|
+
seq = ff[0..-2].join('').downcase.gsub('u','t')
|
125
|
+
seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
|
126
|
+
# next if not nucleotide sequence, i.e. "unavailable"
|
127
|
+
next if (seq.split('').uniq - ['a','c','g','t']).size > 0
|
128
|
+
next if seq.size < 50 # lower bound
|
129
|
+
# hash ensures sequence ids are unique
|
130
|
+
sequences[id] = seq
|
131
|
+
end
|
124
132
|
seqshuffles = options[:seqshuffles]
|
125
133
|
end
|
126
134
|
|
127
135
|
# initialize word id hash, word sequence => word id (0..nwords-1)
|
128
136
|
wids = Hash.new
|
129
|
-
|
130
|
-
|
137
|
+
begin
|
138
|
+
wi = 0
|
139
|
+
options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
|
140
|
+
end
|
131
141
|
|
132
142
|
###
|
133
143
|
### ID mapping
|
@@ -277,22 +287,26 @@ analyze.each do |set,nm|
|
|
277
287
|
perms = []
|
278
288
|
report = []
|
279
289
|
pfdrz = []
|
280
|
-
|
290
|
+
|
291
|
+
report = Array.new(nwords)
|
292
|
+
pfdrz = Array.new(nwords*nperms)
|
293
|
+
|
281
294
|
franks = Hash.new # tid => index in set
|
282
295
|
set.each_with_index{|x,i| franks[x[0]] = i}
|
283
296
|
|
284
297
|
puts "permuting #{options[:permutations]} times ...\n"
|
285
|
-
|
298
|
+
nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
|
286
299
|
|
287
300
|
pbar = ProgressBar.new("progress",nwords)
|
288
|
-
|
301
|
+
|
302
|
+
wids.to_a.threach(threads) do |word,wid|
|
289
303
|
pbar.inc
|
290
304
|
next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
|
291
305
|
next if options[:plot_words] and !options[:plot_words].include?(word)
|
292
306
|
|
293
307
|
plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
|
294
|
-
|
295
|
-
score = Array.new(ngenes) # scores ordered by fold change
|
308
|
+
|
309
|
+
score = Array.new(ngenes,0) # scores ordered by fold change
|
296
310
|
|
297
311
|
if sequences
|
298
312
|
score = set.map{|x| wordscores[allorder[x[0]]][wid]}
|
@@ -303,16 +317,20 @@ analyze.each do |set,nm|
|
|
303
317
|
when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
|
304
318
|
when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
|
305
319
|
when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
|
320
|
+
when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
|
306
321
|
end
|
307
322
|
end
|
308
323
|
|
324
|
+
# center scores
|
309
325
|
smean = score.to_statarray.mean
|
326
|
+
score.map!{|x| x-smean}
|
327
|
+
|
310
328
|
maxrs = 0
|
311
329
|
leading_edge = 0
|
312
330
|
rs = 0 #running sum
|
313
331
|
rsa = [0]
|
314
332
|
score.each_with_index do |x,i|
|
315
|
-
rs +=
|
333
|
+
rs += x
|
316
334
|
rsa << rs
|
317
335
|
if rs.abs > maxrs.abs
|
318
336
|
maxrs = rs
|
@@ -333,7 +351,7 @@ analyze.each do |set,nm|
|
|
333
351
|
prsa = [0]
|
334
352
|
pmaxrs = 0
|
335
353
|
psa.each do |i|
|
336
|
-
prs += score[i]
|
354
|
+
prs += score[i]
|
337
355
|
prsa << prs
|
338
356
|
pmaxrs = prs if prs.abs > pmaxrs.abs
|
339
357
|
end
|
@@ -348,14 +366,12 @@ analyze.each do |set,nm|
|
|
348
366
|
#Because the word zscore distr. can be quite different,
|
349
367
|
# we compute the deviation from the mean of the absolute dist.
|
350
368
|
# The permuted maxRS should be normally distr. (sum of random numbers)
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
|
369
|
+
poffset = wid*nperms
|
370
|
+
pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}
|
371
|
+
|
355
372
|
zsc = (maxrs-pmean)/pstd
|
356
|
-
|
357
373
|
plotfile.close if options[:plot_words]
|
358
|
-
report
|
374
|
+
report[wid] = [wid,zsc,nil,leading_edge]
|
359
375
|
|
360
376
|
end # wordsize
|
361
377
|
pbar.finish
|
@@ -363,31 +379,31 @@ analyze.each do |set,nm|
|
|
363
379
|
###
|
364
380
|
### FDR
|
365
381
|
###
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
382
|
+
|
383
|
+
begin
|
384
|
+
puts "\n>> Estimating FDR ..."
|
385
|
+
report.compact! # remove nil entries
|
386
|
+
pfdrz.compact!
|
387
|
+
fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
|
388
|
+
report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
|
389
|
+
fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
|
390
|
+
nfp, ntp = pfdrz.size.to_f, report.size.to_f
|
391
|
+
ifp, itp = 0, 0
|
392
|
+
fdrrank.each do |zsc,idx|
|
393
|
+
if idx.nil?
|
394
|
+
ifp += 1
|
395
|
+
else
|
396
|
+
itp += 1
|
397
|
+
fpr, tpr = ifp/nfp, itp/ntp
|
398
|
+
report[idx][2] = fpr/tpr
|
399
|
+
end
|
384
400
|
end
|
401
|
+
|
402
|
+
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
403
|
+
puts ""
|
404
|
+
puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
405
|
+
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
|
385
406
|
end
|
386
|
-
|
387
|
-
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
388
|
-
puts ""
|
389
|
-
puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
390
|
-
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
|
391
407
|
|
392
408
|
###
|
393
409
|
### Output summarization
|
@@ -396,19 +412,19 @@ analyze.each do |set,nm|
|
|
396
412
|
wids2 = wids.invert
|
397
413
|
report = report.sort_by{|x| x[1]}.reverse
|
398
414
|
puts "\nTop #{output_top} words"
|
399
|
-
puts ['rank','word','z-score','
|
415
|
+
puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
400
416
|
report[0,output_top].each_with_index do |r,i|
|
401
417
|
wd = wids2[r[0]]
|
402
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
418
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
403
419
|
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
404
420
|
end
|
405
421
|
|
406
422
|
if options[:report_words]
|
407
423
|
puts "......"
|
408
424
|
report.each_with_index do |r,i|
|
409
|
-
if options[:report_words].include?(r[0])
|
425
|
+
if options[:report_words].include?(r[0])
|
410
426
|
wd = wids2[r[0]]
|
411
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
427
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
412
428
|
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
413
429
|
end
|
414
430
|
end
|
@@ -417,11 +433,11 @@ analyze.each do |set,nm|
|
|
417
433
|
if options[:dump]
|
418
434
|
fname = rankfilename + ".#{nm}." + options[:dump].to_s
|
419
435
|
of = File.new(fname,"w")
|
420
|
-
of.puts ['rank','word','z-score','
|
436
|
+
of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
421
437
|
puts "dumping top #{options[:dump]} words in file: #{fname}"
|
422
438
|
report[0..options[:dump]-1].each_with_index do |r,i|
|
423
439
|
wd = wids2[r[0]]
|
424
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
440
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
425
441
|
of.puts s.map{|x| sprintf("%-10s",x)}.join('')
|
426
442
|
end
|
427
443
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 12
|
9
|
+
version: 0.1.12
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-09-17 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
- bin/cwords
|
20
20
|
dependencies:
|