cwords 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/scripts/cluster_words.rb +8 -2
- data/scripts/cwords.rb +67 -51
- metadata +3 -3
data/scripts/cluster_words.rb
CHANGED
@@ -6,9 +6,12 @@
|
|
6
6
|
###
|
7
7
|
|
8
8
|
srcdir = File.dirname(__FILE__)
|
9
|
-
|
9
|
+
basedir = srcdir + "/../"
|
10
|
+
libdir = basedir + '/lib/'
|
11
|
+
$LOAD_PATH << libdir
|
10
12
|
|
11
13
|
require 'wordRS-lib.rb'
|
14
|
+
require 'rubygems'
|
12
15
|
require 'progressbar'
|
13
16
|
require 'optparse'
|
14
17
|
require 'pp'
|
@@ -324,6 +327,9 @@ end
|
|
324
327
|
### Main
|
325
328
|
###
|
326
329
|
|
330
|
+
puts ">> Parameters"
|
331
|
+
options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
|
332
|
+
|
327
333
|
aw = Hash.new()
|
328
334
|
if options[:fdr]
|
329
335
|
IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
|
@@ -387,7 +393,7 @@ end
|
|
387
393
|
#pp clusters.select{|x| x.size > 2}
|
388
394
|
#pp clusters
|
389
395
|
wa = aw.invert
|
390
|
-
resc = clusters.select{|x| x.size >= 3}
|
396
|
+
resc = clusters.select{|x| x.size >= 3} #2
|
391
397
|
resc.each{|cl| print_cluster(cl,wa);puts "\n"}
|
392
398
|
|
393
399
|
puts "Found #{resc.size} word clusters."
|
data/scripts/cwords.rb
CHANGED
@@ -10,6 +10,7 @@ require 'rubygems'
|
|
10
10
|
require 'progressbar'
|
11
11
|
require 'optparse'
|
12
12
|
require 'java'
|
13
|
+
require 'pp'
|
13
14
|
require libdir + 'ushuffle.jar'
|
14
15
|
java_import 'UShuffle'
|
15
16
|
|
@@ -82,7 +83,7 @@ end
|
|
82
83
|
# mandatory parameters
|
83
84
|
[:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
84
85
|
show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
|
85
|
-
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
|
86
|
+
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval','pval2'].include?(options[:scoring_scheme]))
|
86
87
|
|
87
88
|
testing = options[:testing]
|
88
89
|
|
@@ -100,6 +101,7 @@ sequences = nil
|
|
100
101
|
nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
|
101
102
|
bg=options[:bg] # TODO, make option
|
102
103
|
threads=options[:threads]
|
104
|
+
nperms=options[:permutations]
|
103
105
|
|
104
106
|
###
|
105
107
|
### Main program
|
@@ -115,19 +117,27 @@ IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t"
|
|
115
117
|
# read optional sequences
|
116
118
|
if options[:seqfile]
|
117
119
|
puts "\n>> reading sequences ..."
|
118
|
-
sequences = Hash.new
|
119
|
-
IO.readlines(options[:seqfile],">")[1..-1].each do
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
120
|
+
sequences = Hash.new # id => seq
|
121
|
+
IO.readlines(options[:seqfile],">")[1..-1].each do |s|
|
122
|
+
ff = s.split("\n").map{|x| x.chomp}
|
123
|
+
id = ff.shift
|
124
|
+
seq = ff[0..-2].join('').downcase.gsub('u','t')
|
125
|
+
seq += ff[-1] if ff[-1] != '>' # check if last field is ">"
|
126
|
+
# next if not nucleotide sequence, i.e. "unavailable"
|
127
|
+
next if (seq.split('').uniq - ['a','c','g','t']).size > 0
|
128
|
+
next if seq.size < 50 # lower bound
|
129
|
+
# hash ensures sequence ids are unique
|
130
|
+
sequences[id] = seq
|
131
|
+
end
|
124
132
|
seqshuffles = options[:seqshuffles]
|
125
133
|
end
|
126
134
|
|
127
135
|
# initialize word id hash, word sequence => word id (0..nwords-1)
|
128
136
|
wids = Hash.new
|
129
|
-
|
130
|
-
|
137
|
+
begin
|
138
|
+
wi = 0
|
139
|
+
options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=wi ; wi+=1 }}
|
140
|
+
end
|
131
141
|
|
132
142
|
###
|
133
143
|
### ID mapping
|
@@ -277,22 +287,26 @@ analyze.each do |set,nm|
|
|
277
287
|
perms = []
|
278
288
|
report = []
|
279
289
|
pfdrz = []
|
280
|
-
|
290
|
+
|
291
|
+
report = Array.new(nwords)
|
292
|
+
pfdrz = Array.new(nwords*nperms)
|
293
|
+
|
281
294
|
franks = Hash.new # tid => index in set
|
282
295
|
set.each_with_index{|x,i| franks[x[0]] = i}
|
283
296
|
|
284
297
|
puts "permuting #{options[:permutations]} times ...\n"
|
285
|
-
|
298
|
+
nperms.times{|i| perms << (0..set.size-1).to_a.shuffle}
|
286
299
|
|
287
300
|
pbar = ProgressBar.new("progress",nwords)
|
288
|
-
|
301
|
+
|
302
|
+
wids.to_a.threach(threads) do |word,wid|
|
289
303
|
pbar.inc
|
290
304
|
next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
|
291
305
|
next if options[:plot_words] and !options[:plot_words].include?(word)
|
292
306
|
|
293
307
|
plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
|
294
|
-
|
295
|
-
score = Array.new(ngenes) # scores ordered by fold change
|
308
|
+
|
309
|
+
score = Array.new(ngenes,0) # scores ordered by fold change
|
296
310
|
|
297
311
|
if sequences
|
298
312
|
score = set.map{|x| wordscores[allorder[x[0]]][wid]}
|
@@ -303,16 +317,20 @@ analyze.each do |set,nm|
|
|
303
317
|
when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
|
304
318
|
when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
|
305
319
|
when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
|
320
|
+
when "pval2" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = gte_obs.to_f}
|
306
321
|
end
|
307
322
|
end
|
308
323
|
|
324
|
+
# center scores
|
309
325
|
smean = score.to_statarray.mean
|
326
|
+
score.map!{|x| x-smean}
|
327
|
+
|
310
328
|
maxrs = 0
|
311
329
|
leading_edge = 0
|
312
330
|
rs = 0 #running sum
|
313
331
|
rsa = [0]
|
314
332
|
score.each_with_index do |x,i|
|
315
|
-
rs +=
|
333
|
+
rs += x
|
316
334
|
rsa << rs
|
317
335
|
if rs.abs > maxrs.abs
|
318
336
|
maxrs = rs
|
@@ -333,7 +351,7 @@ analyze.each do |set,nm|
|
|
333
351
|
prsa = [0]
|
334
352
|
pmaxrs = 0
|
335
353
|
psa.each do |i|
|
336
|
-
prs += score[i]
|
354
|
+
prs += score[i]
|
337
355
|
prsa << prs
|
338
356
|
pmaxrs = prs if prs.abs > pmaxrs.abs
|
339
357
|
end
|
@@ -348,14 +366,12 @@ analyze.each do |set,nm|
|
|
348
366
|
#Because the word zscore distr. can be quite different,
|
349
367
|
# we compute the deviation from the mean of the absolute dist.
|
350
368
|
# The permuted maxRS should be normally distr. (sum of random numbers)
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
|
369
|
+
poffset = wid*nperms
|
370
|
+
pmaxrs_pos.map{|x| (x-pmean)/pstd}.each_with_index{|v,j| pfdrz[poffset+j] = v}
|
371
|
+
|
355
372
|
zsc = (maxrs-pmean)/pstd
|
356
|
-
|
357
373
|
plotfile.close if options[:plot_words]
|
358
|
-
report
|
374
|
+
report[wid] = [wid,zsc,nil,leading_edge]
|
359
375
|
|
360
376
|
end # wordsize
|
361
377
|
pbar.finish
|
@@ -363,31 +379,31 @@ analyze.each do |set,nm|
|
|
363
379
|
###
|
364
380
|
### FDR
|
365
381
|
###
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
382
|
+
|
383
|
+
begin
|
384
|
+
puts "\n>> Estimating FDR ..."
|
385
|
+
report.compact! # remove nil entries
|
386
|
+
pfdrz.compact!
|
387
|
+
fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
|
388
|
+
report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
|
389
|
+
fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
|
390
|
+
nfp, ntp = pfdrz.size.to_f, report.size.to_f
|
391
|
+
ifp, itp = 0, 0
|
392
|
+
fdrrank.each do |zsc,idx|
|
393
|
+
if idx.nil?
|
394
|
+
ifp += 1
|
395
|
+
else
|
396
|
+
itp += 1
|
397
|
+
fpr, tpr = ifp/nfp, itp/ntp
|
398
|
+
report[idx][2] = fpr/tpr
|
399
|
+
end
|
384
400
|
end
|
401
|
+
|
402
|
+
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
403
|
+
puts ""
|
404
|
+
puts (["FDR <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
405
|
+
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[2] <= x}.size} + [report.size]).join("\t")
|
385
406
|
end
|
386
|
-
|
387
|
-
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
388
|
-
puts ""
|
389
|
-
puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
390
|
-
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
|
391
407
|
|
392
408
|
###
|
393
409
|
### Output summarization
|
@@ -396,19 +412,19 @@ analyze.each do |set,nm|
|
|
396
412
|
wids2 = wids.invert
|
397
413
|
report = report.sort_by{|x| x[1]}.reverse
|
398
414
|
puts "\nTop #{output_top} words"
|
399
|
-
puts ['rank','word','z-score','
|
415
|
+
puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
400
416
|
report[0,output_top].each_with_index do |r,i|
|
401
417
|
wd = wids2[r[0]]
|
402
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
418
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
403
419
|
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
404
420
|
end
|
405
421
|
|
406
422
|
if options[:report_words]
|
407
423
|
puts "......"
|
408
424
|
report.each_with_index do |r,i|
|
409
|
-
if options[:report_words].include?(r[0])
|
425
|
+
if options[:report_words].include?(r[0])
|
410
426
|
wd = wids2[r[0]]
|
411
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
427
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
412
428
|
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
413
429
|
end
|
414
430
|
end
|
@@ -417,11 +433,11 @@ analyze.each do |set,nm|
|
|
417
433
|
if options[:dump]
|
418
434
|
fname = rankfilename + ".#{nm}." + options[:dump].to_s
|
419
435
|
of = File.new(fname,"w")
|
420
|
-
of.puts ['rank','word','z-score','
|
436
|
+
of.puts ['rank','word','z-score','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
421
437
|
puts "dumping top #{options[:dump]} words in file: #{fname}"
|
422
438
|
report[0..options[:dump]-1].each_with_index do |r,i|
|
423
439
|
wd = wids2[r[0]]
|
424
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].
|
440
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_s,word_annotation[wd]]
|
425
441
|
of.puts s.map{|x| sprintf("%-10s",x)}.join('')
|
426
442
|
end
|
427
443
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 12
|
9
|
+
version: 0.1.12
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-09-17 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
- bin/cwords
|
20
20
|
dependencies:
|