cwords 0.1.2-jruby → 0.1.3-jruby
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/cwords +10 -2
- data/bin/cwords_mkdb +9 -2
- metadata +3 -5
- data/bin/cwords2 +0 -429
data/bin/cwords
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
-
#!/bin/
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
scriptdir = File.dirname(__FILE__) + "/../scripts/"
|
4
|
+
|
5
|
+
mems = ARGV.join(" ").match(/M=(\w+)/)
|
6
|
+
mem = mems ? mems[1] : '4096m'
|
7
|
+
argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
|
8
|
+
puts "Starting cwords with max heap size " + mem + " ...\n"
|
9
|
+
|
10
|
+
cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords.rb " + argv
|
11
|
+
exec cmd
|
data/bin/cwords_mkdb
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
-
#!/bin/
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
scriptdir = File.dirname(__FILE__) + "/../scripts/"
|
2
3
|
|
3
|
-
|
4
|
+
mems = ARGV.join(" ").match(/M=(\w+)/)
|
5
|
+
mem = mems ? mems[1] : '4096m'
|
6
|
+
argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
|
7
|
+
puts "Starting cwords with max heap size " + mem + " ..."
|
8
|
+
|
9
|
+
cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
|
10
|
+
exec cmd
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 3
|
9
|
+
version: 0.1.3
|
10
10
|
platform: jruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -46,12 +46,11 @@ dependencies:
|
|
46
46
|
version: 0.2.0
|
47
47
|
type: :runtime
|
48
48
|
version_requirements: *id002
|
49
|
-
description: Word correlation analysis
|
49
|
+
description: Word correlation analysis in ranked nucleotide sequences (bioinformatics)
|
50
50
|
email: andersmbj@gmail.com
|
51
51
|
executables:
|
52
52
|
- cwords
|
53
53
|
- cwords_mkdb
|
54
|
-
- cwords2
|
55
54
|
extensions: []
|
56
55
|
|
57
56
|
extra_rdoc_files: []
|
@@ -59,7 +58,6 @@ extra_rdoc_files: []
|
|
59
58
|
files:
|
60
59
|
- README
|
61
60
|
- LICENSE
|
62
|
-
- bin/cwords2
|
63
61
|
- bin/cwords
|
64
62
|
- bin/cwords_mkdb
|
65
63
|
- lib/ushuffle.jar
|
data/bin/cwords2
DELETED
@@ -1,429 +0,0 @@
|
|
1
|
-
#!/usr/bin/env jruby --server --fast -J-Xmx4096m
|
2
|
-
|
3
|
-
### Requires jruby, www.jruby.org
|
4
|
-
|
5
|
-
###
|
6
|
-
### Running sum analysis for 5 different measures of word enrichment in a sequence:
|
7
|
-
### obs : use the observed word count
|
8
|
-
### bin : use presence/absence of word
|
9
|
-
### pval : use the p-value of the expected occurrences being >= the observed occurence
|
10
|
-
|
11
|
-
srcdir = File.dirname(__FILE__)
|
12
|
-
basedir = srcdir + "/../"
|
13
|
-
libdir = basedir + '/lib/'
|
14
|
-
$LOAD_PATH << libdir
|
15
|
-
|
16
|
-
require 'wordRS-lib.rb'
|
17
|
-
require 'rubygems'
|
18
|
-
require 'progressbar'
|
19
|
-
require 'optparse'
|
20
|
-
require 'peach'
|
21
|
-
require 'java'
|
22
|
-
require libdir + 'ushuffle.jar'
|
23
|
-
java_import 'UShuffle'
|
24
|
-
|
25
|
-
#default options
|
26
|
-
options = Hash.new
|
27
|
-
options[:wordsize] = [7]
|
28
|
-
options[:split_words]=nil
|
29
|
-
options[:dbdir] = basedir + "db/"
|
30
|
-
options[:scoring_scheme] = 'pval'
|
31
|
-
options[:permutations]=50
|
32
|
-
options[:seqshuffles]=100
|
33
|
-
options[:rankfile]=nil
|
34
|
-
options[:seqfile]=nil
|
35
|
-
options[:report_words]=nil
|
36
|
-
options[:plot_words]=nil
|
37
|
-
options[:onlyanno]=nil
|
38
|
-
options[:dump]=nil
|
39
|
-
options[:testing]=nil
|
40
|
-
options[:rank_all]=nil
|
41
|
-
options[:rank_inverse]=nil
|
42
|
-
options[:rank_split_median]=nil
|
43
|
-
options[:rank_abs]=nil
|
44
|
-
options[:bg]=1 #mononucleotide shuffling
|
45
|
-
options[:threads]=1
|
46
|
-
|
47
|
-
$coptions = OptionParser.new do |opts|
|
48
|
-
# analysis settings
|
49
|
-
opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
|
50
|
-
opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
|
51
|
-
opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
|
52
|
-
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
|
53
|
-
opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
|
54
|
-
opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
|
55
|
-
opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
|
56
|
-
opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
|
57
|
-
|
58
|
-
# rank control
|
59
|
-
opts.on("-x", "--rank_all", "do not split positive and neg. values") {|o| options[:rank_all] = true}
|
60
|
-
opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
|
61
|
-
opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
|
62
|
-
opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
|
63
|
-
|
64
|
-
# files and directories
|
65
|
-
opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
|
66
|
-
opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
|
67
|
-
opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
|
68
|
-
|
69
|
-
# output control
|
70
|
-
opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
|
71
|
-
opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
|
72
|
-
opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
|
73
|
-
opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
|
74
|
-
end
|
75
|
-
|
76
|
-
def show_help(msg="", code=0, io=STDOUT)
|
77
|
-
io.puts "#{msg}\n#{$coptions}"
|
78
|
-
exit(code)
|
79
|
-
end
|
80
|
-
|
81
|
-
$coptions.parse!(ARGV)
|
82
|
-
# mandatory parameters
|
83
|
-
[:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
84
|
-
show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
|
85
|
-
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
|
86
|
-
|
87
|
-
testing = options[:testing]
|
88
|
-
|
89
|
-
# get filename without directory
|
90
|
-
rankfilename = File.basename(options[:rankfile])
|
91
|
-
|
92
|
-
# hard-coded
|
93
|
-
output_top = 10
|
94
|
-
|
95
|
-
prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
|
96
|
-
annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
|
97
|
-
tidfile = basedir + "/resources/" + "genemap.tsv"
|
98
|
-
seqshuffles = 5000 # currently hardcoded for database
|
99
|
-
sequences = nil
|
100
|
-
nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
|
101
|
-
bg=options[:bg] # TODO, make option
|
102
|
-
threads=options[:threads]
|
103
|
-
|
104
|
-
###
|
105
|
-
### Main program
|
106
|
-
###
|
107
|
-
|
108
|
-
puts ">> Parameters"
|
109
|
-
options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
|
110
|
-
|
111
|
-
# read in mirbase seed family
|
112
|
-
word_annotation = Hash.new("") # seq => family
|
113
|
-
IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
|
114
|
-
|
115
|
-
# read optional sequences
|
116
|
-
if options[:seqfile]
|
117
|
-
puts ">> reading sequences ..."
|
118
|
-
sequences = Hash.new
|
119
|
-
IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
|
120
|
-
ls = entry.split("\n").map{|x| x.chomp}
|
121
|
-
# hash ensures sequence ids unique
|
122
|
-
sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
|
123
|
-
end
|
124
|
-
seqshuffles = options[:seqshuffles]
|
125
|
-
end
|
126
|
-
|
127
|
-
# initialize word id hash, word sequence => word id (0..nwords-1)
|
128
|
-
wids = Hash.new
|
129
|
-
i = 0
|
130
|
-
options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
|
131
|
-
|
132
|
-
###
|
133
|
-
### ID mapping
|
134
|
-
###
|
135
|
-
|
136
|
-
# pre-computed word database:
|
137
|
-
# map ids given in rankfile to internal ids
|
138
|
-
# remove rankfile entries with no match to internal id
|
139
|
-
# sequence file:
|
140
|
-
# take intersection of rank and sequence IDs
|
141
|
-
|
142
|
-
puts ">> Mapping and filtering IDs ..."
|
143
|
-
|
144
|
-
all = []
|
145
|
-
begin
|
146
|
-
idmap = Hash.new
|
147
|
-
internal_ids = nil
|
148
|
-
|
149
|
-
if sequences
|
150
|
-
internal_ids = sequences
|
151
|
-
else
|
152
|
-
IO.readlines(tidfile).each do |l|
|
153
|
-
tid = l.split(" ")[0]
|
154
|
-
l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
|
155
|
-
end
|
156
|
-
internal_ids = idmap.invert # allowed internal ids
|
157
|
-
end
|
158
|
-
|
159
|
-
allh = Hash.new {|h,k| h[k] = []}
|
160
|
-
filtered = 0
|
161
|
-
|
162
|
-
IO.readlines(options[:rankfile]).each do |l|
|
163
|
-
l = l.split("\t")
|
164
|
-
#test if internal id or mapable external id
|
165
|
-
tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
|
166
|
-
tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
|
167
|
-
end
|
168
|
-
|
169
|
-
# filter unknown sequences
|
170
|
-
sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
|
171
|
-
|
172
|
-
# we currently mean-collapse ids, we could allow mean/min/max collapsing ...
|
173
|
-
all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
|
174
|
-
|
175
|
-
puts "removed #{filtered} invalid transcript ids" if filtered > 0
|
176
|
-
end
|
177
|
-
|
178
|
-
allorder = Hash.new # tid => index in all
|
179
|
-
all.each_with_index{|x,i| allorder[x[0]] = i}
|
180
|
-
|
181
|
-
###
|
182
|
-
### Word enumeration (optional)
|
183
|
-
###
|
184
|
-
|
185
|
-
wordscores = []
|
186
|
-
if sequences
|
187
|
-
puts ">> Enumerating words in sequences"
|
188
|
-
wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
|
189
|
-
pbar = ProgressBar.new("progress",sequences.size)
|
190
|
-
all.peach(threads) do |seqid,val|
|
191
|
-
us = UShuffle.new
|
192
|
-
seq=sequences[seqid]
|
193
|
-
seqidx=allorder[seqid]
|
194
|
-
pbar.inc
|
195
|
-
seqsize = seq.size
|
196
|
-
observed = Array.new(wids.size,0)
|
197
|
-
options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
|
198
|
-
|
199
|
-
case options[:scoring_scheme]
|
200
|
-
when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
|
201
|
-
when "obs" then wordscores[seqidx] = observed
|
202
|
-
else
|
203
|
-
# pval, compute distribution of expected word occurrences
|
204
|
-
us.init_shuffle(seq,bg)
|
205
|
-
seqshuffles.times do |si|
|
206
|
-
seqsh = us.shuffle
|
207
|
-
expected = Array.new(wids.size,0)
|
208
|
-
options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
|
209
|
-
observed.each_with_index{|x,widx| wordscores[seqidx][widx] =+ 1 if expected[widx]>=x}
|
210
|
-
end
|
211
|
-
end
|
212
|
-
end
|
213
|
-
pbar.finish
|
214
|
-
end
|
215
|
-
|
216
|
-
###
|
217
|
-
### Generate list ranking
|
218
|
-
###
|
219
|
-
|
220
|
-
analyze = []
|
221
|
-
if options[:rank_split_median]
|
222
|
-
# we should perhaps use an :inverse option,
|
223
|
-
# reversing the two pos and neg lists
|
224
|
-
med = all.map{|x| x[1]}.to_statarray.median
|
225
|
-
pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
|
226
|
-
neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
|
227
|
-
analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
|
228
|
-
elsif options[:rank_all] # do not split positive and negative range
|
229
|
-
pos_set = all.sort{|a,b| b[1] <=> a[1]}
|
230
|
-
neg_set = all.sort{|a,b| a[1] <=> b[1]}
|
231
|
-
analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
|
232
|
-
elsif options[:rank_abs] # rank by absolute values
|
233
|
-
pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
|
234
|
-
neg_set = pos_set.reverse
|
235
|
-
analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
|
236
|
-
else
|
237
|
-
pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
|
238
|
-
neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
|
239
|
-
analyze = [[pos_set,'positive'],[neg_set,'negative']]
|
240
|
-
end
|
241
|
-
|
242
|
-
# inverse lists
|
243
|
-
analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
|
244
|
-
|
245
|
-
# split sequence set when --split option is given
|
246
|
-
if options[:split_words]
|
247
|
-
seqs_with_words = Hash.new
|
248
|
-
|
249
|
-
options[:split_words].each do |split_word|
|
250
|
-
begin
|
251
|
-
IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
|
252
|
-
l = x.split("\t")
|
253
|
-
seqs_with_words[l[0]] = 1 if l[1].to_i > 0
|
254
|
-
end
|
255
|
-
rescue
|
256
|
-
warn "could not split sequences on word #{split_word}: " + $!
|
257
|
-
end
|
258
|
-
end
|
259
|
-
|
260
|
-
analyze_split = []
|
261
|
-
analyze.each do |set,nm|
|
262
|
-
analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
|
263
|
-
end
|
264
|
-
analyze = analyze_split
|
265
|
-
end
|
266
|
-
|
267
|
-
###
|
268
|
-
### Correlation analysis
|
269
|
-
###
|
270
|
-
|
271
|
-
puts ">> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
|
272
|
-
|
273
|
-
analyze.each do |set,nm|
|
274
|
-
ngenes = set.size
|
275
|
-
puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
|
276
|
-
next if ngenes == 0
|
277
|
-
perms = []
|
278
|
-
report = []
|
279
|
-
pfdrz = []
|
280
|
-
|
281
|
-
franks = Hash.new # tid => index in set
|
282
|
-
set.each_with_index{|x,i| franks[x[0]] = i}
|
283
|
-
|
284
|
-
puts "permuting #{options[:permutations]} times ...\n"
|
285
|
-
options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
|
286
|
-
|
287
|
-
pbar = ProgressBar.new("progress",nwords)
|
288
|
-
wids.to_a.sort_by{|x| x[1]}.peach(threads) do |word,wid|
|
289
|
-
pbar.inc
|
290
|
-
next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
|
291
|
-
next if options[:plot_words] and !options[:plot_words].include?(word)
|
292
|
-
|
293
|
-
plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
|
294
|
-
|
295
|
-
score = Array.new(ngenes) # scores ordered by fold change
|
296
|
-
|
297
|
-
if sequences
|
298
|
-
score = set.map{|x| wordscores[allorder[x[0]]][wid]}
|
299
|
-
score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
|
300
|
-
else # use precomputed word database
|
301
|
-
wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
|
302
|
-
case options[:scoring_scheme]
|
303
|
-
when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
|
304
|
-
when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
|
305
|
-
when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
|
306
|
-
end
|
307
|
-
end
|
308
|
-
|
309
|
-
smean = score.to_statarray.mean
|
310
|
-
maxrs = 0
|
311
|
-
leading_edge = 0
|
312
|
-
rs = 0 #running sum
|
313
|
-
rsa = [0]
|
314
|
-
score.each_with_index do |x,i|
|
315
|
-
rs += (x-smean)
|
316
|
-
rsa << rs
|
317
|
-
if rs.abs > maxrs.abs
|
318
|
-
maxrs = rs
|
319
|
-
leading_edge = i+1
|
320
|
-
end
|
321
|
-
end
|
322
|
-
|
323
|
-
plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
|
324
|
-
plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
|
325
|
-
|
326
|
-
# we are only interested in pos. maxrs scores,
|
327
|
-
# because we currently analyze up/down regulated seperately
|
328
|
-
next if maxrs <= 0
|
329
|
-
|
330
|
-
pmaxrs_pos = StatArray.new
|
331
|
-
perms.each_with_index do |psa,pidx|
|
332
|
-
prs = 0
|
333
|
-
prsa = [0]
|
334
|
-
pmaxrs = 0
|
335
|
-
psa.each do |i|
|
336
|
-
prs += score[i]-smean
|
337
|
-
prsa << prs
|
338
|
-
pmaxrs = prs if prs.abs > pmaxrs.abs
|
339
|
-
end
|
340
|
-
# the permuted scores are approx. symmetric around 0
|
341
|
-
pmaxrs_pos << pmaxrs.abs
|
342
|
-
plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
|
343
|
-
end
|
344
|
-
|
345
|
-
pmean = pmaxrs_pos.mean
|
346
|
-
pstd = pmaxrs_pos.stddev
|
347
|
-
|
348
|
-
#Because the word zscore distr. can be quite different,
|
349
|
-
# we compute the deviation from the mean of the absolute dist.
|
350
|
-
# The permuted maxRS should be normally distr. (sum of random numbers)
|
351
|
-
pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
|
352
|
-
|
353
|
-
#pvalue and fdr statistic for word is also computed based on abs. dist.
|
354
|
-
pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
|
355
|
-
zsc = (maxrs-pmean)/pstd
|
356
|
-
|
357
|
-
plotfile.close if options[:plot_words]
|
358
|
-
report << [wid,zsc,pval,nil,leading_edge]
|
359
|
-
|
360
|
-
end # wordsize
|
361
|
-
pbar.finish
|
362
|
-
|
363
|
-
###
|
364
|
-
### FDR
|
365
|
-
###
|
366
|
-
|
367
|
-
puts "fdr calculation ..."
|
368
|
-
fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
|
369
|
-
report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
|
370
|
-
fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
|
371
|
-
nfp = pfdrz.size.to_f
|
372
|
-
ntp = report.size.to_f
|
373
|
-
word_fdrrank = Hash.new()
|
374
|
-
ifp = 0
|
375
|
-
itp = 0
|
376
|
-
fdrrank.each do |zsc,idx|
|
377
|
-
if idx.nil?
|
378
|
-
ifp += 1
|
379
|
-
else
|
380
|
-
itp += 1
|
381
|
-
fpr = ifp/nfp
|
382
|
-
tpr = itp/ntp
|
383
|
-
report[idx][3] = fpr/tpr
|
384
|
-
end
|
385
|
-
end
|
386
|
-
|
387
|
-
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
388
|
-
puts ""
|
389
|
-
puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
390
|
-
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
|
391
|
-
|
392
|
-
###
|
393
|
-
### Output summarization
|
394
|
-
###
|
395
|
-
|
396
|
-
wids2 = wids.invert
|
397
|
-
report = report.sort_by{|x| x[1]}.reverse
|
398
|
-
puts "\nTop #{output_top} words"
|
399
|
-
puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
400
|
-
report[0,output_top].each_with_index do |r,i|
|
401
|
-
wd = wids2[r[0]]
|
402
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
403
|
-
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
404
|
-
end
|
405
|
-
|
406
|
-
if options[:report_words]
|
407
|
-
puts "......"
|
408
|
-
report.each_with_index do |r,i|
|
409
|
-
if options[:report_words].include?(r[0]) # and i > output_top
|
410
|
-
wd = wids2[r[0]]
|
411
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
412
|
-
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
413
|
-
end
|
414
|
-
end
|
415
|
-
end
|
416
|
-
|
417
|
-
if options[:dump]
|
418
|
-
fname = rankfilename + ".#{nm}." + options[:dump].to_s
|
419
|
-
of = File.new(fname,"w")
|
420
|
-
of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
421
|
-
puts "dumping top #{options[:dump]} words in file: #{fname}"
|
422
|
-
report[0..options[:dump]-1].each_with_index do |r,i|
|
423
|
-
wd = wids2[r[0]]
|
424
|
-
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
425
|
-
of.puts s.map{|x| sprintf("%-10s",x)}.join('')
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
|
-
end
|