cwords 0.1.2-jruby → 0.1.3-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/bin/cwords +10 -2
  2. data/bin/cwords_mkdb +9 -2
  3. metadata +3 -5
  4. data/bin/cwords2 +0 -429
data/bin/cwords CHANGED
@@ -1,3 +1,11 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env ruby
2
2
 
3
- jruby --server --fast -J-Xmx4096m `dirname $0`/../scripts/cwords.rb $*
3
+ scriptdir = File.dirname(__FILE__) + "/../scripts/"
4
+
5
+ mems = ARGV.join(" ").match(/M=(\w+)/)
6
+ mem = mems ? mems[1] : '4096m'
7
+ argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
8
+ puts "Starting cwords with max heap size " + mem + " ...\n"
9
+
10
+ cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords.rb " + argv
11
+ exec cmd
@@ -1,3 +1,10 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env ruby
2
+ scriptdir = File.dirname(__FILE__) + "/../scripts/"
2
3
 
3
- jruby --server --fast -J-Xmx4096m `dirname $0`/../scripts/cwords_mkdb.rb $*
4
+ mems = ARGV.join(" ").match(/M=(\w+)/)
5
+ mem = mems ? mems[1] : '4096m'
6
+ argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
7
+ puts "Starting cwords with max heap size " + mem + " ..."
8
+
9
+ cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
10
+ exec cmd
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 2
9
- version: 0.1.2
8
+ - 3
9
+ version: 0.1.3
10
10
  platform: jruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -46,12 +46,11 @@ dependencies:
46
46
  version: 0.2.0
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
- description: Word correlation analysis for bioinformatics
49
+ description: Word correlation analysis in ranked nucleotide sequences (bioinformatics)
50
50
  email: andersmbj@gmail.com
51
51
  executables:
52
52
  - cwords
53
53
  - cwords_mkdb
54
- - cwords2
55
54
  extensions: []
56
55
 
57
56
  extra_rdoc_files: []
@@ -59,7 +58,6 @@ extra_rdoc_files: []
59
58
  files:
60
59
  - README
61
60
  - LICENSE
62
- - bin/cwords2
63
61
  - bin/cwords
64
62
  - bin/cwords_mkdb
65
63
  - lib/ushuffle.jar
@@ -1,429 +0,0 @@
1
- #!/usr/bin/env jruby --server --fast -J-Xmx4096m
2
-
3
- ### Requires jruby, www.jruby.org
4
-
5
- ###
6
- ### Running sum analysis for 5 different measures of word enrichment in a sequence:
7
- ### obs : use the observed word count
8
- ### bin : use presence/absence of word
9
- ### pval : use the p-value of the expected occurrences being >= the observed occurence
10
-
11
- srcdir = File.dirname(__FILE__)
12
- basedir = srcdir + "/../"
13
- libdir = basedir + '/lib/'
14
- $LOAD_PATH << libdir
15
-
16
- require 'wordRS-lib.rb'
17
- require 'rubygems'
18
- require 'progressbar'
19
- require 'optparse'
20
- require 'peach'
21
- require 'java'
22
- require libdir + 'ushuffle.jar'
23
- java_import 'UShuffle'
24
-
25
- #default options
26
- options = Hash.new
27
- options[:wordsize] = [7]
28
- options[:split_words]=nil
29
- options[:dbdir] = basedir + "db/"
30
- options[:scoring_scheme] = 'pval'
31
- options[:permutations]=50
32
- options[:seqshuffles]=100
33
- options[:rankfile]=nil
34
- options[:seqfile]=nil
35
- options[:report_words]=nil
36
- options[:plot_words]=nil
37
- options[:onlyanno]=nil
38
- options[:dump]=nil
39
- options[:testing]=nil
40
- options[:rank_all]=nil
41
- options[:rank_inverse]=nil
42
- options[:rank_split_median]=nil
43
- options[:rank_abs]=nil
44
- options[:bg]=1 #mononucleotide shuffling
45
- options[:threads]=1
46
-
47
- $coptions = OptionParser.new do |opts|
48
- # analysis settings
49
- opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
50
- opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
51
- opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
52
- opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
53
- opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
54
- opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
55
- opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
56
- opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
57
-
58
- # rank control
59
- opts.on("-x", "--rank_all", "do not split positive and neg. values") {|o| options[:rank_all] = true}
60
- opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
61
- opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
62
- opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
63
-
64
- # files and directories
65
- opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
66
- opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
67
- opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
68
-
69
- # output control
70
- opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
71
- opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
72
- opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
73
- opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
74
- end
75
-
76
- def show_help(msg="", code=0, io=STDOUT)
77
- io.puts "#{msg}\n#{$coptions}"
78
- exit(code)
79
- end
80
-
81
- $coptions.parse!(ARGV)
82
- # mandatory parameters
83
- [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
84
- show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
85
- show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
86
-
87
- testing = options[:testing]
88
-
89
- # get filename without directory
90
- rankfilename = File.basename(options[:rankfile])
91
-
92
- # hard-coded
93
- output_top = 10
94
-
95
- prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
96
- annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
97
- tidfile = basedir + "/resources/" + "genemap.tsv"
98
- seqshuffles = 5000 # currently hardcoded for database
99
- sequences = nil
100
- nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
101
- bg=options[:bg] # TODO, make option
102
- threads=options[:threads]
103
-
104
- ###
105
- ### Main program
106
- ###
107
-
108
- puts ">> Parameters"
109
- options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
110
-
111
- # read in mirbase seed family
112
- word_annotation = Hash.new("") # seq => family
113
- IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
114
-
115
- # read optional sequences
116
- if options[:seqfile]
117
- puts ">> reading sequences ..."
118
- sequences = Hash.new
119
- IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
120
- ls = entry.split("\n").map{|x| x.chomp}
121
- # hash ensures sequence ids unique
122
- sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
123
- end
124
- seqshuffles = options[:seqshuffles]
125
- end
126
-
127
- # initialize word id hash, word sequence => word id (0..nwords-1)
128
- wids = Hash.new
129
- i = 0
130
- options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
131
-
132
- ###
133
- ### ID mapping
134
- ###
135
-
136
- # pre-computed word database:
137
- # map ids given in rankfile to internal ids
138
- # remove rankfile entries with no match to internal id
139
- # sequence file:
140
- # take intersection of rank and sequence IDs
141
-
142
- puts ">> Mapping and filtering IDs ..."
143
-
144
- all = []
145
- begin
146
- idmap = Hash.new
147
- internal_ids = nil
148
-
149
- if sequences
150
- internal_ids = sequences
151
- else
152
- IO.readlines(tidfile).each do |l|
153
- tid = l.split(" ")[0]
154
- l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
155
- end
156
- internal_ids = idmap.invert # allowed internal ids
157
- end
158
-
159
- allh = Hash.new {|h,k| h[k] = []}
160
- filtered = 0
161
-
162
- IO.readlines(options[:rankfile]).each do |l|
163
- l = l.split("\t")
164
- #test if internal id or mapable external id
165
- tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
166
- tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
167
- end
168
-
169
- # filter unknown sequences
170
- sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
171
-
172
- # we currently mean-collapse ids, we could allow mean/min/max collapsing ...
173
- all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
174
-
175
- puts "removed #{filtered} invalid transcript ids" if filtered > 0
176
- end
177
-
178
- allorder = Hash.new # tid => index in all
179
- all.each_with_index{|x,i| allorder[x[0]] = i}
180
-
181
- ###
182
- ### Word enumeration (optional)
183
- ###
184
-
185
- wordscores = []
186
- if sequences
187
- puts ">> Enumerating words in sequences"
188
- wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
189
- pbar = ProgressBar.new("progress",sequences.size)
190
- all.peach(threads) do |seqid,val|
191
- us = UShuffle.new
192
- seq=sequences[seqid]
193
- seqidx=allorder[seqid]
194
- pbar.inc
195
- seqsize = seq.size
196
- observed = Array.new(wids.size,0)
197
- options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
198
-
199
- case options[:scoring_scheme]
200
- when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
201
- when "obs" then wordscores[seqidx] = observed
202
- else
203
- # pval, compute distribution of expected word occurrences
204
- us.init_shuffle(seq,bg)
205
- seqshuffles.times do |si|
206
- seqsh = us.shuffle
207
- expected = Array.new(wids.size,0)
208
- options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
209
- observed.each_with_index{|x,widx| wordscores[seqidx][widx] =+ 1 if expected[widx]>=x}
210
- end
211
- end
212
- end
213
- pbar.finish
214
- end
215
-
216
- ###
217
- ### Generate list ranking
218
- ###
219
-
220
- analyze = []
221
- if options[:rank_split_median]
222
- # we should perhaps use an :inverse option,
223
- # reversing the two pos and neg lists
224
- med = all.map{|x| x[1]}.to_statarray.median
225
- pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
226
- neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
227
- analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
228
- elsif options[:rank_all] # do not split positive and negative range
229
- pos_set = all.sort{|a,b| b[1] <=> a[1]}
230
- neg_set = all.sort{|a,b| a[1] <=> b[1]}
231
- analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
232
- elsif options[:rank_abs] # rank by absolute values
233
- pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
234
- neg_set = pos_set.reverse
235
- analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
236
- else
237
- pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
238
- neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
239
- analyze = [[pos_set,'positive'],[neg_set,'negative']]
240
- end
241
-
242
- # inverse lists
243
- analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
244
-
245
- # split sequence set when --split option is given
246
- if options[:split_words]
247
- seqs_with_words = Hash.new
248
-
249
- options[:split_words].each do |split_word|
250
- begin
251
- IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
252
- l = x.split("\t")
253
- seqs_with_words[l[0]] = 1 if l[1].to_i > 0
254
- end
255
- rescue
256
- warn "could not split sequences on word #{split_word}: " + $!
257
- end
258
- end
259
-
260
- analyze_split = []
261
- analyze.each do |set,nm|
262
- analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
263
- end
264
- analyze = analyze_split
265
- end
266
-
267
- ###
268
- ### Correlation analysis
269
- ###
270
-
271
- puts ">> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
272
-
273
- analyze.each do |set,nm|
274
- ngenes = set.size
275
- puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
276
- next if ngenes == 0
277
- perms = []
278
- report = []
279
- pfdrz = []
280
-
281
- franks = Hash.new # tid => index in set
282
- set.each_with_index{|x,i| franks[x[0]] = i}
283
-
284
- puts "permuting #{options[:permutations]} times ...\n"
285
- options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
286
-
287
- pbar = ProgressBar.new("progress",nwords)
288
- wids.to_a.sort_by{|x| x[1]}.peach(threads) do |word,wid|
289
- pbar.inc
290
- next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
291
- next if options[:plot_words] and !options[:plot_words].include?(word)
292
-
293
- plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
294
-
295
- score = Array.new(ngenes) # scores ordered by fold change
296
-
297
- if sequences
298
- score = set.map{|x| wordscores[allorder[x[0]]][wid]}
299
- score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
300
- else # use precomputed word database
301
- wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
302
- case options[:scoring_scheme]
303
- when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
304
- when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
305
- when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
306
- end
307
- end
308
-
309
- smean = score.to_statarray.mean
310
- maxrs = 0
311
- leading_edge = 0
312
- rs = 0 #running sum
313
- rsa = [0]
314
- score.each_with_index do |x,i|
315
- rs += (x-smean)
316
- rsa << rs
317
- if rs.abs > maxrs.abs
318
- maxrs = rs
319
- leading_edge = i+1
320
- end
321
- end
322
-
323
- plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
324
- plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
325
-
326
- # we are only interested in pos. maxrs scores,
327
- # because we currently analyze up/down regulated seperately
328
- next if maxrs <= 0
329
-
330
- pmaxrs_pos = StatArray.new
331
- perms.each_with_index do |psa,pidx|
332
- prs = 0
333
- prsa = [0]
334
- pmaxrs = 0
335
- psa.each do |i|
336
- prs += score[i]-smean
337
- prsa << prs
338
- pmaxrs = prs if prs.abs > pmaxrs.abs
339
- end
340
- # the permuted scores are approx. symmetric around 0
341
- pmaxrs_pos << pmaxrs.abs
342
- plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
343
- end
344
-
345
- pmean = pmaxrs_pos.mean
346
- pstd = pmaxrs_pos.stddev
347
-
348
- #Because the word zscore distr. can be quite different,
349
- # we compute the deviation from the mean of the absolute dist.
350
- # The permuted maxRS should be normally distr. (sum of random numbers)
351
- pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
352
-
353
- #pvalue and fdr statistic for word is also computed based on abs. dist.
354
- pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
355
- zsc = (maxrs-pmean)/pstd
356
-
357
- plotfile.close if options[:plot_words]
358
- report << [wid,zsc,pval,nil,leading_edge]
359
-
360
- end # wordsize
361
- pbar.finish
362
-
363
- ###
364
- ### FDR
365
- ###
366
-
367
- puts "fdr calculation ..."
368
- fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
369
- report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
370
- fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
371
- nfp = pfdrz.size.to_f
372
- ntp = report.size.to_f
373
- word_fdrrank = Hash.new()
374
- ifp = 0
375
- itp = 0
376
- fdrrank.each do |zsc,idx|
377
- if idx.nil?
378
- ifp += 1
379
- else
380
- itp += 1
381
- fpr = ifp/nfp
382
- tpr = itp/ntp
383
- report[idx][3] = fpr/tpr
384
- end
385
- end
386
-
387
- cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
388
- puts ""
389
- puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
390
- puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
391
-
392
- ###
393
- ### Output summarization
394
- ###
395
-
396
- wids2 = wids.invert
397
- report = report.sort_by{|x| x[1]}.reverse
398
- puts "\nTop #{output_top} words"
399
- puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
400
- report[0,output_top].each_with_index do |r,i|
401
- wd = wids2[r[0]]
402
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
403
- puts s.map{|x| sprintf("%-10s",x)}.join('')
404
- end
405
-
406
- if options[:report_words]
407
- puts "......"
408
- report.each_with_index do |r,i|
409
- if options[:report_words].include?(r[0]) # and i > output_top
410
- wd = wids2[r[0]]
411
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
412
- puts s.map{|x| sprintf("%-10s",x)}.join('')
413
- end
414
- end
415
- end
416
-
417
- if options[:dump]
418
- fname = rankfilename + ".#{nm}." + options[:dump].to_s
419
- of = File.new(fname,"w")
420
- of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
421
- puts "dumping top #{options[:dump]} words in file: #{fname}"
422
- report[0..options[:dump]-1].each_with_index do |r,i|
423
- wd = wids2[r[0]]
424
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
425
- of.puts s.map{|x| sprintf("%-10s",x)}.join('')
426
- end
427
- end
428
-
429
- end