cwords 0.1.2-jruby → 0.1.3-jruby

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/bin/cwords +10 -2
  2. data/bin/cwords_mkdb +9 -2
  3. metadata +3 -5
  4. data/bin/cwords2 +0 -429
data/bin/cwords CHANGED
@@ -1,3 +1,11 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env ruby
2
2
 
3
- jruby --server --fast -J-Xmx4096m `dirname $0`/../scripts/cwords.rb $*
3
+ scriptdir = File.dirname(__FILE__) + "/../scripts/"
4
+
5
+ mems = ARGV.join(" ").match(/M=(\w+)/)
6
+ mem = mems ? mems[1] : '4096m'
7
+ argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
8
+ puts "Starting cwords with max heap size " + mem + " ...\n"
9
+
10
+ cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords.rb " + argv
11
+ exec cmd
@@ -1,3 +1,10 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env ruby
2
+ scriptdir = File.dirname(__FILE__) + "/../scripts/"
2
3
 
3
- jruby --server --fast -J-Xmx4096m `dirname $0`/../scripts/cwords_mkdb.rb $*
4
+ mems = ARGV.join(" ").match(/M=(\w+)/)
5
+ mem = mems ? mems[1] : '4096m'
6
+ argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
7
+ puts "Starting cwords with max heap size " + mem + " ..."
8
+
9
+ cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
10
+ exec cmd
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 2
9
- version: 0.1.2
8
+ - 3
9
+ version: 0.1.3
10
10
  platform: jruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -46,12 +46,11 @@ dependencies:
46
46
  version: 0.2.0
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
- description: Word correlation analysis for bioinformatics
49
+ description: Word correlation analysis in ranked nucleotide sequences (bioinformatics)
50
50
  email: andersmbj@gmail.com
51
51
  executables:
52
52
  - cwords
53
53
  - cwords_mkdb
54
- - cwords2
55
54
  extensions: []
56
55
 
57
56
  extra_rdoc_files: []
@@ -59,7 +58,6 @@ extra_rdoc_files: []
59
58
  files:
60
59
  - README
61
60
  - LICENSE
62
- - bin/cwords2
63
61
  - bin/cwords
64
62
  - bin/cwords_mkdb
65
63
  - lib/ushuffle.jar
@@ -1,429 +0,0 @@
1
- #!/usr/bin/env jruby --server --fast -J-Xmx4096m
2
-
3
- ### Requires jruby, www.jruby.org
4
-
5
- ###
6
- ### Running sum analysis for 5 different measures of word enrichment in a sequence:
7
- ### obs : use the observed word count
8
- ### bin : use presence/absence of word
9
- ### pval : use the p-value of the expected occurrences being >= the observed occurence
10
-
11
- srcdir = File.dirname(__FILE__)
12
- basedir = srcdir + "/../"
13
- libdir = basedir + '/lib/'
14
- $LOAD_PATH << libdir
15
-
16
- require 'wordRS-lib.rb'
17
- require 'rubygems'
18
- require 'progressbar'
19
- require 'optparse'
20
- require 'peach'
21
- require 'java'
22
- require libdir + 'ushuffle.jar'
23
- java_import 'UShuffle'
24
-
25
- #default options
26
- options = Hash.new
27
- options[:wordsize] = [7]
28
- options[:split_words]=nil
29
- options[:dbdir] = basedir + "db/"
30
- options[:scoring_scheme] = 'pval'
31
- options[:permutations]=50
32
- options[:seqshuffles]=100
33
- options[:rankfile]=nil
34
- options[:seqfile]=nil
35
- options[:report_words]=nil
36
- options[:plot_words]=nil
37
- options[:onlyanno]=nil
38
- options[:dump]=nil
39
- options[:testing]=nil
40
- options[:rank_all]=nil
41
- options[:rank_inverse]=nil
42
- options[:rank_split_median]=nil
43
- options[:rank_abs]=nil
44
- options[:bg]=1 #mononucleotide shuffling
45
- options[:threads]=1
46
-
47
- $coptions = OptionParser.new do |opts|
48
- # analysis settings
49
- opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
50
- opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
51
- opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
52
- opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
53
- opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
54
- opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
55
- opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
56
- opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
57
-
58
- # rank control
59
- opts.on("-x", "--rank_all", "do not split positive and neg. values") {|o| options[:rank_all] = true}
60
- opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
61
- opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
62
- opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
63
-
64
- # files and directories
65
- opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
66
- opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
67
- opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
68
-
69
- # output control
70
- opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
71
- opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
72
- opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
73
- opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
74
- end
75
-
76
- def show_help(msg="", code=0, io=STDOUT)
77
- io.puts "#{msg}\n#{$coptions}"
78
- exit(code)
79
- end
80
-
81
- $coptions.parse!(ARGV)
82
- # mandatory parameters
83
- [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
84
- show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
85
- show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
86
-
87
- testing = options[:testing]
88
-
89
- # get filename without directory
90
- rankfilename = File.basename(options[:rankfile])
91
-
92
- # hard-coded
93
- output_top = 10
94
-
95
- prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
96
- annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
97
- tidfile = basedir + "/resources/" + "genemap.tsv"
98
- seqshuffles = 5000 # currently hardcoded for database
99
- sequences = nil
100
- nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
101
- bg=options[:bg] # TODO, make option
102
- threads=options[:threads]
103
-
104
- ###
105
- ### Main program
106
- ###
107
-
108
- puts ">> Parameters"
109
- options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
110
-
111
- # read in mirbase seed family
112
- word_annotation = Hash.new("") # seq => family
113
- IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
114
-
115
- # read optional sequences
116
- if options[:seqfile]
117
- puts ">> reading sequences ..."
118
- sequences = Hash.new
119
- IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
120
- ls = entry.split("\n").map{|x| x.chomp}
121
- # hash ensures sequence ids unique
122
- sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
123
- end
124
- seqshuffles = options[:seqshuffles]
125
- end
126
-
127
- # initialize word id hash, word sequence => word id (0..nwords-1)
128
- wids = Hash.new
129
- i = 0
130
- options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
131
-
132
- ###
133
- ### ID mapping
134
- ###
135
-
136
- # pre-computed word database:
137
- # map ids given in rankfile to internal ids
138
- # remove rankfile entries with no match to internal id
139
- # sequence file:
140
- # take intersection of rank and sequence IDs
141
-
142
- puts ">> Mapping and filtering IDs ..."
143
-
144
- all = []
145
- begin
146
- idmap = Hash.new
147
- internal_ids = nil
148
-
149
- if sequences
150
- internal_ids = sequences
151
- else
152
- IO.readlines(tidfile).each do |l|
153
- tid = l.split(" ")[0]
154
- l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
155
- end
156
- internal_ids = idmap.invert # allowed internal ids
157
- end
158
-
159
- allh = Hash.new {|h,k| h[k] = []}
160
- filtered = 0
161
-
162
- IO.readlines(options[:rankfile]).each do |l|
163
- l = l.split("\t")
164
- #test if internal id or mapable external id
165
- tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
166
- tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
167
- end
168
-
169
- # filter unknown sequences
170
- sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
171
-
172
- # we currently mean-collapse ids, we could allow mean/min/max collapsing ...
173
- all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
174
-
175
- puts "removed #{filtered} invalid transcript ids" if filtered > 0
176
- end
177
-
178
- allorder = Hash.new # tid => index in all
179
- all.each_with_index{|x,i| allorder[x[0]] = i}
180
-
181
- ###
182
- ### Word enumeration (optional)
183
- ###
184
-
185
- wordscores = []
186
- if sequences
187
- puts ">> Enumerating words in sequences"
188
- wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
189
- pbar = ProgressBar.new("progress",sequences.size)
190
- all.peach(threads) do |seqid,val|
191
- us = UShuffle.new
192
- seq=sequences[seqid]
193
- seqidx=allorder[seqid]
194
- pbar.inc
195
- seqsize = seq.size
196
- observed = Array.new(wids.size,0)
197
- options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
198
-
199
- case options[:scoring_scheme]
200
- when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
201
- when "obs" then wordscores[seqidx] = observed
202
- else
203
- # pval, compute distribution of expected word occurrences
204
- us.init_shuffle(seq,bg)
205
- seqshuffles.times do |si|
206
- seqsh = us.shuffle
207
- expected = Array.new(wids.size,0)
208
- options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
209
- observed.each_with_index{|x,widx| wordscores[seqidx][widx] =+ 1 if expected[widx]>=x}
210
- end
211
- end
212
- end
213
- pbar.finish
214
- end
215
-
216
- ###
217
- ### Generate list ranking
218
- ###
219
-
220
- analyze = []
221
- if options[:rank_split_median]
222
- # we should perhaps use an :inverse option,
223
- # reversing the two pos and neg lists
224
- med = all.map{|x| x[1]}.to_statarray.median
225
- pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
226
- neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
227
- analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
228
- elsif options[:rank_all] # do not split positive and negative range
229
- pos_set = all.sort{|a,b| b[1] <=> a[1]}
230
- neg_set = all.sort{|a,b| a[1] <=> b[1]}
231
- analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
232
- elsif options[:rank_abs] # rank by absolute values
233
- pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
234
- neg_set = pos_set.reverse
235
- analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
236
- else
237
- pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
238
- neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
239
- analyze = [[pos_set,'positive'],[neg_set,'negative']]
240
- end
241
-
242
- # inverse lists
243
- analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
244
-
245
- # split sequence set when --split option is given
246
- if options[:split_words]
247
- seqs_with_words = Hash.new
248
-
249
- options[:split_words].each do |split_word|
250
- begin
251
- IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
252
- l = x.split("\t")
253
- seqs_with_words[l[0]] = 1 if l[1].to_i > 0
254
- end
255
- rescue
256
- warn "could not split sequences on word #{split_word}: " + $!
257
- end
258
- end
259
-
260
- analyze_split = []
261
- analyze.each do |set,nm|
262
- analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
263
- end
264
- analyze = analyze_split
265
- end
266
-
267
- ###
268
- ### Correlation analysis
269
- ###
270
-
271
- puts ">> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
272
-
273
- analyze.each do |set,nm|
274
- ngenes = set.size
275
- puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
276
- next if ngenes == 0
277
- perms = []
278
- report = []
279
- pfdrz = []
280
-
281
- franks = Hash.new # tid => index in set
282
- set.each_with_index{|x,i| franks[x[0]] = i}
283
-
284
- puts "permuting #{options[:permutations]} times ...\n"
285
- options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
286
-
287
- pbar = ProgressBar.new("progress",nwords)
288
- wids.to_a.sort_by{|x| x[1]}.peach(threads) do |word,wid|
289
- pbar.inc
290
- next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
291
- next if options[:plot_words] and !options[:plot_words].include?(word)
292
-
293
- plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
294
-
295
- score = Array.new(ngenes) # scores ordered by fold change
296
-
297
- if sequences
298
- score = set.map{|x| wordscores[allorder[x[0]]][wid]}
299
- score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
300
- else # use precomputed word database
301
- wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
302
- case options[:scoring_scheme]
303
- when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
304
- when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
305
- when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
306
- end
307
- end
308
-
309
- smean = score.to_statarray.mean
310
- maxrs = 0
311
- leading_edge = 0
312
- rs = 0 #running sum
313
- rsa = [0]
314
- score.each_with_index do |x,i|
315
- rs += (x-smean)
316
- rsa << rs
317
- if rs.abs > maxrs.abs
318
- maxrs = rs
319
- leading_edge = i+1
320
- end
321
- end
322
-
323
- plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
324
- plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
325
-
326
- # we are only interested in pos. maxrs scores,
327
- # because we currently analyze up/down regulated seperately
328
- next if maxrs <= 0
329
-
330
- pmaxrs_pos = StatArray.new
331
- perms.each_with_index do |psa,pidx|
332
- prs = 0
333
- prsa = [0]
334
- pmaxrs = 0
335
- psa.each do |i|
336
- prs += score[i]-smean
337
- prsa << prs
338
- pmaxrs = prs if prs.abs > pmaxrs.abs
339
- end
340
- # the permuted scores are approx. symmetric around 0
341
- pmaxrs_pos << pmaxrs.abs
342
- plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
343
- end
344
-
345
- pmean = pmaxrs_pos.mean
346
- pstd = pmaxrs_pos.stddev
347
-
348
- #Because the word zscore distr. can be quite different,
349
- # we compute the deviation from the mean of the absolute dist.
350
- # The permuted maxRS should be normally distr. (sum of random numbers)
351
- pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
352
-
353
- #pvalue and fdr statistic for word is also computed based on abs. dist.
354
- pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
355
- zsc = (maxrs-pmean)/pstd
356
-
357
- plotfile.close if options[:plot_words]
358
- report << [wid,zsc,pval,nil,leading_edge]
359
-
360
- end # wordsize
361
- pbar.finish
362
-
363
- ###
364
- ### FDR
365
- ###
366
-
367
- puts "fdr calculation ..."
368
- fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
369
- report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
370
- fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
371
- nfp = pfdrz.size.to_f
372
- ntp = report.size.to_f
373
- word_fdrrank = Hash.new()
374
- ifp = 0
375
- itp = 0
376
- fdrrank.each do |zsc,idx|
377
- if idx.nil?
378
- ifp += 1
379
- else
380
- itp += 1
381
- fpr = ifp/nfp
382
- tpr = itp/ntp
383
- report[idx][3] = fpr/tpr
384
- end
385
- end
386
-
387
- cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
388
- puts ""
389
- puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
390
- puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
391
-
392
- ###
393
- ### Output summarization
394
- ###
395
-
396
- wids2 = wids.invert
397
- report = report.sort_by{|x| x[1]}.reverse
398
- puts "\nTop #{output_top} words"
399
- puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
400
- report[0,output_top].each_with_index do |r,i|
401
- wd = wids2[r[0]]
402
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
403
- puts s.map{|x| sprintf("%-10s",x)}.join('')
404
- end
405
-
406
- if options[:report_words]
407
- puts "......"
408
- report.each_with_index do |r,i|
409
- if options[:report_words].include?(r[0]) # and i > output_top
410
- wd = wids2[r[0]]
411
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
412
- puts s.map{|x| sprintf("%-10s",x)}.join('')
413
- end
414
- end
415
- end
416
-
417
- if options[:dump]
418
- fname = rankfilename + ".#{nm}." + options[:dump].to_s
419
- of = File.new(fname,"w")
420
- of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
421
- puts "dumping top #{options[:dump]} words in file: #{fname}"
422
- report[0..options[:dump]-1].each_with_index do |r,i|
423
- wd = wids2[r[0]]
424
- s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
425
- of.puts s.map{|x| sprintf("%-10s",x)}.join('')
426
- end
427
- end
428
-
429
- end