cwords 0.1.1-jruby → 0.1.2-jruby

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/bin/cwords2 +429 -0
  2. metadata +6 -4
@@ -0,0 +1,429 @@
1
+ #!/usr/bin/env jruby --server --fast -J-Xmx4096m
2
+
3
+ ### Requires jruby, www.jruby.org
4
+
5
+ ###
6
+ ### Running sum analysis for 5 different measures of word enrichment in a sequence:
7
+ ### obs : use the observed word count
8
+ ### bin : use presence/absence of word
9
+ ### pval : use the p-value of the expected occurrences being >= the observed occurence
10
+
11
+ srcdir = File.dirname(__FILE__)
12
+ basedir = srcdir + "/../"
13
+ libdir = basedir + '/lib/'
14
+ $LOAD_PATH << libdir
15
+
16
+ require 'wordRS-lib.rb'
17
+ require 'rubygems'
18
+ require 'progressbar'
19
+ require 'optparse'
20
+ require 'peach'
21
+ require 'java'
22
+ require libdir + 'ushuffle.jar'
23
+ java_import 'UShuffle'
24
+
25
+ #default options
26
+ options = Hash.new
27
+ options[:wordsize] = [7]
28
+ options[:split_words]=nil
29
+ options[:dbdir] = basedir + "db/"
30
+ options[:scoring_scheme] = 'pval'
31
+ options[:permutations]=50
32
+ options[:seqshuffles]=100
33
+ options[:rankfile]=nil
34
+ options[:seqfile]=nil
35
+ options[:report_words]=nil
36
+ options[:plot_words]=nil
37
+ options[:onlyanno]=nil
38
+ options[:dump]=nil
39
+ options[:testing]=nil
40
+ options[:rank_all]=nil
41
+ options[:rank_inverse]=nil
42
+ options[:rank_split_median]=nil
43
+ options[:rank_abs]=nil
44
+ options[:bg]=1 #mononucleotide shuffling
45
+ options[:threads]=1
46
+
47
+ $coptions = OptionParser.new do |opts|
48
+ # analysis settings
49
+ opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
50
+ opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
51
+ opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
52
+ opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
53
+ opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
54
+ opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
55
+ opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
56
+ opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
57
+
58
+ # rank control
59
+ opts.on("-x", "--rank_all", "do not split positive and neg. values") {|o| options[:rank_all] = true}
60
+ opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
61
+ opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
62
+ opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
63
+
64
+ # files and directories
65
+ opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
66
+ opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
67
+ opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
68
+
69
+ # output control
70
+ opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
71
+ opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
72
+ opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
73
+ opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
74
+ end
75
+
76
+ def show_help(msg="", code=0, io=STDOUT)
77
+ io.puts "#{msg}\n#{$coptions}"
78
+ exit(code)
79
+ end
80
+
81
+ $coptions.parse!(ARGV)
82
+ # mandatory parameters
83
+ [:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
84
+ show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
85
+ show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
86
+
87
+ testing = options[:testing]
88
+
89
+ # get filename without directory
90
+ rankfilename = File.basename(options[:rankfile])
91
+
92
+ # hard-coded
93
+ output_top = 10
94
+
95
+ prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
96
+ annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
97
+ tidfile = basedir + "/resources/" + "genemap.tsv"
98
+ seqshuffles = 5000 # currently hardcoded for database
99
+ sequences = nil
100
+ nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
101
+ bg=options[:bg] # TODO, make option
102
+ threads=options[:threads]
103
+
104
+ ###
105
+ ### Main program
106
+ ###
107
+
108
+ puts ">> Parameters"
109
+ options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
110
+
111
+ # read in mirbase seed family
112
+ word_annotation = Hash.new("") # seq => family
113
+ IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
114
+
115
+ # read optional sequences
116
+ if options[:seqfile]
117
+ puts ">> reading sequences ..."
118
+ sequences = Hash.new
119
+ IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
120
+ ls = entry.split("\n").map{|x| x.chomp}
121
+ # hash ensures sequence ids unique
122
+ sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
123
+ end
124
+ seqshuffles = options[:seqshuffles]
125
+ end
126
+
127
+ # initialize word id hash, word sequence => word id (0..nwords-1)
128
+ wids = Hash.new
129
+ i = 0
130
+ options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
131
+
132
+ ###
133
+ ### ID mapping
134
+ ###
135
+
136
+ # pre-computed word database:
137
+ # map ids given in rankfile to internal ids
138
+ # remove rankfile entries with no match to internal id
139
+ # sequence file:
140
+ # take intersection of rank and sequence IDs
141
+
142
+ puts ">> Mapping and filtering IDs ..."
143
+
144
+ all = []
145
+ begin
146
+ idmap = Hash.new
147
+ internal_ids = nil
148
+
149
+ if sequences
150
+ internal_ids = sequences
151
+ else
152
+ IO.readlines(tidfile).each do |l|
153
+ tid = l.split(" ")[0]
154
+ l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
155
+ end
156
+ internal_ids = idmap.invert # allowed internal ids
157
+ end
158
+
159
+ allh = Hash.new {|h,k| h[k] = []}
160
+ filtered = 0
161
+
162
+ IO.readlines(options[:rankfile]).each do |l|
163
+ l = l.split("\t")
164
+ #test if internal id or mapable external id
165
+ tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
166
+ tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
167
+ end
168
+
169
+ # filter unknown sequences
170
+ sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
171
+
172
+ # we currently mean-collapse ids, we could allow mean/min/max collapsing ...
173
+ all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
174
+
175
+ puts "removed #{filtered} invalid transcript ids" if filtered > 0
176
+ end
177
+
178
+ allorder = Hash.new # tid => index in all
179
+ all.each_with_index{|x,i| allorder[x[0]] = i}
180
+
181
+ ###
182
+ ### Word enumeration (optional)
183
+ ###
184
+
185
+ wordscores = []
186
+ if sequences
187
+ puts ">> Enumerating words in sequences"
188
+ wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
189
+ pbar = ProgressBar.new("progress",sequences.size)
190
+ all.peach(threads) do |seqid,val|
191
+ us = UShuffle.new
192
+ seq=sequences[seqid]
193
+ seqidx=allorder[seqid]
194
+ pbar.inc
195
+ seqsize = seq.size
196
+ observed = Array.new(wids.size,0)
197
+ options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
198
+
199
+ case options[:scoring_scheme]
200
+ when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
201
+ when "obs" then wordscores[seqidx] = observed
202
+ else
203
+ # pval, compute distribution of expected word occurrences
204
+ us.init_shuffle(seq,bg)
205
+ seqshuffles.times do |si|
206
+ seqsh = us.shuffle
207
+ expected = Array.new(wids.size,0)
208
+ options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
209
+ observed.each_with_index{|x,widx| wordscores[seqidx][widx] =+ 1 if expected[widx]>=x}
210
+ end
211
+ end
212
+ end
213
+ pbar.finish
214
+ end
215
+
216
+ ###
217
+ ### Generate list ranking
218
+ ###
219
+
220
+ analyze = []
221
+ if options[:rank_split_median]
222
+ # we should perhaps use an :inverse option,
223
+ # reversing the two pos and neg lists
224
+ med = all.map{|x| x[1]}.to_statarray.median
225
+ pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
226
+ neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
227
+ analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
228
+ elsif options[:rank_all] # do not split positive and negative range
229
+ pos_set = all.sort{|a,b| b[1] <=> a[1]}
230
+ neg_set = all.sort{|a,b| a[1] <=> b[1]}
231
+ analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
232
+ elsif options[:rank_abs] # rank by absolute values
233
+ pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
234
+ neg_set = pos_set.reverse
235
+ analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
236
+ else
237
+ pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
238
+ neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
239
+ analyze = [[pos_set,'positive'],[neg_set,'negative']]
240
+ end
241
+
242
+ # inverse lists
243
+ analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
244
+
245
+ # split sequence set when --split option is given
246
+ if options[:split_words]
247
+ seqs_with_words = Hash.new
248
+
249
+ options[:split_words].each do |split_word|
250
+ begin
251
+ IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
252
+ l = x.split("\t")
253
+ seqs_with_words[l[0]] = 1 if l[1].to_i > 0
254
+ end
255
+ rescue
256
+ warn "could not split sequences on word #{split_word}: " + $!
257
+ end
258
+ end
259
+
260
+ analyze_split = []
261
+ analyze.each do |set,nm|
262
+ analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
263
+ end
264
+ analyze = analyze_split
265
+ end
266
+
267
+ ###
268
+ ### Correlation analysis
269
+ ###
270
+
271
+ puts ">> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
272
+
273
+ analyze.each do |set,nm|
274
+ ngenes = set.size
275
+ puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
276
+ next if ngenes == 0
277
+ perms = []
278
+ report = []
279
+ pfdrz = []
280
+
281
+ franks = Hash.new # tid => index in set
282
+ set.each_with_index{|x,i| franks[x[0]] = i}
283
+
284
+ puts "permuting #{options[:permutations]} times ...\n"
285
+ options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
286
+
287
+ pbar = ProgressBar.new("progress",nwords)
288
+ wids.to_a.sort_by{|x| x[1]}.peach(threads) do |word,wid|
289
+ pbar.inc
290
+ next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
291
+ next if options[:plot_words] and !options[:plot_words].include?(word)
292
+
293
+ plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
294
+
295
+ score = Array.new(ngenes) # scores ordered by fold change
296
+
297
+ if sequences
298
+ score = set.map{|x| wordscores[allorder[x[0]]][wid]}
299
+ score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
300
+ else # use precomputed word database
301
+ wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
302
+ case options[:scoring_scheme]
303
+ when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
304
+ when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
305
+ when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
306
+ end
307
+ end
308
+
309
+ smean = score.to_statarray.mean
310
+ maxrs = 0
311
+ leading_edge = 0
312
+ rs = 0 #running sum
313
+ rsa = [0]
314
+ score.each_with_index do |x,i|
315
+ rs += (x-smean)
316
+ rsa << rs
317
+ if rs.abs > maxrs.abs
318
+ maxrs = rs
319
+ leading_edge = i+1
320
+ end
321
+ end
322
+
323
+ plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
324
+ plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
325
+
326
+ # we are only interested in pos. maxrs scores,
327
+ # because we currently analyze up/down regulated seperately
328
+ next if maxrs <= 0
329
+
330
+ pmaxrs_pos = StatArray.new
331
+ perms.each_with_index do |psa,pidx|
332
+ prs = 0
333
+ prsa = [0]
334
+ pmaxrs = 0
335
+ psa.each do |i|
336
+ prs += score[i]-smean
337
+ prsa << prs
338
+ pmaxrs = prs if prs.abs > pmaxrs.abs
339
+ end
340
+ # the permuted scores are approx. symmetric around 0
341
+ pmaxrs_pos << pmaxrs.abs
342
+ plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
343
+ end
344
+
345
+ pmean = pmaxrs_pos.mean
346
+ pstd = pmaxrs_pos.stddev
347
+
348
+ #Because the word zscore distr. can be quite different,
349
+ # we compute the deviation from the mean of the absolute dist.
350
+ # The permuted maxRS should be normally distr. (sum of random numbers)
351
+ pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
352
+
353
+ #pvalue and fdr statistic for word is also computed based on abs. dist.
354
+ pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
355
+ zsc = (maxrs-pmean)/pstd
356
+
357
+ plotfile.close if options[:plot_words]
358
+ report << [wid,zsc,pval,nil,leading_edge]
359
+
360
+ end # wordsize
361
+ pbar.finish
362
+
363
+ ###
364
+ ### FDR
365
+ ###
366
+
367
+ puts "fdr calculation ..."
368
+ fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
369
+ report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
370
+ fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
371
+ nfp = pfdrz.size.to_f
372
+ ntp = report.size.to_f
373
+ word_fdrrank = Hash.new()
374
+ ifp = 0
375
+ itp = 0
376
+ fdrrank.each do |zsc,idx|
377
+ if idx.nil?
378
+ ifp += 1
379
+ else
380
+ itp += 1
381
+ fpr = ifp/nfp
382
+ tpr = itp/ntp
383
+ report[idx][3] = fpr/tpr
384
+ end
385
+ end
386
+
387
+ cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
388
+ puts ""
389
+ puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
390
+ puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
391
+
392
+ ###
393
+ ### Output summarization
394
+ ###
395
+
396
+ wids2 = wids.invert
397
+ report = report.sort_by{|x| x[1]}.reverse
398
+ puts "\nTop #{output_top} words"
399
+ puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
400
+ report[0,output_top].each_with_index do |r,i|
401
+ wd = wids2[r[0]]
402
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
403
+ puts s.map{|x| sprintf("%-10s",x)}.join('')
404
+ end
405
+
406
+ if options[:report_words]
407
+ puts "......"
408
+ report.each_with_index do |r,i|
409
+ if options[:report_words].include?(r[0]) # and i > output_top
410
+ wd = wids2[r[0]]
411
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
412
+ puts s.map{|x| sprintf("%-10s",x)}.join('')
413
+ end
414
+ end
415
+ end
416
+
417
+ if options[:dump]
418
+ fname = rankfilename + ".#{nm}." + options[:dump].to_s
419
+ of = File.new(fname,"w")
420
+ of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
421
+ puts "dumping top #{options[:dump]} words in file: #{fname}"
422
+ report[0..options[:dump]-1].each_with_index do |r,i|
423
+ wd = wids2[r[0]]
424
+ s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
425
+ of.puts s.map{|x| sprintf("%-10s",x)}.join('')
426
+ end
427
+ end
428
+
429
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 1
9
- version: 0.1.1
8
+ - 2
9
+ version: 0.1.2
10
10
  platform: jruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -46,11 +46,12 @@ dependencies:
46
46
  version: 0.2.0
47
47
  type: :runtime
48
48
  version_requirements: *id002
49
- description:
49
+ description: Word correlation analysis for bioinformatics
50
50
  email: andersmbj@gmail.com
51
51
  executables:
52
52
  - cwords
53
53
  - cwords_mkdb
54
+ - cwords2
54
55
  extensions: []
55
56
 
56
57
  extra_rdoc_files: []
@@ -58,6 +59,7 @@ extra_rdoc_files: []
58
59
  files:
59
60
  - README
60
61
  - LICENSE
62
+ - bin/cwords2
61
63
  - bin/cwords
62
64
  - bin/cwords_mkdb
63
65
  - lib/ushuffle.jar
@@ -95,6 +97,6 @@ rubyforge_project:
95
97
  rubygems_version: 1.3.6
96
98
  signing_key:
97
99
  specification_version: 3
98
- summary: Word correlation analysis (bioinformatics)
100
+ summary: Word correlation analysis
99
101
  test_files: []
100
102