cwords 0.1.3-jruby → 0.1.4-jruby
Sign up to get free protection for your applications and to get access to all the features.
- data/scripts/cwords.rb +431 -0
- data/scripts/cwords_mkdb.rb +84 -0
- metadata +4 -2
data/scripts/cwords.rb
ADDED
@@ -0,0 +1,431 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
|
3
|
+
### Requires jruby, www.jruby.org
|
4
|
+
|
5
|
+
###
|
6
|
+
### Running sum analysis for 5 different measures of word enrichment in a sequence:
|
7
|
+
### obs : use the observed word count
|
8
|
+
### bin : use presence/absence of word
|
9
|
+
### pval : use the p-value of the expected occurrences being >= the observed occurence
|
10
|
+
|
11
|
+
srcdir = File.dirname(__FILE__)
|
12
|
+
basedir = srcdir + "/../"
|
13
|
+
libdir = basedir + '/lib/'
|
14
|
+
$LOAD_PATH << libdir
|
15
|
+
|
16
|
+
require 'wordRS-lib.rb'
|
17
|
+
require 'rubygems'
|
18
|
+
require 'progressbar'
|
19
|
+
require 'optparse'
|
20
|
+
require 'peach'
|
21
|
+
require 'java'
|
22
|
+
require libdir + 'ushuffle.jar'
|
23
|
+
java_import 'UShuffle'
|
24
|
+
|
25
|
+
#default options
|
26
|
+
options = Hash.new
|
27
|
+
options[:wordsize] = [7]
|
28
|
+
options[:split_words]=nil
|
29
|
+
options[:dbdir] = basedir + "db/"
|
30
|
+
options[:scoring_scheme] = 'pval'
|
31
|
+
options[:permutations]=50
|
32
|
+
options[:seqshuffles]=100
|
33
|
+
options[:rankfile]=nil
|
34
|
+
options[:seqfile]=nil
|
35
|
+
options[:report_words]=nil
|
36
|
+
options[:plot_words]=nil
|
37
|
+
options[:onlyanno]=nil
|
38
|
+
options[:dump]=nil
|
39
|
+
options[:testing]=nil
|
40
|
+
options[:rank_all]=nil
|
41
|
+
options[:rank_inverse]=nil
|
42
|
+
options[:rank_split_median]=nil
|
43
|
+
options[:rank_abs]=nil
|
44
|
+
options[:bg]=1 #mononucleotide shuffling
|
45
|
+
options[:threads]=1
|
46
|
+
|
47
|
+
$coptions = OptionParser.new do |opts|
|
48
|
+
opts.banner = "Usage: cwords [options]"
|
49
|
+
|
50
|
+
# analysis settings
|
51
|
+
opts.on("-c", "--scoring_scheme ARG", "scoring scheme") {|o| options[:scoring_scheme] = o}
|
52
|
+
opts.on("-p", "--permutations ARG", "number of list permutations") {|o| options[:permutations] = o.to_i}
|
53
|
+
opts.on("-q", "--shuffles ARG", "number of sequence shuffles for sequence bias correction") {|o| options[:seqshuffles] = o.to_i}
|
54
|
+
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
|
55
|
+
opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
|
56
|
+
opts.on("-t", "--threads ARG", "use multiple threads to parallelize computations") {|o| options[:threads] = o.to_i}
|
57
|
+
opts.on( "--split_words WORDS", "split sequence set based on occurrences of WORDS") {|o| options[:split_words] = o.split(",")}
|
58
|
+
opts.on( "--onlyanno", "only process annotated (i.e. mirbase) words") {|o| options[:onlyanno] = true}
|
59
|
+
|
60
|
+
# rank control
|
61
|
+
opts.on("-x", "--rank_all", "do not split positive and neg. values") {|o| options[:rank_all] = true}
|
62
|
+
opts.on("-m", "--rank_split_median", "split ranked list at median") {|o| options[:rank_split_median] = true}
|
63
|
+
opts.on("-i", "--rank_inverse", "inverse all ranked lists") {|o| options[:rank_inverse] = true}
|
64
|
+
opts.on("-a", "--rank_abs", "rank by absolute value") {|o| options[:rank_abs] = true}
|
65
|
+
|
66
|
+
# files and directories
|
67
|
+
opts.on("-r", "--rankfile ARG", "rank file") {|o| options[:rankfile] = o}
|
68
|
+
opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
|
69
|
+
opts.on("-d", "--db ARG", "word database") { |o| options[:db] = o}
|
70
|
+
|
71
|
+
# output control
|
72
|
+
opts.on("-u", "--dump ARG", "dump top words") { |o| options[:dump] = o.to_i}
|
73
|
+
opts.on( "--report_words ARG", "report on words (comma separated)") {|o| options[:report_words] = o.split(',')}
|
74
|
+
opts.on( "--plot_words ARG", "only make plot files for words (comma separated)") {|o| options[:plot_words] = o.split(',')}
|
75
|
+
opts.on( "--testing", "testing mode") {|o| options[:testing] = true}
|
76
|
+
end
|
77
|
+
|
78
|
+
def show_help(msg="", code=0, io=STDOUT)
|
79
|
+
io.puts "#{msg}\n#{$coptions}"
|
80
|
+
exit(code)
|
81
|
+
end
|
82
|
+
|
83
|
+
$coptions.parse!(ARGV)
|
84
|
+
# mandatory parameters
|
85
|
+
[:rankfile].each{|p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
86
|
+
show_help("db or seqfile required") if !(options[:db] or options[:seqfile])
|
87
|
+
show_help("scoring scheme must be one of: obs,bin,pval") if !(['obs','bin','pval'].include?(options[:scoring_scheme]))
|
88
|
+
|
89
|
+
testing = options[:testing]
|
90
|
+
|
91
|
+
# get filename without directory
|
92
|
+
rankfilename = File.basename(options[:rankfile])
|
93
|
+
|
94
|
+
# hard-coded
|
95
|
+
output_top = 10
|
96
|
+
|
97
|
+
prankdir = basedir + "/db/" + options[:db] + "/" if options[:db]
|
98
|
+
annofile = basedir + "/resources/" + "word_annotation.tsv" #annotation
|
99
|
+
tidfile = basedir + "/resources/" + "genemap.tsv"
|
100
|
+
seqshuffles = 5000 # currently hardcoded for database
|
101
|
+
sequences = nil
|
102
|
+
nwords = options[:wordsize].map{|x| 4**x}.to_statarray.sum
|
103
|
+
bg=options[:bg] # TODO, make option
|
104
|
+
threads=options[:threads]
|
105
|
+
|
106
|
+
###
|
107
|
+
### Main program
|
108
|
+
###
|
109
|
+
|
110
|
+
puts ">> Parameters"
|
111
|
+
options.each{|k,v| puts sprintf("%-20s: %s",k,v) if !v.nil?}
|
112
|
+
|
113
|
+
# read in mirbase seed family
|
114
|
+
word_annotation = Hash.new("") # seq => family
|
115
|
+
IO.readlines(annofile).each{|l| word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
|
116
|
+
|
117
|
+
# read optional sequences
|
118
|
+
if options[:seqfile]
|
119
|
+
puts ">> reading sequences ..."
|
120
|
+
sequences = Hash.new
|
121
|
+
IO.readlines(options[:seqfile],">")[1..-1].each do |entry|
|
122
|
+
ls = entry.split("\n").map{|x| x.chomp}
|
123
|
+
# hash ensures sequence ids unique
|
124
|
+
sequences[ls[0]] = ls[1..-2].join('').downcase.gsub('u','t') # last field is ">"
|
125
|
+
end
|
126
|
+
seqshuffles = options[:seqshuffles]
|
127
|
+
end
|
128
|
+
|
129
|
+
# initialize word id hash, word sequence => word id (0..nwords-1)
|
130
|
+
wids = Hash.new
|
131
|
+
i = 0
|
132
|
+
options[:wordsize].each{|ws| ['a','g','c','t'].rep_perm(ws) {|seqa| wids[seqa.join('')]=i ; i+=1 }}
|
133
|
+
|
134
|
+
###
|
135
|
+
### ID mapping
|
136
|
+
###
|
137
|
+
|
138
|
+
# pre-computed word database:
|
139
|
+
# map ids given in rankfile to internal ids
|
140
|
+
# remove rankfile entries with no match to internal id
|
141
|
+
# sequence file:
|
142
|
+
# take intersection of rank and sequence IDs
|
143
|
+
|
144
|
+
puts ">> Mapping and filtering IDs ..."
|
145
|
+
|
146
|
+
all = []
|
147
|
+
begin
|
148
|
+
idmap = Hash.new
|
149
|
+
internal_ids = nil
|
150
|
+
|
151
|
+
if sequences
|
152
|
+
internal_ids = sequences
|
153
|
+
else
|
154
|
+
IO.readlines(tidfile).each do |l|
|
155
|
+
tid = l.split(" ")[0]
|
156
|
+
l.split(" ")[1].split(",").each{|extid| idmap[extid] = tid}
|
157
|
+
end
|
158
|
+
internal_ids = idmap.invert # allowed internal ids
|
159
|
+
end
|
160
|
+
|
161
|
+
allh = Hash.new {|h,k| h[k] = []}
|
162
|
+
filtered = 0
|
163
|
+
|
164
|
+
IO.readlines(options[:rankfile]).each do |l|
|
165
|
+
l = l.split("\t")
|
166
|
+
#test if internal id or mapable external id
|
167
|
+
tid = (internal_ids.key?(l[0]) ? l[0] : idmap[l[0]])
|
168
|
+
tid.nil? ? filtered += 1 : allh[tid] << l[1].to_f
|
169
|
+
end
|
170
|
+
|
171
|
+
# filter unknown sequences
|
172
|
+
sequences.keys.each{|id| sequences.delete(id) if !allh.key?(id)} if sequences
|
173
|
+
|
174
|
+
# we currently mean-collapse ids, we could allow mean/min/max collapsing ...
|
175
|
+
all = allh.to_a.map{|tid,values| [tid,values.to_statarray.mean]}
|
176
|
+
|
177
|
+
puts "removed #{filtered} invalid transcript ids" if filtered > 0
|
178
|
+
end
|
179
|
+
|
180
|
+
allorder = Hash.new # tid => index in all
|
181
|
+
all.each_with_index{|x,i| allorder[x[0]] = i}
|
182
|
+
|
183
|
+
###
|
184
|
+
### Word enumeration (optional)
|
185
|
+
###
|
186
|
+
|
187
|
+
wordscores = []
|
188
|
+
if sequences
|
189
|
+
puts ">> Enumerating words in sequences"
|
190
|
+
wordscores = Array.new(all.size) {Array.new(wids.size,0)} # {Java::short[wids.size].new}
|
191
|
+
pbar = ProgressBar.new("progress",sequences.size)
|
192
|
+
all.peach(threads) do |seqid,val|
|
193
|
+
us = UShuffle.new
|
194
|
+
seq=sequences[seqid]
|
195
|
+
seqidx=allorder[seqid]
|
196
|
+
pbar.inc
|
197
|
+
seqsize = seq.size
|
198
|
+
observed = Array.new(wids.size,0)
|
199
|
+
options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seq[i, ws]]; observed[wid] += 1 if not wid.nil?}}
|
200
|
+
|
201
|
+
case options[:scoring_scheme]
|
202
|
+
when "bin" then wordscores[seqidx] = observed.map{|x| x > 0 ? 1 : -1}
|
203
|
+
when "obs" then wordscores[seqidx] = observed
|
204
|
+
else
|
205
|
+
# pval, compute distribution of expected word occurrences
|
206
|
+
us.init_shuffle(seq,bg)
|
207
|
+
seqshuffles.times do |si|
|
208
|
+
seqsh = us.shuffle
|
209
|
+
expected = Array.new(wids.size,0)
|
210
|
+
options[:wordsize].each{|ws| (0..seqsize-ws).each{|i| wid = wids[seqsh[i, ws]]; expected[wid] += 1 if !wid.nil?}}
|
211
|
+
observed.each_with_index{|x,widx| wordscores[seqidx][widx] =+ 1 if expected[widx]>=x}
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
pbar.finish
|
216
|
+
end
|
217
|
+
|
218
|
+
###
|
219
|
+
### Generate list ranking
|
220
|
+
###
|
221
|
+
|
222
|
+
analyze = []
|
223
|
+
if options[:rank_split_median]
|
224
|
+
# we should perhaps use an :inverse option,
|
225
|
+
# reversing the two pos and neg lists
|
226
|
+
med = all.map{|x| x[1]}.to_statarray.median
|
227
|
+
pos_set = all.select{|x| x[1] > med}.sort{|a,b| b[1] <=> a[1]}
|
228
|
+
neg_set = all.select{|x| x[1] <= med}.sort{|a,b| a[1] <=> b[1]}
|
229
|
+
analyze = [[pos_set,'med_positive'],[neg_set,'med_negative']]
|
230
|
+
elsif options[:rank_all] # do not split positive and negative range
|
231
|
+
pos_set = all.sort{|a,b| b[1] <=> a[1]}
|
232
|
+
neg_set = all.sort{|a,b| a[1] <=> b[1]}
|
233
|
+
analyze = [[pos_set,'all_positive'],[neg_set,'all_negative']]
|
234
|
+
elsif options[:rank_abs] # rank by absolute values
|
235
|
+
pos_set = all.map{|x| [x[0],x[1].abs]}.sort{|a,b| b[1] <=> a[1]}
|
236
|
+
neg_set = pos_set.reverse
|
237
|
+
analyze = [[pos_set,'abs_positive'],[neg_set,'abs_negative']]
|
238
|
+
else
|
239
|
+
pos_set = all.select{|x| x[1] > 0}.sort{|a,b| b[1] <=> a[1]}
|
240
|
+
neg_set = all.select{|x| x[1] < 0}.sort{|a,b| a[1] <=> b[1]}
|
241
|
+
analyze = [[pos_set,'positive'],[neg_set,'negative']]
|
242
|
+
end
|
243
|
+
|
244
|
+
# inverse lists
|
245
|
+
analyze.map!{|set,nm| [set.reverse,nm+".inv"]} if options[:rank_inverse]
|
246
|
+
|
247
|
+
# split sequence set when --split option is given
|
248
|
+
if options[:split_words]
|
249
|
+
seqs_with_words = Hash.new
|
250
|
+
|
251
|
+
options[:split_words].each do |split_word|
|
252
|
+
begin
|
253
|
+
IO.readlines(prankdir + split_word.downcase + ".rnk").each do |x|
|
254
|
+
l = x.split("\t")
|
255
|
+
seqs_with_words[l[0]] = 1 if l[1].to_i > 0
|
256
|
+
end
|
257
|
+
rescue
|
258
|
+
warn "could not split sequences on word #{split_word}: " + $!
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
analyze_split = []
|
263
|
+
analyze.each do |set,nm|
|
264
|
+
analyze_split += set.partition{|x| seqs_with_words.key?(x[0])}.zip([nm+".split+"+options[:split_words].join(","),nm+".split-"+options[:split_words].join(",")])
|
265
|
+
end
|
266
|
+
analyze = analyze_split
|
267
|
+
end
|
268
|
+
|
269
|
+
###
|
270
|
+
### Correlation analysis
|
271
|
+
###
|
272
|
+
|
273
|
+
puts ">> Analyzing sequence sets: " + analyze.map{|x| x[1]}.join(", ")
|
274
|
+
|
275
|
+
analyze.each do |set,nm|
|
276
|
+
ngenes = set.size
|
277
|
+
puts "\n>> Analyzing #{nm} set ...\nnumber of genes: #{ngenes}"
|
278
|
+
next if ngenes == 0
|
279
|
+
perms = []
|
280
|
+
report = []
|
281
|
+
pfdrz = []
|
282
|
+
|
283
|
+
franks = Hash.new # tid => index in set
|
284
|
+
set.each_with_index{|x,i| franks[x[0]] = i}
|
285
|
+
|
286
|
+
puts "permuting #{options[:permutations]} times ...\n"
|
287
|
+
options[:permutations].times{|i| perms << (0..set.size-1).to_a.shuffle}
|
288
|
+
|
289
|
+
pbar = ProgressBar.new("progress",nwords)
|
290
|
+
wids.to_a.sort_by{|x| x[1]}.peach(threads) do |word,wid|
|
291
|
+
pbar.inc
|
292
|
+
next if options[:onlyanno] and not word_annotation.key?(word) #only process annotated words
|
293
|
+
next if options[:plot_words] and !options[:plot_words].include?(word)
|
294
|
+
|
295
|
+
plotfile = File.new(rankfilename + ".#{word}.#{nm}.csv","w") if options[:plot_words]
|
296
|
+
|
297
|
+
score = Array.new(ngenes) # scores ordered by fold change
|
298
|
+
|
299
|
+
if sequences
|
300
|
+
score = set.map{|x| wordscores[allorder[x[0]]][wid]}
|
301
|
+
score.map!{|x| -Math.log((x+1.0)/(seqshuffles+1))} if options[:scoring_scheme] == 'pval'
|
302
|
+
else # use precomputed word database
|
303
|
+
wordcounts = IO.readlines(prankdir + word + ".rnk").map{|x| x.split("\t")}.select{|x| franks.key?(x[0])}
|
304
|
+
case options[:scoring_scheme]
|
305
|
+
when "bin" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_i == 0 ? -1 : 1}
|
306
|
+
when "obs" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = obs.to_f}
|
307
|
+
when "pval" then wordcounts.each{|id,obs,gte_obs,exp| score[franks[id]] = -Math.log((gte_obs.to_f+1)/(seqshuffles+1.0))}
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
smean = score.to_statarray.mean
|
312
|
+
maxrs = 0
|
313
|
+
leading_edge = 0
|
314
|
+
rs = 0 #running sum
|
315
|
+
rsa = [0]
|
316
|
+
score.each_with_index do |x,i|
|
317
|
+
rs += (x-smean)
|
318
|
+
rsa << rs
|
319
|
+
if rs.abs > maxrs.abs
|
320
|
+
maxrs = rs
|
321
|
+
leading_edge = i+1
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
plotfile.puts(([word+".score"] + [0] + score.map{|x| x.to_e(2)}).join(",")) if options[:plot_words]
|
326
|
+
plotfile.puts(([word+".rs"] + rsa).join(",")) if options[:plot_words]
|
327
|
+
|
328
|
+
# we are only interested in pos. maxrs scores,
|
329
|
+
# because we currently analyze up/down regulated seperately
|
330
|
+
next if maxrs <= 0
|
331
|
+
|
332
|
+
pmaxrs_pos = StatArray.new
|
333
|
+
perms.each_with_index do |psa,pidx|
|
334
|
+
prs = 0
|
335
|
+
prsa = [0]
|
336
|
+
pmaxrs = 0
|
337
|
+
psa.each do |i|
|
338
|
+
prs += score[i]-smean
|
339
|
+
prsa << prs
|
340
|
+
pmaxrs = prs if prs.abs > pmaxrs.abs
|
341
|
+
end
|
342
|
+
# the permuted scores are approx. symmetric around 0
|
343
|
+
pmaxrs_pos << pmaxrs.abs
|
344
|
+
plotfile.puts(([word+".rs."+pidx.to_s] + prsa).join(",")) if options[:plot_words]
|
345
|
+
end
|
346
|
+
|
347
|
+
pmean = pmaxrs_pos.mean
|
348
|
+
pstd = pmaxrs_pos.stddev
|
349
|
+
|
350
|
+
#Because the word zscore distr. can be quite different,
|
351
|
+
# we compute the deviation from the mean of the absolute dist.
|
352
|
+
# The permuted maxRS should be normally distr. (sum of random numbers)
|
353
|
+
pfdrz += pmaxrs_pos.map{|x| (x-pmean)/pstd}
|
354
|
+
|
355
|
+
#pvalue and fdr statistic for word is also computed based on abs. dist.
|
356
|
+
pval = (pmaxrs_pos.select{|x| x>=maxrs}.size+1.0)/(pmaxrs_pos.size+1)
|
357
|
+
zsc = (maxrs-pmean)/pstd
|
358
|
+
|
359
|
+
plotfile.close if options[:plot_words]
|
360
|
+
report << [wid,zsc,pval,nil,leading_edge]
|
361
|
+
|
362
|
+
end # wordsize
|
363
|
+
pbar.finish
|
364
|
+
|
365
|
+
###
|
366
|
+
### FDR
|
367
|
+
###
|
368
|
+
|
369
|
+
puts "fdr calculation ..."
|
370
|
+
fdrrank = pfdrz.map{|x| [x,nil]} # [zscore,word_report_index]
|
371
|
+
report.each_with_index{|x,idx| fdrrank << [x[1],idx]}
|
372
|
+
fdrrank = fdrrank.sort_by{|x| x[0]}.reverse # sort high zscore to low zscore
|
373
|
+
nfp = pfdrz.size.to_f
|
374
|
+
ntp = report.size.to_f
|
375
|
+
word_fdrrank = Hash.new()
|
376
|
+
ifp = 0
|
377
|
+
itp = 0
|
378
|
+
fdrrank.each do |zsc,idx|
|
379
|
+
if idx.nil?
|
380
|
+
ifp += 1
|
381
|
+
else
|
382
|
+
itp += 1
|
383
|
+
fpr = ifp/nfp
|
384
|
+
tpr = itp/ntp
|
385
|
+
report[idx][3] = fpr/tpr
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
cutoff_fdr = [0.001,0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.5]
|
390
|
+
puts ""
|
391
|
+
puts (["fdr <="] + cutoff_fdr.map{|x| x.to_s(3)} + ["total"]).join("\t")
|
392
|
+
puts (["count"] + cutoff_fdr.map{|x| report.select{|y| y[3] <= x}.size} + [report.size]).join("\t")
|
393
|
+
|
394
|
+
###
|
395
|
+
### Output summarization
|
396
|
+
###
|
397
|
+
|
398
|
+
wids2 = wids.invert
|
399
|
+
report = report.sort_by{|x| x[1]}.reverse
|
400
|
+
puts "\nTop #{output_top} words"
|
401
|
+
puts ['rank','word','z-score','p-value','fdr','ledge','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
402
|
+
report[0,output_top].each_with_index do |r,i|
|
403
|
+
wd = wids2[r[0]]
|
404
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
405
|
+
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
406
|
+
end
|
407
|
+
|
408
|
+
if options[:report_words]
|
409
|
+
puts "......"
|
410
|
+
report.each_with_index do |r,i|
|
411
|
+
if options[:report_words].include?(r[0]) # and i > output_top
|
412
|
+
wd = wids2[r[0]]
|
413
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
414
|
+
puts s.map{|x| sprintf("%-10s",x)}.join('')
|
415
|
+
end
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
if options[:dump]
|
420
|
+
fname = rankfilename + ".#{nm}." + options[:dump].to_s
|
421
|
+
of = File.new(fname,"w")
|
422
|
+
of.puts ['rank','word','z-score','p-value','fdr','ledge','GS size','annotation'].map{|x| sprintf("%-10s",x)}.join('')
|
423
|
+
puts "dumping top #{options[:dump]} words in file: #{fname}"
|
424
|
+
report[0..options[:dump]-1].each_with_index do |r,i|
|
425
|
+
wd = wids2[r[0]]
|
426
|
+
s = [i+1,wd,r[1].to_s(2),r[2].to_e(2),r[3].to_e(2),r[4].to_s,word_annotation[wd]]
|
427
|
+
of.puts s.map{|x| sprintf("%-10s",x)}.join('')
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
srcdir = File.dirname(__FILE__)
|
4
|
+
basedir = srcdir + "../"
|
5
|
+
libdir = basedir + 'lib/'
|
6
|
+
$LOAD_PATH << libdir
|
7
|
+
|
8
|
+
require 'wordRS-lib.rb'
|
9
|
+
require 'progressbar'
|
10
|
+
require 'optparse'
|
11
|
+
require 'fileutils'
|
12
|
+
|
13
|
+
tdir = basedir + '/tmp/'
|
14
|
+
FileUtils.mkdir_p tdir # create dir if it does not exist
|
15
|
+
|
16
|
+
###
|
17
|
+
### Main
|
18
|
+
###
|
19
|
+
|
20
|
+
#default options
|
21
|
+
options = Hash.new
|
22
|
+
options[:wordsize] = [7]
|
23
|
+
options[:seqfile] = nil
|
24
|
+
options[:partitions] = 1
|
25
|
+
options[:stats] = ['p'] # p=p
|
26
|
+
options[:ruby]='jruby --fast -J-Xmx1024m'
|
27
|
+
options[:shuffles]=5000
|
28
|
+
options[:bg]=1 #mononucleotide shuffling
|
29
|
+
|
30
|
+
$coptions = OptionParser.new do |opts|
|
31
|
+
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
|
32
|
+
opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
|
33
|
+
opts.on("-p", "--partitions ARG", "number of sequence partitions") {|o| options[:partitions] = o.to_i}
|
34
|
+
opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
|
35
|
+
opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
|
36
|
+
opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
|
37
|
+
opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
|
38
|
+
end
|
39
|
+
|
40
|
+
def show_help(msg="", code=0, io=STDOUT)
|
41
|
+
io.puts "#{msg}\n#{$coptions}"
|
42
|
+
exit(code)
|
43
|
+
end
|
44
|
+
|
45
|
+
$coptions.parse!(ARGV)
|
46
|
+
#mandatory parameters
|
47
|
+
[:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
48
|
+
|
49
|
+
exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
|
50
|
+
dbname = File.basename(options[:seqfile],'.fa')
|
51
|
+
dbdir = basedir + "/db/" + dbname + "_bg#{options[:bg]}"
|
52
|
+
FileUtils.mkdir_p dbdir # create dir if it does not exist
|
53
|
+
|
54
|
+
n=options[:partitions]
|
55
|
+
|
56
|
+
# word id's
|
57
|
+
@seqs = IO.readlines(options[:seqfile],"\n>")
|
58
|
+
puts "#{@seqs.size} sequences"
|
59
|
+
|
60
|
+
puts "purging database ..."
|
61
|
+
options[:wordsize].each do |wordsize|
|
62
|
+
['a','g','c','t'].rep_perm(wordsize) {|seqa| wf = "#{dbdir}/#{seqa.join('')}.rnk"; File.delete(wf) if File.exist?(wf)}
|
63
|
+
end
|
64
|
+
|
65
|
+
puts "starting #{n} processes ..."
|
66
|
+
|
67
|
+
cmd = "#{options[:ruby]} #{basedir}/scripts/wordsrus_mkdb.rb"
|
68
|
+
cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
|
69
|
+
|
70
|
+
stamp = Time.now.to_i
|
71
|
+
|
72
|
+
partsize = @seqs.size/n
|
73
|
+
cmds = []
|
74
|
+
(n-1).times do |i|
|
75
|
+
cmds << cmd + " -p #{(i)*(partsize)+1}-#{(i+1)*(partsize)} &> #{tdir}#{dbname}_b#{options[:bg]}_#{i+1}_#{stamp}.dbout"
|
76
|
+
end
|
77
|
+
cmds << cmd + " -p #{partsize*(n-1)+1}-#{[n*(partsize),@seqs.size].max} &> #{tdir}#{dbname}_b#{options[:bg]}_#{n}_#{stamp}.dbout"
|
78
|
+
cmds.each do |c|
|
79
|
+
p c
|
80
|
+
exec c if fork.nil?
|
81
|
+
end
|
82
|
+
|
83
|
+
puts "Jobs started."
|
84
|
+
puts "Monitor with : tail #{tdir}#{dbname}_*#{stamp}.dbout"
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 4
|
9
|
+
version: 0.1.4
|
10
10
|
platform: jruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -64,6 +64,8 @@ files:
|
|
64
64
|
- lib/wordRS-lib.rb
|
65
65
|
- resources/genemap.tsv
|
66
66
|
- resources/word_annotation.tsv
|
67
|
+
- scripts/cwords.rb
|
68
|
+
- scripts/cwords_mkdb.rb
|
67
69
|
- scripts/cluster_words.rb
|
68
70
|
- scripts/complementary_words.rb
|
69
71
|
has_rdoc: true
|