cwords 0.1-jruby

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,394 @@
1
+ #!/usr/bin/ruby
2
+
3
+ ###
4
+ ### Given a miRS output word list file and a sequence,
5
+ ### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
6
+ ###
7
+
8
+ srcdir = File.dirname(__FILE__)
9
+ $LOAD_PATH << srcdir + '/../lib/'
10
+
11
+ require 'wordRS-lib.rb'
12
+ require 'progressbar'
13
+ require 'optparse'
14
+ require 'pp'
15
+
16
+ #default options
17
+ options = Hash.new
18
+ options[:wordfile]=nil
19
+ options[:sep]=" "
20
+ options[:overlap]=3
21
+ options[:seedoverlap]=nil
22
+ options[:testing]=nil
23
+ options[:fdr]=nil
24
+ options[:seed]=20
25
+ options[:top]=nil
26
+ options[:keep_lc] = nil # filter low complexity words
27
+ # we could estimate significance of cluster size based on shuffles ...
28
+ options[:shuffles]=100
29
+
30
+ $coptions = OptionParser.new do |opts|
31
+ opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
32
+ opts.on("-s", "--sep ARG", "separator") {|o| options[:sep] = o}
33
+ opts.on("-k", "--keep_lc", "keep low complexity words") {|o| options[:keep_lc] = o.to_i}
34
+ opts.on("-o", "--overlap ARG", "overlap") {|o| options[:overlap] = o.to_i}
35
+ opts.on("-v", "--seedoverlap ARG", "seed overlap") {|o| options[:seedoverlap] = o.to_i}
36
+ opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
37
+ opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
38
+ opts.on("-i", "--seed ARG", "seed number clusters") { |o| options[:seed] = o.to_i}
39
+ opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
40
+ opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
41
+ end
42
+
43
+ def show_help(msg="", code=0, io=STDOUT)
44
+ io.puts "#{msg}\n#{$coptions}"
45
+ exit(code)
46
+ end
47
+
48
+ $coptions.parse!(ARGV)
49
+ #mandatory parameters
50
+ [:wordfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
51
+
52
+ options[:seedoverlap] = options[:overlap] if !options[:seedoverlap]
53
+ options[:top] = options[:seed] if !options[:top]
54
+
55
+ # read in mirbase seed family
56
+ annofile = srcdir + "/../resources/" + "word_annotation.tsv" #annotation
57
+ @word_annotation = Hash.new("") # seq => family
58
+ IO.readlines(annofile).each{|l| @word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
59
+
60
+ ###
61
+ ### Sub
62
+ ###
63
+
64
+ class String
65
+
66
+ def editdist(b)
67
+ a = self
68
+ return 0 if !a || !b || a == b
69
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
70
+ m = [[0]]
71
+ 1.upto(a.length) { |i| m[i] = [i] }
72
+ 1.upto(b.length) { |j| m[0][j] = j }
73
+ 1.upto(a.length) do |i|
74
+ 1.upto(b.length) do |j|
75
+ m[i][j] =
76
+ [ m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),
77
+ m[i-1][j] + 1,
78
+ m[i][j-1] + 1 ].min
79
+ end
80
+ end
81
+ m[a.length][b.length]
82
+ end
83
+
84
+ def local_align_score(b)
85
+ a = self
86
+ return 0 if !a || !b || a == b
87
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
88
+ match = 2
89
+ miss = -1
90
+ gap = -1
91
+ m = [[0]]
92
+ 1.upto(a.length) { |i| m[i] = [0] }
93
+ 1.upto(b.length) { |j| m[0][j] = 0 }
94
+ 1.upto(a.length) do |i|
95
+ 1.upto(b.length) do |j|
96
+ m[i][j] =
97
+ [ m[i-1][j-1] + (a[i-1] == b[j-1] ? match : miss),
98
+ m[i-1][j] + gap,
99
+ m[i][j-1] + gap,0 ].max
100
+ end
101
+ end
102
+ (m.last+m.map{|x| x.last}).max
103
+ end
104
+
105
+ # gapped align of complementary nucleotides, with wobbles
106
+ def local_align_gap_score(b)
107
+ a=self
108
+ return 0 if !a || !b || a == b
109
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
110
+ a = a.split('')
111
+ b = b.split('')
112
+
113
+ # score cutoff 8.5
114
+ # at least 5 matches, max 1 go or mismatch
115
+ # wobbles and neutral
116
+ # allowed
117
+ # 6m + 1go/mm : 11.0
118
+ # 5m + 2wo : 10
119
+ # 5m + 1go/mm + 1wo : 9
120
+ # 5m + 1go + 1ge : 8.5
121
+ # not allowed
122
+ # 4m + 3wo : 8 # no
123
+ # 5m + 2mm/go : 8
124
+
125
+ match = 2.0
126
+ miss = -1.0
127
+ wobble = 0.0
128
+ gap_open = -1.0
129
+ gap_ext = -0.5
130
+
131
+ score = Hash.new(miss)
132
+ ['at','ta','gc','cg'].each {|x| score[x] = match}
133
+ ['gt','tg'].each {|x| score[x] = wobble}
134
+
135
+ m = [[0]]
136
+ #g1/2 - gap extension matrix for two seqs
137
+ g1 = [[gap_open]]
138
+ 1.upto(a.length) { |i| m[i] = [0]; g1[i] = [gap_open]} #g2[i] = gap_open}
139
+ 1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = gap_open}# g2[0][j] = gap_open}
140
+ g2 = g1.clone
141
+
142
+ 1.upto(a.length) do |i|
143
+ 1.upto(b.length) do |j|
144
+ scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
145
+ m[i-1][j] + g1[i-1][j], # gap in seq1
146
+ m[i][j-1] + g2[i][j-1], # gap in seq2
147
+ 0] # start new local alignment
148
+
149
+ m[i][j] = scores.max
150
+ g1[i][j] = (scores.index(m[i][j]) == 1) ? gap_ext : gap_open # gap in seq1
151
+ g2[i][j] = (scores.index(m[i][j]) == 2) ? gap_ext : gap_open # gap in seq2
152
+ end
153
+ end
154
+ #puts ([""]+b).map{|x| x.center(4)}.join(' ')
155
+ #puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
156
+ (m.last+m.map{|x| x.last}).max
157
+ end
158
+
159
+ # returns [size of overlap,rel. pos of s2 in s1]
160
+ def overlap(s2,require_ovl)
161
+ s1 = self
162
+ s1=s1.split('')
163
+ s2=s2.split('')
164
+ ovl = require_ovl || 5
165
+ return [] if (s1.size < ovl or s2.size < ovl)
166
+ return [0] if s1 == s2
167
+
168
+ # first, check if one word is contained in the other ...
169
+ # if s1.size <= s2.size
170
+ (s2.size-s1.size).times{|i| return [-i] if s2[i,s1.size] == s1}
171
+ (s1.size-s2.size).times{|i| return [i] if s1[i,s2.size] == s2}
172
+
173
+ # then check for overlap
174
+ (ovl..[s1.size,s2.size].min).to_a.reverse.each do |len|
175
+ ovls = []
176
+ ovls << -(s2.size-len) if s1[0,len] == s2[s2.size-len..-1]
177
+ ovls << s1.size-len if s2[0,len] == s1[s1.size-len..-1]
178
+ return ovls if !ovls.empty?
179
+ end
180
+
181
+ return [] # no overlap
182
+ end
183
+
184
+ #require '~/miwork/words/src/lib/mirs-lib.rb'
185
+ def editaln(b,edt)
186
+ a = self
187
+ #puts a
188
+ #puts b
189
+ return nil if !a || !b
190
+ return nil if a.length == 0 || b.length == 0
191
+ m = [[0]]
192
+ bt = [[0]] # backtrack
193
+ 1.upto(a.length) { |i| m[i] = [i]; bt[i] = [0]; }
194
+ 1.upto(b.length) { |j| m[0][j] = j; bt[0][j] = 0;}
195
+ 1.upto(a.length) do |i|
196
+ 1.upto(b.length) do |j|
197
+ scores =
198
+ [ [m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),0],
199
+ [m[i-1][j] + 1,1],
200
+ [m[i][j-1] + 1,2] ]
201
+
202
+ m[i][j] = scores.min.first
203
+ bt[i][j] = scores.min.last
204
+
205
+ end
206
+ end
207
+ editdist = m[a.length][b.length]
208
+ return [] if editdist > edt
209
+ #find alignment
210
+ if bt[a.length][b.length] == 0
211
+ # last pos aligns
212
+ return [a.length-b.length]
213
+ elsif bt[a.length][b.length] == 1
214
+ # aligns from above, find first non-1
215
+ bt.map{|x| x.last}.reverse.each_with_index do |bts,idx|
216
+ return [a.length-idx-b.length] if bts != 1
217
+ end
218
+ else
219
+ # aligns from left, find first non-2
220
+ bt[a.length].reverse.each_with_index do |bts,idx|
221
+ return [a.length+idx-b.length] if bts != 2
222
+ end
223
+ end
224
+
225
+ puts "error"
226
+ pp m
227
+ pp bt
228
+
229
+ end
230
+
231
+ end
232
+
233
+
234
+
235
+ def align_to_cluster(cluster,s,ovl)
236
+ align = []
237
+ return [] if cluster.key?(s) # s already exist in cluster
238
+ match_to_cluster = 0
239
+ cluster.to_a.each do |w,wis|
240
+ wol = w.overlap(s,ovl)
241
+ #wol = w.editaln(s,ovl) # test editdist
242
+ if !wol.empty?
243
+ match_to_cluster += 1
244
+ wol.each do |si|
245
+ wis.each do |wi|
246
+ #compute overlap length
247
+ olength = (si < 0) ? [s.size+si,w.size].min : [s.size,w.size-si].min
248
+ # we require the overlap should contain at least two
249
+ # different nucleotides
250
+ #puts w if s == "tgttt" and olength >= 3
251
+ next if (si<0) and w[0,olength].split('').uniq.size == 1
252
+ next if (si>=0) and s[0,olength].split('').uniq.size == 1
253
+ #adjust si relative to first cluster word
254
+ align << [s,wi+si,olength]
255
+ end
256
+ end
257
+ end
258
+ end
259
+ # return [] if overlap is not greater than ovl in at least half of
260
+ # the cluster members
261
+ #return [] if cluster.size > match_to_cluster
262
+
263
+ # return alignments with greatest overlap
264
+ align = align.compact.uniq
265
+ maxovl = align.map{|x| x.last}.max
266
+ return align.select{|x| x.last == maxovl}
267
+ end
268
+
269
+ # find cluster(s) with highest overlap
270
+ def align_to_clusters(clusters,s,ovl)
271
+
272
+ alns = []
273
+ clusters.each do |cl|
274
+ aln = align_to_cluster(cl,s,ovl)
275
+ alns << [aln,cl] if !aln.empty?
276
+ end
277
+ #pp alns if s == "tgttt"
278
+
279
+ # select alignments with greates overlap
280
+ maxovl = alns.map{|aln,cl| aln.first.last}.max
281
+ #pp maxovl if s == "cccgttt"
282
+ #pp alns.select{|x| x.first.last == maxovl} if s == "cccgttt"
283
+ return alns.select{|aln,cl| aln.first.last == maxovl}
284
+
285
+ end
286
+
287
+ def add_to_clusters(alignments)
288
+ #add.compact.uniq.each{|x,y| cl[x] << y}
289
+ alignments.each do |alns,cl|
290
+ #pp alns
291
+ alns.each{|aln| cl[aln[0]] << aln[1]}
292
+ end
293
+ end
294
+
295
+ def print_cluster(cluster,ranks)
296
+ cl = []
297
+ cluster.to_a.each{|a,b| b.each{|c| cl << [a,c]}}
298
+ cl = cl.sort_by{|x| x[1]}
299
+ imin = cl.first[1]
300
+
301
+ # preprocess clusters for consensus alignment
302
+ cons = Hash.new {|h,k| h[k] = Hash.new(0.0)}
303
+ cl.each do |w,i|
304
+ w.split('').each_with_index do |nt,idx|
305
+ cons[i+idx][nt] +=1
306
+ end
307
+ end
308
+
309
+ def cons_word(consh,word,i)
310
+ cw = ""
311
+ word.split('').each_with_index do |nt,idx|
312
+ frac = consh[i+idx][nt]/consh[i+idx].values.to_statarray.sum
313
+ cw += (frac >= 0.5 and consh[i+idx].values.to_statarray.sum > 1) ? nt.upcase : nt
314
+ end
315
+ return cw
316
+ end
317
+
318
+ cl.each do |w,i|
319
+ puts ranks[w].to_s.rjust(5)+": "+(" "*(i-imin)+cons_word(cons,w,i).tr('tT','uU')).ljust(20)+@word_annotation[w].chomp.ljust(20)
320
+ end
321
+ end
322
+
323
+ ###
324
+ ### Main
325
+ ###
326
+
327
+ aw = Hash.new()
328
+ if options[:fdr]
329
+ IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
330
+ aw[idx+1] = wline.split(options[:sep])[1]
331
+ end
332
+ else
333
+ IO.readlines(options[:wordfile])[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
334
+ aw[idx+1] = wline.split(options[:sep])[1]
335
+ end
336
+ end
337
+
338
+ awords = aw.to_a.sort.map{|x| x[1]}
339
+ awords_seed = awords[0,options[:seed]]
340
+ awords_rest = awords[options[:seed]..-1]
341
+
342
+ if !options[:keep_lc]
343
+ awords_seed = awords_seed.select{|x| x.split('').uniq.size > 1}
344
+ awords_rest = awords_rest.select{|x| x.split('').uniq.size > 1}
345
+ puts "removed #{aw.size-awords_seed.size-awords_rest.size} low complexity words"
346
+ end
347
+
348
+ # seedialize clusters
349
+ clusters = []
350
+ awords_seed.each{|x| h = Hash.new{|j,k| j[k]=Array.new}; h[x]=[0]; clusters << h}
351
+ #pp clusters
352
+
353
+ puts "step 1, seedializing clusters from #{options[:seed]} words"
354
+ pbar = ProgressBar.new("running",options[:seed])
355
+ options[:seed].size.times do
356
+ awords_seed.each do |word|
357
+ add_to_clusters(align_to_clusters(clusters,word,options[:seedoverlap]))
358
+ end
359
+ pbar.inc
360
+ end
361
+ pbar.finish
362
+
363
+ # remove duplicate clusters
364
+ # {'word' => [pos in alignment,array, usually one element]}
365
+
366
+ clusters.each{|x| clusters.delete_if{|y| x.object_id != y.object_id and x.key?(y.to_a.first.first)}}
367
+
368
+ # pp clusters
369
+ # merge clusters ...
370
+
371
+ puts "step 2, processing #{awords.size} words"
372
+ #pbar = ProgressBar.new("running",awords.size)
373
+ #awords[options[:seed]..-1].size.times do
374
+
375
+ # map extra words to cluster(s) with greatest overlap
376
+ # dont extend extra words
377
+ if options[:top] > options[:seed]
378
+ add = []
379
+ awords_rest.each do |word|
380
+ # add += align_to_cluster(cl,word,options[:overlap])
381
+ add << align_to_clusters(clusters,word,options[:overlap])
382
+ end
383
+ add.each{|a| add_to_clusters(a)}
384
+ end
385
+
386
+ #require 'pp'
387
+ #pp clusters.select{|x| x.size > 2}
388
+ #pp clusters
389
+ wa = aw.invert
390
+ resc = clusters.select{|x| x.size >= 3}
391
+ resc.each{|cl| print_cluster(cl,wa);puts "\n"}
392
+
393
+ puts "Found #{resc.size} word clusters."
394
+
@@ -0,0 +1,205 @@
1
+ #!/usr/bin/ruby
2
+
3
+ ###
4
+ ### Given a miRS output word list file and a sequence,
5
+ ### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
6
+ ###
7
+
8
+
9
+ srcdir = File.dirname(__FILE__)
10
+ $LOAD_PATH << srcdir + '/../lib/'
11
+
12
+ require 'wordRS-lib.rb'
13
+ require 'progressbar'
14
+ require 'optparse'
15
+
16
+ #default options
17
+ options = Hash.new
18
+ options[:wordfile]=nil
19
+ options[:seq]=nil
20
+ options[:matchscore]=7
21
+ options[:testing]=nil
22
+ options[:fdr]=nil
23
+ options[:top]=250
24
+ options[:shuffles]=100
25
+
26
+ $coptions = OptionParser.new do |opts|
27
+ opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
28
+ opts.on("-s", "--seq ARG", "sequence") {|o| options[:seq] = o}
29
+ opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
30
+ opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
31
+ opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
32
+ opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
33
+ end
34
+
35
+ def show_help(msg="", code=0, io=STDOUT)
36
+ io.puts "#{msg}\n#{$coptions}"
37
+ exit(code)
38
+ end
39
+
40
+ $coptions.parse!(ARGV)
41
+ #mandatory parameters
42
+ [:wordfile,:seq].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
43
+ seq = options[:seq]
44
+
45
+ ###
46
+ ### Sub
47
+ ###
48
+
49
+ def local_align(s1,s2)
50
+ return 0 if !s1 || !s2 || s1 == s2
51
+ return (s1.length - s2.length).abs if s1.length == 0 || s2.length == 0
52
+ a = s1.split('')
53
+ b = s2.split('')
54
+
55
+ # a, word, short seq, target sequence : gaps (bulge in miR) expensive, but also because we have additional match options
56
+ # b, miRNA, longer seq,
57
+ # we require full alignment without penalty for overhangs
58
+
59
+ # score cutoff 8.5
60
+ # at least 5 matches, max 1 go or mismatch
61
+ # wobbles and neutral
62
+ # gm : gap mir, gt = gap target
63
+ # allowed
64
+ # 6m + 1gmo/mm : 11.0
65
+ # 5m + 2wo : 10
66
+ # 5m + 1gmo/mm + 1wo : 9
67
+ # 5m + 1gmo + 1ge : 8.5
68
+ # not allowed
69
+ # 4m + 3wo : 8 # no
70
+ # 5m + 2mm/go : 8
71
+
72
+ match = 2.0
73
+ miss = -1.0
74
+ wobble = 0.0
75
+ g1_open = -1.0 #
76
+ g2_open = -3.0 # 7m + 1g2 = 14-3 = 11 = 6m + 1g1 = 12 - 1
77
+ gap_ext = -0.5
78
+
79
+ score = Hash.new(miss)
80
+ ['at','ta','gc','cg'].each {|x| score[x] = match}
81
+ ['gt','tg'].each {|x| score[x] = wobble}
82
+
83
+ m = [[0]] # score matrix
84
+ bm = [["1,1:"]] # backtrack matrix
85
+ g1 = [[g1_open]]
86
+ g2 = [[g2_open]]
87
+ 1.upto(a.length) { |i| m[i] = [0]; g1[i] = [g1_open]; g2[i] = [g2_open]; bm[i]=["#{i+1},1:"]}
88
+ 1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = g1_open; g2[0][j] = g2_open; bm[0][j]="1,#{j+1}:"}
89
+ #g1/2 - gap extension matrix for two seqs
90
+
91
+ 1.upto(a.length) do |i|
92
+ 1.upto(b.length) do |j|
93
+ scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
94
+ m[i-1][j] + g1[i-1][j], # gap in seq1
95
+ m[i][j-1] + g2[i][j-1]] # gap in seq2
96
+
97
+ m[i][j] = scores.max
98
+ case scores.index(m[i][j])
99
+ when 0
100
+ g1[i][j] = g1_open
101
+ g2[i][j] = g2_open
102
+ case score[a[i-1] + b[j-1]]
103
+ when match then bm[i][j] = bm[i-1][j-1] + "*"
104
+ when miss then bm[i][j] = bm[i-1][j-1] + "!"
105
+ when wobble then bm[i][j] = bm[i-1][j-1] + "w"
106
+ end
107
+ when 1
108
+ g1[i][j] = gap_ext
109
+ g2[i][j] = g2_open
110
+ bm[i][j] = bm[i-1][j] + "-"
111
+ when 2
112
+ g2[i][j] = gap_ext
113
+ g1[i][j] = g1_open
114
+ bm[i][j] = bm[i][j-1] + "_"
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ # puts ([""]+b).map{|x| x.center(4)}.join(' ')
121
+ # puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
122
+ #(m.last+m.map{|x| x.last}).max
123
+ (m.last+m.map{|x| x.last}).zip(bm.last+bm.map{|x| x.last}).sort_by{|x| x[0]}.last
124
+ end
125
+
126
+ def update_mirpairing(mphash,aln)
127
+ pos,alnstr = aln.split(":")
128
+ start_word,start_mir = pos.split(",")
129
+ alnarr = alnstr.split('')
130
+ mirpos = start_mir.to_i
131
+ while !alnarr.empty?
132
+ alnchar = alnarr.shift
133
+ mphash[alnchar][mirpos-1] += 1
134
+ mirpos += 1 if alnchar != "-" #next mirpos, unless gap in miR
135
+ end
136
+ end
137
+
138
+ ###
139
+ ### Main
140
+ ###
141
+
142
+ # this is the string (the miRNA target sequence) we want to match
143
+ seqr = seq.downcase.reverse
144
+ seqc = seq.downcase.tr("agctu","tcgaa") # control - miR*
145
+ shuffled_seqs = (1..options[:shuffles]).to_a.map{|i| seqr.shuffle}
146
+ matches_shuffles = Array.new(options[:shuffles],0)
147
+ random_seqs = (1..options[:shuffles]).to_a.map{|i| ("agct".split('')*100).shuffle[0,seq.size].join('')}
148
+ matches_random = Array.new(options[:shuffles],0)
149
+
150
+ ofname = "#{options[:wordfile]}.#{seq}"
151
+ of = File.new(ofname,"w")
152
+ of.puts ">revcompl_sequence\n#{seqc.reverse}"
153
+
154
+ matches_seq = 0 # matches to the original string
155
+
156
+ aln_flank = 0 # how should we add flank info ??
157
+ mirpairing = Hash.new()
158
+ mirpairing['*'] = Array.new(seq.size+aln_flank,0) # match
159
+ mirpairing['!'] = Array.new(seq.size+aln_flank,0) # mismatch
160
+ mirpairing['w'] = Array.new(seq.size+aln_flank,0) # wobble
161
+ mirpairing['-'] = Array.new(seq.size+aln_flank,0) # gap in miR, bulge in target - hard to visualize
162
+ mirpairing['_'] = Array.new(seq.size+aln_flank,0) # gap in target, bulge in miR
163
+
164
+ awords = []
165
+ if options[:fdr]
166
+ awords = IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1]
167
+ else
168
+ awords = IO.readlines(options[:wordfile])[0,options[:top]+1]
169
+ end
170
+ awords.shift # remove header
171
+
172
+ puts "Processing #{awords.size} words"
173
+ pbar = ProgressBar.new("running",awords.size)
174
+ awords.each_with_index do |wline,idx|
175
+ rank = idx+1
176
+ word = wline.split(" ")[1]
177
+ shuffled_seqs.each_with_index{|s,idx| matches_shuffles[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
178
+ random_seqs.each_with_index{|s,idx| matches_random[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
179
+ sc,aln = local_align(word,seqr)
180
+ if sc >= options[:matchscore]
181
+ matches_seq += 1
182
+ update_mirpairing(mirpairing,aln)
183
+ of.puts ">#{rank}[#{sc.to_s(1)}=#{aln}]\n#{word}"
184
+ end
185
+ pbar.inc
186
+ end
187
+ pbar.finish
188
+ of.close
189
+
190
+ shuffled_mean = matches_shuffles.to_statarray.mean
191
+ shuffled_stdd = matches_shuffles.to_statarray.stddev
192
+ random_mean = matches_random.to_statarray.mean
193
+ random_stdd = matches_random.to_statarray.stddev
194
+
195
+ puts "words reverse-complementary to sequence: #{matches_seq}"
196
+ puts "words reverse-complementary to shuffled : #{shuffled_mean.to_s(1)}+-#{shuffled_stdd.to_s(1)}"
197
+ puts "words reverse-complementary to random : #{random_mean.to_s(1)}+-#{random_stdd.to_s(1)}"
198
+ puts "similar words in file: #{ofname}"
199
+
200
+ puts (["3'"] + seqc.reverse.split('')).map{|x| x.center(4)}.join('')
201
+ mirpairing.each do |alnchar,mirpos|
202
+ alnstr = alnchar.center(4)
203
+ alnstr += mirpos.map{|x| x.to_s.center(4)}.join('')
204
+ puts alnstr
205
+ end