cwords 0.1-jruby

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,394 @@
1
+ #!/usr/bin/ruby
2
+
3
+ ###
4
+ ### Given a miRS output word list file and a sequence,
5
+ ### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
6
+ ###
7
+
8
+ srcdir = File.dirname(__FILE__)
9
+ $LOAD_PATH << srcdir + '/../lib/'
10
+
11
+ require 'wordRS-lib.rb'
12
+ require 'progressbar'
13
+ require 'optparse'
14
+ require 'pp'
15
+
16
+ #default options
17
+ options = Hash.new
18
+ options[:wordfile]=nil
19
+ options[:sep]=" "
20
+ options[:overlap]=3
21
+ options[:seedoverlap]=nil
22
+ options[:testing]=nil
23
+ options[:fdr]=nil
24
+ options[:seed]=20
25
+ options[:top]=nil
26
+ options[:keep_lc] = nil # filter low complexity words
27
+ # we could estimate significance of cluster size based on shuffles ...
28
+ options[:shuffles]=100
29
+
30
+ $coptions = OptionParser.new do |opts|
31
+ opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
32
+ opts.on("-s", "--sep ARG", "separator") {|o| options[:sep] = o}
33
+ opts.on("-k", "--keep_lc", "keep low complexity words") {|o| options[:keep_lc] = o.to_i}
34
+ opts.on("-o", "--overlap ARG", "overlap") {|o| options[:overlap] = o.to_i}
35
+ opts.on("-v", "--seedoverlap ARG", "seed overlap") {|o| options[:seedoverlap] = o.to_i}
36
+ opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
37
+ opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
38
+ opts.on("-i", "--seed ARG", "seed number clusters") { |o| options[:seed] = o.to_i}
39
+ opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
40
+ opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
41
+ end
42
+
43
+ def show_help(msg="", code=0, io=STDOUT)
44
+ io.puts "#{msg}\n#{$coptions}"
45
+ exit(code)
46
+ end
47
+
48
+ $coptions.parse!(ARGV)
49
+ #mandatory parameters
50
+ [:wordfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
51
+
52
+ options[:seedoverlap] = options[:overlap] if !options[:seedoverlap]
53
+ options[:top] = options[:seed] if !options[:top]
54
+
55
+ # read in mirbase seed family
56
+ annofile = srcdir + "/../resources/" + "word_annotation.tsv" #annotation
57
+ @word_annotation = Hash.new("") # seq => family
58
+ IO.readlines(annofile).each{|l| @word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
59
+
60
+ ###
61
+ ### Sub
62
+ ###
63
+
64
+ class String
65
+
66
+ def editdist(b)
67
+ a = self
68
+ return 0 if !a || !b || a == b
69
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
70
+ m = [[0]]
71
+ 1.upto(a.length) { |i| m[i] = [i] }
72
+ 1.upto(b.length) { |j| m[0][j] = j }
73
+ 1.upto(a.length) do |i|
74
+ 1.upto(b.length) do |j|
75
+ m[i][j] =
76
+ [ m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),
77
+ m[i-1][j] + 1,
78
+ m[i][j-1] + 1 ].min
79
+ end
80
+ end
81
+ m[a.length][b.length]
82
+ end
83
+
84
+ def local_align_score(b)
85
+ a = self
86
+ return 0 if !a || !b || a == b
87
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
88
+ match = 2
89
+ miss = -1
90
+ gap = -1
91
+ m = [[0]]
92
+ 1.upto(a.length) { |i| m[i] = [0] }
93
+ 1.upto(b.length) { |j| m[0][j] = 0 }
94
+ 1.upto(a.length) do |i|
95
+ 1.upto(b.length) do |j|
96
+ m[i][j] =
97
+ [ m[i-1][j-1] + (a[i-1] == b[j-1] ? match : miss),
98
+ m[i-1][j] + gap,
99
+ m[i][j-1] + gap,0 ].max
100
+ end
101
+ end
102
+ (m.last+m.map{|x| x.last}).max
103
+ end
104
+
105
+ # gapped align of complementary nucleotides, with wobbles
106
+ def local_align_gap_score(b)
107
+ a=self
108
+ return 0 if !a || !b || a == b
109
+ return (a.length - b.length).abs if a.length == 0 || b.length == 0
110
+ a = a.split('')
111
+ b = b.split('')
112
+
113
+ # score cutoff 8.5
114
+ # at least 5 matches, max 1 go or mismatch
115
+ # wobbles and neutral
116
+ # allowed
117
+ # 6m + 1go/mm : 11.0
118
+ # 5m + 2wo : 10
119
+ # 5m + 1go/mm + 1wo : 9
120
+ # 5m + 1go + 1ge : 8.5
121
+ # not allowed
122
+ # 4m + 3wo : 8 # no
123
+ # 5m + 2mm/go : 8
124
+
125
+ match = 2.0
126
+ miss = -1.0
127
+ wobble = 0.0
128
+ gap_open = -1.0
129
+ gap_ext = -0.5
130
+
131
+ score = Hash.new(miss)
132
+ ['at','ta','gc','cg'].each {|x| score[x] = match}
133
+ ['gt','tg'].each {|x| score[x] = wobble}
134
+
135
+ m = [[0]]
136
+ #g1/2 - gap extension matrix for two seqs
137
+ g1 = [[gap_open]]
138
+ 1.upto(a.length) { |i| m[i] = [0]; g1[i] = [gap_open]} #g2[i] = gap_open}
139
+ 1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = gap_open}# g2[0][j] = gap_open}
140
+ g2 = g1.clone
141
+
142
+ 1.upto(a.length) do |i|
143
+ 1.upto(b.length) do |j|
144
+ scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
145
+ m[i-1][j] + g1[i-1][j], # gap in seq1
146
+ m[i][j-1] + g2[i][j-1], # gap in seq2
147
+ 0] # start new local alignment
148
+
149
+ m[i][j] = scores.max
150
+ g1[i][j] = (scores.index(m[i][j]) == 1) ? gap_ext : gap_open # gap in seq1
151
+ g2[i][j] = (scores.index(m[i][j]) == 2) ? gap_ext : gap_open # gap in seq2
152
+ end
153
+ end
154
+ #puts ([""]+b).map{|x| x.center(4)}.join(' ')
155
+ #puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
156
+ (m.last+m.map{|x| x.last}).max
157
+ end
158
+
159
+ # returns [size of overlap,rel. pos of s2 in s1]
160
+ def overlap(s2,require_ovl)
161
+ s1 = self
162
+ s1=s1.split('')
163
+ s2=s2.split('')
164
+ ovl = require_ovl || 5
165
+ return [] if (s1.size < ovl or s2.size < ovl)
166
+ return [0] if s1 == s2
167
+
168
+ # first, check if one word is contained in the other ...
169
+ # if s1.size <= s2.size
170
+ (s2.size-s1.size).times{|i| return [-i] if s2[i,s1.size] == s1}
171
+ (s1.size-s2.size).times{|i| return [i] if s1[i,s2.size] == s2}
172
+
173
+ # then check for overlap
174
+ (ovl..[s1.size,s2.size].min).to_a.reverse.each do |len|
175
+ ovls = []
176
+ ovls << -(s2.size-len) if s1[0,len] == s2[s2.size-len..-1]
177
+ ovls << s1.size-len if s2[0,len] == s1[s1.size-len..-1]
178
+ return ovls if !ovls.empty?
179
+ end
180
+
181
+ return [] # no overlap
182
+ end
183
+
184
+ #require '~/miwork/words/src/lib/mirs-lib.rb'
185
+ def editaln(b,edt)
186
+ a = self
187
+ #puts a
188
+ #puts b
189
+ return nil if !a || !b
190
+ return nil if a.length == 0 || b.length == 0
191
+ m = [[0]]
192
+ bt = [[0]] # backtrack
193
+ 1.upto(a.length) { |i| m[i] = [i]; bt[i] = [0]; }
194
+ 1.upto(b.length) { |j| m[0][j] = j; bt[0][j] = 0;}
195
+ 1.upto(a.length) do |i|
196
+ 1.upto(b.length) do |j|
197
+ scores =
198
+ [ [m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),0],
199
+ [m[i-1][j] + 1,1],
200
+ [m[i][j-1] + 1,2] ]
201
+
202
+ m[i][j] = scores.min.first
203
+ bt[i][j] = scores.min.last
204
+
205
+ end
206
+ end
207
+ editdist = m[a.length][b.length]
208
+ return [] if editdist > edt
209
+ #find alignment
210
+ if bt[a.length][b.length] == 0
211
+ # last pos aligns
212
+ return [a.length-b.length]
213
+ elsif bt[a.length][b.length] == 1
214
+ # aligns from above, find first non-1
215
+ bt.map{|x| x.last}.reverse.each_with_index do |bts,idx|
216
+ return [a.length-idx-b.length] if bts != 1
217
+ end
218
+ else
219
+ # aligns from left, find first non-2
220
+ bt[a.length].reverse.each_with_index do |bts,idx|
221
+ return [a.length+idx-b.length] if bts != 2
222
+ end
223
+ end
224
+
225
+ puts "error"
226
+ pp m
227
+ pp bt
228
+
229
+ end
230
+
231
+ end
232
+
233
+
234
+
235
+ def align_to_cluster(cluster,s,ovl)
236
+ align = []
237
+ return [] if cluster.key?(s) # s already exist in cluster
238
+ match_to_cluster = 0
239
+ cluster.to_a.each do |w,wis|
240
+ wol = w.overlap(s,ovl)
241
+ #wol = w.editaln(s,ovl) # test editdist
242
+ if !wol.empty?
243
+ match_to_cluster += 1
244
+ wol.each do |si|
245
+ wis.each do |wi|
246
+ #compute overlap length
247
+ olength = (si < 0) ? [s.size+si,w.size].min : [s.size,w.size-si].min
248
+ # we require the overlap should contain at least two
249
+ # different nucleotides
250
+ #puts w if s == "tgttt" and olength >= 3
251
+ next if (si<0) and w[0,olength].split('').uniq.size == 1
252
+ next if (si>=0) and s[0,olength].split('').uniq.size == 1
253
+ #adjust si relative to first cluster word
254
+ align << [s,wi+si,olength]
255
+ end
256
+ end
257
+ end
258
+ end
259
+ # return [] if overlap is not greater than ovl in at least half of
260
+ # the cluster members
261
+ #return [] if cluster.size > match_to_cluster
262
+
263
+ # return alignments with greatest overlap
264
+ align = align.compact.uniq
265
+ maxovl = align.map{|x| x.last}.max
266
+ return align.select{|x| x.last == maxovl}
267
+ end
268
+
269
+ # find cluster(s) with highest overlap
270
+ def align_to_clusters(clusters,s,ovl)
271
+
272
+ alns = []
273
+ clusters.each do |cl|
274
+ aln = align_to_cluster(cl,s,ovl)
275
+ alns << [aln,cl] if !aln.empty?
276
+ end
277
+ #pp alns if s == "tgttt"
278
+
279
+ # select alignments with greates overlap
280
+ maxovl = alns.map{|aln,cl| aln.first.last}.max
281
+ #pp maxovl if s == "cccgttt"
282
+ #pp alns.select{|x| x.first.last == maxovl} if s == "cccgttt"
283
+ return alns.select{|aln,cl| aln.first.last == maxovl}
284
+
285
+ end
286
+
287
+ def add_to_clusters(alignments)
288
+ #add.compact.uniq.each{|x,y| cl[x] << y}
289
+ alignments.each do |alns,cl|
290
+ #pp alns
291
+ alns.each{|aln| cl[aln[0]] << aln[1]}
292
+ end
293
+ end
294
+
295
+ def print_cluster(cluster,ranks)
296
+ cl = []
297
+ cluster.to_a.each{|a,b| b.each{|c| cl << [a,c]}}
298
+ cl = cl.sort_by{|x| x[1]}
299
+ imin = cl.first[1]
300
+
301
+ # preprocess clusters for consensus alignment
302
+ cons = Hash.new {|h,k| h[k] = Hash.new(0.0)}
303
+ cl.each do |w,i|
304
+ w.split('').each_with_index do |nt,idx|
305
+ cons[i+idx][nt] +=1
306
+ end
307
+ end
308
+
309
+ def cons_word(consh,word,i)
310
+ cw = ""
311
+ word.split('').each_with_index do |nt,idx|
312
+ frac = consh[i+idx][nt]/consh[i+idx].values.to_statarray.sum
313
+ cw += (frac >= 0.5 and consh[i+idx].values.to_statarray.sum > 1) ? nt.upcase : nt
314
+ end
315
+ return cw
316
+ end
317
+
318
+ cl.each do |w,i|
319
+ puts ranks[w].to_s.rjust(5)+": "+(" "*(i-imin)+cons_word(cons,w,i).tr('tT','uU')).ljust(20)+@word_annotation[w].chomp.ljust(20)
320
+ end
321
+ end
322
+
323
+ ###
324
+ ### Main
325
+ ###
326
+
327
+ aw = Hash.new()
328
+ if options[:fdr]
329
+ IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
330
+ aw[idx+1] = wline.split(options[:sep])[1]
331
+ end
332
+ else
333
+ IO.readlines(options[:wordfile])[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
334
+ aw[idx+1] = wline.split(options[:sep])[1]
335
+ end
336
+ end
337
+
338
+ awords = aw.to_a.sort.map{|x| x[1]}
339
+ awords_seed = awords[0,options[:seed]]
340
+ awords_rest = awords[options[:seed]..-1]
341
+
342
+ if !options[:keep_lc]
343
+ awords_seed = awords_seed.select{|x| x.split('').uniq.size > 1}
344
+ awords_rest = awords_rest.select{|x| x.split('').uniq.size > 1}
345
+ puts "removed #{aw.size-awords_seed.size-awords_rest.size} low complexity words"
346
+ end
347
+
348
+ # seedialize clusters
349
+ clusters = []
350
+ awords_seed.each{|x| h = Hash.new{|j,k| j[k]=Array.new}; h[x]=[0]; clusters << h}
351
+ #pp clusters
352
+
353
+ puts "step 1, seedializing clusters from #{options[:seed]} words"
354
+ pbar = ProgressBar.new("running",options[:seed])
355
+ options[:seed].size.times do
356
+ awords_seed.each do |word|
357
+ add_to_clusters(align_to_clusters(clusters,word,options[:seedoverlap]))
358
+ end
359
+ pbar.inc
360
+ end
361
+ pbar.finish
362
+
363
+ # remove duplicate clusters
364
+ # {'word' => [pos in alignment,array, usually one element]}
365
+
366
+ clusters.each{|x| clusters.delete_if{|y| x.object_id != y.object_id and x.key?(y.to_a.first.first)}}
367
+
368
+ # pp clusters
369
+ # merge clusters ...
370
+
371
+ puts "step 2, processing #{awords.size} words"
372
+ #pbar = ProgressBar.new("running",awords.size)
373
+ #awords[options[:seed]..-1].size.times do
374
+
375
+ # map extra words to cluster(s) with greatest overlap
376
+ # dont extend extra words
377
+ if options[:top] > options[:seed]
378
+ add = []
379
+ awords_rest.each do |word|
380
+ # add += align_to_cluster(cl,word,options[:overlap])
381
+ add << align_to_clusters(clusters,word,options[:overlap])
382
+ end
383
+ add.each{|a| add_to_clusters(a)}
384
+ end
385
+
386
+ #require 'pp'
387
+ #pp clusters.select{|x| x.size > 2}
388
+ #pp clusters
389
+ wa = aw.invert
390
+ resc = clusters.select{|x| x.size >= 3}
391
+ resc.each{|cl| print_cluster(cl,wa);puts "\n"}
392
+
393
+ puts "Found #{resc.size} word clusters."
394
+
@@ -0,0 +1,205 @@
1
+ #!/usr/bin/ruby
2
+
3
+ ###
4
+ ### Given a miRS output word list file and a sequence,
5
+ ### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
6
+ ###
7
+
8
+
9
+ srcdir = File.dirname(__FILE__)
10
+ $LOAD_PATH << srcdir + '/../lib/'
11
+
12
+ require 'wordRS-lib.rb'
13
+ require 'progressbar'
14
+ require 'optparse'
15
+
16
+ #default options
17
+ options = Hash.new
18
+ options[:wordfile]=nil
19
+ options[:seq]=nil
20
+ options[:matchscore]=7
21
+ options[:testing]=nil
22
+ options[:fdr]=nil
23
+ options[:top]=250
24
+ options[:shuffles]=100
25
+
26
+ $coptions = OptionParser.new do |opts|
27
+ opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
28
+ opts.on("-s", "--seq ARG", "sequence") {|o| options[:seq] = o}
29
+ opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
30
+ opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
31
+ opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
32
+ opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
33
+ end
34
+
35
+ def show_help(msg="", code=0, io=STDOUT)
36
+ io.puts "#{msg}\n#{$coptions}"
37
+ exit(code)
38
+ end
39
+
40
+ $coptions.parse!(ARGV)
41
+ #mandatory parameters
42
+ [:wordfile,:seq].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
43
+ seq = options[:seq]
44
+
45
+ ###
46
+ ### Sub
47
+ ###
48
+
49
+ def local_align(s1,s2)
50
+ return 0 if !s1 || !s2 || s1 == s2
51
+ return (s1.length - s2.length).abs if s1.length == 0 || s2.length == 0
52
+ a = s1.split('')
53
+ b = s2.split('')
54
+
55
+ # a, word, short seq, target sequence : gaps (bulge in miR) expensive, but also because we have additional match options
56
+ # b, miRNA, longer seq,
57
+ # we require full alignment without penalty for overhangs
58
+
59
+ # score cutoff 8.5
60
+ # at least 5 matches, max 1 go or mismatch
61
+ # wobbles and neutral
62
+ # gm : gap mir, gt = gap target
63
+ # allowed
64
+ # 6m + 1gmo/mm : 11.0
65
+ # 5m + 2wo : 10
66
+ # 5m + 1gmo/mm + 1wo : 9
67
+ # 5m + 1gmo + 1ge : 8.5
68
+ # not allowed
69
+ # 4m + 3wo : 8 # no
70
+ # 5m + 2mm/go : 8
71
+
72
+ match = 2.0
73
+ miss = -1.0
74
+ wobble = 0.0
75
+ g1_open = -1.0 #
76
+ g2_open = -3.0 # 7m + 1g2 = 14-3 = 11 = 6m + 1g1 = 12 - 1
77
+ gap_ext = -0.5
78
+
79
+ score = Hash.new(miss)
80
+ ['at','ta','gc','cg'].each {|x| score[x] = match}
81
+ ['gt','tg'].each {|x| score[x] = wobble}
82
+
83
+ m = [[0]] # score matrix
84
+ bm = [["1,1:"]] # backtrack matrix
85
+ g1 = [[g1_open]]
86
+ g2 = [[g2_open]]
87
+ 1.upto(a.length) { |i| m[i] = [0]; g1[i] = [g1_open]; g2[i] = [g2_open]; bm[i]=["#{i+1},1:"]}
88
+ 1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = g1_open; g2[0][j] = g2_open; bm[0][j]="1,#{j+1}:"}
89
+ #g1/2 - gap extension matrix for two seqs
90
+
91
+ 1.upto(a.length) do |i|
92
+ 1.upto(b.length) do |j|
93
+ scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
94
+ m[i-1][j] + g1[i-1][j], # gap in seq1
95
+ m[i][j-1] + g2[i][j-1]] # gap in seq2
96
+
97
+ m[i][j] = scores.max
98
+ case scores.index(m[i][j])
99
+ when 0
100
+ g1[i][j] = g1_open
101
+ g2[i][j] = g2_open
102
+ case score[a[i-1] + b[j-1]]
103
+ when match then bm[i][j] = bm[i-1][j-1] + "*"
104
+ when miss then bm[i][j] = bm[i-1][j-1] + "!"
105
+ when wobble then bm[i][j] = bm[i-1][j-1] + "w"
106
+ end
107
+ when 1
108
+ g1[i][j] = gap_ext
109
+ g2[i][j] = g2_open
110
+ bm[i][j] = bm[i-1][j] + "-"
111
+ when 2
112
+ g2[i][j] = gap_ext
113
+ g1[i][j] = g1_open
114
+ bm[i][j] = bm[i][j-1] + "_"
115
+ end
116
+
117
+ end
118
+ end
119
+
120
+ # puts ([""]+b).map{|x| x.center(4)}.join(' ')
121
+ # puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
122
+ #(m.last+m.map{|x| x.last}).max
123
+ (m.last+m.map{|x| x.last}).zip(bm.last+bm.map{|x| x.last}).sort_by{|x| x[0]}.last
124
+ end
125
+
126
+ def update_mirpairing(mphash,aln)
127
+ pos,alnstr = aln.split(":")
128
+ start_word,start_mir = pos.split(",")
129
+ alnarr = alnstr.split('')
130
+ mirpos = start_mir.to_i
131
+ while !alnarr.empty?
132
+ alnchar = alnarr.shift
133
+ mphash[alnchar][mirpos-1] += 1
134
+ mirpos += 1 if alnchar != "-" #next mirpos, unless gap in miR
135
+ end
136
+ end
137
+
138
+ ###
139
+ ### Main
140
+ ###
141
+
142
+ # this is the string (the miRNA target sequence) we want to match
143
+ seqr = seq.downcase.reverse
144
+ seqc = seq.downcase.tr("agctu","tcgaa") # control - miR*
145
+ shuffled_seqs = (1..options[:shuffles]).to_a.map{|i| seqr.shuffle}
146
+ matches_shuffles = Array.new(options[:shuffles],0)
147
+ random_seqs = (1..options[:shuffles]).to_a.map{|i| ("agct".split('')*100).shuffle[0,seq.size].join('')}
148
+ matches_random = Array.new(options[:shuffles],0)
149
+
150
+ ofname = "#{options[:wordfile]}.#{seq}"
151
+ of = File.new(ofname,"w")
152
+ of.puts ">revcompl_sequence\n#{seqc.reverse}"
153
+
154
+ matches_seq = 0 # matches to the original string
155
+
156
+ aln_flank = 0 # how should we add flank info ??
157
+ mirpairing = Hash.new()
158
+ mirpairing['*'] = Array.new(seq.size+aln_flank,0) # match
159
+ mirpairing['!'] = Array.new(seq.size+aln_flank,0) # mismatch
160
+ mirpairing['w'] = Array.new(seq.size+aln_flank,0) # wobble
161
+ mirpairing['-'] = Array.new(seq.size+aln_flank,0) # gap in miR, bulge in target - hard to visualize
162
+ mirpairing['_'] = Array.new(seq.size+aln_flank,0) # gap in target, bulge in miR
163
+
164
+ awords = []
165
+ if options[:fdr]
166
+ awords = IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1]
167
+ else
168
+ awords = IO.readlines(options[:wordfile])[0,options[:top]+1]
169
+ end
170
+ awords.shift # remove header
171
+
172
+ puts "Processing #{awords.size} words"
173
+ pbar = ProgressBar.new("running",awords.size)
174
+ awords.each_with_index do |wline,idx|
175
+ rank = idx+1
176
+ word = wline.split(" ")[1]
177
+ shuffled_seqs.each_with_index{|s,idx| matches_shuffles[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
178
+ random_seqs.each_with_index{|s,idx| matches_random[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
179
+ sc,aln = local_align(word,seqr)
180
+ if sc >= options[:matchscore]
181
+ matches_seq += 1
182
+ update_mirpairing(mirpairing,aln)
183
+ of.puts ">#{rank}[#{sc.to_s(1)}=#{aln}]\n#{word}"
184
+ end
185
+ pbar.inc
186
+ end
187
+ pbar.finish
188
+ of.close
189
+
190
+ shuffled_mean = matches_shuffles.to_statarray.mean
191
+ shuffled_stdd = matches_shuffles.to_statarray.stddev
192
+ random_mean = matches_random.to_statarray.mean
193
+ random_stdd = matches_random.to_statarray.stddev
194
+
195
+ puts "words reverse-complementary to sequence: #{matches_seq}"
196
+ puts "words reverse-complementary to shuffled : #{shuffled_mean.to_s(1)}+-#{shuffled_stdd.to_s(1)}"
197
+ puts "words reverse-complementary to random : #{random_mean.to_s(1)}+-#{random_stdd.to_s(1)}"
198
+ puts "similar words in file: #{ofname}"
199
+
200
+ puts (["3'"] + seqc.reverse.split('')).map{|x| x.center(4)}.join('')
201
+ mirpairing.each do |alnchar,mirpos|
202
+ alnstr = alnchar.center(4)
203
+ alnstr += mirpos.map{|x| x.to_s.center(4)}.join('')
204
+ puts alnstr
205
+ end