cwords 0.1-jruby
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +28 -0
- data/README +101 -0
- data/bin/cwords +3 -0
- data/bin/cwords_mkdb +3 -0
- data/lib/ushuffle.jar +0 -0
- data/lib/wordRS-lib.rb +169 -0
- data/resources/genemap.tsv +18956 -0
- data/resources/word_annotation.tsv +1801 -0
- data/scripts/cluster_words.rb +394 -0
- data/scripts/complementary_words.rb +205 -0
- metadata +97 -0
@@ -0,0 +1,394 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
###
|
4
|
+
### Given a miRS output word list file and a sequence,
|
5
|
+
### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
|
6
|
+
###
|
7
|
+
|
8
|
+
srcdir = File.dirname(__FILE__)
|
9
|
+
$LOAD_PATH << srcdir + '/../lib/'
|
10
|
+
|
11
|
+
require 'wordRS-lib.rb'
|
12
|
+
require 'progressbar'
|
13
|
+
require 'optparse'
|
14
|
+
require 'pp'
|
15
|
+
|
16
|
+
#default options
|
17
|
+
options = Hash.new
|
18
|
+
options[:wordfile]=nil
|
19
|
+
options[:sep]=" "
|
20
|
+
options[:overlap]=3
|
21
|
+
options[:seedoverlap]=nil
|
22
|
+
options[:testing]=nil
|
23
|
+
options[:fdr]=nil
|
24
|
+
options[:seed]=20
|
25
|
+
options[:top]=nil
|
26
|
+
options[:keep_lc] = nil # filter low complexity words
|
27
|
+
# we could estimate significance of cluster size based on shuffles ...
|
28
|
+
options[:shuffles]=100
|
29
|
+
|
30
|
+
$coptions = OptionParser.new do |opts|
|
31
|
+
opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
|
32
|
+
opts.on("-s", "--sep ARG", "separator") {|o| options[:sep] = o}
|
33
|
+
opts.on("-k", "--keep_lc", "keep low complexity words") {|o| options[:keep_lc] = o.to_i}
|
34
|
+
opts.on("-o", "--overlap ARG", "overlap") {|o| options[:overlap] = o.to_i}
|
35
|
+
opts.on("-v", "--seedoverlap ARG", "seed overlap") {|o| options[:seedoverlap] = o.to_i}
|
36
|
+
opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
|
37
|
+
opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
|
38
|
+
opts.on("-i", "--seed ARG", "seed number clusters") { |o| options[:seed] = o.to_i}
|
39
|
+
opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
|
40
|
+
opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
|
41
|
+
end
|
42
|
+
|
43
|
+
def show_help(msg="", code=0, io=STDOUT)
|
44
|
+
io.puts "#{msg}\n#{$coptions}"
|
45
|
+
exit(code)
|
46
|
+
end
|
47
|
+
|
48
|
+
$coptions.parse!(ARGV)
|
49
|
+
#mandatory parameters
|
50
|
+
[:wordfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
51
|
+
|
52
|
+
options[:seedoverlap] = options[:overlap] if !options[:seedoverlap]
|
53
|
+
options[:top] = options[:seed] if !options[:top]
|
54
|
+
|
55
|
+
# read in mirbase seed family
|
56
|
+
annofile = srcdir + "/../resources/" + "word_annotation.tsv" #annotation
|
57
|
+
@word_annotation = Hash.new("") # seq => family
|
58
|
+
IO.readlines(annofile).each{|l| @word_annotation[l.split("\t")[0]] = l.split("\t")[1]}
|
59
|
+
|
60
|
+
###
|
61
|
+
### Sub
|
62
|
+
###
|
63
|
+
|
64
|
+
class String
|
65
|
+
|
66
|
+
def editdist(b)
|
67
|
+
a = self
|
68
|
+
return 0 if !a || !b || a == b
|
69
|
+
return (a.length - b.length).abs if a.length == 0 || b.length == 0
|
70
|
+
m = [[0]]
|
71
|
+
1.upto(a.length) { |i| m[i] = [i] }
|
72
|
+
1.upto(b.length) { |j| m[0][j] = j }
|
73
|
+
1.upto(a.length) do |i|
|
74
|
+
1.upto(b.length) do |j|
|
75
|
+
m[i][j] =
|
76
|
+
[ m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),
|
77
|
+
m[i-1][j] + 1,
|
78
|
+
m[i][j-1] + 1 ].min
|
79
|
+
end
|
80
|
+
end
|
81
|
+
m[a.length][b.length]
|
82
|
+
end
|
83
|
+
|
84
|
+
def local_align_score(b)
|
85
|
+
a = self
|
86
|
+
return 0 if !a || !b || a == b
|
87
|
+
return (a.length - b.length).abs if a.length == 0 || b.length == 0
|
88
|
+
match = 2
|
89
|
+
miss = -1
|
90
|
+
gap = -1
|
91
|
+
m = [[0]]
|
92
|
+
1.upto(a.length) { |i| m[i] = [0] }
|
93
|
+
1.upto(b.length) { |j| m[0][j] = 0 }
|
94
|
+
1.upto(a.length) do |i|
|
95
|
+
1.upto(b.length) do |j|
|
96
|
+
m[i][j] =
|
97
|
+
[ m[i-1][j-1] + (a[i-1] == b[j-1] ? match : miss),
|
98
|
+
m[i-1][j] + gap,
|
99
|
+
m[i][j-1] + gap,0 ].max
|
100
|
+
end
|
101
|
+
end
|
102
|
+
(m.last+m.map{|x| x.last}).max
|
103
|
+
end
|
104
|
+
|
105
|
+
# gapped align of complementary nucleotides, with wobbles
|
106
|
+
def local_align_gap_score(b)
|
107
|
+
a=self
|
108
|
+
return 0 if !a || !b || a == b
|
109
|
+
return (a.length - b.length).abs if a.length == 0 || b.length == 0
|
110
|
+
a = a.split('')
|
111
|
+
b = b.split('')
|
112
|
+
|
113
|
+
# score cutoff 8.5
|
114
|
+
# at least 5 matches, max 1 go or mismatch
|
115
|
+
# wobbles and neutral
|
116
|
+
# allowed
|
117
|
+
# 6m + 1go/mm : 11.0
|
118
|
+
# 5m + 2wo : 10
|
119
|
+
# 5m + 1go/mm + 1wo : 9
|
120
|
+
# 5m + 1go + 1ge : 8.5
|
121
|
+
# not allowed
|
122
|
+
# 4m + 3wo : 8 # no
|
123
|
+
# 5m + 2mm/go : 8
|
124
|
+
|
125
|
+
match = 2.0
|
126
|
+
miss = -1.0
|
127
|
+
wobble = 0.0
|
128
|
+
gap_open = -1.0
|
129
|
+
gap_ext = -0.5
|
130
|
+
|
131
|
+
score = Hash.new(miss)
|
132
|
+
['at','ta','gc','cg'].each {|x| score[x] = match}
|
133
|
+
['gt','tg'].each {|x| score[x] = wobble}
|
134
|
+
|
135
|
+
m = [[0]]
|
136
|
+
#g1/2 - gap extension matrix for two seqs
|
137
|
+
g1 = [[gap_open]]
|
138
|
+
1.upto(a.length) { |i| m[i] = [0]; g1[i] = [gap_open]} #g2[i] = gap_open}
|
139
|
+
1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = gap_open}# g2[0][j] = gap_open}
|
140
|
+
g2 = g1.clone
|
141
|
+
|
142
|
+
1.upto(a.length) do |i|
|
143
|
+
1.upto(b.length) do |j|
|
144
|
+
scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
|
145
|
+
m[i-1][j] + g1[i-1][j], # gap in seq1
|
146
|
+
m[i][j-1] + g2[i][j-1], # gap in seq2
|
147
|
+
0] # start new local alignment
|
148
|
+
|
149
|
+
m[i][j] = scores.max
|
150
|
+
g1[i][j] = (scores.index(m[i][j]) == 1) ? gap_ext : gap_open # gap in seq1
|
151
|
+
g2[i][j] = (scores.index(m[i][j]) == 2) ? gap_ext : gap_open # gap in seq2
|
152
|
+
end
|
153
|
+
end
|
154
|
+
#puts ([""]+b).map{|x| x.center(4)}.join(' ')
|
155
|
+
#puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
|
156
|
+
(m.last+m.map{|x| x.last}).max
|
157
|
+
end
|
158
|
+
|
159
|
+
# returns [size of overlap,rel. pos of s2 in s1]
|
160
|
+
def overlap(s2,require_ovl)
|
161
|
+
s1 = self
|
162
|
+
s1=s1.split('')
|
163
|
+
s2=s2.split('')
|
164
|
+
ovl = require_ovl || 5
|
165
|
+
return [] if (s1.size < ovl or s2.size < ovl)
|
166
|
+
return [0] if s1 == s2
|
167
|
+
|
168
|
+
# first, check if one word is contained in the other ...
|
169
|
+
# if s1.size <= s2.size
|
170
|
+
(s2.size-s1.size).times{|i| return [-i] if s2[i,s1.size] == s1}
|
171
|
+
(s1.size-s2.size).times{|i| return [i] if s1[i,s2.size] == s2}
|
172
|
+
|
173
|
+
# then check for overlap
|
174
|
+
(ovl..[s1.size,s2.size].min).to_a.reverse.each do |len|
|
175
|
+
ovls = []
|
176
|
+
ovls << -(s2.size-len) if s1[0,len] == s2[s2.size-len..-1]
|
177
|
+
ovls << s1.size-len if s2[0,len] == s1[s1.size-len..-1]
|
178
|
+
return ovls if !ovls.empty?
|
179
|
+
end
|
180
|
+
|
181
|
+
return [] # no overlap
|
182
|
+
end
|
183
|
+
|
184
|
+
#require '~/miwork/words/src/lib/mirs-lib.rb'
|
185
|
+
def editaln(b,edt)
|
186
|
+
a = self
|
187
|
+
#puts a
|
188
|
+
#puts b
|
189
|
+
return nil if !a || !b
|
190
|
+
return nil if a.length == 0 || b.length == 0
|
191
|
+
m = [[0]]
|
192
|
+
bt = [[0]] # backtrack
|
193
|
+
1.upto(a.length) { |i| m[i] = [i]; bt[i] = [0]; }
|
194
|
+
1.upto(b.length) { |j| m[0][j] = j; bt[0][j] = 0;}
|
195
|
+
1.upto(a.length) do |i|
|
196
|
+
1.upto(b.length) do |j|
|
197
|
+
scores =
|
198
|
+
[ [m[i-1][j-1] + (a[i-1] == b[j-1] ? 0 : 1),0],
|
199
|
+
[m[i-1][j] + 1,1],
|
200
|
+
[m[i][j-1] + 1,2] ]
|
201
|
+
|
202
|
+
m[i][j] = scores.min.first
|
203
|
+
bt[i][j] = scores.min.last
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
207
|
+
editdist = m[a.length][b.length]
|
208
|
+
return [] if editdist > edt
|
209
|
+
#find alignment
|
210
|
+
if bt[a.length][b.length] == 0
|
211
|
+
# last pos aligns
|
212
|
+
return [a.length-b.length]
|
213
|
+
elsif bt[a.length][b.length] == 1
|
214
|
+
# aligns from above, find first non-1
|
215
|
+
bt.map{|x| x.last}.reverse.each_with_index do |bts,idx|
|
216
|
+
return [a.length-idx-b.length] if bts != 1
|
217
|
+
end
|
218
|
+
else
|
219
|
+
# aligns from left, find first non-2
|
220
|
+
bt[a.length].reverse.each_with_index do |bts,idx|
|
221
|
+
return [a.length+idx-b.length] if bts != 2
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
puts "error"
|
226
|
+
pp m
|
227
|
+
pp bt
|
228
|
+
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
|
233
|
+
|
234
|
+
|
235
|
+
def align_to_cluster(cluster,s,ovl)
|
236
|
+
align = []
|
237
|
+
return [] if cluster.key?(s) # s already exist in cluster
|
238
|
+
match_to_cluster = 0
|
239
|
+
cluster.to_a.each do |w,wis|
|
240
|
+
wol = w.overlap(s,ovl)
|
241
|
+
#wol = w.editaln(s,ovl) # test editdist
|
242
|
+
if !wol.empty?
|
243
|
+
match_to_cluster += 1
|
244
|
+
wol.each do |si|
|
245
|
+
wis.each do |wi|
|
246
|
+
#compute overlap length
|
247
|
+
olength = (si < 0) ? [s.size+si,w.size].min : [s.size,w.size-si].min
|
248
|
+
# we require the overlap should contain at least two
|
249
|
+
# different nucleotides
|
250
|
+
#puts w if s == "tgttt" and olength >= 3
|
251
|
+
next if (si<0) and w[0,olength].split('').uniq.size == 1
|
252
|
+
next if (si>=0) and s[0,olength].split('').uniq.size == 1
|
253
|
+
#adjust si relative to first cluster word
|
254
|
+
align << [s,wi+si,olength]
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
# return [] if overlap is not greater than ovl in at least half of
|
260
|
+
# the cluster members
|
261
|
+
#return [] if cluster.size > match_to_cluster
|
262
|
+
|
263
|
+
# return alignments with greatest overlap
|
264
|
+
align = align.compact.uniq
|
265
|
+
maxovl = align.map{|x| x.last}.max
|
266
|
+
return align.select{|x| x.last == maxovl}
|
267
|
+
end
|
268
|
+
|
269
|
+
# find cluster(s) with highest overlap
|
270
|
+
def align_to_clusters(clusters,s,ovl)
|
271
|
+
|
272
|
+
alns = []
|
273
|
+
clusters.each do |cl|
|
274
|
+
aln = align_to_cluster(cl,s,ovl)
|
275
|
+
alns << [aln,cl] if !aln.empty?
|
276
|
+
end
|
277
|
+
#pp alns if s == "tgttt"
|
278
|
+
|
279
|
+
# select alignments with greates overlap
|
280
|
+
maxovl = alns.map{|aln,cl| aln.first.last}.max
|
281
|
+
#pp maxovl if s == "cccgttt"
|
282
|
+
#pp alns.select{|x| x.first.last == maxovl} if s == "cccgttt"
|
283
|
+
return alns.select{|aln,cl| aln.first.last == maxovl}
|
284
|
+
|
285
|
+
end
|
286
|
+
|
287
|
+
def add_to_clusters(alignments)
|
288
|
+
#add.compact.uniq.each{|x,y| cl[x] << y}
|
289
|
+
alignments.each do |alns,cl|
|
290
|
+
#pp alns
|
291
|
+
alns.each{|aln| cl[aln[0]] << aln[1]}
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
def print_cluster(cluster,ranks)
|
296
|
+
cl = []
|
297
|
+
cluster.to_a.each{|a,b| b.each{|c| cl << [a,c]}}
|
298
|
+
cl = cl.sort_by{|x| x[1]}
|
299
|
+
imin = cl.first[1]
|
300
|
+
|
301
|
+
# preprocess clusters for consensus alignment
|
302
|
+
cons = Hash.new {|h,k| h[k] = Hash.new(0.0)}
|
303
|
+
cl.each do |w,i|
|
304
|
+
w.split('').each_with_index do |nt,idx|
|
305
|
+
cons[i+idx][nt] +=1
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
def cons_word(consh,word,i)
|
310
|
+
cw = ""
|
311
|
+
word.split('').each_with_index do |nt,idx|
|
312
|
+
frac = consh[i+idx][nt]/consh[i+idx].values.to_statarray.sum
|
313
|
+
cw += (frac >= 0.5 and consh[i+idx].values.to_statarray.sum > 1) ? nt.upcase : nt
|
314
|
+
end
|
315
|
+
return cw
|
316
|
+
end
|
317
|
+
|
318
|
+
cl.each do |w,i|
|
319
|
+
puts ranks[w].to_s.rjust(5)+": "+(" "*(i-imin)+cons_word(cons,w,i).tr('tT','uU')).ljust(20)+@word_annotation[w].chomp.ljust(20)
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
###
|
324
|
+
### Main
|
325
|
+
###
|
326
|
+
|
327
|
+
aw = Hash.new()
|
328
|
+
if options[:fdr]
|
329
|
+
IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
|
330
|
+
aw[idx+1] = wline.split(options[:sep])[1]
|
331
|
+
end
|
332
|
+
else
|
333
|
+
IO.readlines(options[:wordfile])[0,options[:top]+1][1..-1].each_with_index do |wline,idx|
|
334
|
+
aw[idx+1] = wline.split(options[:sep])[1]
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
awords = aw.to_a.sort.map{|x| x[1]}
|
339
|
+
awords_seed = awords[0,options[:seed]]
|
340
|
+
awords_rest = awords[options[:seed]..-1]
|
341
|
+
|
342
|
+
if !options[:keep_lc]
|
343
|
+
awords_seed = awords_seed.select{|x| x.split('').uniq.size > 1}
|
344
|
+
awords_rest = awords_rest.select{|x| x.split('').uniq.size > 1}
|
345
|
+
puts "removed #{aw.size-awords_seed.size-awords_rest.size} low complexity words"
|
346
|
+
end
|
347
|
+
|
348
|
+
# seedialize clusters
|
349
|
+
clusters = []
|
350
|
+
awords_seed.each{|x| h = Hash.new{|j,k| j[k]=Array.new}; h[x]=[0]; clusters << h}
|
351
|
+
#pp clusters
|
352
|
+
|
353
|
+
puts "step 1, seedializing clusters from #{options[:seed]} words"
|
354
|
+
pbar = ProgressBar.new("running",options[:seed])
|
355
|
+
options[:seed].size.times do
|
356
|
+
awords_seed.each do |word|
|
357
|
+
add_to_clusters(align_to_clusters(clusters,word,options[:seedoverlap]))
|
358
|
+
end
|
359
|
+
pbar.inc
|
360
|
+
end
|
361
|
+
pbar.finish
|
362
|
+
|
363
|
+
# remove duplicate clusters
|
364
|
+
# {'word' => [pos in alignment,array, usually one element]}
|
365
|
+
|
366
|
+
clusters.each{|x| clusters.delete_if{|y| x.object_id != y.object_id and x.key?(y.to_a.first.first)}}
|
367
|
+
|
368
|
+
# pp clusters
|
369
|
+
# merge clusters ...
|
370
|
+
|
371
|
+
puts "step 2, processing #{awords.size} words"
|
372
|
+
#pbar = ProgressBar.new("running",awords.size)
|
373
|
+
#awords[options[:seed]..-1].size.times do
|
374
|
+
|
375
|
+
# map extra words to cluster(s) with greatest overlap
|
376
|
+
# dont extend extra words
|
377
|
+
if options[:top] > options[:seed]
|
378
|
+
add = []
|
379
|
+
awords_rest.each do |word|
|
380
|
+
# add += align_to_cluster(cl,word,options[:overlap])
|
381
|
+
add << align_to_clusters(clusters,word,options[:overlap])
|
382
|
+
end
|
383
|
+
add.each{|a| add_to_clusters(a)}
|
384
|
+
end
|
385
|
+
|
386
|
+
#require 'pp'
|
387
|
+
#pp clusters.select{|x| x.size > 2}
|
388
|
+
#pp clusters
|
389
|
+
wa = aw.invert
|
390
|
+
resc = clusters.select{|x| x.size >= 3}
|
391
|
+
resc.each{|cl| print_cluster(cl,wa);puts "\n"}
|
392
|
+
|
393
|
+
puts "Found #{resc.size} word clusters."
|
394
|
+
|
@@ -0,0 +1,205 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
###
|
4
|
+
### Given a miRS output word list file and a sequence,
|
5
|
+
### the words are aligned to input sequence (RNA complementary alignment, differentiating between mismatches and bulges)
|
6
|
+
###
|
7
|
+
|
8
|
+
|
9
|
+
srcdir = File.dirname(__FILE__)
|
10
|
+
$LOAD_PATH << srcdir + '/../lib/'
|
11
|
+
|
12
|
+
require 'wordRS-lib.rb'
|
13
|
+
require 'progressbar'
|
14
|
+
require 'optparse'
|
15
|
+
|
16
|
+
#default options
|
17
|
+
options = Hash.new
|
18
|
+
options[:wordfile]=nil
|
19
|
+
options[:seq]=nil
|
20
|
+
options[:matchscore]=7
|
21
|
+
options[:testing]=nil
|
22
|
+
options[:fdr]=nil
|
23
|
+
options[:top]=250
|
24
|
+
options[:shuffles]=100
|
25
|
+
|
26
|
+
$coptions = OptionParser.new do |opts|
|
27
|
+
opts.on("-w", "--wordfile ARG", "word rank file") {|o| options[:wordfile] = o}
|
28
|
+
opts.on("-s", "--seq ARG", "sequence") {|o| options[:seq] = o}
|
29
|
+
opts.on("-m", "--matchscore ARG", "scoring scheme") {|o| options[:matchscore] = o.to_f}
|
30
|
+
opts.on("-t", "--top ARG", "limit words to scan") { |o| options[:top] = o.to_i}
|
31
|
+
opts.on("-f", "--fdr ARG", "limit words to scan") { |o| options[:fdr] = o.to_f}
|
32
|
+
opts.on("-u", "--shuffles ARG", "background estimate based on shuffle sequences") { |o| options[:shuffles] = o.to_i}
|
33
|
+
end
|
34
|
+
|
35
|
+
def show_help(msg="", code=0, io=STDOUT)
|
36
|
+
io.puts "#{msg}\n#{$coptions}"
|
37
|
+
exit(code)
|
38
|
+
end
|
39
|
+
|
40
|
+
$coptions.parse!(ARGV)
|
41
|
+
#mandatory parameters
|
42
|
+
[:wordfile,:seq].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
43
|
+
seq = options[:seq]
|
44
|
+
|
45
|
+
###
|
46
|
+
### Sub
|
47
|
+
###
|
48
|
+
|
49
|
+
def local_align(s1,s2)
|
50
|
+
return 0 if !s1 || !s2 || s1 == s2
|
51
|
+
return (s1.length - s2.length).abs if s1.length == 0 || s2.length == 0
|
52
|
+
a = s1.split('')
|
53
|
+
b = s2.split('')
|
54
|
+
|
55
|
+
# a, word, short seq, target sequence : gaps (bulge in miR) expensive, but also because we have additional match options
|
56
|
+
# b, miRNA, longer seq,
|
57
|
+
# we require full alignment without penalty for overhangs
|
58
|
+
|
59
|
+
# score cutoff 8.5
|
60
|
+
# at least 5 matches, max 1 go or mismatch
|
61
|
+
# wobbles and neutral
|
62
|
+
# gm : gap mir, gt = gap target
|
63
|
+
# allowed
|
64
|
+
# 6m + 1gmo/mm : 11.0
|
65
|
+
# 5m + 2wo : 10
|
66
|
+
# 5m + 1gmo/mm + 1wo : 9
|
67
|
+
# 5m + 1gmo + 1ge : 8.5
|
68
|
+
# not allowed
|
69
|
+
# 4m + 3wo : 8 # no
|
70
|
+
# 5m + 2mm/go : 8
|
71
|
+
|
72
|
+
match = 2.0
|
73
|
+
miss = -1.0
|
74
|
+
wobble = 0.0
|
75
|
+
g1_open = -1.0 #
|
76
|
+
g2_open = -3.0 # 7m + 1g2 = 14-3 = 11 = 6m + 1g1 = 12 - 1
|
77
|
+
gap_ext = -0.5
|
78
|
+
|
79
|
+
score = Hash.new(miss)
|
80
|
+
['at','ta','gc','cg'].each {|x| score[x] = match}
|
81
|
+
['gt','tg'].each {|x| score[x] = wobble}
|
82
|
+
|
83
|
+
m = [[0]] # score matrix
|
84
|
+
bm = [["1,1:"]] # backtrack matrix
|
85
|
+
g1 = [[g1_open]]
|
86
|
+
g2 = [[g2_open]]
|
87
|
+
1.upto(a.length) { |i| m[i] = [0]; g1[i] = [g1_open]; g2[i] = [g2_open]; bm[i]=["#{i+1},1:"]}
|
88
|
+
1.upto(b.length) { |j| m[0][j] = 0; g1[0][j] = g1_open; g2[0][j] = g2_open; bm[0][j]="1,#{j+1}:"}
|
89
|
+
#g1/2 - gap extension matrix for two seqs
|
90
|
+
|
91
|
+
1.upto(a.length) do |i|
|
92
|
+
1.upto(b.length) do |j|
|
93
|
+
scores = [ m[i-1][j-1] + score[a[i-1] + b[j-1]], #match/mismatch
|
94
|
+
m[i-1][j] + g1[i-1][j], # gap in seq1
|
95
|
+
m[i][j-1] + g2[i][j-1]] # gap in seq2
|
96
|
+
|
97
|
+
m[i][j] = scores.max
|
98
|
+
case scores.index(m[i][j])
|
99
|
+
when 0
|
100
|
+
g1[i][j] = g1_open
|
101
|
+
g2[i][j] = g2_open
|
102
|
+
case score[a[i-1] + b[j-1]]
|
103
|
+
when match then bm[i][j] = bm[i-1][j-1] + "*"
|
104
|
+
when miss then bm[i][j] = bm[i-1][j-1] + "!"
|
105
|
+
when wobble then bm[i][j] = bm[i-1][j-1] + "w"
|
106
|
+
end
|
107
|
+
when 1
|
108
|
+
g1[i][j] = gap_ext
|
109
|
+
g2[i][j] = g2_open
|
110
|
+
bm[i][j] = bm[i-1][j] + "-"
|
111
|
+
when 2
|
112
|
+
g2[i][j] = gap_ext
|
113
|
+
g1[i][j] = g1_open
|
114
|
+
bm[i][j] = bm[i][j-1] + "_"
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# puts ([""]+b).map{|x| x.center(4)}.join(' ')
|
121
|
+
# puts m.map{|x| x.map{|y| y.to_f.to_s(1).center(4)}.join(" ")}
|
122
|
+
#(m.last+m.map{|x| x.last}).max
|
123
|
+
(m.last+m.map{|x| x.last}).zip(bm.last+bm.map{|x| x.last}).sort_by{|x| x[0]}.last
|
124
|
+
end
|
125
|
+
|
126
|
+
def update_mirpairing(mphash,aln)
|
127
|
+
pos,alnstr = aln.split(":")
|
128
|
+
start_word,start_mir = pos.split(",")
|
129
|
+
alnarr = alnstr.split('')
|
130
|
+
mirpos = start_mir.to_i
|
131
|
+
while !alnarr.empty?
|
132
|
+
alnchar = alnarr.shift
|
133
|
+
mphash[alnchar][mirpos-1] += 1
|
134
|
+
mirpos += 1 if alnchar != "-" #next mirpos, unless gap in miR
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
###
|
139
|
+
### Main
|
140
|
+
###
|
141
|
+
|
142
|
+
# this is the string (the miRNA target sequence) we want to match
|
143
|
+
seqr = seq.downcase.reverse
|
144
|
+
seqc = seq.downcase.tr("agctu","tcgaa") # control - miR*
|
145
|
+
shuffled_seqs = (1..options[:shuffles]).to_a.map{|i| seqr.shuffle}
|
146
|
+
matches_shuffles = Array.new(options[:shuffles],0)
|
147
|
+
random_seqs = (1..options[:shuffles]).to_a.map{|i| ("agct".split('')*100).shuffle[0,seq.size].join('')}
|
148
|
+
matches_random = Array.new(options[:shuffles],0)
|
149
|
+
|
150
|
+
ofname = "#{options[:wordfile]}.#{seq}"
|
151
|
+
of = File.new(ofname,"w")
|
152
|
+
of.puts ">revcompl_sequence\n#{seqc.reverse}"
|
153
|
+
|
154
|
+
matches_seq = 0 # matches to the original string
|
155
|
+
|
156
|
+
aln_flank = 0 # how should we add flank info ??
|
157
|
+
mirpairing = Hash.new()
|
158
|
+
mirpairing['*'] = Array.new(seq.size+aln_flank,0) # match
|
159
|
+
mirpairing['!'] = Array.new(seq.size+aln_flank,0) # mismatch
|
160
|
+
mirpairing['w'] = Array.new(seq.size+aln_flank,0) # wobble
|
161
|
+
mirpairing['-'] = Array.new(seq.size+aln_flank,0) # gap in miR, bulge in target - hard to visualize
|
162
|
+
mirpairing['_'] = Array.new(seq.size+aln_flank,0) # gap in target, bulge in miR
|
163
|
+
|
164
|
+
awords = []
|
165
|
+
if options[:fdr]
|
166
|
+
awords = IO.readlines(options[:wordfile]).select{|x| x.split(' ')[5].to_f < options[:fdr]}[0,options[:top]+1]
|
167
|
+
else
|
168
|
+
awords = IO.readlines(options[:wordfile])[0,options[:top]+1]
|
169
|
+
end
|
170
|
+
awords.shift # remove header
|
171
|
+
|
172
|
+
puts "Processing #{awords.size} words"
|
173
|
+
pbar = ProgressBar.new("running",awords.size)
|
174
|
+
awords.each_with_index do |wline,idx|
|
175
|
+
rank = idx+1
|
176
|
+
word = wline.split(" ")[1]
|
177
|
+
shuffled_seqs.each_with_index{|s,idx| matches_shuffles[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
|
178
|
+
random_seqs.each_with_index{|s,idx| matches_random[idx] += 1 if local_align(word,s).first >= options[:matchscore]}
|
179
|
+
sc,aln = local_align(word,seqr)
|
180
|
+
if sc >= options[:matchscore]
|
181
|
+
matches_seq += 1
|
182
|
+
update_mirpairing(mirpairing,aln)
|
183
|
+
of.puts ">#{rank}[#{sc.to_s(1)}=#{aln}]\n#{word}"
|
184
|
+
end
|
185
|
+
pbar.inc
|
186
|
+
end
|
187
|
+
pbar.finish
|
188
|
+
of.close
|
189
|
+
|
190
|
+
shuffled_mean = matches_shuffles.to_statarray.mean
|
191
|
+
shuffled_stdd = matches_shuffles.to_statarray.stddev
|
192
|
+
random_mean = matches_random.to_statarray.mean
|
193
|
+
random_stdd = matches_random.to_statarray.stddev
|
194
|
+
|
195
|
+
puts "words reverse-complementary to sequence: #{matches_seq}"
|
196
|
+
puts "words reverse-complementary to shuffled : #{shuffled_mean.to_s(1)}+-#{shuffled_stdd.to_s(1)}"
|
197
|
+
puts "words reverse-complementary to random : #{random_mean.to_s(1)}+-#{random_stdd.to_s(1)}"
|
198
|
+
puts "similar words in file: #{ofname}"
|
199
|
+
|
200
|
+
puts (["3'"] + seqc.reverse.split('')).map{|x| x.center(4)}.join('')
|
201
|
+
mirpairing.each do |alnchar,mirpos|
|
202
|
+
alnstr = alnchar.center(4)
|
203
|
+
alnstr += mirpos.map{|x| x.to_s.center(4)}.join('')
|
204
|
+
puts alnstr
|
205
|
+
end
|