rgfa 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/gfadiff.rb +420 -0
- data/bin/rgfa-findcrisprs.rb +208 -0
- data/bin/rgfa-mergelinear.rb +14 -0
- data/bin/rgfa-simdebruijn.rb +86 -0
- data/lib/rgfa.rb +376 -0
- data/lib/rgfa/byte_array.rb +74 -0
- data/lib/rgfa/cigar.rb +157 -0
- data/lib/rgfa/connectivity.rb +131 -0
- data/lib/rgfa/containments.rb +97 -0
- data/lib/rgfa/error.rb +3 -0
- data/lib/rgfa/field_array.rb +87 -0
- data/lib/rgfa/field_parser.rb +109 -0
- data/lib/rgfa/field_validator.rb +241 -0
- data/lib/rgfa/field_writer.rb +108 -0
- data/lib/rgfa/headers.rb +76 -0
- data/lib/rgfa/line.rb +721 -0
- data/lib/rgfa/line/containment.rb +87 -0
- data/lib/rgfa/line/header.rb +92 -0
- data/lib/rgfa/line/link.rb +379 -0
- data/lib/rgfa/line/path.rb +106 -0
- data/lib/rgfa/line/segment.rb +209 -0
- data/lib/rgfa/linear_paths.rb +285 -0
- data/lib/rgfa/lines.rb +155 -0
- data/lib/rgfa/links.rb +242 -0
- data/lib/rgfa/logger.rb +192 -0
- data/lib/rgfa/multiplication.rb +156 -0
- data/lib/rgfa/numeric_array.rb +196 -0
- data/lib/rgfa/paths.rb +98 -0
- data/lib/rgfa/rgl.rb +194 -0
- data/lib/rgfa/segment_ends_path.rb +9 -0
- data/lib/rgfa/segment_info.rb +162 -0
- data/lib/rgfa/segments.rb +99 -0
- data/lib/rgfa/sequence.rb +65 -0
- data/lib/rgfatools.rb +102 -0
- data/lib/rgfatools/artifacts.rb +29 -0
- data/lib/rgfatools/copy_number.rb +126 -0
- data/lib/rgfatools/invertible_segments.rb +104 -0
- data/lib/rgfatools/linear_paths.rb +140 -0
- data/lib/rgfatools/multiplication.rb +194 -0
- data/lib/rgfatools/p_bubbles.rb +66 -0
- data/lib/rgfatools/superfluous_links.rb +64 -0
- metadata +97 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2f8b7edb6df7ada4a6f53db674a3b306e00e2861
|
4
|
+
data.tar.gz: 87d224980a807f6b8e98917b845d48a12ec11e4b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c38a9ef751a5220dd59991a4b0788d8860d66ead6880c03de53ab047970cb3ad4719b23c4dd1b7b1d262dfe29cc84b82e105d10e938e83231ad3969efdf86edd
|
7
|
+
data.tar.gz: b87e1dcd4f7ddccb77bdb03edad2a52ae23e8b5c9f22629f4aa57206c75306b7e7f813e5154c76245de14113d532e983795957721bd0ecce9d772a7e95ee6259
|
data/bin/gfadiff.rb
ADDED
@@ -0,0 +1,420 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rgfa"
|
4
|
+
|
5
|
+
rt = []
|
6
|
+
all_rt = %W[-h -s -l -c -p]
|
7
|
+
all_rt.each do |rtopt|
|
8
|
+
rt << ARGV.delete(rtopt)
|
9
|
+
end
|
10
|
+
rt.compact!
|
11
|
+
rt = all_rt if rt.empty?
|
12
|
+
|
13
|
+
out_identical = ARGV.delete("-i")
|
14
|
+
|
15
|
+
out_script = ARGV.delete("-script")
|
16
|
+
|
17
|
+
if ARGV.size != 2
|
18
|
+
STDERR.puts "Compare two RGFA files"
|
19
|
+
STDERR.puts
|
20
|
+
STDERR.puts "Usage: #$0 [-h] [-s] [-l] [-c] [-p] [-i] [-script] <gfa1> <gfa2>"
|
21
|
+
STDERR.puts
|
22
|
+
STDERR.puts "If a combination of -h,-s,-l,-c and/or -p is specified, then"
|
23
|
+
STDERR.puts "only record of the specified type [h=headers, s=segments, "
|
24
|
+
STDERR.puts "l=links, c=containments, p=paths] are compared. "
|
25
|
+
STDERR.puts "(default: -h -s -l -c -p)"
|
26
|
+
STDERR.puts
|
27
|
+
STDERR.puts "Other options:"
|
28
|
+
STDERR.puts " -i: output msg if identical"
|
29
|
+
STDERR.puts " -script: create ruby script to transform gfa1 in gfa2"
|
30
|
+
exit 1
|
31
|
+
end
|
32
|
+
|
33
|
+
if out_script
|
34
|
+
puts "#!/usr/bin/env ruby"
|
35
|
+
puts
|
36
|
+
puts "#"
|
37
|
+
puts "# This script was automatically generated using gfadiff.rb"
|
38
|
+
puts "#"
|
39
|
+
puts "# Purpose: edit gfa1 into gfa2"
|
40
|
+
puts "#"
|
41
|
+
puts "# gfa1: #{ARGV[0]}"
|
42
|
+
puts "# gfa2: #{ARGV[1]}"
|
43
|
+
puts "#"
|
44
|
+
puts
|
45
|
+
puts "require \"rgfa\""
|
46
|
+
puts
|
47
|
+
puts "gfa = RGFA.from_file(\"#{ARGV[0]}\")"
|
48
|
+
puts
|
49
|
+
end
|
50
|
+
|
51
|
+
gfa1 = RGFA.new
|
52
|
+
gfa1.turn_off_validations
|
53
|
+
gfa1.read_file(ARGV[0])
|
54
|
+
gfa2 = RGFA.new
|
55
|
+
gfa2.turn_off_validations
|
56
|
+
gfa2.read_file(ARGV[1])
|
57
|
+
|
58
|
+
if rt.include?("-h")
|
59
|
+
h1 = gfa1.header
|
60
|
+
h2 = gfa2.header
|
61
|
+
if h1 == h2
|
62
|
+
if out_identical
|
63
|
+
puts "# Header values are identical"
|
64
|
+
elsif out_script
|
65
|
+
puts "# Headers"
|
66
|
+
puts "# ... are identical"
|
67
|
+
puts
|
68
|
+
end
|
69
|
+
else
|
70
|
+
if out_script
|
71
|
+
puts "# Headers"
|
72
|
+
puts
|
73
|
+
end
|
74
|
+
(h1.optional_fieldnames - h2.optional_fieldnames).each do |k|
|
75
|
+
if out_script
|
76
|
+
puts "gfa.header.delete_field(#{k.inspect})"
|
77
|
+
else
|
78
|
+
v = h1.get(k)
|
79
|
+
if v.kind_of?(RGFA::FieldArray)
|
80
|
+
t = v.datatype
|
81
|
+
v.each do |elem|
|
82
|
+
elem = elem.to_gfa_field(datatype: t)
|
83
|
+
puts "<\t[headers/exclusive/multivalue/#{k}]\t#{elem}"
|
84
|
+
end
|
85
|
+
else
|
86
|
+
v = h1.field_to_s(k, optfield: true)
|
87
|
+
puts "M\t[headers/exclusive]\t#{k.inspect}\t#{v}"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
(h2.optional_fieldnames - h1.optional_fieldnames).each do |k|
|
92
|
+
v = h2.get(k)
|
93
|
+
if out_script
|
94
|
+
t = h2.get_datatype(k)
|
95
|
+
puts "gfa.header.set_datatype(#{k.inspect}, #{t.inspect})"
|
96
|
+
if v.kind_of?(RGFA::FieldArray)
|
97
|
+
t = v.datatype
|
98
|
+
v.each do |elem|
|
99
|
+
puts "gfa.header.add(#{k.inspect}, #{elem.inspect}, "+
|
100
|
+
"#{t.inspect})"
|
101
|
+
end
|
102
|
+
else
|
103
|
+
puts "gfa.header.#{k}=#{v.inspect}"
|
104
|
+
end
|
105
|
+
else
|
106
|
+
if v.kind_of?(RGFA::FieldArray)
|
107
|
+
t = v.datatype
|
108
|
+
v.each do |elem|
|
109
|
+
elem = elem.to_gfa_field(datatype: t)
|
110
|
+
puts ">\t[headers/exclusive/multivalue/#{k}]\t#{elem}"
|
111
|
+
end
|
112
|
+
else
|
113
|
+
v = h2.field_to_s(k, optfield: true)
|
114
|
+
puts ">\t[headers/exclusive]\t#{k.inspect}\t#{v}"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
(h1.optional_fieldnames & h2.optional_fieldnames).each do |k|
|
119
|
+
v1 = h1.get(k)
|
120
|
+
v2 = h2.get(k)
|
121
|
+
v1a = v1.kind_of?(RGFA::FieldArray) ? v1.sort : [v1]
|
122
|
+
v2a = v2.kind_of?(RGFA::FieldArray) ? v2.sort : [v2]
|
123
|
+
t1 = v1.kind_of?(RGFA::FieldArray) ? v1.datatype : h1.get_datatype(k)
|
124
|
+
t2 = v2.kind_of?(RGFA::FieldArray) ? v2.datatype : h2.get_datatype(k)
|
125
|
+
m1 = v1.kind_of?(RGFA::FieldArray) ? "multivalue/" : ""
|
126
|
+
m2 = v2.kind_of?(RGFA::FieldArray) ? "multivalue/" : ""
|
127
|
+
if out_script
|
128
|
+
if t1 != t2 or v1a != v2a
|
129
|
+
puts "gfa.header.delete(#{k.inspect})"
|
130
|
+
v2a.each do |v2|
|
131
|
+
v2 = v2.to_gfa_field(datatype: t2)
|
132
|
+
puts "gfa.header.add(#{k.inspect}, #{v2.inspect}, "+
|
133
|
+
"#{t2.inspect})"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
else
|
137
|
+
if t1 != t2
|
138
|
+
v1a.each do |v1|
|
139
|
+
v1 = v1.to_gfa_field(datatype: t1)
|
140
|
+
puts "<\t[headers/typediff/#{m1}#{k}#{}]\t#{v1}"
|
141
|
+
end
|
142
|
+
v2a.each do |v2|
|
143
|
+
v2 = v2.to_gfa_field(datatype: t2)
|
144
|
+
puts ">\t[headers/typediff/#{m2}#{k}]\t#{v2}"
|
145
|
+
end
|
146
|
+
else
|
147
|
+
(v1a-v2a).each do |v1|
|
148
|
+
v1 = v1.to_gfa_field(datatype: t1)
|
149
|
+
puts "<\t[headers/valuediff/#{m1}#{k}]\t#{v1}"
|
150
|
+
end
|
151
|
+
(v2a-v1a).each do |v2|
|
152
|
+
v2 = v2.to_gfa_field(datatype: t2)
|
153
|
+
puts ">\t[headers/valuediff/#{m2}#{k}]\t#{v2}"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
if out_script
|
159
|
+
puts
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def diff_segments_or_paths(gfa1,gfa2,rt,out_script,out_identical)
|
165
|
+
rts = rt + "s"
|
166
|
+
rtsU = rts[0].upcase + rts[1..-1]
|
167
|
+
s1names = gfa1.send("#{rt}_names").sort
|
168
|
+
s2names = gfa2.send("#{rt}_names").sort
|
169
|
+
difffound = false
|
170
|
+
if out_script
|
171
|
+
puts "# #{rtsU}"
|
172
|
+
puts
|
173
|
+
end
|
174
|
+
(s1names - s2names).each do |sn|
|
175
|
+
difffound = true
|
176
|
+
segstr = gfa1.send(rt,sn).to_s
|
177
|
+
if out_script
|
178
|
+
puts "gfa.rm(#{sn.inspect})"
|
179
|
+
else
|
180
|
+
puts "<\t[#{rts}/exclusive]\t#{segstr}"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
(s2names - s1names).each do |sn|
|
184
|
+
difffound = true
|
185
|
+
segstr = gfa2.send(rt,sn).to_s
|
186
|
+
if out_script
|
187
|
+
puts "gfa << #{segstr.inspect}"
|
188
|
+
else
|
189
|
+
puts ">\t[#{rts}/exclusive]\t#{segstr}"
|
190
|
+
end
|
191
|
+
end
|
192
|
+
(s1names & s2names).each do |sn|
|
193
|
+
s1 = gfa1.send(rt,sn)
|
194
|
+
s2 = gfa2.send(rt,sn)
|
195
|
+
s1.required_fieldnames.each do |fn|
|
196
|
+
v1 = s1.field_to_s(fn)
|
197
|
+
v2 = s2.field_to_s(fn)
|
198
|
+
if v1 != v2
|
199
|
+
difffound = true
|
200
|
+
if out_script
|
201
|
+
puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v2.inspect}"
|
202
|
+
else
|
203
|
+
puts "<\t[#{rts}/reqfields/valuediff/#{sn}]\t#{v1}"
|
204
|
+
puts ">\t[#{rts}/reqfields/valuediff/#{sn}]\t#{v2}"
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
s1f = s1.optional_fieldnames
|
209
|
+
s2f = s2.optional_fieldnames
|
210
|
+
(s1f - s2f).each do |fn|
|
211
|
+
difffound = true
|
212
|
+
if out_script
|
213
|
+
puts "gfa.#{rt}(#{sn.inspect}).delete_field(#{fn.inspect})"
|
214
|
+
else
|
215
|
+
v = s1.field_to_s(fn, optfield: true)
|
216
|
+
puts "<\t[#{rts}/optfields/exclusive/#{sn}]\t#{v}"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
(s2f - s1f).each do |fn|
|
220
|
+
difffound = true
|
221
|
+
if out_script
|
222
|
+
v = s2.get(fn)
|
223
|
+
t = s2.get_datatype(fn)
|
224
|
+
puts "gfa.#{rt}(#{sn.inspect}).set_datatype(#{fn.inspect},#{t})"
|
225
|
+
puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v.inspect}"
|
226
|
+
else
|
227
|
+
v = s2.field_to_s(fn, optfield: true)
|
228
|
+
puts ">\t[#{rts}/optfields/exclusive/#{sn}]\t#{v}"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
(s1f & s2f).each do |fn|
|
232
|
+
v1 = s1.field_to_s(fn, optfield: true)
|
233
|
+
v2 = s2.field_to_s(fn, optfield: true)
|
234
|
+
if v1 != v2
|
235
|
+
difffound = true
|
236
|
+
if out_script
|
237
|
+
v = s2.get(fn)
|
238
|
+
t = s2.get_datatype(fn)
|
239
|
+
puts "gfa.#{rt}(#{sn.inspect}).set_datatype(#{fn.inspect},#{t})"
|
240
|
+
puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v.inspect}"
|
241
|
+
else
|
242
|
+
puts "<\t[#{rts}/optfields/valuediff/#{sn}]\t#{v1}"
|
243
|
+
puts ">\t[#{rts}/optfields/valuediff/#{sn}]\t#{v2}"
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
if !difffound
|
249
|
+
if out_script
|
250
|
+
puts "# ... are identical"
|
251
|
+
elsif out_identical
|
252
|
+
puts "# #{rtsU} are identical"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
puts if out_script
|
256
|
+
end
|
257
|
+
|
258
|
+
if rt.include?("-s")
|
259
|
+
diff_segments_or_paths(gfa1,gfa2, "segment",out_script,out_identical)
|
260
|
+
end
|
261
|
+
|
262
|
+
# TODO: diff of single optfields
|
263
|
+
if rt.include?("-l")
|
264
|
+
difffound = false
|
265
|
+
s1names = gfa1.segment_names.sort
|
266
|
+
s2names = gfa2.segment_names.sort
|
267
|
+
if out_script
|
268
|
+
puts "# Links"
|
269
|
+
puts
|
270
|
+
end
|
271
|
+
difflinks1 = []
|
272
|
+
(s1names - s2names).each do |sn|
|
273
|
+
difffound = true
|
274
|
+
[:B, :E].each {|et| difflinks1 += gfa1.links_of([sn, et])}
|
275
|
+
end
|
276
|
+
difflinks1.uniq.each do |l|
|
277
|
+
if !out_script
|
278
|
+
puts "<\t[links/exclusive_segments]\t#{l.to_s}"
|
279
|
+
end
|
280
|
+
end
|
281
|
+
difflinks2 = []
|
282
|
+
(s2names - s1names).each do |sn|
|
283
|
+
difffound = true
|
284
|
+
[:B, :E].each {|et| difflinks2 += gfa2.links_of([sn, et])}
|
285
|
+
end
|
286
|
+
difflinks2.uniq.each do |l|
|
287
|
+
if out_script
|
288
|
+
puts "gfa << #{l.to_s.inspect}"
|
289
|
+
else
|
290
|
+
puts ">\t[links/exclusive_segments]\t#{l.to_s}"
|
291
|
+
end
|
292
|
+
end
|
293
|
+
difflinks1b = []
|
294
|
+
difflinks2b = []
|
295
|
+
(s1names & s2names).each do |sn|
|
296
|
+
[:B, :E].each do |et|
|
297
|
+
l1 = gfa1.links_of([sn, et])
|
298
|
+
l2 = gfa2.links_of([sn, et])
|
299
|
+
d1 = l1 - l2
|
300
|
+
d2 = l2 - l1
|
301
|
+
if !d1.empty?
|
302
|
+
difffound = true
|
303
|
+
difflinks1b += d1
|
304
|
+
end
|
305
|
+
if !d2.empty?
|
306
|
+
difffound = true
|
307
|
+
difflinks2b += d2
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
(difflinks1b-difflinks1).uniq.each do |l|
|
312
|
+
if out_script
|
313
|
+
puts "gfa.rm(gfa.link_from_to(#{l.from.to_sym.inspect}, "+
|
314
|
+
"#{l.from_orient.inspect}, "+
|
315
|
+
"#{l.to.to_sym.inspect}, "+
|
316
|
+
"#{l.to_orient.inspect}, "+
|
317
|
+
"#{l.overlap.to_s.inspect}.to_cigar))"
|
318
|
+
else
|
319
|
+
puts "<\t[links/different]\t#{l.to_s}"
|
320
|
+
end
|
321
|
+
end
|
322
|
+
(difflinks2b-difflinks2).uniq.each do |l|
|
323
|
+
if out_script
|
324
|
+
puts "gfa << #{l.to_s.inspect}"
|
325
|
+
else
|
326
|
+
puts ">\t[links/different]\t#{l.to_s}"
|
327
|
+
end
|
328
|
+
end
|
329
|
+
if !difffound
|
330
|
+
if out_script
|
331
|
+
puts "# ... are identical"
|
332
|
+
elsif out_identical
|
333
|
+
puts "# Links are identical"
|
334
|
+
end
|
335
|
+
end
|
336
|
+
puts if out_script
|
337
|
+
end
|
338
|
+
|
339
|
+
# TODO: this code is similar to -l; make generic and merge
|
340
|
+
if rt.include?("-c")
|
341
|
+
difffound = false
|
342
|
+
s1names = gfa1.segment_names.sort
|
343
|
+
s2names = gfa2.segment_names.sort
|
344
|
+
cexcl1 = []
|
345
|
+
(s1names - s2names).each do |sn|
|
346
|
+
difffound = true
|
347
|
+
cexcl1 += gfa1.contained_in(sn)
|
348
|
+
cexcl1 += gfa1.containing(sn)
|
349
|
+
end
|
350
|
+
cexcl1.uniq.each do |c|
|
351
|
+
if !out_script
|
352
|
+
puts "<\t[contaiments/exclusive_segments]\t#{c.to_s}"
|
353
|
+
end
|
354
|
+
end
|
355
|
+
cexcl2 = []
|
356
|
+
(s2names - s1names).each do |sn|
|
357
|
+
difffound = true
|
358
|
+
cexcl2 += gfa2.contained_in(sn)
|
359
|
+
cexcl2 += gfa2.containing(sn)
|
360
|
+
end
|
361
|
+
cexcl2.uniq.each do |c|
|
362
|
+
if out_script
|
363
|
+
puts "gfa << #{c.to_s.inspect}"
|
364
|
+
else
|
365
|
+
puts ">\t[contaiments/exclusive_segments]\t#{c.to_s}"
|
366
|
+
end
|
367
|
+
end
|
368
|
+
cdiff1 = []
|
369
|
+
cdiff2 = []
|
370
|
+
(s1names & s2names).each do |sn|
|
371
|
+
c1 = gfa1.contained_in(sn)
|
372
|
+
c2 = gfa2.contained_in(sn)
|
373
|
+
c1 += gfa1.containing(sn)
|
374
|
+
c2 += gfa2.containing(sn)
|
375
|
+
d1 = c1 - c2
|
376
|
+
d2 = c2 - c1
|
377
|
+
if !d1.empty?
|
378
|
+
difffound = true
|
379
|
+
cdiff1 += d1
|
380
|
+
end
|
381
|
+
if !d2.empty?
|
382
|
+
difffound = true
|
383
|
+
cdiff2 += d2
|
384
|
+
end
|
385
|
+
end
|
386
|
+
(cdiff1-cexcl1).uniq.each do |l|
|
387
|
+
if out_script
|
388
|
+
# TODO: handle multiple containments for a segments pair
|
389
|
+
puts "gfa.rm(gfa.containment(#{l.from.to_sym.inspect}, "+
|
390
|
+
"#{l.to.to_sym.inspect}))"
|
391
|
+
else
|
392
|
+
puts "<\t[containments/different]\t#{l.to_s}"
|
393
|
+
end
|
394
|
+
end
|
395
|
+
(cdiff2-cexcl2).uniq.each do |l|
|
396
|
+
if out_script
|
397
|
+
puts "gfa << #{l.to_s.inspect}"
|
398
|
+
else
|
399
|
+
puts ">\t[containments/different]\t#{l.to_s}"
|
400
|
+
end
|
401
|
+
end
|
402
|
+
if !difffound
|
403
|
+
if out_script
|
404
|
+
puts "# ... are identical"
|
405
|
+
elsif out_identical
|
406
|
+
puts "# Containments are identical"
|
407
|
+
end
|
408
|
+
end
|
409
|
+
puts if out_script
|
410
|
+
end
|
411
|
+
|
412
|
+
if rt.include?("-p")
|
413
|
+
diff_segments_or_paths(gfa1,gfa2,"path",out_script,out_identical)
|
414
|
+
end
|
415
|
+
|
416
|
+
if out_script
|
417
|
+
puts
|
418
|
+
puts "# Output graph"
|
419
|
+
puts "puts gfa"
|
420
|
+
end
|
@@ -0,0 +1,208 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rgfatools"
|
4
|
+
|
5
|
+
# crisprs have a structure ARU1RU..RUnRB where |U|~|R| in [24..50]
|
6
|
+
|
7
|
+
$debugmode = false
|
8
|
+
$spacersonly = false
|
9
|
+
|
10
|
+
class RGFA
|
11
|
+
|
12
|
+
def find_crisprs(minrepeats=3,minlen=24,maxlen=50)
|
13
|
+
ls = {}
|
14
|
+
segment_names.each do |sn|
|
15
|
+
s = segment(sn)
|
16
|
+
s.cn = (s.coverage(unit_length: @default[:unit_length],
|
17
|
+
count_tag: @default[:count_tag])/2).round
|
18
|
+
end
|
19
|
+
output_segment_infos if $debugmode
|
20
|
+
maxvisits_global = {:B => {}, :E => {}}
|
21
|
+
segment_names.each do |sn|
|
22
|
+
s = segment(sn)
|
23
|
+
next if s.length < minlen or s.length > maxlen
|
24
|
+
next if s.cn < minrepeats
|
25
|
+
circles = {}
|
26
|
+
linear = {}
|
27
|
+
maxvisits = {}
|
28
|
+
[:B, :E].each do |rt|
|
29
|
+
maxvisits[rt] = maxvisits_global[rt].dup
|
30
|
+
maxvisits[rt][sn] ||= s.cn
|
31
|
+
circles[rt] = []
|
32
|
+
linear[rt] = []
|
33
|
+
segment_end = [s, rt].to_segment_end
|
34
|
+
links_of(segment_end).each do |l|
|
35
|
+
search_circle(segment_end.invert_end_type,
|
36
|
+
segment_end,
|
37
|
+
l,
|
38
|
+
maxvisits[rt],0,
|
39
|
+
minlen,
|
40
|
+
maxlen*2+s.length,
|
41
|
+
[segment_end],
|
42
|
+
circles[rt],
|
43
|
+
linear[rt])
|
44
|
+
end
|
45
|
+
if maxvisits[rt][sn.to_sym] > 0
|
46
|
+
multi = {:l => [], :c => []}
|
47
|
+
[[linear[rt],:l], [circles[rt],:c]].each do |paths, pt|
|
48
|
+
paths.each do |c|
|
49
|
+
min_mv = s.cn
|
50
|
+
upto = (pt == :l ? -1 : -2)
|
51
|
+
c[0..upto].each do |csn, et|
|
52
|
+
mv = maxvisits[rt][csn.to_sym]
|
53
|
+
if mv < min_mv
|
54
|
+
min_mv = mv
|
55
|
+
end
|
56
|
+
end
|
57
|
+
if min_mv > 0
|
58
|
+
min_mv.times { multi[pt] << c.dup }
|
59
|
+
c[0..upto].each do |csn, et|
|
60
|
+
maxvisits[rt][csn.to_sym] -= min_mv
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
circles[rt] += multi[:c]
|
66
|
+
linear[rt] += multi[:l]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
n_paths = (circles[:E].size+circles[:B].size+
|
70
|
+
linear[:E].size+linear[:B].size)
|
71
|
+
if (circles[:E].size - circles[:B].size).abs > 1
|
72
|
+
next
|
73
|
+
end
|
74
|
+
if (linear[:E].size - linear[:B].size).abs > 0
|
75
|
+
next
|
76
|
+
end
|
77
|
+
if linear[:E].size != 1
|
78
|
+
next
|
79
|
+
end
|
80
|
+
merged_circles = []
|
81
|
+
circles[:E].each {|c|merged_circles << merge_crisprs_path(c,s,:E)}
|
82
|
+
before = merge_crisprs_path(linear[:B].first,s,:B)
|
83
|
+
after = merge_crisprs_path(linear[:E].first,s,:E)
|
84
|
+
next if merged_circles.size < minrepeats
|
85
|
+
maxvisits_global = maxvisits
|
86
|
+
instances = 1
|
87
|
+
possible_instances = 0
|
88
|
+
merged_circles.each do |seq|
|
89
|
+
if seq.length > s.length + minlen
|
90
|
+
possible_instances += 1
|
91
|
+
end
|
92
|
+
instances += 1
|
93
|
+
end
|
94
|
+
if $spacersonly
|
95
|
+
puts merged_circles.sort.map(&:upcase)
|
96
|
+
else
|
97
|
+
puts "CRISP signature found in segment #{s.name}"
|
98
|
+
puts
|
99
|
+
puts " Before: sequence = ...#{before[-50..-1]}"
|
100
|
+
puts
|
101
|
+
if possible_instances > 0
|
102
|
+
instances = "#{instances}..#{instances+possible_instances}"
|
103
|
+
end
|
104
|
+
puts " Repeat: instances = #{instances}; "+
|
105
|
+
"length = #{s.length};\t"+
|
106
|
+
"sequence = #{s.sequence}"
|
107
|
+
puts
|
108
|
+
puts " Spacers:"
|
109
|
+
asterisk = false
|
110
|
+
merged_circles.each_with_index do |seq, i|
|
111
|
+
if seq.length > s.length + minlen
|
112
|
+
str = "=#{s.length}+2*#{(seq.length.to_f - s.length)/2}"
|
113
|
+
asterisk = true
|
114
|
+
this_asterisk = true
|
115
|
+
else
|
116
|
+
str = ""
|
117
|
+
this_asterisk = false
|
118
|
+
end
|
119
|
+
puts " (#{i+1}#{this_asterisk ? "*" : ""})\t"+
|
120
|
+
"length = #{seq.length}#{str};\tsequence = #{seq}"
|
121
|
+
end
|
122
|
+
if asterisk
|
123
|
+
puts
|
124
|
+
puts " * = possibly containing inexact repeat instance"
|
125
|
+
end
|
126
|
+
puts
|
127
|
+
puts "After: sequence = #{after[0..49]}..."
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def output_segment_infos
|
135
|
+
segment_names.each do |sn|
|
136
|
+
s = segment(sn)
|
137
|
+
puts "#{s.name}\t#{s.cn}\t"+
|
138
|
+
"#{neighbours([s.name,:B]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
|
139
|
+
"#{neighbours([s.name,:E]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
|
140
|
+
"#{links_of([s.name,:B]).size}\t"+
|
141
|
+
"#{links_of([s.name,:E]).size}\t"+
|
142
|
+
"#{s.KC}\t#{s.length}"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def merge_crisprs_path(segpath, repeat, repeat_end)
|
147
|
+
merged = create_merged_segment(segpath, merged_name: :short,
|
148
|
+
disable_tracking: true)[0]
|
149
|
+
sequence = merged.sequence[repeat.
|
150
|
+
sequence.length..-(1+repeat.sequence.length)]
|
151
|
+
sequence = sequence.rc if repeat_end == :B
|
152
|
+
return sequence
|
153
|
+
end
|
154
|
+
|
155
|
+
def search_circle(goal, from, l, maxvisits, dist, mindist,
|
156
|
+
maxdist, path, circles, linear)
|
157
|
+
dest = l.other_end(from)
|
158
|
+
dest.segment = segment(dest.segment)
|
159
|
+
maxvisits[dest.name] ||= dest.segment.cn
|
160
|
+
se = dest.invert_end_type
|
161
|
+
if dest == goal
|
162
|
+
return if dist < mindist
|
163
|
+
new_path = path.dup
|
164
|
+
new_path << se
|
165
|
+
new_path[0..-2].each {|x| maxvisits[x.name] -= 1}
|
166
|
+
circles << new_path
|
167
|
+
return
|
168
|
+
end
|
169
|
+
return if maxvisits[dest.name] == 0
|
170
|
+
return if path.any?{|x|x.name==dest.name}
|
171
|
+
new_path = path.dup
|
172
|
+
new_path << se
|
173
|
+
dist += dest.segment.length - l.overlap.first.len
|
174
|
+
if dist > maxdist
|
175
|
+
new_path = path.dup
|
176
|
+
new_path << se
|
177
|
+
new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
|
178
|
+
linear << new_path
|
179
|
+
return
|
180
|
+
end
|
181
|
+
ls = links_of(se)
|
182
|
+
if ls.size == 0
|
183
|
+
new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
|
184
|
+
linear << new_path
|
185
|
+
return
|
186
|
+
end
|
187
|
+
ls.each do |next_l|
|
188
|
+
next_dest = segment(next_l.other_end(se).segment)
|
189
|
+
maxvisits[next_dest.name] ||= next_dest.cn
|
190
|
+
next if maxvisits[next_dest.name] == 0
|
191
|
+
search_circle(goal,se,next_l,maxvisits,dist,mindist,maxdist,new_path,
|
192
|
+
circles,linear)
|
193
|
+
end
|
194
|
+
return
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
if (ARGV.size == 0)
|
200
|
+
STDERR.puts "Usage: #$0 <gfa>"
|
201
|
+
exit 1
|
202
|
+
end
|
203
|
+
gfa = RGFA.from_file(ARGV[0])
|
204
|
+
gfa.set_default_count_tag(:KC)
|
205
|
+
gfa.header.ks ||= gfa.segments[0].length + 1
|
206
|
+
gfa.set_count_unit_length(gfa.header.ks-1)
|
207
|
+
gfa.find_crisprs
|
208
|
+
|