rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,87 @@
1
+ # A containment line of a RGFA file
2
+ class RGFA::Line::Containment < RGFA::Line
3
+
4
+ RECORD_TYPE = :C
5
+ REQFIELDS = [:from, :from_orient, :to, :to_orient, :pos, :overlap]
6
+ PREDEFINED_OPTFIELDS = [:MQ, :NM]
7
+ DATATYPE = {
8
+ :from => :lbl,
9
+ :from_orient => :orn,
10
+ :to => :lbl,
11
+ :to_orient => :orn,
12
+ :pos => :pos,
13
+ :overlap => :cig,
14
+ :MQ => :i,
15
+ :NM => :i,
16
+ }
17
+
18
+ define_field_methods!
19
+
20
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
21
+ # from/from_orient fields
22
+ def oriented_from
23
+ [from, from_orient].to_oriented_segment
24
+ end
25
+
26
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
27
+ # to/to_orient fields
28
+ def oriented_to
29
+ [to, to_orient].to_oriented_segment
30
+ end
31
+
32
+ # The from segment name, in both cases where from is a segment name (Symbol)
33
+ # or a segment (RGFA::Line::Segment)
34
+ # @return [Symbol]
35
+ def from_name
36
+ from.to_sym
37
+ end
38
+
39
+ # The to segment name, in both cases where to is a segment name (Symbol)
40
+ # or a segment (RGFA::Line::Segment)
41
+ # @return [Symbol]
42
+ def to_name
43
+ to.to_sym
44
+ end
45
+
46
+ # @return [Integer,nil] the rightmost 0-based coordinate of the contained
47
+ # sequence in the container; nil if the overlap is unspecified
48
+ def rpos
49
+ return nil if overlap.empty?
50
+ rpos = pos
51
+ overlap.each do |op|
52
+ if [:M, :D].include?(op.code)
53
+ rpos += op.len
54
+ end
55
+ end
56
+ return rpos
57
+ end
58
+
59
+ # Returns true if the containment is normal, false otherwise
60
+ #
61
+ # <b> Definition of normal containment </b>
62
+ #
63
+ # Each containment has an equivalent reverse containment.
64
+ # Consider a containment of B (length:8) in A (length:100) at position 9 of A
65
+ # with a cigar 1M1I2M3D4M (i.e. rpos = 19).
66
+ #
67
+ # A+ B+ 1M1I2M3D4M 9 == A- B- 4M3D2M1I1M 80
68
+ # A+ B- 1M1I2M3D4M 9 == A- B+ 4M3D2M1I1M 80
69
+ # A- B+ 1M1I2M3D4M 9 == A+ B- 4M3D2M1I1M 80
70
+ # A- B- 1M1I2M3D4M 9 == A+ B+ 4M3D2M1I1M 80
71
+ #
72
+ # Pos in the reverse is equal to the length of A minus the right pos
73
+ # of B before reversing.
74
+ #
75
+ # We require here that A != B as A == B makes no sense for containments.
76
+ # Thus it is always possible to express the containment using a positive
77
+ # from orientation.
78
+ #
79
+ # For this reason the normality is simply defined as + from orientation.
80
+ #
81
+ # @return [Boolean]
82
+ #
83
+ def normal?
84
+ from_orient == :+
85
+ end
86
+
87
+ end
@@ -0,0 +1,92 @@
1
+ # A header line of a RGFA file
2
+ #
3
+ # For examples on how to set the header data, see {RGFA::Headers}.
4
+ #
5
+ # @see RGFA::Line
6
+ class RGFA::Line::Header < RGFA::Line
7
+
8
+ RECORD_TYPE = :H
9
+ REQFIELDS = []
10
+ PREDEFINED_OPTFIELDS = [:VN]
11
+ DATATYPE = {
12
+ :VN => :Z
13
+ }
14
+
15
+ define_field_methods!
16
+
17
+ # Set a header value (multi-value compatible).
18
+ #
19
+ # If a field does not exist yet, set it to value. If it exists and it is a
20
+ # {RGFA::FieldArray}, add the value to the field array. If it exists and it
21
+ # is not a field array, create a field array with the previous value and
22
+ # the new one
23
+ # @param fieldname [Symbol]
24
+ # @param value [Object]
25
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use;
26
+ # the default is to determine the datatype according to the value or the
27
+ # previous values present int the field
28
+ def add(fieldname, value, datatype=nil)
29
+ fieldname = fieldname.to_sym
30
+ prev = get(fieldname)
31
+ if prev.nil?
32
+ set_datatype(fieldname, datatype) if datatype
33
+ set(fieldname, value)
34
+ return self
35
+ elsif !prev.kind_of?(RGFA::FieldArray)
36
+ prev = RGFA::FieldArray.new(get_datatype(fieldname), [prev])
37
+ set_datatype(fieldname, :J)
38
+ set(fieldname,prev)
39
+ end
40
+ prev.push_with_validation(value, datatype, fieldname)
41
+ return self
42
+ end
43
+
44
+ # Array of optional tags data.
45
+ #
46
+ # Returns the optional fields as an array of [fieldname, datatype, value]
47
+ # arrays. If a field is a FieldArray, this is splitted into multiple fields
48
+ # with the same fieldname.
49
+ # @return [Array<(Symbol, Symbol, Object)>]
50
+ # @api private
51
+ def tags
52
+ retval = []
53
+ optional_fieldnames.each do |of|
54
+ value = get(of)
55
+ if value.kind_of?(RGFA::FieldArray)
56
+ value.each do |elem|
57
+ retval << [of, value.datatype, elem]
58
+ end
59
+ else
60
+ retval << [of, get_datatype(of), value]
61
+ end
62
+ end
63
+ return retval
64
+ end
65
+
66
+ # Split the header line into single-tag lines.
67
+ #
68
+ # If a tag is a FieldArray, this is splitted into multiple fields
69
+ # with the same fieldname.
70
+ # @return [Array<RGFA::Line::Header>]
71
+ # @api private
72
+ def split
73
+ tags.map do |tagname, datatype, value|
74
+ h = RGFA::Line::Header.new([], validate: @validate)
75
+ h.set_datatype(tagname, datatype)
76
+ h.set(tagname, value)
77
+ h
78
+ end
79
+ end
80
+
81
+ # Merge an additional {RGFA::Line::Header} line into this header line.
82
+ # @param gfa_line [RGFA::Line::Header] the header line to merge
83
+ # @return [self]
84
+ # @api private
85
+ def merge(gfa_line)
86
+ gfa_line.optional_fieldnames.each do |of|
87
+ add(of, gfa_line.get(of), gfa_line.get_datatype(of))
88
+ end
89
+ self
90
+ end
91
+
92
+ end
@@ -0,0 +1,379 @@
1
+ # A link connects two segments, or a segment to itself.
2
+ #
3
+ class RGFA::Line::Link < RGFA::Line
4
+
5
+ RECORD_TYPE = :L
6
+ REQFIELDS = [:from, :from_orient, :to, :to_orient, :overlap]
7
+ PREDEFINED_OPTFIELDS = [:MQ, :NM, :RC, :FC, :KC]
8
+ DATATYPE = {
9
+ :from => :lbl,
10
+ :from_orient => :orn,
11
+ :to => :lbl,
12
+ :to_orient => :orn,
13
+ :overlap => :cig,
14
+ :MQ => :i,
15
+ :NM => :i,
16
+ :RC => :i,
17
+ :FC => :i,
18
+ :KC => :i,
19
+ }
20
+
21
+ define_field_methods!
22
+
23
+ # The other segment of a link
24
+ # @param segment [RGFA::Line::Segment, Symbol] segment name or instance
25
+ # @raise [RGFA::LineMissingError]
26
+ # if segment is not involved in the link
27
+ # @return [Symbol] the name of the other segment of the link
28
+ # if circular, then +segment+
29
+ def other(segment)
30
+ segment_name =
31
+ (segment.kind_of?(RGFA::Line::Segment) ? segment.name : segment.to_sym)
32
+ if segment_name == from.to_sym
33
+ to
34
+ elsif segment_name == to.to_sym
35
+ from
36
+ else
37
+ raise RGFA::LineMissingError,
38
+ "Link #{self} does not involve segment #{segment_name}"
39
+ end
40
+ end
41
+
42
+ # @return [Boolean] is the from and to segments are equal
43
+ def circular?
44
+ from.to_sym == to.to_sym
45
+ end
46
+
47
+ # @return [Boolean] is the from and to segments are equal
48
+ def circular_same_end?
49
+ from_end == to_end
50
+ end
51
+
52
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
53
+ # from/from_orient fields
54
+ def oriented_from
55
+ [from, from_orient].to_oriented_segment
56
+ end
57
+
58
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
59
+ # to/to_orient fields
60
+ def oriented_to
61
+ [to, to_orient].to_oriented_segment
62
+ end
63
+
64
+ # @return [RGFA::SegmentEnd] the segment end represented by the
65
+ # from/from_orient fields
66
+ def from_end
67
+ [from, from_orient == :+ ? :E : :B].to_segment_end
68
+ end
69
+
70
+ # @return [RGFA::SegmentEnd] the segment end represented by the
71
+ # to/to_orient fields
72
+ def to_end
73
+ [to, to_orient == :+ ? :B : :E].to_segment_end
74
+ end
75
+
76
+ # Signature of the segment ends, for debugging
77
+ # @api private
78
+ def segment_ends_s
79
+ [from_end.to_s, to_end.to_s].join("---")
80
+ end
81
+
82
+ # @param segment_end [RGFA::SegmentEnd] one of the two segment ends
83
+ # of the link
84
+ # @return [RGFA::SegmentEnd] the other segment end
85
+ #
86
+ # @raise [ArgumentError] if segment_end is not a valid segment end
87
+ # representation
88
+ # @raise [RuntimeError] if segment_end is not a segment end of the link
89
+ def other_end(segment_end)
90
+ segment_end = segment_end.to_segment_end
91
+ if (from_end == segment_end)
92
+ return to_end
93
+ elsif (to_end == segment_end)
94
+ return from_end
95
+ else
96
+ raise "Segment end '#{segment_end.inspect}' not found\n"+
97
+ "(from=#{from_end.inspect};to=#{to_end.inspect})"
98
+ end
99
+ end
100
+
101
+ # The from segment name, in both cases where from is a segment name (Symbol)
102
+ # or a segment (RGFA::Line::Segment)
103
+ # @return [Symbol]
104
+ def from_name
105
+ from.to_sym
106
+ end
107
+
108
+ # The to segment name, in both cases where to is a segment name (Symbol)
109
+ # or a segment (RGFA::Line::Segment)
110
+ # @return [Symbol]
111
+ def to_name
112
+ to.to_sym
113
+ end
114
+
115
+ # Returns true if the link is normal, false otherwise
116
+ #
117
+ # == Definition of normal link
118
+ #
119
+ # Each link has an equivalent reverse link. Consider a link of A to B
120
+ # with a overlap 1M1I2M:
121
+ #
122
+ # from+ to to+ (1M1I2M) == to- to from- (2M1D1M)
123
+ # from- to to- (1M1I2M) == to+ to from+ (2M1D1M)
124
+ # from+ to to- (1M1I2M) == to+ to from- (2M1D1M)
125
+ # from- to to+ (1M1I2M) == to- to from+ (2M1D1M)
126
+ #
127
+ # Consider also the special case, where from == to and the overlap is not
128
+ # specified, or equal to its reverse:
129
+ #
130
+ # from+ to from+ (*) == from- to from- (*) # left has a +; right has no +
131
+ # from- to from- (*) == from+ to from+ (*) # left has no +; right has a +
132
+ # from+ to from- (*) == from+ to from- (*) # left == right
133
+ # from- to from+ (*) == from- to from+ (*) # left == right
134
+ #
135
+ # Thus we define a link as normal if:
136
+ # - from < to (lexicographical comparison of segments)
137
+ # - from == to and overlap.to_s < reverse_overlap.to_s
138
+ # - from == to, overlap == reverse_overlap and at least one orientation is +
139
+ #
140
+ # @return [Boolean]
141
+ #
142
+ def normal?
143
+ if from_name < to_name
144
+ return true
145
+ elsif from_name > to_name
146
+ return false
147
+ else
148
+ overlap_s = overlap.to_s
149
+ reverse_overlap_s = reverse_overlap.to_s
150
+ if overlap_s < reverse_overlap_s
151
+ return true
152
+ elsif overlap_s > reverse_overlap_s
153
+ return false
154
+ else
155
+ return [from_orient, to_orient].include?(:+)
156
+ end
157
+ end
158
+ end
159
+
160
+ # Returns the unchanged link if the link is normal,
161
+ # otherwise reverses the link and returns it.
162
+ #
163
+ # @note The path references are not corrected by this method; therefore
164
+ # the method shall be used before the link is embedded in a graph.
165
+ #
166
+ # @return [RGFA::Line::Link] self
167
+ def normalize!
168
+ reverse! if !normal?
169
+ end
170
+
171
+ # Creates a link with both strands of the sequences inverted.
172
+ # The CIGAR operations (order/type) are inverted as well.
173
+ # Optional fields are left unchanged.
174
+ #
175
+ # @note The path references are not copied to the reverse link.
176
+ #
177
+ # @note This method shall be overridden if custom optional fields
178
+ # are defined, which have a ``reverse'' operation which determines
179
+ # their value in the equivalent but reverse link.
180
+ #
181
+ # @return [RGFA::Line::Link] the inverted link.
182
+ def reverse
183
+ l = self.clone
184
+ l.from = to
185
+ l.from_orient = (to_orient == :+ ? :- : :+)
186
+ l.to = from
187
+ l.to_orient = (from_orient == :+ ? :- : :+)
188
+ l.overlap = reverse_overlap
189
+ l
190
+ end
191
+
192
+ # Reverses the link inplace, i.e. sets:
193
+ # from = to
194
+ # from_orient = other_orient(to_orient)
195
+ # to = from
196
+ # to_orient = other_orient(from_orient)
197
+ # overlap = reverse_overlap.
198
+ #
199
+ # The optional fields are left unchanged.
200
+ #
201
+ # @note The path references are not reversed by this method; therefore
202
+ # the method shall be used before the link is embedded in a graph.
203
+ #
204
+ # @note This method shall be overridden if custom optional fields
205
+ # are defined, which have a ``reverse'' operation which determines
206
+ # their value in the equivalent but reverse link.
207
+ #
208
+ # @return [RGFA::Line::Link] self
209
+ def reverse!
210
+ tmp = self.from
211
+ self.from = self.to
212
+ self.to = tmp
213
+ tmp = self.from_orient
214
+ self.from_orient = (self.to_orient == :+) ? :- : :+
215
+ self.to_orient = (tmp == :+) ? :- : :+
216
+ self.overlap = self.reverse_overlap
217
+ return self
218
+ end
219
+
220
+ # Paths for which the link is required.
221
+ #
222
+ # The return value is an empty array
223
+ # if the link is not embedded in a graph.
224
+ #
225
+ # Otherwise, an array of tuples path/boolean is returned.
226
+ # The boolean value tells
227
+ # if the link is used in direct (true) or reverse direction (false)
228
+ # in the path.
229
+ # @return [Array<Array<(RGFA::Line::Path, Boolean)>>]
230
+ def paths
231
+ @paths ||= []
232
+ @paths
233
+ end
234
+
235
+ # Compute the overlap when the strand of both sequences is inverted.
236
+ #
237
+ # @return [RGFA::CIGAR]
238
+ def reverse_overlap
239
+ self.overlap.reverse
240
+ end
241
+
242
+ #
243
+ # Compares two links and determine their equivalence.
244
+ # Thereby, optional fields are not considered.
245
+ #
246
+ # @note Inverting the strand of both links and reversing
247
+ # the CIGAR operations (order/type), one obtains a
248
+ # reverse but equivalent link.
249
+ #
250
+ # @param other [RGFA::Line::Link] a link
251
+ # @return [Boolean] are self and other equivalent?
252
+ # @see #==
253
+ # @see #same?
254
+ # @see #reverse?
255
+ def eql?(other)
256
+ same?(other) or reverse?(other)
257
+ end
258
+
259
+ # Compares the optional fields of two links.
260
+ #
261
+ # @note This method shall be overridden if custom optional fields
262
+ # are defined, which have a ``reverse'' operation which determines
263
+ # their value in the equivalent but reverse link.
264
+ #
265
+ # @param other [RGFA::Line::Link] a link
266
+ # @return [Boolean] are self and other equivalent?
267
+ # @see #==
268
+ def eql_optional?(other)
269
+ (self.optional_fieldnames.sort == other.optional_fieldnames.sort) and
270
+ optional_fieldnames.each {|fn| self.get(fn) == other.get(fn)}
271
+ end
272
+
273
+ # Compares two links and determine their equivalence.
274
+ # Optional fields must have the same content.
275
+ #
276
+ # @note Inverting the strand of both links and reversing
277
+ # the CIGAR operations (order/type), one obtains an equivalent
278
+ # link.
279
+ #
280
+ # @param other [RGFA::Line::Link] a link
281
+ # @return [Boolean] are self and other equivalent?
282
+ # @see #eql?
283
+ # @see #eql_optional?
284
+ #def ==(other)
285
+ # eql?(other) and eql_optional?(other)
286
+ #end
287
+
288
+ # Compares two links and determine their equivalence.
289
+ # Thereby, optional fields are not considered.
290
+ #
291
+ # @param other [RGFA::Line::Link] a link
292
+ # @return [Boolean] are self and other equivalent?
293
+ # @see #eql?
294
+ # @see #reverse?
295
+ # @see #==
296
+ def same?(other)
297
+ (from_end == other.from_end and
298
+ to_end == other.to_end and
299
+ overlap == other.overlap)
300
+ end
301
+
302
+ # Compares the reverse of the link to another link
303
+ # and determine their equivalence.
304
+ # Thereby, optional fields are not considered.
305
+ #
306
+ # @param other [RGFA::Line::Link] the other link
307
+ # @return [Boolean] are the reverse of self and other equivalent?
308
+ # @see #eql?
309
+ # @see #same?
310
+ # @see #==
311
+ def reverse?(other)
312
+ (from_end == other.to_end and
313
+ to_end == other.from_end and
314
+ overlap == other.reverse_overlap)
315
+ end
316
+
317
+ # Computes an hash for including a link in an Hash tables,
318
+ # so that the hash of a link and its reverse is the same.
319
+ # Thereby, optional fields are not considered.
320
+ # @see #eql?
321
+ def hash
322
+ from_end.hash + to_end.hash + overlap.hash + reverse_overlap.to_s.hash
323
+ end
324
+
325
+ # Compares a link and optionally the reverse link,
326
+ # with two oriented_segments and optionally an overlap.
327
+ # @param [RGFA::OrientedSegment] other_oriented_from
328
+ # @param [RGFA::OrientedSegment] other_oriented_to
329
+ # @param equivalent [Boolean] shall the reverse link also be considered?
330
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
331
+ # @return [Boolean] does the link or, if +equivalent+,
332
+ # the reverse link go from the first
333
+ # oriented segment to the second with an overlap equal to the provided one
334
+ # (if not empty)?
335
+ def compatible?(other_oriented_from, other_oriented_to, other_overlap = [],
336
+ equivalent = true)
337
+ other_overlap = other_overlap.to_cigar
338
+ is_direct = compatible_direct?(other_oriented_from, other_oriented_to,
339
+ other_overlap)
340
+ if is_direct
341
+ return true
342
+ elsif equivalent
343
+ return compatible_reverse?(other_oriented_from, other_oriented_to,
344
+ other_overlap)
345
+ else
346
+ return false
347
+ end
348
+ end
349
+
350
+ # Compares a link with two oriented segments and optionally an overlap.
351
+ # @param [RGFA::OrientedSegment] other_oriented_from
352
+ # @param [RGFA::OrientedSegment] other_oriented_to
353
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
354
+ # @return [Boolean] does the link go from the first
355
+ # oriented segment to the second with an overlap equal to the provided one
356
+ # (if not empty)?
357
+ def compatible_direct?(other_oriented_from, other_oriented_to,
358
+ other_overlap = [])
359
+ (oriented_from == other_oriented_from and
360
+ oriented_to == other_oriented_to) and
361
+ (overlap.empty? or other_overlap.empty? or (overlap == other_overlap))
362
+ end
363
+
364
+ # Compares the reverse link with two oriented segments and optionally an
365
+ # overlap.
366
+ # @param [RGFA::OrientedSegment] other_oriented_from
367
+ # @param [RGFA::OrientedSegment] other_oriented_to
368
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
369
+ # @return [Boolean] does the reverse link go from the first
370
+ # oriented segment to the second with an overlap equal to the provided one
371
+ # (if not empty)?
372
+ def compatible_reverse?(other_oriented_from, other_oriented_to,
373
+ other_overlap = [])
374
+ (oriented_to == other_oriented_from.invert_orient and
375
+ oriented_from == other_oriented_to.invert_orient) and
376
+ (overlap.empty? or other_overlap.empty? or (overlap == other_overlap))
377
+ end
378
+
379
+ end