rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,74 @@
1
+ require_relative "error.rb"
2
+
3
+ #
4
+ # Array of positive integers <= 255;
5
+ # representation of the data contained in an H field
6
+ #
7
+ class RGFA::ByteArray < Array
8
+
9
+ # Validates the byte array content
10
+ # @raise [RGFA::ByteArray::ValueError] if any value is not a
11
+ # positive integer <= 255
12
+ # @return [void]
13
+ def validate!
14
+ each do |x|
15
+ unless x.kind_of?(Integer) and (0..255).include?(x)
16
+ raise RGFA::ByteArray::ValueError,
17
+ "Value incompatible with byte array: #{x.inspect}\n"+
18
+ "in array: #{self.inspect}"
19
+ end
20
+ end
21
+ self.trust
22
+ return nil
23
+ end
24
+
25
+ # Returns self
26
+ # @return [RGFA::ByteArray] self
27
+ def to_byte_array
28
+ self
29
+ end
30
+
31
+ # GFA datatype H representation of the byte array
32
+ # @raise [RGFA::ByteArray::ValueError] if the
33
+ # array is not a valid byte array
34
+ # @return [String]
35
+ def to_s
36
+ validate!
37
+ map do |elem|
38
+ str = elem.to_s(16).upcase
39
+ elem < 16 ? "0#{str}" : str
40
+ end.join
41
+ end
42
+
43
+ end
44
+
45
+ # Exception raised if any value is not a positive integer <= 255
46
+ class RGFA::ByteArray::ValueError < RGFA::Error; end
47
+
48
+ # Exception raised if string is not a valid representation of byte array
49
+ class RGFA::ByteArray::FormatError < RGFA::Error; end
50
+
51
+ # Method to create a RGFA::ByteArray from an Array
52
+ class Array
53
+ # Create a RGFA::ByteArray from an Array instance
54
+ # @return [RGFA::ByteArray] the byte array
55
+ def to_byte_array
56
+ RGFA::ByteArray.new(self)
57
+ end
58
+ end
59
+
60
+ # Method to parse the string representation of a RGFA::ByteArray
61
+ class String
62
+ # Convert a GFA string representation of a byte array to a byte array
63
+ # @return [RGFA::ByteArray] the byte array
64
+ # @raise [RGFA::ByteArray::FormatError] if the string size is not > 0
65
+ # and even
66
+ def to_byte_array
67
+ if (size < 2) or (size % 2 == 1)
68
+ raise RGFA::ByteArray::FormatError,
69
+ "Invalid byte array string #{self}; "+
70
+ "each element must be represented by two letters [0-9A-F]"
71
+ end
72
+ scan(/..?/).map {|x|Integer(x,16)}.to_byte_array
73
+ end
74
+ end
@@ -0,0 +1,157 @@
1
+ require_relative "error.rb"
2
+
3
+ # Array of {RGFA::CIGAR::Operation CIGAR operations}.
4
+ # Represents the contents of a CIGAR string.
5
+ class RGFA::CIGAR < Array
6
+
7
+ # Compute the CIGAR for the segments in reverse direction.
8
+ #
9
+ # @example Reversing a CIGAR
10
+ #
11
+ # RGFA::CIGAR.from_string("2M1D3M").reverse.to_s
12
+ # # => "3M1I2M"
13
+ #
14
+ # # S1 + S2 + 2M1D3M
15
+ # #
16
+ # # S1+ ACGACTGTGA
17
+ # # S2+ CT-TGACGG
18
+ # #
19
+ # # S2- CCGTCA-AG
20
+ # # S1- TCACAGTCGT
21
+ # #
22
+ # # S2 - S1 - 3M1I2M
23
+ #
24
+ # @return [RGFA::CIGAR] (empty if CIGAR string is *)
25
+ def reverse
26
+ super.map do |op|
27
+ if op.code == :I
28
+ op.code = :D
29
+ elsif op.code == :D
30
+ op.code = :I
31
+ end
32
+ op
33
+ end
34
+ end
35
+
36
+ # Parse a CIGAR string into an array of CIGAR operations.
37
+ #
38
+ # Each operation is represented by a {RGFA::CIGAR::Operation},
39
+ # i.e. a tuple of operation length and operation
40
+ # symbol (one of MIDNSHPX=).
41
+ #
42
+ # @return [RGFA::CIGAR] (empty if string is *)
43
+ # @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
44
+ def self.from_string(str)
45
+ a = RGFA::CIGAR.new
46
+ if str != "*"
47
+ raise RGFA::CIGAR::ValueError if str !~ /^([0-9]+[MIDNSHPX=])+$/
48
+ str.scan(/[0-9]+[MIDNSHPX=]/).each do |op|
49
+ len = op[0..-2].to_i
50
+ code = op[-1..-1].to_sym
51
+ a << RGFA::CIGAR::Operation.new(len, code)
52
+ end
53
+ end
54
+ return a
55
+ end
56
+
57
+ # String representation of the CIGAR
58
+ # @return [String] CIGAR string
59
+ def to_s
60
+ if empty?
61
+ return "*"
62
+ else
63
+ map(&:to_s).join
64
+ end
65
+ end
66
+
67
+ # Validate the instance
68
+ # @raise if any component of the CIGAR array is invalid.
69
+ # @return [void]
70
+ def validate!
71
+ any? do |op|
72
+ op.to_cigar_operation.validate!
73
+ end
74
+ end
75
+
76
+ # @return [RGFA::CIGAR] self
77
+ def to_cigar
78
+ self
79
+ end
80
+
81
+ # Create a copy
82
+ # @return [RGFA::CIGAR]
83
+ def clone
84
+ map{|x|x.clone}
85
+ end
86
+
87
+ end
88
+
89
+ # Exception raised by invalid CIGAR string content
90
+ class RGFA::CIGAR::ValueError < RGFA::Error; end
91
+
92
+ # An operation in a CIGAR string
93
+ class RGFA::CIGAR::Operation
94
+ attr_accessor :len
95
+ attr_accessor :code
96
+
97
+ # CIGAR operation code
98
+ CODE = [:M, :I, :D, :N, :S, :H, :P, :X, :"="]
99
+
100
+ # @param len [Integer] length of the operation
101
+ # @param code [RGFA::CIGAR::Operation::CODE] code of the operation
102
+ def initialize(len, code)
103
+ @len = len
104
+ @code = code
105
+ end
106
+
107
+ # The string representation of the operation
108
+ # @return [String]
109
+ def to_s
110
+ "#{len}#{code}"
111
+ end
112
+
113
+ # Compare two operations
114
+ # @return [Boolean]
115
+ def ==(other)
116
+ other.len == len and other.code == code
117
+ end
118
+
119
+ # Validate the operation
120
+ # @return [void]
121
+ # @raise [RGFA::CIGAR::ValueError] if the code is invalid or the length is not
122
+ # an integer larger than zero
123
+ def validate!
124
+ if Integer(len) <= 0 or
125
+ !RGFA::CIGAR::Operation::CODE.include?(code)
126
+ raise RGFA::CIGAR::ValueError
127
+ end
128
+ end
129
+
130
+ # @return [RGFA::CIGAR::Operation] self
131
+ def to_cigar_operation
132
+ self
133
+ end
134
+ end
135
+
136
+ class Array
137
+ # Create a {RGFA::CIGAR} instance from the content of the array.
138
+ # @return [RGFA::CIGAR]
139
+ def to_cigar
140
+ RGFA::CIGAR.new(self)
141
+ end
142
+ # Create a {RGFA::CIGAR::Operation} instance from the content of the array.
143
+ # @return [RGFA::CIGAR::Operation]
144
+ def to_cigar_operation
145
+ RGFA::CIGAR::Operation.new(Integer(self[0]), self[1].to_sym)
146
+ end
147
+ end
148
+
149
+ class String
150
+ # Parse CIGAR string and return an array of CIGAR operations
151
+ # @return [RGFA::CIGAR] CIGAR operations (empty if string is "*")
152
+ # @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
153
+ def to_cigar
154
+ RGFA::CIGAR.from_string(self)
155
+ end
156
+ end
157
+
@@ -0,0 +1,131 @@
1
+ #
2
+ # Methods which analyse the connectivity of the graph.
3
+ #
4
+ module RGFA::Connectivity
5
+
6
+ require "set"
7
+
8
+ # Computes the connectivity of a segment from its number of links.
9
+ #
10
+ # @param segment [String|RGFA::Line::Segment] segment name or instance
11
+ #
12
+ # @return [Array<conn_symbol,conn_symbol>]
13
+ # conn. symbols respectively of the :B and :E ends of +segment+.
14
+ #
15
+ # <b>Connectivity symbol:</b> (+conn_symbol+)
16
+ # - Let _n_ be the number of links to an end (+:B+ or +:E+) of a segment.
17
+ # Then the connectivity symbol is +:M+ if <i>n > 1</i>, otherwise _n_.
18
+ #
19
+ def connectivity(segment)
20
+ connectivity_symbols(links_of([segment, :B]).size,
21
+ links_of([segment, :E]).size)
22
+ end
23
+
24
+ # Does the removal of the link alone divide a component
25
+ # of the graph into two?
26
+ # @return [Boolean]
27
+ # @param link [RGFA::Line::Link] a link
28
+ def cut_link?(link)
29
+ return false if link.circular?
30
+ return true if links_of(link.from_end.invert_end_type).size == 0
31
+ return true if links_of(link.to_end.invert_end_type).size == 0
32
+ c = {}
33
+ [:from, :to].each do |et|
34
+ c[et] = Set.new
35
+ visited = Set.new
36
+ segend = link.send(:"#{et}_end")
37
+ visited << segend.name
38
+ visited << link.other_end(segend).name
39
+ traverse_component(segend, c[et], visited)
40
+ end
41
+ return c[:from] != c[:to]
42
+ end
43
+
44
+ # Does the removal of the segment and its links divide a
45
+ # component of the graph into two?
46
+ # @param segment [String, RGFA::Line::Segment] a segment name or instance
47
+ # @return [Boolean]
48
+ def cut_segment?(segment)
49
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
50
+ cn = connectivity(segment_name)
51
+ return false if [[0,0],[0,1],[1,0]].include?(cn)
52
+ start_points = []
53
+ [:B, :E].each do |et|
54
+ start_points += links_of([segment_name, et]).map do |l|
55
+ l.other_end([segment_name, et]).invert_end_type
56
+ end
57
+ end
58
+ cc = []
59
+ start_points.uniq.each do |start_point|
60
+ cc << Set.new
61
+ visited = Set.new
62
+ visited << segment_name
63
+ traverse_component(start_point, cc.last, visited)
64
+ end
65
+ return cc.any?{|c|c != cc[0]}
66
+ end
67
+
68
+ # Find the connected component of the graph in which a segment is included
69
+ # @return [Array<String>]
70
+ # array of segment names
71
+ # @param segment [String, RGFA::Line::Segment] a segment name or instance
72
+ # @param visited [Set<String>] a set of segments to ignore during graph
73
+ # traversal; all segments in the found component will be added to it
74
+ def segment_connected_component(segment, visited = Set.new)
75
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
76
+ visited << segment_name
77
+ c = [segment_name]
78
+ traverse_component([segment_name, :B], c, visited)
79
+ traverse_component([segment_name, :E], c, visited)
80
+ return c
81
+ end
82
+
83
+ # Find the connected components of the graph
84
+ # @return [Array<Array<String>>]
85
+ # array of components, each an array of segment names
86
+ def connected_components
87
+ components = []
88
+ visited = Set.new
89
+ segment_names.each do |sn|
90
+ next if visited.include?(sn)
91
+ components << segment_connected_component(sn, visited)
92
+ end
93
+ return components
94
+ end
95
+
96
+ # Split connected components of the graph into single-component RGFAs
97
+ # @return [Array<RGFA>]
98
+ def split_connected_components
99
+ retval = []
100
+ ccs = connected_components
101
+ ccs.each do |cc|
102
+ gfa2 = self.clone
103
+ gfa2.rm(gfa2.segment_names - cc)
104
+ retval << gfa2
105
+ end
106
+ return retval
107
+ end
108
+
109
+ private
110
+
111
+ def traverse_component(segment_end, c, visited)
112
+ links_of(segment_end).each do |l|
113
+ oe = l.other_end(segment_end)
114
+ sn = oe.name
115
+ next if visited.include?(sn)
116
+ visited << sn
117
+ c << sn
118
+ traverse_component([sn, :B], c, visited)
119
+ traverse_component([sn, :E], c, visited)
120
+ end
121
+ end
122
+
123
+ def connectivity_symbols(n,m)
124
+ [connectivity_symbol(n), connectivity_symbol(m)]
125
+ end
126
+
127
+ def connectivity_symbol(n)
128
+ n > 1 ? :M : n
129
+ end
130
+
131
+ end
@@ -0,0 +1,97 @@
1
+ require_relative "error"
2
+
3
+ #
4
+ # Methods for the RGFA class, which allow to handle containments in the graph.
5
+ #
6
+ module RGFA::Containments
7
+
8
+ def add_containment(gfa_line)
9
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
10
+ @containments << gfa_line
11
+ [:from, :to].each do |dir|
12
+ segment_name = gfa_line.send(dir)
13
+ orient = gfa_line.send(:"#{dir}_orient")
14
+ if !@segments.has_key?(segment_name)
15
+ raise RGFA::LineMissingError if @segments_first_order
16
+ @segments[segment_name] =
17
+ RGFA::Line::Segment.new({:name => segment_name},
18
+ virtual: true)
19
+ end
20
+ s = @segments[segment_name]
21
+ s.containments[dir][orient] << gfa_line
22
+ gfa_line.send(:"#{dir}=", s)
23
+ end
24
+ end
25
+ protected :add_containment
26
+
27
+ # Delete a containment
28
+ #
29
+ # @param c [RGFA::Line::Containment] containment instance
30
+ # @return [RGFA] self
31
+ def delete_containment(c)
32
+ @containments.delete(c)
33
+ segment(c.from).containments[:from][c.from_orient].delete(c)
34
+ segment(c.to).containments[:to][c.to_orient].delete(c)
35
+ end
36
+
37
+ # All containments in the graph
38
+ # @return [Array<RGFA::Line::Containment>]
39
+ def containments
40
+ @containments
41
+ end
42
+
43
+ # Find containment lines whose +from+ segment name is +segment_name+
44
+ # @!macro segment_or_name
45
+ # @return [Array<RGFA::Line::Containment>]
46
+ def contained_in(s)
47
+ s = segment!(s)
48
+ s.containments[:from][:+] + s.containments[:from][:-]
49
+ end
50
+
51
+ # Find containment lines whose +to+ segment name is +segment_name+
52
+ # @return [Array<RGFA::Line::Containment>]
53
+ # @!macro segment_or_name
54
+ def containing(s)
55
+ s = segment!(s)
56
+ s.containments[:to][:+] + s.containments[:to][:-]
57
+ end
58
+
59
+ # Searches all containments of +contained+ in +container+.
60
+ # Returns a possibly empty array of containments.
61
+ #
62
+ # @return [Array<RGFA::Line::Containment>]
63
+ # @!macro [new] container_contained
64
+ # @param container [RGFA::Line::Segment, Symbol] a segment instance or name
65
+ # @param contained [RGFA::Line::Segment, Symbol] a segment instance or name
66
+ #
67
+ def containments_between(container, contained)
68
+ contained_in(container).select {|l| l.to.to_sym == contained.to_sym }
69
+ end
70
+
71
+ # Searches a containment of +contained+ in +container+.
72
+ # Returns the first containment found or nil if none found.
73
+ #
74
+ # @return [RGFA::Line::Containment, nil]
75
+ # @!macro container_contained
76
+ def containment(container, contained)
77
+ contained_in(container).each do |l|
78
+ if l.to.to_sym == contained.to_sym
79
+ return l
80
+ end
81
+ end
82
+ return nil
83
+ end
84
+
85
+ # Searches a containment of +contained+ in +container+.
86
+ # Raises an exception if no such containment was found.
87
+ #
88
+ # @return [RGFA::Line::Containment]
89
+ # @raise [RGFA::LineMissingError] if no such containment found
90
+ # @!macro container_contained
91
+ def containment!(container, contained)
92
+ c = containment(container, contained)
93
+ raise RGFA::LineMissingError, "No containment was found" if c.nil?
94
+ c
95
+ end
96
+
97
+ end