rgfa 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/gfadiff.rb +420 -0
- data/bin/rgfa-findcrisprs.rb +208 -0
- data/bin/rgfa-mergelinear.rb +14 -0
- data/bin/rgfa-simdebruijn.rb +86 -0
- data/lib/rgfa.rb +376 -0
- data/lib/rgfa/byte_array.rb +74 -0
- data/lib/rgfa/cigar.rb +157 -0
- data/lib/rgfa/connectivity.rb +131 -0
- data/lib/rgfa/containments.rb +97 -0
- data/lib/rgfa/error.rb +3 -0
- data/lib/rgfa/field_array.rb +87 -0
- data/lib/rgfa/field_parser.rb +109 -0
- data/lib/rgfa/field_validator.rb +241 -0
- data/lib/rgfa/field_writer.rb +108 -0
- data/lib/rgfa/headers.rb +76 -0
- data/lib/rgfa/line.rb +721 -0
- data/lib/rgfa/line/containment.rb +87 -0
- data/lib/rgfa/line/header.rb +92 -0
- data/lib/rgfa/line/link.rb +379 -0
- data/lib/rgfa/line/path.rb +106 -0
- data/lib/rgfa/line/segment.rb +209 -0
- data/lib/rgfa/linear_paths.rb +285 -0
- data/lib/rgfa/lines.rb +155 -0
- data/lib/rgfa/links.rb +242 -0
- data/lib/rgfa/logger.rb +192 -0
- data/lib/rgfa/multiplication.rb +156 -0
- data/lib/rgfa/numeric_array.rb +196 -0
- data/lib/rgfa/paths.rb +98 -0
- data/lib/rgfa/rgl.rb +194 -0
- data/lib/rgfa/segment_ends_path.rb +9 -0
- data/lib/rgfa/segment_info.rb +162 -0
- data/lib/rgfa/segments.rb +99 -0
- data/lib/rgfa/sequence.rb +65 -0
- data/lib/rgfatools.rb +102 -0
- data/lib/rgfatools/artifacts.rb +29 -0
- data/lib/rgfatools/copy_number.rb +126 -0
- data/lib/rgfatools/invertible_segments.rb +104 -0
- data/lib/rgfatools/linear_paths.rb +140 -0
- data/lib/rgfatools/multiplication.rb +194 -0
- data/lib/rgfatools/p_bubbles.rb +66 -0
- data/lib/rgfatools/superfluous_links.rb +64 -0
- metadata +97 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
require_relative "error.rb"
|
2
|
+
|
3
|
+
#
|
4
|
+
# Array of positive integers <= 255;
|
5
|
+
# representation of the data contained in an H field
|
6
|
+
#
|
7
|
+
class RGFA::ByteArray < Array
|
8
|
+
|
9
|
+
# Validates the byte array content
|
10
|
+
# @raise [RGFA::ByteArray::ValueError] if any value is not a
|
11
|
+
# positive integer <= 255
|
12
|
+
# @return [void]
|
13
|
+
def validate!
|
14
|
+
each do |x|
|
15
|
+
unless x.kind_of?(Integer) and (0..255).include?(x)
|
16
|
+
raise RGFA::ByteArray::ValueError,
|
17
|
+
"Value incompatible with byte array: #{x.inspect}\n"+
|
18
|
+
"in array: #{self.inspect}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
self.trust
|
22
|
+
return nil
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns self
|
26
|
+
# @return [RGFA::ByteArray] self
|
27
|
+
def to_byte_array
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
# GFA datatype H representation of the byte array
|
32
|
+
# @raise [RGFA::ByteArray::ValueError] if the
|
33
|
+
# array is not a valid byte array
|
34
|
+
# @return [String]
|
35
|
+
def to_s
|
36
|
+
validate!
|
37
|
+
map do |elem|
|
38
|
+
str = elem.to_s(16).upcase
|
39
|
+
elem < 16 ? "0#{str}" : str
|
40
|
+
end.join
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
# Exception raised if any value is not a positive integer <= 255
|
46
|
+
class RGFA::ByteArray::ValueError < RGFA::Error; end
|
47
|
+
|
48
|
+
# Exception raised if string is not a valid representation of byte array
|
49
|
+
class RGFA::ByteArray::FormatError < RGFA::Error; end
|
50
|
+
|
51
|
+
# Method to create a RGFA::ByteArray from an Array
|
52
|
+
class Array
|
53
|
+
# Create a RGFA::ByteArray from an Array instance
|
54
|
+
# @return [RGFA::ByteArray] the byte array
|
55
|
+
def to_byte_array
|
56
|
+
RGFA::ByteArray.new(self)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Method to parse the string representation of a RGFA::ByteArray
|
61
|
+
class String
|
62
|
+
# Convert a GFA string representation of a byte array to a byte array
|
63
|
+
# @return [RGFA::ByteArray] the byte array
|
64
|
+
# @raise [RGFA::ByteArray::FormatError] if the string size is not > 0
|
65
|
+
# and even
|
66
|
+
def to_byte_array
|
67
|
+
if (size < 2) or (size % 2 == 1)
|
68
|
+
raise RGFA::ByteArray::FormatError,
|
69
|
+
"Invalid byte array string #{self}; "+
|
70
|
+
"each element must be represented by two letters [0-9A-F]"
|
71
|
+
end
|
72
|
+
scan(/..?/).map {|x|Integer(x,16)}.to_byte_array
|
73
|
+
end
|
74
|
+
end
|
data/lib/rgfa/cigar.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
require_relative "error.rb"
|
2
|
+
|
3
|
+
# Array of {RGFA::CIGAR::Operation CIGAR operations}.
|
4
|
+
# Represents the contents of a CIGAR string.
|
5
|
+
class RGFA::CIGAR < Array
|
6
|
+
|
7
|
+
# Compute the CIGAR for the segments in reverse direction.
|
8
|
+
#
|
9
|
+
# @example Reversing a CIGAR
|
10
|
+
#
|
11
|
+
# RGFA::CIGAR.from_string("2M1D3M").reverse.to_s
|
12
|
+
# # => "3M1I2M"
|
13
|
+
#
|
14
|
+
# # S1 + S2 + 2M1D3M
|
15
|
+
# #
|
16
|
+
# # S1+ ACGACTGTGA
|
17
|
+
# # S2+ CT-TGACGG
|
18
|
+
# #
|
19
|
+
# # S2- CCGTCA-AG
|
20
|
+
# # S1- TCACAGTCGT
|
21
|
+
# #
|
22
|
+
# # S2 - S1 - 3M1I2M
|
23
|
+
#
|
24
|
+
# @return [RGFA::CIGAR] (empty if CIGAR string is *)
|
25
|
+
def reverse
|
26
|
+
super.map do |op|
|
27
|
+
if op.code == :I
|
28
|
+
op.code = :D
|
29
|
+
elsif op.code == :D
|
30
|
+
op.code = :I
|
31
|
+
end
|
32
|
+
op
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Parse a CIGAR string into an array of CIGAR operations.
|
37
|
+
#
|
38
|
+
# Each operation is represented by a {RGFA::CIGAR::Operation},
|
39
|
+
# i.e. a tuple of operation length and operation
|
40
|
+
# symbol (one of MIDNSHPX=).
|
41
|
+
#
|
42
|
+
# @return [RGFA::CIGAR] (empty if string is *)
|
43
|
+
# @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
|
44
|
+
def self.from_string(str)
|
45
|
+
a = RGFA::CIGAR.new
|
46
|
+
if str != "*"
|
47
|
+
raise RGFA::CIGAR::ValueError if str !~ /^([0-9]+[MIDNSHPX=])+$/
|
48
|
+
str.scan(/[0-9]+[MIDNSHPX=]/).each do |op|
|
49
|
+
len = op[0..-2].to_i
|
50
|
+
code = op[-1..-1].to_sym
|
51
|
+
a << RGFA::CIGAR::Operation.new(len, code)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
return a
|
55
|
+
end
|
56
|
+
|
57
|
+
# String representation of the CIGAR
|
58
|
+
# @return [String] CIGAR string
|
59
|
+
def to_s
|
60
|
+
if empty?
|
61
|
+
return "*"
|
62
|
+
else
|
63
|
+
map(&:to_s).join
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Validate the instance
|
68
|
+
# @raise if any component of the CIGAR array is invalid.
|
69
|
+
# @return [void]
|
70
|
+
def validate!
|
71
|
+
any? do |op|
|
72
|
+
op.to_cigar_operation.validate!
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# @return [RGFA::CIGAR] self
|
77
|
+
def to_cigar
|
78
|
+
self
|
79
|
+
end
|
80
|
+
|
81
|
+
# Create a copy
|
82
|
+
# @return [RGFA::CIGAR]
|
83
|
+
def clone
|
84
|
+
map{|x|x.clone}
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
# Exception raised by invalid CIGAR string content
|
90
|
+
class RGFA::CIGAR::ValueError < RGFA::Error; end
|
91
|
+
|
92
|
+
# An operation in a CIGAR string
|
93
|
+
class RGFA::CIGAR::Operation
|
94
|
+
attr_accessor :len
|
95
|
+
attr_accessor :code
|
96
|
+
|
97
|
+
# CIGAR operation code
|
98
|
+
CODE = [:M, :I, :D, :N, :S, :H, :P, :X, :"="]
|
99
|
+
|
100
|
+
# @param len [Integer] length of the operation
|
101
|
+
# @param code [RGFA::CIGAR::Operation::CODE] code of the operation
|
102
|
+
def initialize(len, code)
|
103
|
+
@len = len
|
104
|
+
@code = code
|
105
|
+
end
|
106
|
+
|
107
|
+
# The string representation of the operation
|
108
|
+
# @return [String]
|
109
|
+
def to_s
|
110
|
+
"#{len}#{code}"
|
111
|
+
end
|
112
|
+
|
113
|
+
# Compare two operations
|
114
|
+
# @return [Boolean]
|
115
|
+
def ==(other)
|
116
|
+
other.len == len and other.code == code
|
117
|
+
end
|
118
|
+
|
119
|
+
# Validate the operation
|
120
|
+
# @return [void]
|
121
|
+
# @raise [RGFA::CIGAR::ValueError] if the code is invalid or the length is not
|
122
|
+
# an integer larger than zero
|
123
|
+
def validate!
|
124
|
+
if Integer(len) <= 0 or
|
125
|
+
!RGFA::CIGAR::Operation::CODE.include?(code)
|
126
|
+
raise RGFA::CIGAR::ValueError
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# @return [RGFA::CIGAR::Operation] self
|
131
|
+
def to_cigar_operation
|
132
|
+
self
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
class Array
|
137
|
+
# Create a {RGFA::CIGAR} instance from the content of the array.
|
138
|
+
# @return [RGFA::CIGAR]
|
139
|
+
def to_cigar
|
140
|
+
RGFA::CIGAR.new(self)
|
141
|
+
end
|
142
|
+
# Create a {RGFA::CIGAR::Operation} instance from the content of the array.
|
143
|
+
# @return [RGFA::CIGAR::Operation]
|
144
|
+
def to_cigar_operation
|
145
|
+
RGFA::CIGAR::Operation.new(Integer(self[0]), self[1].to_sym)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
class String
|
150
|
+
# Parse CIGAR string and return an array of CIGAR operations
|
151
|
+
# @return [RGFA::CIGAR] CIGAR operations (empty if string is "*")
|
152
|
+
# @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
|
153
|
+
def to_cigar
|
154
|
+
RGFA::CIGAR.from_string(self)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
@@ -0,0 +1,131 @@
|
|
1
|
+
#
|
2
|
+
# Methods which analyse the connectivity of the graph.
|
3
|
+
#
|
4
|
+
module RGFA::Connectivity
|
5
|
+
|
6
|
+
require "set"
|
7
|
+
|
8
|
+
# Computes the connectivity of a segment from its number of links.
|
9
|
+
#
|
10
|
+
# @param segment [String|RGFA::Line::Segment] segment name or instance
|
11
|
+
#
|
12
|
+
# @return [Array<conn_symbol,conn_symbol>]
|
13
|
+
# conn. symbols respectively of the :B and :E ends of +segment+.
|
14
|
+
#
|
15
|
+
# <b>Connectivity symbol:</b> (+conn_symbol+)
|
16
|
+
# - Let _n_ be the number of links to an end (+:B+ or +:E+) of a segment.
|
17
|
+
# Then the connectivity symbol is +:M+ if <i>n > 1</i>, otherwise _n_.
|
18
|
+
#
|
19
|
+
def connectivity(segment)
|
20
|
+
connectivity_symbols(links_of([segment, :B]).size,
|
21
|
+
links_of([segment, :E]).size)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Does the removal of the link alone divide a component
|
25
|
+
# of the graph into two?
|
26
|
+
# @return [Boolean]
|
27
|
+
# @param link [RGFA::Line::Link] a link
|
28
|
+
def cut_link?(link)
|
29
|
+
return false if link.circular?
|
30
|
+
return true if links_of(link.from_end.invert_end_type).size == 0
|
31
|
+
return true if links_of(link.to_end.invert_end_type).size == 0
|
32
|
+
c = {}
|
33
|
+
[:from, :to].each do |et|
|
34
|
+
c[et] = Set.new
|
35
|
+
visited = Set.new
|
36
|
+
segend = link.send(:"#{et}_end")
|
37
|
+
visited << segend.name
|
38
|
+
visited << link.other_end(segend).name
|
39
|
+
traverse_component(segend, c[et], visited)
|
40
|
+
end
|
41
|
+
return c[:from] != c[:to]
|
42
|
+
end
|
43
|
+
|
44
|
+
# Does the removal of the segment and its links divide a
|
45
|
+
# component of the graph into two?
|
46
|
+
# @param segment [String, RGFA::Line::Segment] a segment name or instance
|
47
|
+
# @return [Boolean]
|
48
|
+
def cut_segment?(segment)
|
49
|
+
segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
|
50
|
+
cn = connectivity(segment_name)
|
51
|
+
return false if [[0,0],[0,1],[1,0]].include?(cn)
|
52
|
+
start_points = []
|
53
|
+
[:B, :E].each do |et|
|
54
|
+
start_points += links_of([segment_name, et]).map do |l|
|
55
|
+
l.other_end([segment_name, et]).invert_end_type
|
56
|
+
end
|
57
|
+
end
|
58
|
+
cc = []
|
59
|
+
start_points.uniq.each do |start_point|
|
60
|
+
cc << Set.new
|
61
|
+
visited = Set.new
|
62
|
+
visited << segment_name
|
63
|
+
traverse_component(start_point, cc.last, visited)
|
64
|
+
end
|
65
|
+
return cc.any?{|c|c != cc[0]}
|
66
|
+
end
|
67
|
+
|
68
|
+
# Find the connected component of the graph in which a segment is included
|
69
|
+
# @return [Array<String>]
|
70
|
+
# array of segment names
|
71
|
+
# @param segment [String, RGFA::Line::Segment] a segment name or instance
|
72
|
+
# @param visited [Set<String>] a set of segments to ignore during graph
|
73
|
+
# traversal; all segments in the found component will be added to it
|
74
|
+
def segment_connected_component(segment, visited = Set.new)
|
75
|
+
segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
|
76
|
+
visited << segment_name
|
77
|
+
c = [segment_name]
|
78
|
+
traverse_component([segment_name, :B], c, visited)
|
79
|
+
traverse_component([segment_name, :E], c, visited)
|
80
|
+
return c
|
81
|
+
end
|
82
|
+
|
83
|
+
# Find the connected components of the graph
|
84
|
+
# @return [Array<Array<String>>]
|
85
|
+
# array of components, each an array of segment names
|
86
|
+
def connected_components
|
87
|
+
components = []
|
88
|
+
visited = Set.new
|
89
|
+
segment_names.each do |sn|
|
90
|
+
next if visited.include?(sn)
|
91
|
+
components << segment_connected_component(sn, visited)
|
92
|
+
end
|
93
|
+
return components
|
94
|
+
end
|
95
|
+
|
96
|
+
# Split connected components of the graph into single-component RGFAs
|
97
|
+
# @return [Array<RGFA>]
|
98
|
+
def split_connected_components
|
99
|
+
retval = []
|
100
|
+
ccs = connected_components
|
101
|
+
ccs.each do |cc|
|
102
|
+
gfa2 = self.clone
|
103
|
+
gfa2.rm(gfa2.segment_names - cc)
|
104
|
+
retval << gfa2
|
105
|
+
end
|
106
|
+
return retval
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def traverse_component(segment_end, c, visited)
|
112
|
+
links_of(segment_end).each do |l|
|
113
|
+
oe = l.other_end(segment_end)
|
114
|
+
sn = oe.name
|
115
|
+
next if visited.include?(sn)
|
116
|
+
visited << sn
|
117
|
+
c << sn
|
118
|
+
traverse_component([sn, :B], c, visited)
|
119
|
+
traverse_component([sn, :E], c, visited)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def connectivity_symbols(n,m)
|
124
|
+
[connectivity_symbol(n), connectivity_symbol(m)]
|
125
|
+
end
|
126
|
+
|
127
|
+
def connectivity_symbol(n)
|
128
|
+
n > 1 ? :M : n
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require_relative "error"
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for the RGFA class, which allow to handle containments in the graph.
|
5
|
+
#
|
6
|
+
module RGFA::Containments
|
7
|
+
|
8
|
+
def add_containment(gfa_line)
|
9
|
+
gfa_line = gfa_line.to_rgfa_line(validate: @validate)
|
10
|
+
@containments << gfa_line
|
11
|
+
[:from, :to].each do |dir|
|
12
|
+
segment_name = gfa_line.send(dir)
|
13
|
+
orient = gfa_line.send(:"#{dir}_orient")
|
14
|
+
if !@segments.has_key?(segment_name)
|
15
|
+
raise RGFA::LineMissingError if @segments_first_order
|
16
|
+
@segments[segment_name] =
|
17
|
+
RGFA::Line::Segment.new({:name => segment_name},
|
18
|
+
virtual: true)
|
19
|
+
end
|
20
|
+
s = @segments[segment_name]
|
21
|
+
s.containments[dir][orient] << gfa_line
|
22
|
+
gfa_line.send(:"#{dir}=", s)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
protected :add_containment
|
26
|
+
|
27
|
+
# Delete a containment
|
28
|
+
#
|
29
|
+
# @param c [RGFA::Line::Containment] containment instance
|
30
|
+
# @return [RGFA] self
|
31
|
+
def delete_containment(c)
|
32
|
+
@containments.delete(c)
|
33
|
+
segment(c.from).containments[:from][c.from_orient].delete(c)
|
34
|
+
segment(c.to).containments[:to][c.to_orient].delete(c)
|
35
|
+
end
|
36
|
+
|
37
|
+
# All containments in the graph
|
38
|
+
# @return [Array<RGFA::Line::Containment>]
|
39
|
+
def containments
|
40
|
+
@containments
|
41
|
+
end
|
42
|
+
|
43
|
+
# Find containment lines whose +from+ segment name is +segment_name+
|
44
|
+
# @!macro segment_or_name
|
45
|
+
# @return [Array<RGFA::Line::Containment>]
|
46
|
+
def contained_in(s)
|
47
|
+
s = segment!(s)
|
48
|
+
s.containments[:from][:+] + s.containments[:from][:-]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Find containment lines whose +to+ segment name is +segment_name+
|
52
|
+
# @return [Array<RGFA::Line::Containment>]
|
53
|
+
# @!macro segment_or_name
|
54
|
+
def containing(s)
|
55
|
+
s = segment!(s)
|
56
|
+
s.containments[:to][:+] + s.containments[:to][:-]
|
57
|
+
end
|
58
|
+
|
59
|
+
# Searches all containments of +contained+ in +container+.
|
60
|
+
# Returns a possibly empty array of containments.
|
61
|
+
#
|
62
|
+
# @return [Array<RGFA::Line::Containment>]
|
63
|
+
# @!macro [new] container_contained
|
64
|
+
# @param container [RGFA::Line::Segment, Symbol] a segment instance or name
|
65
|
+
# @param contained [RGFA::Line::Segment, Symbol] a segment instance or name
|
66
|
+
#
|
67
|
+
def containments_between(container, contained)
|
68
|
+
contained_in(container).select {|l| l.to.to_sym == contained.to_sym }
|
69
|
+
end
|
70
|
+
|
71
|
+
# Searches a containment of +contained+ in +container+.
|
72
|
+
# Returns the first containment found or nil if none found.
|
73
|
+
#
|
74
|
+
# @return [RGFA::Line::Containment, nil]
|
75
|
+
# @!macro container_contained
|
76
|
+
def containment(container, contained)
|
77
|
+
contained_in(container).each do |l|
|
78
|
+
if l.to.to_sym == contained.to_sym
|
79
|
+
return l
|
80
|
+
end
|
81
|
+
end
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# Searches a containment of +contained+ in +container+.
|
86
|
+
# Raises an exception if no such containment was found.
|
87
|
+
#
|
88
|
+
# @return [RGFA::Line::Containment]
|
89
|
+
# @raise [RGFA::LineMissingError] if no such containment found
|
90
|
+
# @!macro container_contained
|
91
|
+
def containment!(container, contained)
|
92
|
+
c = containment(container, contained)
|
93
|
+
raise RGFA::LineMissingError, "No containment was found" if c.nil?
|
94
|
+
c
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|