rgfa 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/gfadiff.rb +420 -0
- data/bin/rgfa-findcrisprs.rb +208 -0
- data/bin/rgfa-mergelinear.rb +14 -0
- data/bin/rgfa-simdebruijn.rb +86 -0
- data/lib/rgfa.rb +376 -0
- data/lib/rgfa/byte_array.rb +74 -0
- data/lib/rgfa/cigar.rb +157 -0
- data/lib/rgfa/connectivity.rb +131 -0
- data/lib/rgfa/containments.rb +97 -0
- data/lib/rgfa/error.rb +3 -0
- data/lib/rgfa/field_array.rb +87 -0
- data/lib/rgfa/field_parser.rb +109 -0
- data/lib/rgfa/field_validator.rb +241 -0
- data/lib/rgfa/field_writer.rb +108 -0
- data/lib/rgfa/headers.rb +76 -0
- data/lib/rgfa/line.rb +721 -0
- data/lib/rgfa/line/containment.rb +87 -0
- data/lib/rgfa/line/header.rb +92 -0
- data/lib/rgfa/line/link.rb +379 -0
- data/lib/rgfa/line/path.rb +106 -0
- data/lib/rgfa/line/segment.rb +209 -0
- data/lib/rgfa/linear_paths.rb +285 -0
- data/lib/rgfa/lines.rb +155 -0
- data/lib/rgfa/links.rb +242 -0
- data/lib/rgfa/logger.rb +192 -0
- data/lib/rgfa/multiplication.rb +156 -0
- data/lib/rgfa/numeric_array.rb +196 -0
- data/lib/rgfa/paths.rb +98 -0
- data/lib/rgfa/rgl.rb +194 -0
- data/lib/rgfa/segment_ends_path.rb +9 -0
- data/lib/rgfa/segment_info.rb +162 -0
- data/lib/rgfa/segments.rb +99 -0
- data/lib/rgfa/sequence.rb +65 -0
- data/lib/rgfatools.rb +102 -0
- data/lib/rgfatools/artifacts.rb +29 -0
- data/lib/rgfatools/copy_number.rb +126 -0
- data/lib/rgfatools/invertible_segments.rb +104 -0
- data/lib/rgfatools/linear_paths.rb +140 -0
- data/lib/rgfatools/multiplication.rb +194 -0
- data/lib/rgfatools/p_bubbles.rb +66 -0
- data/lib/rgfatools/superfluous_links.rb +64 -0
- metadata +97 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
require_relative "error.rb"
|
2
|
+
|
3
|
+
#
|
4
|
+
# Method for the RGFA class, which allow to split a segment into
|
5
|
+
# multiple copies.
|
6
|
+
#
|
7
|
+
module RGFA::Multiplication
|
8
|
+
|
9
|
+
# Create multiple copies of a segment.
|
10
|
+
#
|
11
|
+
# == Automatic computation of the copy names
|
12
|
+
#
|
13
|
+
# - Can be overridden, by providing an array of copy names.
|
14
|
+
# - First, it is checked if the name of the original segment ends with a
|
15
|
+
# relevant
|
16
|
+
# string, i.e. a lower case letter (for +:lowcase+), an upper case letter
|
17
|
+
# (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
|
18
|
+
# plus one or more optional digits (for +:copy+).
|
19
|
+
# - If so, it is assumed, it was already a copy, and it is not
|
20
|
+
# altered.
|
21
|
+
# - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
|
22
|
+
# +:number+), +_copy+ (for +:copy+) is appended to the string.
|
23
|
+
# - Then, in all
|
24
|
+
# cases, next (*) is called on the string, until a valid, non-existant name
|
25
|
+
# is found for each of the segment copies
|
26
|
+
# - (*) = except for +:copy+, where
|
27
|
+
# for the first copy no digit is present, but for the following is,
|
28
|
+
# i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
|
29
|
+
#
|
30
|
+
# @param [Integer] factor multiplication factor; if 0, delete the segment;
|
31
|
+
# if 1; do nothing; if > 1; number of copies to create
|
32
|
+
# @param segment [String, RGFA::Line::Segment] segment name or instance
|
33
|
+
# @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
|
34
|
+
# <i>(Defaults to: +:lowcase+)</i>
|
35
|
+
# Array of names for the copies of the segment,
|
36
|
+
# or a symbol, which defines a system to compute the names from the name of
|
37
|
+
# the original segment. See "automatic computation of the copy names".
|
38
|
+
# @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
|
39
|
+
# If factor == 0 (i.e. deletion), delete segment only if
|
40
|
+
# {#cut_segment?}(segment) is +false+.
|
41
|
+
#
|
42
|
+
# @return [RGFA] self
|
43
|
+
def multiply(segment, factor, copy_names: :lowcase,
|
44
|
+
conserve_components: true)
|
45
|
+
segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
|
46
|
+
if factor < 2
|
47
|
+
return self if factor == 1
|
48
|
+
return self if cut_segment?(segment_name) and conserve_components
|
49
|
+
return delete_segment(segment_name)
|
50
|
+
end
|
51
|
+
s = segment!(segment_name)
|
52
|
+
divide_segment_and_connection_counts(s, factor)
|
53
|
+
copy_names = compute_copy_names(copy_names, segment_name, factor)
|
54
|
+
copy_names.each {|cn| clone_segment_and_connections(s, cn)}
|
55
|
+
return self
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def compute_copy_names(copy_names, segment_name, factor)
|
61
|
+
return nil if factor < 2
|
62
|
+
accepted = [:lowcase, :upcase, :number, :copy]
|
63
|
+
if copy_names.kind_of?(Array)
|
64
|
+
return copy_names
|
65
|
+
elsif !accepted.include?(copy_names)
|
66
|
+
raise ArgumentError,
|
67
|
+
"copy_names shall be an array of names or one of: "+
|
68
|
+
accepted.inspect
|
69
|
+
end
|
70
|
+
retval = []
|
71
|
+
next_name = segment_name.to_s
|
72
|
+
case copy_names
|
73
|
+
when :lowcase
|
74
|
+
if next_name =~ /^.*[a-z]$/
|
75
|
+
next_name = next_name.next
|
76
|
+
else
|
77
|
+
next_name += "b"
|
78
|
+
end
|
79
|
+
when :upcase
|
80
|
+
if next_name =~ /^.*[A-Z]$/
|
81
|
+
next_name = next_name.next
|
82
|
+
else
|
83
|
+
next_name += "B"
|
84
|
+
end
|
85
|
+
when :number
|
86
|
+
if next_name =~ /^.*[0-9]$/
|
87
|
+
next_name = next_name.next
|
88
|
+
else
|
89
|
+
next_name += "2"
|
90
|
+
end
|
91
|
+
when :copy
|
92
|
+
if next_name =~ /^.*_copy(\d*)$/
|
93
|
+
next_name += "1" if $1 == ""
|
94
|
+
next_name = next_name.next
|
95
|
+
copy_names = :number
|
96
|
+
else
|
97
|
+
next_name += "_copy"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
while retval.size < (factor-1)
|
101
|
+
while retval.include?(next_name) or
|
102
|
+
@segments.has_key?(next_name.to_sym) or
|
103
|
+
@paths.has_key?(next_name.to_sym)
|
104
|
+
if copy_names == :copy
|
105
|
+
next_name += "1"
|
106
|
+
copy_names = :number
|
107
|
+
end
|
108
|
+
next_name = next_name.next
|
109
|
+
end
|
110
|
+
retval << next_name
|
111
|
+
end
|
112
|
+
return retval
|
113
|
+
end
|
114
|
+
|
115
|
+
def divide_counts(gfa_line, factor)
|
116
|
+
[:KC, :RC, :FC].each do |count_tag|
|
117
|
+
if gfa_line.optional_fieldnames.include?(count_tag)
|
118
|
+
value = (gfa_line.get(count_tag).to_f / factor)
|
119
|
+
gfa_line.set(count_tag, value.to_i)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def divide_segment_and_connection_counts(segment, factor)
|
125
|
+
divide_counts(segment, factor)
|
126
|
+
[:links,:containments].each do |rt|
|
127
|
+
[:from,:to].each do |dir|
|
128
|
+
[:+, :-].each do |o|
|
129
|
+
segment.send(rt)[dir][o].each do |l|
|
130
|
+
# circular link counts shall be divided only ones
|
131
|
+
next if dir == :to and l.from == l.to
|
132
|
+
divide_counts(l, factor)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def clone_segment_and_connections(segment, clone_name)
|
140
|
+
cpy = segment.clone
|
141
|
+
cpy.name = clone_name
|
142
|
+
self << cpy
|
143
|
+
[:links,:containments].each do |rt|
|
144
|
+
[:from,:to].each do |dir|
|
145
|
+
[:+, :-].each do |o|
|
146
|
+
segment.send(rt)[dir][o].each do |l|
|
147
|
+
lc = l.clone
|
148
|
+
lc.set(dir, clone_name)
|
149
|
+
self << lc
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require_relative "error"
|
2
|
+
|
3
|
+
#
|
4
|
+
# A numeric array representable using the data type B of the GFA specification
|
5
|
+
#
|
6
|
+
class RGFA::NumericArray < Array
|
7
|
+
|
8
|
+
# Subtypes for signed integers, from the smallest to the largest
|
9
|
+
SIGNED_INT_SUBTYPE = %W[c s i]
|
10
|
+
|
11
|
+
# Subtypes for unsigned integers, from the smallest to the largest
|
12
|
+
UNSIGNED_INT_SUBTYPE = SIGNED_INT_SUBTYPE.map{|st|st.upcase}
|
13
|
+
|
14
|
+
# Subtypes for integers
|
15
|
+
INT_SUBTYPE = UNSIGNED_INT_SUBTYPE + SIGNED_INT_SUBTYPE
|
16
|
+
|
17
|
+
# Subtypes for floats
|
18
|
+
FLOAT_SUBTYPE = ["f"]
|
19
|
+
|
20
|
+
# Subtypes
|
21
|
+
SUBTYPE = INT_SUBTYPE + FLOAT_SUBTYPE
|
22
|
+
|
23
|
+
# Number of bits of unsigned integer subtypes
|
24
|
+
SUBTYPE_BITS = {"c" => 8, "s" => 16, "i" => 32}
|
25
|
+
|
26
|
+
# Range for integer subtypes
|
27
|
+
SUBTYPE_RANGE = Hash[
|
28
|
+
INT_SUBTYPE.map do |subtype|
|
29
|
+
[
|
30
|
+
subtype,
|
31
|
+
if subtype == subtype.upcase
|
32
|
+
0..((2**SUBTYPE_BITS[subtype.downcase])-1)
|
33
|
+
else
|
34
|
+
(-(2**(SUBTYPE_BITS[subtype]-1)))..((2**(SUBTYPE_BITS[subtype]-1))-1)
|
35
|
+
end
|
36
|
+
]
|
37
|
+
end
|
38
|
+
]
|
39
|
+
|
40
|
+
# Validate the numeric array
|
41
|
+
#
|
42
|
+
# @raise [RGFA::NumericArray::ValueError] if the array is not valid
|
43
|
+
def validate!
|
44
|
+
compute_subtype
|
45
|
+
end
|
46
|
+
|
47
|
+
# Computes the subtype of the array from its content.
|
48
|
+
#
|
49
|
+
# If all elements are float, then the computed subtype is "f".
|
50
|
+
# If all elements are integer, the smallest possible numeric subtype
|
51
|
+
# is computed; thereby,
|
52
|
+
# if all elements are non-negative, an unsigned subtype is selected,
|
53
|
+
# otherwise a signed subtype.
|
54
|
+
# In all other cases an exception is raised.
|
55
|
+
#
|
56
|
+
# @raise [RGFA::NumericArray::ValueError] if the array is not a valid numeric
|
57
|
+
# array
|
58
|
+
# @return [RGFA::NumericArray::SUBTYPE]
|
59
|
+
def compute_subtype
|
60
|
+
if all? {|f|f.kind_of?(Float)}
|
61
|
+
return "f"
|
62
|
+
else
|
63
|
+
e_max = nil
|
64
|
+
e_min = nil
|
65
|
+
each do |e|
|
66
|
+
if !e.kind_of?(Integer)
|
67
|
+
raise RGFA::NumericArray::ValueError,
|
68
|
+
"NumericArray does not contain homogenous numeric values\n"+
|
69
|
+
"Content: #{inspect}"
|
70
|
+
end
|
71
|
+
e_max = e if e_max.nil? or e > e_max
|
72
|
+
e_min = e if e_min.nil? or e < e_min
|
73
|
+
end
|
74
|
+
return RGFA::NumericArray.integer_type(e_min..e_max)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Computes the subtype for integers in a given range.
|
79
|
+
#
|
80
|
+
# If all elements are non-negative, an unsigned subtype is selected,
|
81
|
+
# otherwise a signed subtype.
|
82
|
+
#
|
83
|
+
# @param range [Range] the integer range
|
84
|
+
#
|
85
|
+
# @raise [RGFA::NumericArray::ValueError] if the integer range is outside
|
86
|
+
# all subtype ranges
|
87
|
+
#
|
88
|
+
# @return [RGFA::NumericArray::INT_SUBTYPE] subtype code
|
89
|
+
def self.integer_type(range)
|
90
|
+
if range.min < 0
|
91
|
+
SIGNED_INT_SUBTYPE.each do |st|
|
92
|
+
st_range = RGFA::NumericArray::SUBTYPE_RANGE[st]
|
93
|
+
if st_range.include?(range.min) and st_range.include?(range.max)
|
94
|
+
return st
|
95
|
+
end
|
96
|
+
end
|
97
|
+
else
|
98
|
+
UNSIGNED_INT_SUBTYPE.each do |st|
|
99
|
+
return st if range.max < RGFA::NumericArray::SUBTYPE_RANGE[st].max
|
100
|
+
end
|
101
|
+
end
|
102
|
+
raise RGFA::NumericArray::ValueError,
|
103
|
+
"NumericArray: values are outside of all integer subtype ranges\n"+
|
104
|
+
"Content: #{inspect}"
|
105
|
+
end
|
106
|
+
|
107
|
+
# Return self
|
108
|
+
# @param validate [Boolean] <i>(default: +false+)</i>
|
109
|
+
# if +true+, validate the range of the numeric values, according
|
110
|
+
# to the array subtype
|
111
|
+
# @raise [RGFA::NumericArray::ValueError] if validate is set and
|
112
|
+
# any value is not compatible with the subtype
|
113
|
+
# @return [RGFA::NumericArray]
|
114
|
+
def to_numeric_array(validate: false)
|
115
|
+
validate! if validate
|
116
|
+
self
|
117
|
+
end
|
118
|
+
|
119
|
+
# GFA datatype B representation of the numeric array
|
120
|
+
# @raise [RGFA::NumericArray::ValueError] if the array
|
121
|
+
# if not a valid numeric array
|
122
|
+
# @return [String]
|
123
|
+
def to_s
|
124
|
+
subtype = compute_subtype
|
125
|
+
"#{subtype},#{join(",")}"
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
# Exception raised if a value in a numeric array is not compatible
|
131
|
+
# with the selected subtype
|
132
|
+
class RGFA::NumericArray::ValueError < RGFA::Error; end
|
133
|
+
|
134
|
+
# Exception raised if an invalid subtype code is found
|
135
|
+
class RGFA::NumericArray::TypeError < RGFA::Error; end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Method to create a numeric array from an array
|
139
|
+
#
|
140
|
+
class Array
|
141
|
+
# Create a numeric array from an Array instance
|
142
|
+
# @param validate [Boolean] <i>(default: +true+)</i>
|
143
|
+
# if +true+, validate the range of the numeric values, according
|
144
|
+
# to the array subtype
|
145
|
+
# @raise [RGFA::NumericArray::ValueError] if validate is set and
|
146
|
+
# any value is not compatible with the subtype
|
147
|
+
# @return [RGFA::NumericArray] the numeric array
|
148
|
+
def to_numeric_array(validate: true)
|
149
|
+
na = RGFA::NumericArray.new(self)
|
150
|
+
na.validate! if validate
|
151
|
+
na
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
#
|
156
|
+
# Method to create a numeric array from a string
|
157
|
+
#
|
158
|
+
class String
|
159
|
+
# Create a numeric array from a string
|
160
|
+
# @param validate [Boolean] <i>(default: +true+)</i>
|
161
|
+
# if +true+, validate the range of the numeric values, according
|
162
|
+
# to the array subtype
|
163
|
+
# @raise [RGFA::NumericArray::ValueError] if validate is set and
|
164
|
+
# any value is not compatible with the subtype
|
165
|
+
# @raise [RGFA::NumericArray::TypeError] if the subtype code is invalid
|
166
|
+
# @return [RGFA::NumericArray] the numeric array
|
167
|
+
def to_numeric_array(validate: true)
|
168
|
+
elems = split(",")
|
169
|
+
subtype = elems.shift
|
170
|
+
integer = (subtype != "f")
|
171
|
+
if integer
|
172
|
+
range = RGFA::NumericArray::SUBTYPE_RANGE[subtype]
|
173
|
+
elsif !RGFA::NumericArray::SUBTYPE.include?(subtype)
|
174
|
+
raise RGFA::NumericArray::TypeError, "Subtype #{subtype} unknown"
|
175
|
+
end
|
176
|
+
elems.map do |e|
|
177
|
+
begin
|
178
|
+
if integer
|
179
|
+
e = Integer(e)
|
180
|
+
if validate and not range.include?(e)
|
181
|
+
raise "NumericArray: "+
|
182
|
+
"value is outside of subtype #{subtype} range\n"+
|
183
|
+
"Value: #{e}\n"+
|
184
|
+
"Range: #{range.inspect}\n"+
|
185
|
+
"Content: #{inspect}"
|
186
|
+
end
|
187
|
+
e
|
188
|
+
else
|
189
|
+
Float(e)
|
190
|
+
end
|
191
|
+
rescue => msg
|
192
|
+
raise RGFA::NumericArray::ValueError, msg
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
data/lib/rgfa/paths.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative "error"
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for the RGFA class, which allow to handle paths in the graph.
|
5
|
+
#
|
6
|
+
module RGFA::Paths
|
7
|
+
|
8
|
+
def add_path(gfa_line)
|
9
|
+
gfa_line = gfa_line.to_rgfa_line(validate: @validate)
|
10
|
+
if @segments.has_key?(gfa_line.path_name)
|
11
|
+
raise RGFA::DuplicatedLabelError,
|
12
|
+
"Error when adding line: #{gfa_line}\n"+
|
13
|
+
"a segment already exists with the name: #{gfa_line.path_name}\n"+
|
14
|
+
"Segment: #{@segments[gfa_line.path_name]}"
|
15
|
+
elsif @paths.has_key?(gfa_line.path_name)
|
16
|
+
raise RGFA::DuplicatedLabelError,
|
17
|
+
"Error when adding line: #{gfa_line}\n"+
|
18
|
+
"a path already exists with the name: #{gfa_line.path_name}\n"+
|
19
|
+
"Path: #{@paths[gfa_line.path_name]}"
|
20
|
+
else
|
21
|
+
@paths[gfa_line.path_name] = gfa_line
|
22
|
+
gfa_line.required_links.each do |from,to,cigar|
|
23
|
+
l = nil
|
24
|
+
if segment(from.segment) and segment(to.segment)
|
25
|
+
l = link_from_to(from, to, cigar)
|
26
|
+
end
|
27
|
+
if l.nil?
|
28
|
+
v = RGFA::Line::Link.new({:from => from.segment,
|
29
|
+
:from_orient => from.orient,
|
30
|
+
:to => to.segment,
|
31
|
+
:to_orient => to.orient,
|
32
|
+
:overlap => cigar},
|
33
|
+
virtual: true)
|
34
|
+
if @segments_first_order
|
35
|
+
raise RGFA::LineMissingError, "Path: #{gfa_line}\n"+
|
36
|
+
"requires a non-existing link:\n"+
|
37
|
+
"#{v}"
|
38
|
+
end
|
39
|
+
add_link(v)
|
40
|
+
l = v
|
41
|
+
end
|
42
|
+
direct = l.compatible_direct?(from, to, cigar)
|
43
|
+
gfa_line.links << [l, direct]
|
44
|
+
l.paths << [gfa_line, direct]
|
45
|
+
end
|
46
|
+
gfa_line.segment_names.each do |sn_with_o|
|
47
|
+
sn_with_o[0] = segment(sn_with_o[0])
|
48
|
+
sn_with_o[0].paths[sn_with_o[1]] << gfa_line
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
protected :add_path
|
53
|
+
|
54
|
+
# Delete a path from the RGFA graph
|
55
|
+
# @return [RGFA] self
|
56
|
+
# @param pt [String, RGFA::Line::Path] path name or instance
|
57
|
+
def delete_path(pt)
|
58
|
+
pt = path!(pt)
|
59
|
+
pt.segment_names.each {|sn, o| segment!(sn).paths[o].delete(pt)}
|
60
|
+
pt.links.each {|l, dir| l.paths.delete([pt, dir])}
|
61
|
+
@paths.delete(pt.path_name)
|
62
|
+
return self
|
63
|
+
end
|
64
|
+
|
65
|
+
# All path lines of the graph
|
66
|
+
# @return [Array<RGFA::Line::Path>]
|
67
|
+
def paths
|
68
|
+
@paths.values
|
69
|
+
end
|
70
|
+
|
71
|
+
# @!macro [new] path
|
72
|
+
# Searches the path with name equal to +pt+.
|
73
|
+
# @param pt [String, RGFA::Line::Path] a path or path name
|
74
|
+
# @return [RGFA::Line::Path] if a path is found
|
75
|
+
# @return [nil] if no such path exists in the RGFA instance
|
76
|
+
#
|
77
|
+
def path(pt)
|
78
|
+
return pt if pt.kind_of?(RGFA::Line)
|
79
|
+
@paths[pt.to_sym]
|
80
|
+
end
|
81
|
+
|
82
|
+
# @!macro path
|
83
|
+
# @raise [RGFA::LineMissingError] if no such path exists in the RGFA instance
|
84
|
+
def path!(pt)
|
85
|
+
pt = path(pt)
|
86
|
+
raise RGFA::LineMissingError, "No path has name #{pt}" if pt.nil?
|
87
|
+
pt
|
88
|
+
end
|
89
|
+
|
90
|
+
# @return [Array<RGFA::Line::Path>] paths whose +segment_names+ include the
|
91
|
+
# specified segment.
|
92
|
+
# @!macro [new] segment_or_name
|
93
|
+
# @param s [RGFA::Line::Segment, Symbol] a segment instance or name
|
94
|
+
def paths_with(s)
|
95
|
+
segment!(s).all_paths
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|