rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,156 @@
1
+ require_relative "error.rb"
2
+
3
+ #
4
+ # Method for the RGFA class, which allow to split a segment into
5
+ # multiple copies.
6
+ #
7
+ module RGFA::Multiplication
8
+
9
+ # Create multiple copies of a segment.
10
+ #
11
+ # == Automatic computation of the copy names
12
+ #
13
+ # - Can be overridden, by providing an array of copy names.
14
+ # - First, it is checked if the name of the original segment ends with a
15
+ # relevant
16
+ # string, i.e. a lower case letter (for +:lowcase+), an upper case letter
17
+ # (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
18
+ # plus one or more optional digits (for +:copy+).
19
+ # - If so, it is assumed, it was already a copy, and it is not
20
+ # altered.
21
+ # - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
22
+ # +:number+), +_copy+ (for +:copy+) is appended to the string.
23
+ # - Then, in all
24
+ # cases, next (*) is called on the string, until a valid, non-existant name
25
+ # is found for each of the segment copies
26
+ # - (*) = except for +:copy+, where
27
+ # for the first copy no digit is present, but for the following is,
28
+ # i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
29
+ #
30
+ # @param [Integer] factor multiplication factor; if 0, delete the segment;
31
+ # if 1; do nothing; if > 1; number of copies to create
32
+ # @param segment [String, RGFA::Line::Segment] segment name or instance
33
+ # @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
34
+ # <i>(Defaults to: +:lowcase+)</i>
35
+ # Array of names for the copies of the segment,
36
+ # or a symbol, which defines a system to compute the names from the name of
37
+ # the original segment. See "automatic computation of the copy names".
38
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
39
+ # If factor == 0 (i.e. deletion), delete segment only if
40
+ # {#cut_segment?}(segment) is +false+.
41
+ #
42
+ # @return [RGFA] self
43
+ def multiply(segment, factor, copy_names: :lowcase,
44
+ conserve_components: true)
45
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
46
+ if factor < 2
47
+ return self if factor == 1
48
+ return self if cut_segment?(segment_name) and conserve_components
49
+ return delete_segment(segment_name)
50
+ end
51
+ s = segment!(segment_name)
52
+ divide_segment_and_connection_counts(s, factor)
53
+ copy_names = compute_copy_names(copy_names, segment_name, factor)
54
+ copy_names.each {|cn| clone_segment_and_connections(s, cn)}
55
+ return self
56
+ end
57
+
58
+ private
59
+
60
+ def compute_copy_names(copy_names, segment_name, factor)
61
+ return nil if factor < 2
62
+ accepted = [:lowcase, :upcase, :number, :copy]
63
+ if copy_names.kind_of?(Array)
64
+ return copy_names
65
+ elsif !accepted.include?(copy_names)
66
+ raise ArgumentError,
67
+ "copy_names shall be an array of names or one of: "+
68
+ accepted.inspect
69
+ end
70
+ retval = []
71
+ next_name = segment_name.to_s
72
+ case copy_names
73
+ when :lowcase
74
+ if next_name =~ /^.*[a-z]$/
75
+ next_name = next_name.next
76
+ else
77
+ next_name += "b"
78
+ end
79
+ when :upcase
80
+ if next_name =~ /^.*[A-Z]$/
81
+ next_name = next_name.next
82
+ else
83
+ next_name += "B"
84
+ end
85
+ when :number
86
+ if next_name =~ /^.*[0-9]$/
87
+ next_name = next_name.next
88
+ else
89
+ next_name += "2"
90
+ end
91
+ when :copy
92
+ if next_name =~ /^.*_copy(\d*)$/
93
+ next_name += "1" if $1 == ""
94
+ next_name = next_name.next
95
+ copy_names = :number
96
+ else
97
+ next_name += "_copy"
98
+ end
99
+ end
100
+ while retval.size < (factor-1)
101
+ while retval.include?(next_name) or
102
+ @segments.has_key?(next_name.to_sym) or
103
+ @paths.has_key?(next_name.to_sym)
104
+ if copy_names == :copy
105
+ next_name += "1"
106
+ copy_names = :number
107
+ end
108
+ next_name = next_name.next
109
+ end
110
+ retval << next_name
111
+ end
112
+ return retval
113
+ end
114
+
115
+ def divide_counts(gfa_line, factor)
116
+ [:KC, :RC, :FC].each do |count_tag|
117
+ if gfa_line.optional_fieldnames.include?(count_tag)
118
+ value = (gfa_line.get(count_tag).to_f / factor)
119
+ gfa_line.set(count_tag, value.to_i)
120
+ end
121
+ end
122
+ end
123
+
124
+ def divide_segment_and_connection_counts(segment, factor)
125
+ divide_counts(segment, factor)
126
+ [:links,:containments].each do |rt|
127
+ [:from,:to].each do |dir|
128
+ [:+, :-].each do |o|
129
+ segment.send(rt)[dir][o].each do |l|
130
+ # circular link counts shall be divided only ones
131
+ next if dir == :to and l.from == l.to
132
+ divide_counts(l, factor)
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
138
+
139
+ def clone_segment_and_connections(segment, clone_name)
140
+ cpy = segment.clone
141
+ cpy.name = clone_name
142
+ self << cpy
143
+ [:links,:containments].each do |rt|
144
+ [:from,:to].each do |dir|
145
+ [:+, :-].each do |o|
146
+ segment.send(rt)[dir][o].each do |l|
147
+ lc = l.clone
148
+ lc.set(dir, clone_name)
149
+ self << lc
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
155
+
156
+ end
@@ -0,0 +1,196 @@
1
+ require_relative "error"
2
+
3
+ #
4
+ # A numeric array representable using the data type B of the GFA specification
5
+ #
6
+ class RGFA::NumericArray < Array
7
+
8
+ # Subtypes for signed integers, from the smallest to the largest
9
+ SIGNED_INT_SUBTYPE = %W[c s i]
10
+
11
+ # Subtypes for unsigned integers, from the smallest to the largest
12
+ UNSIGNED_INT_SUBTYPE = SIGNED_INT_SUBTYPE.map{|st|st.upcase}
13
+
14
+ # Subtypes for integers
15
+ INT_SUBTYPE = UNSIGNED_INT_SUBTYPE + SIGNED_INT_SUBTYPE
16
+
17
+ # Subtypes for floats
18
+ FLOAT_SUBTYPE = ["f"]
19
+
20
+ # Subtypes
21
+ SUBTYPE = INT_SUBTYPE + FLOAT_SUBTYPE
22
+
23
+ # Number of bits of unsigned integer subtypes
24
+ SUBTYPE_BITS = {"c" => 8, "s" => 16, "i" => 32}
25
+
26
+ # Range for integer subtypes
27
+ SUBTYPE_RANGE = Hash[
28
+ INT_SUBTYPE.map do |subtype|
29
+ [
30
+ subtype,
31
+ if subtype == subtype.upcase
32
+ 0..((2**SUBTYPE_BITS[subtype.downcase])-1)
33
+ else
34
+ (-(2**(SUBTYPE_BITS[subtype]-1)))..((2**(SUBTYPE_BITS[subtype]-1))-1)
35
+ end
36
+ ]
37
+ end
38
+ ]
39
+
40
+ # Validate the numeric array
41
+ #
42
+ # @raise [RGFA::NumericArray::ValueError] if the array is not valid
43
+ def validate!
44
+ compute_subtype
45
+ end
46
+
47
+ # Computes the subtype of the array from its content.
48
+ #
49
+ # If all elements are float, then the computed subtype is "f".
50
+ # If all elements are integer, the smallest possible numeric subtype
51
+ # is computed; thereby,
52
+ # if all elements are non-negative, an unsigned subtype is selected,
53
+ # otherwise a signed subtype.
54
+ # In all other cases an exception is raised.
55
+ #
56
+ # @raise [RGFA::NumericArray::ValueError] if the array is not a valid numeric
57
+ # array
58
+ # @return [RGFA::NumericArray::SUBTYPE]
59
+ def compute_subtype
60
+ if all? {|f|f.kind_of?(Float)}
61
+ return "f"
62
+ else
63
+ e_max = nil
64
+ e_min = nil
65
+ each do |e|
66
+ if !e.kind_of?(Integer)
67
+ raise RGFA::NumericArray::ValueError,
68
+ "NumericArray does not contain homogenous numeric values\n"+
69
+ "Content: #{inspect}"
70
+ end
71
+ e_max = e if e_max.nil? or e > e_max
72
+ e_min = e if e_min.nil? or e < e_min
73
+ end
74
+ return RGFA::NumericArray.integer_type(e_min..e_max)
75
+ end
76
+ end
77
+
78
+ # Computes the subtype for integers in a given range.
79
+ #
80
+ # If all elements are non-negative, an unsigned subtype is selected,
81
+ # otherwise a signed subtype.
82
+ #
83
+ # @param range [Range] the integer range
84
+ #
85
+ # @raise [RGFA::NumericArray::ValueError] if the integer range is outside
86
+ # all subtype ranges
87
+ #
88
+ # @return [RGFA::NumericArray::INT_SUBTYPE] subtype code
89
+ def self.integer_type(range)
90
+ if range.min < 0
91
+ SIGNED_INT_SUBTYPE.each do |st|
92
+ st_range = RGFA::NumericArray::SUBTYPE_RANGE[st]
93
+ if st_range.include?(range.min) and st_range.include?(range.max)
94
+ return st
95
+ end
96
+ end
97
+ else
98
+ UNSIGNED_INT_SUBTYPE.each do |st|
99
+ return st if range.max < RGFA::NumericArray::SUBTYPE_RANGE[st].max
100
+ end
101
+ end
102
+ raise RGFA::NumericArray::ValueError,
103
+ "NumericArray: values are outside of all integer subtype ranges\n"+
104
+ "Content: #{inspect}"
105
+ end
106
+
107
+ # Return self
108
+ # @param validate [Boolean] <i>(default: +false+)</i>
109
+ # if +true+, validate the range of the numeric values, according
110
+ # to the array subtype
111
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
112
+ # any value is not compatible with the subtype
113
+ # @return [RGFA::NumericArray]
114
+ def to_numeric_array(validate: false)
115
+ validate! if validate
116
+ self
117
+ end
118
+
119
+ # GFA datatype B representation of the numeric array
120
+ # @raise [RGFA::NumericArray::ValueError] if the array
121
+ # if not a valid numeric array
122
+ # @return [String]
123
+ def to_s
124
+ subtype = compute_subtype
125
+ "#{subtype},#{join(",")}"
126
+ end
127
+
128
+ end
129
+
130
+ # Exception raised if a value in a numeric array is not compatible
131
+ # with the selected subtype
132
+ class RGFA::NumericArray::ValueError < RGFA::Error; end
133
+
134
+ # Exception raised if an invalid subtype code is found
135
+ class RGFA::NumericArray::TypeError < RGFA::Error; end
136
+
137
+ #
138
+ # Method to create a numeric array from an array
139
+ #
140
+ class Array
141
+ # Create a numeric array from an Array instance
142
+ # @param validate [Boolean] <i>(default: +true+)</i>
143
+ # if +true+, validate the range of the numeric values, according
144
+ # to the array subtype
145
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
146
+ # any value is not compatible with the subtype
147
+ # @return [RGFA::NumericArray] the numeric array
148
+ def to_numeric_array(validate: true)
149
+ na = RGFA::NumericArray.new(self)
150
+ na.validate! if validate
151
+ na
152
+ end
153
+ end
154
+
155
+ #
156
+ # Method to create a numeric array from a string
157
+ #
158
+ class String
159
+ # Create a numeric array from a string
160
+ # @param validate [Boolean] <i>(default: +true+)</i>
161
+ # if +true+, validate the range of the numeric values, according
162
+ # to the array subtype
163
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
164
+ # any value is not compatible with the subtype
165
+ # @raise [RGFA::NumericArray::TypeError] if the subtype code is invalid
166
+ # @return [RGFA::NumericArray] the numeric array
167
+ def to_numeric_array(validate: true)
168
+ elems = split(",")
169
+ subtype = elems.shift
170
+ integer = (subtype != "f")
171
+ if integer
172
+ range = RGFA::NumericArray::SUBTYPE_RANGE[subtype]
173
+ elsif !RGFA::NumericArray::SUBTYPE.include?(subtype)
174
+ raise RGFA::NumericArray::TypeError, "Subtype #{subtype} unknown"
175
+ end
176
+ elems.map do |e|
177
+ begin
178
+ if integer
179
+ e = Integer(e)
180
+ if validate and not range.include?(e)
181
+ raise "NumericArray: "+
182
+ "value is outside of subtype #{subtype} range\n"+
183
+ "Value: #{e}\n"+
184
+ "Range: #{range.inspect}\n"+
185
+ "Content: #{inspect}"
186
+ end
187
+ e
188
+ else
189
+ Float(e)
190
+ end
191
+ rescue => msg
192
+ raise RGFA::NumericArray::ValueError, msg
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,98 @@
1
+ require_relative "error"
2
+
3
+ #
4
+ # Methods for the RGFA class, which allow to handle paths in the graph.
5
+ #
6
+ module RGFA::Paths
7
+
8
+ def add_path(gfa_line)
9
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
10
+ if @segments.has_key?(gfa_line.path_name)
11
+ raise RGFA::DuplicatedLabelError,
12
+ "Error when adding line: #{gfa_line}\n"+
13
+ "a segment already exists with the name: #{gfa_line.path_name}\n"+
14
+ "Segment: #{@segments[gfa_line.path_name]}"
15
+ elsif @paths.has_key?(gfa_line.path_name)
16
+ raise RGFA::DuplicatedLabelError,
17
+ "Error when adding line: #{gfa_line}\n"+
18
+ "a path already exists with the name: #{gfa_line.path_name}\n"+
19
+ "Path: #{@paths[gfa_line.path_name]}"
20
+ else
21
+ @paths[gfa_line.path_name] = gfa_line
22
+ gfa_line.required_links.each do |from,to,cigar|
23
+ l = nil
24
+ if segment(from.segment) and segment(to.segment)
25
+ l = link_from_to(from, to, cigar)
26
+ end
27
+ if l.nil?
28
+ v = RGFA::Line::Link.new({:from => from.segment,
29
+ :from_orient => from.orient,
30
+ :to => to.segment,
31
+ :to_orient => to.orient,
32
+ :overlap => cigar},
33
+ virtual: true)
34
+ if @segments_first_order
35
+ raise RGFA::LineMissingError, "Path: #{gfa_line}\n"+
36
+ "requires a non-existing link:\n"+
37
+ "#{v}"
38
+ end
39
+ add_link(v)
40
+ l = v
41
+ end
42
+ direct = l.compatible_direct?(from, to, cigar)
43
+ gfa_line.links << [l, direct]
44
+ l.paths << [gfa_line, direct]
45
+ end
46
+ gfa_line.segment_names.each do |sn_with_o|
47
+ sn_with_o[0] = segment(sn_with_o[0])
48
+ sn_with_o[0].paths[sn_with_o[1]] << gfa_line
49
+ end
50
+ end
51
+ end
52
+ protected :add_path
53
+
54
+ # Delete a path from the RGFA graph
55
+ # @return [RGFA] self
56
+ # @param pt [String, RGFA::Line::Path] path name or instance
57
+ def delete_path(pt)
58
+ pt = path!(pt)
59
+ pt.segment_names.each {|sn, o| segment!(sn).paths[o].delete(pt)}
60
+ pt.links.each {|l, dir| l.paths.delete([pt, dir])}
61
+ @paths.delete(pt.path_name)
62
+ return self
63
+ end
64
+
65
+ # All path lines of the graph
66
+ # @return [Array<RGFA::Line::Path>]
67
+ def paths
68
+ @paths.values
69
+ end
70
+
71
+ # @!macro [new] path
72
+ # Searches the path with name equal to +pt+.
73
+ # @param pt [String, RGFA::Line::Path] a path or path name
74
+ # @return [RGFA::Line::Path] if a path is found
75
+ # @return [nil] if no such path exists in the RGFA instance
76
+ #
77
+ def path(pt)
78
+ return pt if pt.kind_of?(RGFA::Line)
79
+ @paths[pt.to_sym]
80
+ end
81
+
82
+ # @!macro path
83
+ # @raise [RGFA::LineMissingError] if no such path exists in the RGFA instance
84
+ def path!(pt)
85
+ pt = path(pt)
86
+ raise RGFA::LineMissingError, "No path has name #{pt}" if pt.nil?
87
+ pt
88
+ end
89
+
90
+ # @return [Array<RGFA::Line::Path>] paths whose +segment_names+ include the
91
+ # specified segment.
92
+ # @!macro [new] segment_or_name
93
+ # @param s [RGFA::Line::Segment, Symbol] a segment instance or name
94
+ def paths_with(s)
95
+ segment!(s).all_paths
96
+ end
97
+
98
+ end