rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,3 @@
1
+ # Parent class for library-specific errors
2
+ class RGFA::Error < StandardError; end
3
+
@@ -0,0 +1,87 @@
1
+ # Array representing multiple values of the same tag in different header lines
2
+ class RGFA::FieldArray < Array
3
+ attr_reader :datatype
4
+
5
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] the datatype to use
6
+ def initialize(datatype, data = [])
7
+ @datatype = datatype
8
+ super(data)
9
+ end
10
+
11
+ # Run a datatype-specific validation on each element of the array
12
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE]
13
+ def validate_gfa_field!(datatype, fieldname=nil)
14
+ each.validate_gfa_field!(@datatype, fieldname)
15
+ end
16
+
17
+ # Default datatype, in this case :J
18
+ # @api private
19
+ def default_gfa_datatype
20
+ :J
21
+ end
22
+
23
+ # Representation of the field array as JSON array, with
24
+ # two additional values: the datatype and a zero byte as "signature".
25
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] (ignored, J is always used)
26
+ # @api private
27
+ def to_gfa_field(datatype: nil)
28
+ self << @datatype
29
+ self << "\0"
30
+ to_json
31
+ end
32
+
33
+ # Add a value to the array and validate
34
+ # @raise [RGFA::FieldArray::TypeMismatchError] if the type
35
+ # of the new value does not correspond to the type of
36
+ # existing values
37
+ # @param value [Object] the value to add
38
+ # @param type [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use;
39
+ # if not +nil+, it will be checked that the specified datatype is the
40
+ # same as for previous elements of the field array;
41
+ # if +nil+, the value will be validated, according to the datatype
42
+ # specified on field array creation
43
+ # @param fieldname [Symbol] the field name to use for error messages
44
+ #
45
+ def push_with_validation(value, type, fieldname=nil)
46
+ if type.nil?
47
+ value.validate_gfa_field!(@datatype, fieldname)
48
+ elsif type != @datatype
49
+ raise RGFA::FieldArray::TypeMismatchError,
50
+ "Datatype mismatch error for field #{fieldname}:\n"+
51
+ "value: #{value}\n"+
52
+ "existing datatype: #{@datatype};\n"+
53
+ "new datatype: #{type}"
54
+ end
55
+ self << value
56
+ end
57
+ end
58
+
59
+ # Generic error associated with field arrays
60
+ class RGFA::FieldArray::Error < RGFA::Error; end
61
+
62
+ # Error raised when trying to add elements with a wrong datatype
63
+ class RGFA::FieldArray::TypeMismatchError < RGFA::Error; end
64
+
65
+ class Array
66
+ # Is this possibly a {RGFA::FieldArray} instance?
67
+ #
68
+ # (i.e. are the two last elements a datatype symbol
69
+ # and a zero byte?)
70
+ # @return [Boolean]
71
+ def rgfa_field_array?
72
+ self[-1] == "\0" and
73
+ RGFA::Line::OPTFIELD_DATATYPE.include?(self[-2].to_sym)
74
+ end
75
+
76
+ # Create a {RGFA::FieldArray} from an array
77
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use
78
+ def to_rgfa_field_array(datatype=nil)
79
+ if self.rgfa_field_array?
80
+ RGFA::FieldArray.new(self[-2].to_sym, self[0..-3])
81
+ elsif datatype.nil?
82
+ raise RGFA::FieldArray::Error, "no datatype specified"
83
+ else
84
+ RGFA::FieldArray.new(datatype, self)
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,109 @@
1
+ require "json"
2
+ require_relative "byte_array"
3
+ require_relative "numeric_array"
4
+ require_relative "cigar"
5
+ require_relative "error"
6
+ require_relative "field_array"
7
+
8
+ #
9
+ # Methods to parse the string representations of the GFA fields
10
+ # @api private
11
+ #
12
+ module RGFA::FieldParser
13
+
14
+ # Parse a string representation of a GFA field value
15
+ # @raise [RGFA::Error] if the value is not valid
16
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
17
+ def parse_gfa_field(datatype: nil,
18
+ validate_strings: true,
19
+ fieldname: nil,
20
+ frozen: false)
21
+ case datatype
22
+ when :A, :Z, :seq
23
+ validate_gfa_field!(datatype, fieldname: fieldname) if validate_strings
24
+ self.freeze if frozen
25
+ return self
26
+ when :lbl, :orn
27
+ validate_gfa_field!(datatype, fieldname: fieldname) if validate_strings
28
+ return to_sym.freeze
29
+ when :i
30
+ return Integer(self)
31
+ when :pos
32
+ value = Integer(self)
33
+ raise RGFA::FieldParser::FormatError if value < 0
34
+ return value
35
+ when :f
36
+ return Float(self)
37
+ when :H
38
+ value = to_byte_array
39
+ value.freeze if frozen
40
+ return value
41
+ when :B
42
+ value = to_numeric_array
43
+ value.freeze if frozen
44
+ return value
45
+ when :J
46
+ value = JSON.parse(self)
47
+ # RGFA convention for array of fields
48
+ if value.kind_of?(Array) and value.rgfa_field_array?
49
+ value = value.to_rgfa_field_array
50
+ end
51
+ # no need to freeze, as any Hash or Array will be valid
52
+ return value
53
+ when :cig
54
+ value = to_cigar
55
+ value.freeze if frozen
56
+ return value
57
+ when :cgs
58
+ value = split(",").map do |c|
59
+ c = c.to_cigar
60
+ c.freeze if frozen
61
+ c
62
+ end
63
+ value.freeze if frozen
64
+ return value
65
+ when :lbs
66
+ value = split(",").map do |l|
67
+ o = l[-1].to_sym
68
+ l = l[0..-2]
69
+ if validate_strings
70
+ l.validate_gfa_field!(:lbl, fieldname: "#{fieldname} "+
71
+ "(entire field content: #{self})" )
72
+ end
73
+ os = [l.to_sym, o].to_oriented_segment
74
+ os.freeze if frozen
75
+ os
76
+ end
77
+ value.freeze if frozen
78
+ return value
79
+ else
80
+ raise RGFA::FieldParser::UnknownDatatypeError,
81
+ "Datatype unknown: #{datatype.inspect}"
82
+ end
83
+ end
84
+
85
+ # Parses an optional field in the form tagname:datatype:value
86
+ # and parses the value according to the datatype
87
+ # @raise [RGFA::FieldParser::FormatError] if the string does not represent
88
+ # an optional field
89
+ # @return [Array(Symbol, RGFA::Line::FIELD_DATATYPE, String)]
90
+ # the parsed content of the field
91
+ def parse_gfa_optfield
92
+ if self =~ /^([A-Za-z][A-Za-z0-9]):([AifZJHB]):(.+)$/
93
+ return $1.to_sym, $2.to_sym, $3
94
+ else
95
+ raise RGFA::FieldParser::FormatError,
96
+ "Expected optional field, found: #{self.inspect}"
97
+ end
98
+ end
99
+ end
100
+
101
+ # Error raised if the field content has an invalid format
102
+ class RGFA::FieldParser::FormatError < RGFA::Error; end
103
+
104
+ # Error raised if an unknown datatype symbol is used
105
+ class RGFA::FieldParser::UnknownDatatypeError < RGFA::Error; end
106
+
107
+ class String
108
+ include RGFA::FieldParser
109
+ end
@@ -0,0 +1,241 @@
1
+ require_relative "field_parser"
2
+ require_relative "line"
3
+
4
+ #
5
+ # Methods to validate the string representations of the GFA fields data
6
+ # @api private
7
+ #
8
+ module RGFA::FieldValidator
9
+
10
+ # Validation regular expressions, derived from the GFA specification
11
+ DATASTRING_VALIDATION_REGEXP = {
12
+ :A => /^[!-~]$/, # Printable character
13
+ :i => /^[-+]?[0-9]+$/, # Signed integer
14
+ :f => /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/,
15
+ # Single-precision floating number
16
+ :Z => /^[ !-~]+$/, # Printable string, including space
17
+ :J => /^[ !-~]+$/, # JSON, excluding new-line and tab characters
18
+ :H => /^[0-9A-F]+$/, # Byte array in the Hex format
19
+ :B => /^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+$/,
20
+ # Integer or numeric array
21
+ :lbl => /^[!-)+-<>-~][!-~]*$/, # segment/path label
22
+ :orn => /^\+|-$/, # segment orientation
23
+ :lbs => /^[!-)+-<>-~][!-~]*[+-](,[!-)+-<>-~][!-~]*[+-])+$/,
24
+ # multiple labels with orientations, comma-sep
25
+ :seq => /^\*$|^[A-Za-z=.]+$/, # nucleotide sequence
26
+ :pos => /^[0-9]*$/, # positive integer
27
+ :cig => /^(\*|(([0-9]+[MIDNSHPX=])+))$/, # CIGAR string
28
+ :cgs => /^(\*|(([0-9]+[MIDNSHPX=])+))(,(\*|(([0-9]+[MIDNSHPX=])+)))*$/,
29
+ # multiple CIGARs, comma-sep
30
+ }
31
+
32
+ # Validates the string according to the provided datatype
33
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
34
+ # @param fieldname [#to_s] Fieldname to use in the error msg
35
+ # @raise [RGFA::FieldParser::FormatError] if the string does not match
36
+ # the regexp for the provided datatype
37
+ # @return [void]
38
+ # @api private
39
+ def validate_gfa_field!(datatype, fieldname=nil)
40
+ regexp = DATASTRING_VALIDATION_REGEXP[datatype]
41
+ raise RGFA::FieldParser::UnknownDatatypeError if regexp.nil?
42
+ if (regexp !~ self)
43
+ fieldname ||= "Value"
44
+ raise RGFA::FieldParser::FormatError,
45
+ "Wrong format for field #{fieldname}: \n"+
46
+ "Content: #{self.inspect}\n"+
47
+ "Datatype: #{datatype}\n"+
48
+ "Expected regex: #{regexp}\n"
49
+ end
50
+ end
51
+
52
+ end
53
+
54
+ class String
55
+ include RGFA::FieldValidator
56
+ end
57
+
58
+ class Object
59
+ # @!macro [new] validate_gfa_field
60
+ # Validates the object according to the provided datatype
61
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
62
+ # @param fieldname [#to_s] Fieldname to use in the error msg
63
+ # @raise [RGFA::FieldParser::FormatError] if the object type or content
64
+ # is not compatible to the provided datatype
65
+ # @return [void]
66
+ # @api private
67
+ def validate_gfa_field!(datatype, fieldname=nil)
68
+ raise RGFA::FieldParser::FormatError,
69
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
70
+ "Content: #{self.inspect}\n"+
71
+ "Datatype: #{datatype}"
72
+ end
73
+ end
74
+
75
+ class Symbol
76
+ # @!macro validate_gfa_field
77
+ def validate_gfa_field!(datatype, fieldname=nil)
78
+ if datatype != :lbl and datatype != :orn and datatype != :Z
79
+ raise RGFA::FieldParser::FormatError,
80
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
81
+ "Content: #{self.inspect}\n"+
82
+ "Datatype: #{datatype}"
83
+ end
84
+ self.to_s.validate_gfa_field!(datatype)
85
+ end
86
+ end
87
+
88
+ class Hash
89
+ # @!macro validate_gfa_field
90
+ def validate_gfa_field!(datatype, fieldname=nil)
91
+ if datatype != :J
92
+ raise RGFA::FieldParser::FormatError,
93
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
94
+ "Content: #{self.inspect}\n"+
95
+ "Datatype: #{datatype}"
96
+ end
97
+ end
98
+ end
99
+
100
+ class Array
101
+ # @!macro validate_gfa_field
102
+ def validate_gfa_field!(datatype, fieldname=nil)
103
+ begin
104
+ case datatype
105
+ when :J
106
+ return
107
+ when :Z
108
+ return
109
+ when :lbs
110
+ map!(&:to_oriented_segment).each(&:validate!)
111
+ return
112
+ when :cig
113
+ to_cigar.validate!
114
+ return
115
+ when :cgs
116
+ map(&:to_cigar).each(&:validate!)
117
+ return
118
+ when :B
119
+ to_numeric_array.validate!
120
+ return
121
+ when :H
122
+ to_byte_array.validate!
123
+ return
124
+ end
125
+ rescue => err
126
+ raise RGFA::FieldParser::FormatError,
127
+ "Invalid content for field #{fieldname}\n"+
128
+ "Content: #{self.inspect}\n"+
129
+ "Datatype: #{datatype}\n"+
130
+ "Error: #{err}"
131
+ end
132
+ raise RGFA::FieldParser::FormatError,
133
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
134
+ "Content: #{self.inspect}\n"+
135
+ "Datatype: #{datatype}"
136
+ end
137
+ end
138
+
139
+ class RGFA::ByteArray
140
+ # @!macro validate_gfa_field
141
+ def validate_gfa_field!(datatype, fieldname=nil)
142
+ if datatype != :H
143
+ raise RGFA::FieldParser::FormatError,
144
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
145
+ "Content: #{self.inspect}\n"+
146
+ "Datatype: #{datatype}"
147
+ end
148
+ begin
149
+ validate!
150
+ rescue => err
151
+ raise RGFA::FieldParser::FormatError,
152
+ "Invalid content for field #{fieldname}\n"+
153
+ "Content: #{self.inspect}\n"+
154
+ "Datatype: #{datatype}\n"+
155
+ "Error: #{err}"
156
+ end
157
+ end
158
+ end
159
+
160
+ class RGFA::CIGAR
161
+ # @!macro validate_gfa_field
162
+ def validate_gfa_field!(datatype, fieldname=nil)
163
+ if datatype != :cig
164
+ raise RGFA::FieldParser::FormatError,
165
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
166
+ "Content: #{self.inspect}\n"+
167
+ "Datatype: #{datatype}"
168
+ end
169
+ begin
170
+ validate!
171
+ rescue => err
172
+ raise RGFA::FieldParser::FormatError,
173
+ "Invalid content for field #{fieldname}\n"+
174
+ "Content: #{self.inspect}\n"+
175
+ "Datatype: #{datatype}\n"+
176
+ "Error: #{err}"
177
+ end
178
+ end
179
+ end
180
+
181
+ class RGFA::NumericArray
182
+ # @!macro validate_gfa_field
183
+ def validate_gfa_field!(datatype, fieldname=nil)
184
+ if datatype != :B
185
+ raise RGFA::FieldParser::FormatError,
186
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
187
+ "Content: #{self.inspect}\n"+
188
+ "Datatype: #{datatype}"
189
+ end
190
+ begin
191
+ validate!
192
+ rescue => err
193
+ raise RGFA::FieldParser::FormatError,
194
+ "Invalid content for field #{fieldname}\n"+
195
+ "Content: #{self.inspect}\n"+
196
+ "Datatype: #{datatype}\n"+
197
+ "Error: #{err}"
198
+ end
199
+ end
200
+ end
201
+
202
+ class Float
203
+ # @!macro validate_gfa_field
204
+ def validate_gfa_field!(datatype, fieldname=nil)
205
+ if datatype != :f and datatype != :Z
206
+ raise RGFA::FieldParser::FormatError,
207
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
208
+ "Content: #{self.inspect}\n"+
209
+ "Datatype: #{datatype}"
210
+ end
211
+ end
212
+ end
213
+
214
+ class Fixnum
215
+ # @!macro validate_gfa_field
216
+ def validate_gfa_field!(datatype, fieldname=nil)
217
+ if (datatype == :pos and self < 0)
218
+ raise RGFA::FieldParser::FormatError,
219
+ "Invalid content for field #{fieldname}\n"+
220
+ "Content: #{self.inspect}\n"+
221
+ "Datatype: #{datatype}"
222
+ elsif ![:i, :f, :Z].include?(datatype)
223
+ raise RGFA::FieldParser::FormatError,
224
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
225
+ "Content: #{self.inspect}\n"+
226
+ "Datatype: #{datatype}"
227
+ end
228
+ end
229
+ end
230
+
231
+ class RGFA::Line::Segment
232
+ # @!macro validate_gfa_field
233
+ def validate_gfa_field!(datatype, fieldname=nil)
234
+ if datatype != :lbl
235
+ raise RGFA::FieldParser::FormatError,
236
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
237
+ "Content: <RGFA::Segment:#{self.to_s}>\n"+
238
+ "Datatype: #{datatype}"
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,108 @@
1
+ require "json"
2
+ require_relative "byte_array"
3
+ require_relative "numeric_array"
4
+ require_relative "line"
5
+
6
+ #
7
+ # Methods to convert ruby objects to the GFA string representations
8
+ # @api private
9
+ #
10
+ # The default conversion is implemented in this module, which is included in
11
+ # Object; single classes may overwrite the following methods, if necessary:
12
+ # - {#default_gfa_datatype}, which returns the symbol of the optional
13
+ # field GFA datatype to use, if none is specified
14
+ # (See RGFA::Line::FIELD_DATATYPE); the default is :Z
15
+ # - {#to_gfa_field} should return a GFA string representation,
16
+ # eventually depending on the specified datatype; no validation is done;
17
+ # the default is #to_s
18
+ #
19
+ module RGFA::FieldWriter
20
+
21
+ # @!macro [new] to_gfa_field
22
+ # Representation of the data for GFA fields; this method
23
+ # does not (in general) validate the string. The method
24
+ # can be overwritten for a given class, and may take
25
+ # the {#default_gfa_datatype} into consideration.
26
+ # @return [String]
27
+ # @api private
28
+ def to_gfa_field(datatype: nil); to_s; end
29
+
30
+ # Representation of the data as an optional field
31
+ # @param fieldname [Symbol] the tag name
32
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] (<i>defaults to: the value
33
+ # returned by {#default_gfa_datatype}</i>)
34
+ # @api private
35
+ def to_gfa_optfield(fieldname, datatype: default_gfa_datatype)
36
+ return "#{fieldname}:#{datatype}:#{to_gfa_field(datatype: datatype)}"
37
+ end
38
+
39
+ # @!macro [new] gfa_datatype
40
+ # Optional field GFA datatype to use, if none is provided
41
+ # @return [RGFA::Line::FIELD_DATATYPE]
42
+ # @api private
43
+ def default_gfa_datatype; :Z; end
44
+ end
45
+
46
+ class Object
47
+ include RGFA::FieldWriter
48
+ end
49
+
50
+ class Fixnum
51
+ # @!macro gfa_datatype
52
+ def default_gfa_datatype; :i; end
53
+ end
54
+
55
+ class Float
56
+ # @!macro gfa_datatype
57
+ def default_gfa_datatype; :f; end
58
+ end
59
+
60
+ class Hash
61
+ # @!macro to_gfa_field
62
+ def to_gfa_field(datatype: nil); to_json; end
63
+
64
+ # @!macro gfa_datatype
65
+ def default_gfa_datatype; :J; end
66
+ end
67
+
68
+ class Array
69
+ # @!macro to_gfa_field
70
+ def to_gfa_field(datatype: default_gfa_datatype)
71
+ case datatype
72
+ when :B
73
+ to_numeric_array.to_s
74
+ when :J
75
+ to_json
76
+ when :cig
77
+ to_cigar.to_s
78
+ when :cgs
79
+ map{|cig|cig.to_cigar.to_s}.join(",")
80
+ when :lbs
81
+ map{|os|os.to_oriented_segment.to_s}.join(",")
82
+ when :H
83
+ to_byte_array.to_s
84
+ else
85
+ map(&:to_s).join(",")
86
+ end
87
+ end
88
+
89
+ # @!macro gfa_datatype
90
+ def default_gfa_datatype
91
+ (all?{|i|i.kind_of?(Integer)} or all?{|i|i.kind_of?(Float)}) ? :B : :J
92
+ end
93
+ end
94
+
95
+ class RGFA::ByteArray
96
+ # @!macro gfa_datatype
97
+ def default_gfa_datatype; :H; end
98
+ end
99
+
100
+ class RGFA::NumericArray
101
+ # @!macro gfa_datatype
102
+ def default_gfa_datatype; :B; end
103
+ end
104
+
105
+ class RGFA::Line::Segment
106
+ # @!macro to_gfa_field
107
+ def to_gfa_field(datatype: nil); to_sym.to_s; end
108
+ end