rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ require "rgfatools"
3
+
4
+ if ARGV.size != 1
5
+ STDERR.puts "Usage: #$0 <gfa>"
6
+ exit 1
7
+ end
8
+
9
+ gfa = RGFA.new
10
+ gfa.enable_progress_logging(part: 0.01)
11
+ gfa.turn_off_validations
12
+ gfa.read_file(ARGV[0])
13
+ gfa.merge_linear_paths(disable_tracking: true, merged_name: :short)
14
+ puts gfa
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env ruby
2
+ require "rgfatools"
3
+ require "set"
4
+
5
+ def read_sequences(filename, logger)
6
+ file = File.new(filename)
7
+ sequences = []
8
+ linecount = `wc -l #{filename}`.strip.split(" ")[0].to_i
9
+ logger.progress_init(:read_file, "lines", linecount,
10
+ "Parse sequences from file with #{linecount} lines")
11
+ file.each do |line|
12
+ if line[0]==">"
13
+ sequences << ""
14
+ else
15
+ sequences.last << line.chomp
16
+ end
17
+ logger.progress_log(:read_file)
18
+ end
19
+ logger.progress_end(:read_file)
20
+ file.close
21
+ return sequences
22
+ end
23
+
24
+ if ARGV.size != 2
25
+ STDERR.puts "Usage: #$0 <k> <genome.fas>"
26
+ exit 1
27
+ end
28
+
29
+ k = Integer(ARGV[0])
30
+
31
+ logger = RGFA::Logger.new()
32
+ logger.enable_progress(part: 0.1)
33
+ sequences = read_sequences(ARGV[1], logger)
34
+ logger.log("Sequence lengths (nt): #{sequences.map(&:size)}")
35
+ segments = {}
36
+ links = Set.new
37
+ kmercount = sequences.map{|seq|seq.length-k+1}.inject(:+)
38
+ logger.progress_init(:generate_graph, "kmers", kmercount,
39
+ "Create graph from #{kmercount} kmers")
40
+ i=1
41
+ sequences.each do |seq|
42
+ 0.upto(seq.length-k) do |pos|
43
+ kmer = seq[pos..(pos+k-1)].downcase
44
+ prefix = kmer[0..k-2]
45
+ suffix = kmer[1..k-1]
46
+ link = "L"
47
+ [prefix, suffix].each do |km1mer|
48
+ orient = "+"
49
+ km1mer_rc = km1mer.rc
50
+ if km1mer > km1mer_rc
51
+ km1mer = km1mer_rc
52
+ orient = "-"
53
+ end
54
+ s = segments[km1mer.to_sym]
55
+ if s.nil?
56
+ s = [i,0]
57
+ segments[km1mer.to_sym] = s
58
+ i+=1;
59
+ end
60
+ s[1] += 1
61
+ link << "\t#{s[0]}\t#{orient}"
62
+ end
63
+ link << "\t#{k-2}M"
64
+ links << link
65
+ logger.progress_log(:generate_graph, segments_added: i,
66
+ links_added: links.size)
67
+ end
68
+ end
69
+ logger.progress_end(:generate_graph)
70
+ segmentscount = i-1
71
+ linkscount = links.size
72
+ puts "H\tks:i:#{k}"
73
+ logger.progress_init(:write_segments, "segments", segmentscount,
74
+ "Output #{segmentscount} segments")
75
+ segments.each do |km1mer, data|
76
+ puts "S\t#{data[0]}\t#{km1mer}\tKC:i:#{data[1]}"
77
+ logger.progress_log(:write_segments)
78
+ end
79
+ logger.progress_end(:write_segments)
80
+ logger.progress_init(:write_links, "links", linkscount,
81
+ "Output #{linkscount} links")
82
+ links.each do |link|
83
+ puts link
84
+ logger.progress_log(:write_links)
85
+ end
86
+ logger.progress_end(:write_links)
@@ -0,0 +1,376 @@
1
+ # (c) 2016, Giorgio Gonnella, ZBH, Uni-Hamburg <gonnella@zbh.uni-hamburg.de>
2
+
3
+ # Main class of the RGFA library.
4
+ #
5
+ # RGFA provides a representation of a GFA graph.
6
+ # It supports creating a graph from scratch, input and output from/to file
7
+ # or strings, as well as several operations on the graph.
8
+ # The examples below show how to create a RGFA object from scratch or
9
+ # from a GFA file, write the RGFA to file, output the string representation or
10
+ # a statistics report, and control the validation level.
11
+ #
12
+ # == Interacting with the graph
13
+ #
14
+ # - {RGFA::Lines}: module with methods for finding, editing, iterating over,
15
+ # removing lines belonging to a RGFA instance. Specialized modules exist
16
+ # for each kind of line:
17
+ # - {RGFA::Headers}: accessing and creating header information is done
18
+ # using a single header line object ({#header RGFA#header})
19
+ # - {RGFA::Segments}
20
+ # - {RGFA::Links}
21
+ # - {RGFA::Containments}
22
+ # - {RGFA::Paths}
23
+ #
24
+ # - {RGFA::Line}: most interaction with the GFA involve interacting with
25
+ # its record, i.e. instances of a subclass of this class. Subclasses:
26
+ # - {RGFA::Line::Header}
27
+ # - {RGFA::Line::Segment}
28
+ # - {RGFA::Line::Link}
29
+ # - {RGFA::Line::Containment}
30
+ # - {RGFA::Line::Path}
31
+ #
32
+ # - Further modules contain methods useful for interacting with the graph
33
+ # - {RGFA::Connectivity} analysis of the connectivity of the graph
34
+ # - {RGFA::LinearPaths} finding and merging of linear paths
35
+ # - {RGFA::Multiplication} separation of the implicit instances of a repeat
36
+ #
37
+ # - Additional functionality is provided by {RGFATools}
38
+ #
39
+ # @example Creating an empty RGFA object
40
+ # gfa = RGFA.new
41
+ #
42
+ # @example Parsing and writing GFA format
43
+ # gfa = RGFA.from_file(filename) # parse GFA file
44
+ # gfa.to_file(filename) # write to GFA file
45
+ # puts gfa # show GFA representation of RGFA object
46
+ #
47
+ # @example Basic statistics report
48
+ # puts gfa.info # print report
49
+ # puts gfa.info(short = true) # compact format, in one line
50
+ #
51
+ # @example Validation
52
+ # gfa = RGFA.from_file(filename, validate: 1) # default level is 2
53
+ # gfa.validate = 3 # change validation level
54
+ # gfa.turn_off_validations # equivalent to gfa.validate = 0
55
+ # gfa.validate! # run post-validations (e.g. check segment names in links)
56
+ #
57
+ class RGFA
58
+ end
59
+
60
+ require_relative "./rgfa/byte_array.rb"
61
+ require_relative "./rgfa/cigar.rb"
62
+ require_relative "./rgfa/connectivity.rb"
63
+ require_relative "./rgfa/containments.rb"
64
+ require_relative "./rgfa/field_array.rb"
65
+ require_relative "./rgfa/field_parser.rb"
66
+ require_relative "./rgfa/field_validator.rb"
67
+ require_relative "./rgfa/field_writer.rb"
68
+ require_relative "./rgfa/multiplication.rb"
69
+ require_relative "./rgfa/headers.rb"
70
+ require_relative "./rgfa/line.rb"
71
+ require_relative "./rgfa/linear_paths.rb"
72
+ require_relative "./rgfa/lines.rb"
73
+ require_relative "./rgfa/links.rb"
74
+ require_relative "./rgfa/logger.rb"
75
+ require_relative "./rgfa/numeric_array.rb"
76
+ require_relative "./rgfa/rgl.rb"
77
+ require_relative "./rgfa/segment_ends_path.rb"
78
+ require_relative "./rgfa/segment_info.rb"
79
+ require_relative "./rgfa/segments.rb"
80
+ require_relative "./rgfa/paths.rb"
81
+ require_relative "./rgfa/sequence.rb"
82
+
83
+ class RGFA
84
+
85
+ include RGFA::Lines
86
+ include RGFA::Headers
87
+ include RGFA::Segments
88
+ include RGFA::Links
89
+ include RGFA::Containments
90
+ include RGFA::Paths
91
+ include RGFA::LinearPaths
92
+ include RGFA::Connectivity
93
+ include RGFA::Multiplication
94
+ include RGFA::LoggerSupport
95
+ include RGFA::RGL
96
+
97
+ attr_accessor :validate
98
+
99
+ # @!macro validate
100
+ # @param validate [Integer] (<i>defaults to: +2+</i>)
101
+ # the validation level; see "Validation level" under
102
+ # {RGFA::Line#initialize}.
103
+ def initialize(validate: 2)
104
+ @validate = validate
105
+ init_headers
106
+ @segments = {}
107
+ @links = []
108
+ @containments = []
109
+ @paths = {}
110
+ @segments_first_order = false
111
+ @progress = false
112
+ @default = {:count_tag => :RC, :unit_length => 1}
113
+ @extensions_enabled = false
114
+ end
115
+
116
+ # Require that the links, containments and paths referring
117
+ # to a segment are added after the segment. Default: do not
118
+ # require any particular ordering.
119
+ #
120
+ # @return [void]
121
+ def require_segments_first_order
122
+ @segments_first_order = true
123
+ end
124
+
125
+ # Set the validation level to 0.
126
+ # See "Validation level" under {RGFA::Line#initialize}.
127
+ # @return [void]
128
+ def turn_off_validations
129
+ @validate = 0
130
+ end
131
+
132
+ # List all names of segments in the graph
133
+ # @return [Array<Symbol>]
134
+ def segment_names
135
+ @segments.keys.compact
136
+ end
137
+
138
+ # List all names of path lines in the graph
139
+ # @return [Array<Symbol>]
140
+ def path_names
141
+ @paths.keys.compact
142
+ end
143
+
144
+ # Post-validation of the RGFA
145
+ # @return [void]
146
+ # @raise if validation fails
147
+ def validate!
148
+ validate_segment_references!
149
+ validate_path_links!
150
+ return nil
151
+ end
152
+
153
+ # Creates a string representation of RGFA conforming to the current
154
+ # specifications
155
+ # @return [String]
156
+ def to_s
157
+ s = ""
158
+ each_line {|line| s << line.to_s; s << "\n"}
159
+ return s
160
+ end
161
+
162
+ # Return the gfa itself
163
+ # @return [self]
164
+ def to_rgfa
165
+ self
166
+ end
167
+
168
+ # Create a copy of the RGFA instance.
169
+ # @return [RGFA]
170
+ def clone
171
+ cpy = to_s.to_rgfa(validate: 0)
172
+ cpy.validate = @validate
173
+ cpy.enable_progress_logging if @progress
174
+ cpy.require_segments_first_order if @segments_first_order
175
+ return cpy
176
+ end
177
+
178
+ # Populates a RGFA instance reading from file with specified +filename+
179
+ # @param [String] filename
180
+ # @raise if file cannot be opened for reading
181
+ # @return [self]
182
+ def read_file(filename)
183
+ if @progress
184
+ linecount = `wc -l #{filename}`.strip.split(" ")[0].to_i
185
+ progress_log_init(:read_file, "lines", linecount,
186
+ "Parse file with #{linecount} lines")
187
+ end
188
+ File.foreach(filename) do |line|
189
+ self << line.chomp
190
+ progress_log(:read_file) if @progress
191
+ end
192
+ progress_log_end(:read_file) if @progress
193
+ validate! if @validate >= 1
194
+ self
195
+ end
196
+
197
+ # Creates a RGFA instance parsing the file with specified +filename+
198
+ # @param [String] filename
199
+ # @raise if file cannot be opened for reading
200
+ # @!macro validate
201
+ # @return [RGFA]
202
+ def self.from_file(filename, validate: 2)
203
+ gfa = RGFA.new(validate: validate)
204
+ gfa.read_file(filename)
205
+ return gfa
206
+ end
207
+
208
+ # Write RGFA to file with specified +filename+;
209
+ # overwrites it if it exists
210
+ # @param [String] filename
211
+ # @raise if file cannot be opened for writing
212
+ # @return [void]
213
+ def to_file(filename)
214
+ File.open(filename, "w") {|f| each_line {|l| f.puts l}}
215
+ end
216
+
217
+ # Output basic statistics about the graph's sequence and topology
218
+ # information.
219
+ #
220
+ # @param [boolean] short compact output as a single text line
221
+ #
222
+ # Compact output has the following keys:
223
+ # - +ns+: number of segments
224
+ # - +nl+: number of links
225
+ # - +cc+: number of connected components
226
+ # - +de+: number of dead ends
227
+ # - +tl+: total length of segment sequences
228
+ # - +50+: N50 segment sequence length
229
+ #
230
+ # Normal output outputs a table with the same information, plus some
231
+ # additional one: the length of the largest
232
+ # component, as well as the shortest and largest and 1st/2nd/3rd quartiles
233
+ # of segment sequence length.
234
+ #
235
+ # @return [String] sequence and topology information collected from the graph.
236
+ #
237
+ def info(short = false)
238
+ q, n50, tlen = lenstats
239
+ nde = n_dead_ends()
240
+ pde = "%.2f%%" % ((nde.to_f*100) / (segments.size*2))
241
+ cc = connected_components()
242
+ cc.map!{|c|c.map{|sn|segment!(sn).length!}.inject(:+)}
243
+ if short
244
+ return "ns=#{segments.size}\t"+
245
+ "nl=#{links.size}\t"+
246
+ "cc=#{cc.size}\t"+
247
+ "de=#{nde}\t"+
248
+ "tl=#{tlen}\t"+
249
+ "50=#{n50}"
250
+ end
251
+ retval = []
252
+ retval << "Segment count: #{segments.size}"
253
+ retval << "Links count: #{links.size}"
254
+ retval << "Total length (bp): #{tlen}"
255
+ retval << "Dead ends: #{nde}"
256
+ retval << "Percentage dead ends: #{pde}"
257
+ retval << "Connected components: #{cc.size}"
258
+ retval << "Largest component (bp): #{cc.last}"
259
+ retval << "N50 (bp): #{n50}"
260
+ retval << "Shortest segment (bp): #{q[0]}"
261
+ retval << "Lower quartile segment (bp): #{q[1]}"
262
+ retval << "Median segment (bp): #{q[2]}"
263
+ retval << "Upper quartile segment (bp): #{q[3]}"
264
+ retval << "Longest segment (bp): #{q[4]}"
265
+ return retval
266
+ end
267
+
268
+ # Counts the dead ends.
269
+ #
270
+ # Dead ends are here defined as segment ends without connections.
271
+ #
272
+ # @return [Integer] number of dead ends in the graph
273
+ #
274
+ def n_dead_ends
275
+ segments.inject(0) do |n,s|
276
+ [:E, :B].each {|e| n+= 1 if links_of([s.name, e]).empty?}
277
+ n
278
+ end
279
+ end
280
+
281
+ # Compare two RGFA instances.
282
+ # @return [Boolean] are the lines of the two instances equivalent?
283
+ def ==(other)
284
+ segments == other.segments and
285
+ links == other.links and
286
+ containments == other.containments and
287
+ headers == other.headers and
288
+ paths == other.paths
289
+ end
290
+
291
+ private
292
+
293
+ def lenstats
294
+ sln = segments.map(&:length!).sort
295
+ n = sln.size
296
+ tlen = sln.inject(:+)
297
+ n50 = nil
298
+ sum = 0
299
+ sln.reverse.each do |l|
300
+ sum += l
301
+ if sum >= tlen/2
302
+ n50 = l
303
+ break
304
+ end
305
+ end
306
+ q = [sln[0], sln[(n/4)-1], sln[(n/2)-1], sln[((n*3)/4)-1], sln[-1]]
307
+ return q, n50, tlen
308
+ end
309
+
310
+ # Checks that L, C and P refer to existing S.
311
+ # @return [void]
312
+ # @raise [RGFA::LineMissingError] if validation fails
313
+ def validate_segment_references!
314
+ @segments.values.each do |s|
315
+ if s.virtual?
316
+ raise RGFA::LineMissingError, "Segment #{s.name} does not exist\n"+
317
+ "References to #{s.name} were found in the following lines:\n"+
318
+ s.all_references.map(&:to_s).join("\n")
319
+ end
320
+ end
321
+ return nil
322
+ end
323
+
324
+ # Checks that P are supported by links.
325
+ # @return [void]
326
+ # @raise if validation fails
327
+ def validate_path_links!
328
+ @paths.values.each do |pt|
329
+ pt.links.each do |l, dir|
330
+ if l.virtual?
331
+ raise RGFA::LineMissingError, "Link: #{l.to_s}\n"+
332
+ "does not exist, but is required by the paths:\n"+
333
+ l.paths.map{|pt2, dir2|pt2.to_s}.join("\n")
334
+ end
335
+ end
336
+ end
337
+ return nil
338
+ end
339
+
340
+ def init_headers
341
+ @headers = RGFA::Line::Header.new([], validate: @validate)
342
+ end
343
+
344
+ end
345
+
346
+ # Ruby core String class, with additional methods.
347
+ class String
348
+
349
+ # Converts a +String+ into a +RGFA+ instance. Each line of the string is added
350
+ # separately to the gfa.
351
+ # @return [RGFA]
352
+ # @!macro validate
353
+ def to_rgfa(validate: 2)
354
+ gfa = RGFA.new(validate: validate)
355
+ split("\n").each {|line| gfa << line}
356
+ gfa.validate! if validate >= 1
357
+ return gfa
358
+ end
359
+
360
+ end
361
+
362
+ # Ruby core Array class, with additional methods.
363
+ class Array
364
+
365
+ # Converts an +Array+ of strings or RGFA::Line instances
366
+ # into a +RGFA+ instance.
367
+ # @return [RGFA]
368
+ # @!macro validate
369
+ def to_rgfa(validate: 2)
370
+ gfa = RGFA.new(validate: validate)
371
+ each {|line| gfa << line}
372
+ gfa.validate! if validate >= 1
373
+ return gfa
374
+ end
375
+
376
+ end