gfa 0.3.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e8e61ff97b34654b7a660b011826ad5549f66933a91a09658facf58c3fd56b1
4
- data.tar.gz: 8f85f07955e71cd38a9dfa28011c70433473ad6a1137ed3e6e217d63eeba20a8
3
+ metadata.gz: 9eaaaf8bcac372e7d6b63d96aee94493faa51d78f5cc2b7edb525c3fbd7efaa2
4
+ data.tar.gz: 950e9ffa9cd07cdea8ef5cc80b6c162bf009ff3bef5ab757dcda51c443925043
5
5
  SHA512:
6
- metadata.gz: 5b9f8fd92cd30d9e4e5c0263e938169141749c7be43011b574f937c9645608d8e72189bfe9072d7321e3acdf7e8288cecf42c90abe3ceef2bf001780bdb3e472
7
- data.tar.gz: ee33c0b9c0dc9adb2df96d95b792b060b248e2f991ed5a0e4b4c136d66f04b9be6cf5ca58462bce046c6cfd9e7fed6c4c6949cb5e8923eb71e3d5f53f0da2703
6
+ metadata.gz: 92e53b9d6c09b1814f3feda86b67dcea6192bd935e3c04dcca7b6ee9c0e7eed919c711941985313c2918f02772f3598883f037638e6e1b7cc9cf8897fec80caf
7
+ data.tar.gz: 4c14ce2bd91582b7b81bc6123443bd5b0cffe8c83550fb85dc48c8da030d35dafaf393506d537c5df7550b25b87ab10003a938c407bd6210983e5f8841552317
data/README.md CHANGED
@@ -7,28 +7,35 @@
7
7
 
8
8
  This implementation follows the specifications of [GFA-spec][].
9
9
 
10
+ To load the library:
11
+
12
+ ```ruby
13
+ require 'gfa'
14
+ ```
10
15
 
11
16
  ## Parsing GFA
12
17
 
13
18
  To parse a file in GFA format:
14
19
 
15
20
  ```ruby
16
- require 'gfa'
17
-
18
21
  my_gfa = GFA.load('assembly.gfa')
19
22
  ```
20
23
 
21
- To load GFA strings line-by-line:
24
+ For large GFA files, you can also parse them in parallel:
22
25
 
23
26
  ```ruby
24
- require 'gfa'
27
+ my_gfa = GFA.load_parallel('large-graph.gfa', 4)
28
+ ```
29
+
30
+ To load GFA strings line-by-line:
25
31
 
32
+ ```ruby
26
33
  my_gfa = GFA.new
27
- fh = File.open('assembly.gfa', 'r')
28
- fh.each do |ln|
29
- my_gfa << ln
34
+ File.open('assembly.gfa', 'r') do |fh|
35
+ fh.each do |ln|
36
+ my_gfa << ln
37
+ end
30
38
  end
31
- fh.close
32
39
  ```
33
40
 
34
41
 
@@ -58,7 +65,6 @@ Any `GFA` object can be exported as an [`RGL`][rgl] graph using the methods
58
65
  [tiny.gfa](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.gfa):
59
66
 
60
67
  ```ruby
61
- require 'gfa'
62
68
  require 'rgl/dot'
63
69
 
64
70
  my_gfa = GFA.load('data/tiny.gfa')
@@ -91,8 +97,6 @@ Or add the following line to your Gemfile:
91
97
  gem 'gfa'
92
98
  ```
93
99
 
94
- and run `bundle install` from your shell.
95
-
96
100
 
97
101
  # Author
98
102
 
@@ -103,6 +107,6 @@ and run `bundle install` from your shell.
103
107
 
104
108
  [Artistic License 2.0](LICENSE).
105
109
 
106
- [GFA-spec]: https://github.com/pmelsted/GFA-spec
110
+ [GFA-spec]: https://github.com/GFA-spec/GFA-spec
107
111
  [lrr]: https://rodriguez-r.com/
108
112
  [rgl]: https://github.com/monora/rgl
data/bin/gfa-add-gaf ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input_gfa, input_gaf, output, degree, threads = ARGV
12
+
13
+ unless degree
14
+ $stderr.puts <<~HELP
15
+ gfa-add-gaf <input-gfa> <input-gaf> <output> <degree> [<pref> [<threads>]]
16
+
17
+ <input-gfa> Input GFA file to read
18
+ <input-gaf> Input GAF file to read
19
+ <output> Output GFA file to write
20
+ <degree> Maximum degree of separation between the segment set in the GAF
21
+ and any other included segments. If 0, only segments are
22
+ included. If 1, only the target segments, records linking to
23
+ them, and segments linked by those records. Any integer > 1
24
+ includes additional expansion rounds for those linked segments.
25
+ Use -1 to include the complete original GAF without subsetting.
26
+ <pref> A prefix to name all recorded paths
27
+ By default: Based on the GAF file name
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input_gfa}"
34
+ gfa = GFA.load_parallel(input_gfa, (threads || 1).to_i)
35
+
36
+ $stderr.puts "Loading GAF: #{input_gaf}"
37
+ $stderr.puts "- Minimum identity: #{0.95}"
38
+ pref ||= File.basename(input_gaf, '.gaf').gsub(/[^!-)+-<>-~]/, '_')
39
+ segments = []
40
+ File.open(input_gaf, 'r') do |fh|
41
+ fh.each do |ln|
42
+ row = ln.chomp.split("\t")
43
+ opt = Hash[row[12..].map { |i| i.split(':', 2) }]
44
+ opt.each { |k, v| opt[k] = GFA::Field[v] }
45
+ next if opt['id'] && opt['id'].value < 0.95
46
+
47
+ gaf_path = row[5]
48
+ seg_names = []
49
+ gaf_path.scan(/[><]?[^><]+/).each do |seg|
50
+ seg_orient = seg.match?(/^</) ? '-' : '+'
51
+ seg_name = seg.sub(/^[><]/, '')
52
+ seg_names << "#{seg_name}#{seg_orient}"
53
+ segments << seg_name unless segments.include?(seg_name)
54
+ end
55
+ gfa << GFA::Record::Path.new(
56
+ "#{pref}_#{$.}", seg_names.join(','), opt['cg']&.value || '*'
57
+ )
58
+ end
59
+ end
60
+ $stderr.puts "- Found #{segments.size} linked segments"
61
+
62
+ degree = degree.to_i
63
+ if degree >= 0
64
+ $stderr.puts 'Subsetting graph'
65
+ gfa = gfa.subgraph(segments, degree: degree)
66
+ end
67
+
68
+ $stderr.puts "Saving GFA: #{output}"
69
+ gfa.save(output)
70
+
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, base, min_len, threads = ARGV
12
+
13
+ unless base
14
+ $stderr.puts <<~HELP
15
+ Split a GFA into multiple independent GFA files that have no links between
16
+ them by greedily identifying all individual modules in the graph
17
+
18
+ gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
19
+
20
+ <input> Input GFA file to read
21
+ <base> Prefix of the output GFA files to write
22
+ <min_len> Minimum length (in bp) to report a module
23
+ By default: 0 (all modules are reported)
24
+ <threads> If passed, parallelize process with these many threads
25
+ HELP
26
+ exit(1)
27
+ end
28
+
29
+ $stderr.puts "Loading GFA: #{input}"
30
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
31
+
32
+ $stderr.puts 'Splitting graph into modules'
33
+ gfas = gfa.split_modules
34
+
35
+ min_len = min_len.to_i
36
+ if min_len > 0
37
+ $stderr.puts 'Filtering out small modules'
38
+ gfas.select! { |gfa| gfa.total_length > min_len }
39
+ end
40
+
41
+ if gfas.empty?
42
+ $stderr.puts "No modules found"
43
+ else
44
+ $stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
45
+ int_len = Math.log10(gfas.size).ceil
46
+ gfas.each_with_index do |gfa, k|
47
+ gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
48
+ end
49
+ end
50
+
data/bin/gfa-subgraph ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, degree, segments, threads = ARGV
12
+
13
+ unless segments
14
+ $stderr.puts <<~HELP
15
+ Select a set of segments and include only elements of the GFA linked to
16
+ those segments (directly or indirectly)
17
+
18
+ gfa-subgraph <input> <output> <degree> <segments> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <output> Output GFA file to write
22
+ <degree> Maximum degree of separation between the segment set and any
23
+ other included segments. If 0, only segments are included.
24
+ If 1, only the target segments, records linking to them, and
25
+ segments linked by those records. Any integer > 1 includes
26
+ additional expansion rounds for those linked segments.
27
+ <segments> Comma-delimited list of segment segments
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input}"
34
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
35
+
36
+ $stderr.puts 'Subsetting graph'
37
+ gfa = gfa.subgraph(segments.split(','), degree: degree.to_i)
38
+
39
+ $stderr.puts "Saving GFA: #{output}"
40
+ gfa.save(output)
41
+
data/lib/gfa/common.rb CHANGED
@@ -14,8 +14,8 @@ class GFA
14
14
  attr :gfa_version, :records, :opts
15
15
 
16
16
  GFA::Record.TYPES.each do |r_type|
17
- plural = "#{r_type.downcase}s"
18
17
  singular = "#{r_type.downcase}"
18
+ plural = "#{singular}s"
19
19
 
20
20
  define_method(plural) { records[r_type] }
21
21
  define_method(singular) { |k| records[r_type][k] }
@@ -24,7 +24,7 @@ class GFA
24
24
 
25
25
  def initialize(opts = {})
26
26
  @records = {}
27
- @opts = { index: true, comments: false }.merge(opts)
27
+ @opts = { index: true, index_id: false, comments: false }.merge(opts)
28
28
  GFA::Record.TYPES.each do |t|
29
29
  @records[t] = GFA::RecordSet.name_class(t).new(self)
30
30
  end
@@ -38,5 +38,33 @@ class GFA
38
38
  records == gfa.records
39
39
  end
40
40
 
41
- alias == eql?
41
+ def ==(gfa)
42
+ eql?(gfa)
43
+ end
44
+
45
+ def size
46
+ records.values.map(&:size).inject(0, :+)
47
+ end
48
+
49
+ def merge!(gfa)
50
+ raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA
51
+
52
+ GFA::Record.TYPES.each do |t|
53
+ @records[t].merge!(gfa.records[t])
54
+ end
55
+ end
56
+
57
+ def indexed?
58
+ records.values.all?(&:indexed?)
59
+ end
60
+
61
+ def rebuild_index!
62
+ @records.each_value(&:rebuild_index!)
63
+ end
64
+
65
+ ##
66
+ # Computes the sum of all individual segment lengths
67
+ def total_length
68
+ segments.total_length
69
+ end
42
70
  end
data/lib/gfa/generator.rb CHANGED
@@ -8,9 +8,9 @@ class GFA
8
8
  end
9
9
 
10
10
  def each_line(&blk)
11
- set_version_header('1.1') if gfa_version.nil?
11
+ set_version_header('1.2') if gfa_version.nil?
12
12
  GFA::Record.TYPES.each do |r_type|
13
- records[r_type].each do |record|
13
+ records[r_type].set.each do |record|
14
14
  blk[record.to_s]
15
15
  end
16
16
  end
@@ -23,7 +23,7 @@ class GFA
23
23
  end
24
24
 
25
25
  def unset_version
26
- @records[:Header].delete_if { |o| !o.fields[:VN].nil? }
26
+ headers.set.delete_if { |o| !o.fields[:VN].nil? }
27
27
  @gfa_version = nil
28
28
  end
29
29
 
data/lib/gfa/graph.rb CHANGED
@@ -21,9 +21,145 @@ class GFA
21
21
  def adjacency_graph(opts = {})
22
22
  implicit_graph(opts).to_adjacency
23
23
  end
24
-
24
+
25
+ ##
26
+ # Extracts the subset of records associated to +segments+, which is an Array
27
+ # with values of any class in: Integer (segment index),
28
+ # String or GFA::Field::String (segment names), or GFA::Record::Segment.
29
+ #
30
+ # +degree+ indicates the maximum degree of separation between the original
31
+ # segment set and any additional segments. Use 0 to include only the segments
32
+ # in the set. Use 1 to include those, the records linking to them, and the
33
+ # additional segments linked by those records. Use any integer greater than 1
34
+ # to prompt additional rounds of greedy graph expansion.
35
+ #
36
+ # If +headers+, it includes all the original headers. Otherwise it only
37
+ # only includes the version header (might be inferred).
38
+ #
39
+ # All comments are ignored even if originally parsed. Walks are currently
40
+ # ignored too. If the current GFA object doesn't have an index, it builds one
41
+ # and forces +index: true+. The output object inherits all options.
42
+ def subgraph(segments, degree: 1, headers: true)
43
+ # Prepare objects
44
+ unless opts[:index]
45
+ opts[:index] = true
46
+ rebuild_index!
47
+ end
48
+ gfa = GFA.new(opts)
49
+ segments =
50
+ segments.map do |i|
51
+ i.is_a?(GFA::Record::Segment) ? i :
52
+ segment(i) or raise "Cannot find segment: #{i}"
53
+ end
54
+
55
+ # Headers
56
+ if headers
57
+ self.headers.set.each { |record| gfa << record }
58
+ else
59
+ gfa << GFA::Record::Header.new("VN:Z:#{gfa_version}")
60
+ end
61
+
62
+ # Original segments
63
+ segments.each { |segment| gfa << segment }
64
+
65
+ # Expand graph
66
+ linking, edges = linking_records(gfa.segments, degree: degree)
67
+ linking += internally_linking_records(segments, edges)
68
+ linking.each { |record| gfa << record }
69
+
70
+ # Return
71
+ gfa
72
+ end
73
+
74
+ ##
75
+ # Finds all the records linking to any segments in +segments+, a
76
+ # GFA::RecordSet::SegmentSet object, and expands to links with up to
77
+ # +degree+ degrees of separation
78
+ #
79
+ # It only evaluates the edges given in the +edges+ Array of GFA::Record
80
+ # values. If +edges+ is +nil+, it uses the full set of edges in the gfa.
81
+ # Edge GFA::Record objects can be of type Link, Containment, Jump, or Path
82
+ #
83
+ # If +_ignore+ is passed, it ignores this number of segments at the beginning
84
+ # of the +segments+ set (assumes they have already been evaluated). This is
85
+ # only used for internal heuristics
86
+ #
87
+ # Returns an Array of with two elements:
88
+ # 0. An array of GFA::Record objects with all the identified linking records
89
+ # 1. An array of GFA::Record objects with all edges that were not identified
90
+ #
91
+ # IMPORTANT NOTE 1: The object +segments+ will be modified to include all
92
+ # linked segments. If you don't want this behaviour, please make sure to pass
93
+ # a duplicate of the object instead.
94
+ #
95
+ # IMPORTANT NOTE 2: The list of linking records may not comprehensively
96
+ # include all records linking the identified expanded segment set. To ensure
97
+ # a consistent set is identified, use:
98
+ # linking, edges = gfa.linking_records(segments)
99
+ # linking += gfa.internally_linking_records(segments, edges)
100
+ #
101
+ def linking_records(segments, degree: 1, edges: nil, _ignore: 0)
102
+ unless segments.is_a? GFA::RecordSet::SegmentSet
103
+ raise "Unrecognised class: #{segments.class}"
104
+ end
105
+
106
+ # Gather edges to evaluate
107
+ edges ||= all_edges
108
+ return [[], edges] if degree <= 0
109
+
110
+ # Links, Containments, Jumps (from, to) and Paths (segment_names)
111
+ linking = []
112
+ eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
113
+ edges.delete_if do |edge|
114
+ if eval_set.any? { |segment| edge.include? segment }
115
+ linking << edge
116
+ true # Remove from the edge set to speed up future recursions
117
+ else
118
+ false # Keep it, possibly linking future recursions
119
+ end
120
+ end
121
+
122
+ # Recurse and return
123
+ if degree >= 1
124
+ pre = segments.size
125
+
126
+ # Add additional linked segments
127
+ linking.each do |record|
128
+ record.segments(self).each do |other_seg|
129
+ segments << other_seg unless segments[other_seg.name]
130
+ end
131
+ end
132
+
133
+ # Recurse only if new segments were discovered
134
+ if segments.size > pre
135
+ $stderr.puts "- Recursion [#{degree}]: " \
136
+ "#{pre} -> #{segments.size}\t(#{edges.size})"
137
+ linking +=
138
+ linking_records(
139
+ segments,
140
+ degree: degree - 1, edges: edges, _ignore: pre
141
+ )[0]
142
+ end
143
+ end
144
+ [linking, edges]
145
+ end
146
+
147
+ def internally_linking_records(segments, edges)
148
+ $stderr.puts '- Gathering internally linking records'
149
+ segments = Hash[segments.set.map { |i| [i.name.value, true]}]
150
+ edges.select { |record| record.segment_names_a.all? { |s| segments[s] } }
151
+ end
152
+
153
+ ##
154
+ # Returns an array of GFA::Record objects including all possible edges
155
+ # from the GFA. I.e., all links, jumps, containments, and paths.
156
+ def all_edges
157
+ edge_t = %i[Link Jump Containment Path]
158
+ edges = edge_t.flat_map { |t| records[t].set } if edges.nil?
159
+ end
160
+
25
161
  private
26
-
162
+
27
163
  def segment_names_with_orient
28
164
  segments.flat_map do |s|
29
165
  %w[+ -].map { |orient| GFA::GraphVertex.idx(s, orient) }
@@ -0,0 +1,96 @@
1
+
2
+ class GFA
3
+ ##
4
+ # Find all independent modules by greedily crawling the linking entries for
5
+ # each segment, and returns an Array of GFA objects containing each individual
6
+ # module. If +recalculate+ is false, it trusts the current calculated
7
+ # matrix unless none exists
8
+ def split_modules(recalculate = true)
9
+ recalculate_matrix if recalculate || @matrix.nil?
10
+ missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
11
+ modules = []
12
+ until missing_segments.empty?
13
+ mod = matrix_find_module(missing_segments[0])
14
+ mod.segments.set.map(&:name).map(&:value).each do |name|
15
+ missing_segments.delete(@matrix_segment_names.index(name))
16
+ end
17
+ modules << mod
18
+ end
19
+ modules
20
+ end
21
+
22
+ ##
23
+ # Finds the entire module containing the segment with index +segment_index+
24
+ # in the matrix (requires calling +recalculate_matrix+ first!). Returns the
25
+ # module as a new GFA
26
+ def matrix_find_module(segment_index)
27
+ # Initialize
28
+ segs = [segment_index]
29
+ edges = []
30
+ new_segs = true
31
+
32
+ # Iterate until no new segments are found
33
+ while new_segs
34
+ new_segs = false
35
+ segs.each do |seg|
36
+ @matrix.size.times do |k|
37
+ next if seg == k
38
+ v = @matrix[[seg, k].max][[seg, k].min]
39
+ next if v.empty?
40
+ edges += v
41
+ unless segs.include?(k)
42
+ new_segs = true
43
+ segs << k
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Save as GFA and return
50
+ o = GFA.new
51
+ segs.each { |k| o << segments[k] }
52
+ edges.uniq.each { |k| o << @matrix_edges[k] }
53
+ o
54
+ end
55
+
56
+ ##
57
+ # Calculates a matrix where all links between segments are represented by the
58
+ # following variables:
59
+ #
60
+ # +@matrix_segment_names+ includes the names of all segments (as String) with
61
+ # the order indicating the segment index in the matrix
62
+ #
63
+ # +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
64
+ # the row index segment, the second index indicates the column index segment,
65
+ # and the third index indicates each of the links between those two. Note that
66
+ # matrix only stores the lower triangle, so the row index must be stictly less
67
+ # than the column index. For example, +@matrix[3][1]+ returns an Array of all
68
+ # index links between the segment with index 3 and the segment with index 1:
69
+ # ```
70
+ # [
71
+ # [ ], # Row 0 is always empty
72
+ # [[] ], # Row 1 stores connections to column 0
73
+ # [[], [] ], # Row 2 stores connections to columns 0 and 1
74
+ # [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
75
+ # ... # &c
76
+ # ]
77
+ # ```
78
+ #
79
+ # +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
80
+ # the GFA. The order indicates the index used by the values of +@matrix+
81
+ def recalculate_matrix
82
+ @matrix_segment_names = segments.set.map(&:name).map(&:value)
83
+ @matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
84
+ @matrix_edges = all_edges
85
+ @matrix_edges.each_with_index do |edge, k|
86
+ names = edge.segments(self).map(&:name).map(&:value)
87
+ indices = names.map { |i| @matrix_segment_names.index(i) }
88
+ indices.each do |a|
89
+ indices.each do |b|
90
+ break if a == b
91
+ @matrix[[a, b].max][[a, b].min] << k
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
data/lib/gfa/parser.rb CHANGED
@@ -6,16 +6,74 @@ class GFA
6
6
  MAX_VERSION = '1.2'
7
7
 
8
8
  ##
9
- # Load a GFA object from a +gfa+ file with options +opts+:
10
- # - index: If the records should be indexed as loaded (default: true)
9
+ # Load a GFA object from a gfa +file+ with options +opts+:
10
+ # - index: If the records should be indexed as loaded (default: true)
11
+ # - index_id: If the records should also be index by ID (default: false)
11
12
  # - comments: If the comment records should be saved (default: false)
13
+ # - line_range: Two-integer array indicating the first and last lines to read
14
+ # (default: nil, read the entire file)
12
15
  def self.load(file, opts = {})
13
16
  gfa = GFA.new(opts)
14
- fh = File.open(file, 'r')
15
- fh.each { |ln| gfa << ln }
17
+ read_records(file, opts) do |record|
18
+ gfa << record
19
+ end
16
20
  gfa
17
- ensure
18
- fh&.close
21
+ end
22
+
23
+ def self.read_records(file, opts = {})
24
+ rng = opts[:line_range]
25
+ File.open(file, 'r') do |fh|
26
+ lno = -1
27
+ fh.each do |ln|
28
+ lno += 1
29
+ next if !rng.nil? && (lno < rng[0] || lno > rng[1])
30
+ next if !opts[:comments] && ln[0] == '#'
31
+
32
+ yield(GFA::Record[ln])
33
+ end
34
+ end
35
+ end
36
+
37
+ ##
38
+ # Load a GFA object from a gfa +file+ in parallel using +thr+ threads,
39
+ # and the same +opts+ supported by +load+. Defaults to the +load+ method
40
+ # instead if +thr <= 1+.
41
+ def self.load_parallel(file, thr, opts = {})
42
+ return self.load(file, opts) if thr <= 1
43
+
44
+ # Prepare data
45
+ lno = 0
46
+ File.open(file, 'r') { |fh| fh.each { lno += 1 } }
47
+ thr = lno if thr > lno
48
+ blk = (lno.to_f / thr).ceil
49
+
50
+ # Launch children processes
51
+ io = []
52
+ pid = []
53
+ thr.times do |i|
54
+ io[i] = IO.pipe
55
+ pid << fork do
56
+ io[i][0].close
57
+ o = opts.merge(line_range: [i * blk, (i + 1) * blk - 1])
58
+ records = []
59
+ read_records(file, o) { |record| records << record }
60
+ Marshal.dump(records, io[i][1])
61
+ exit!(0)
62
+ end
63
+ io[i][1].close
64
+ end
65
+
66
+ # Collect and merge results
67
+ gfa = GFA.new(opts)
68
+ io.each_with_index do |pipe, k|
69
+ result = pipe[0].read
70
+ Process.wait(pid[k])
71
+ raise "Child process failed: #{k}" if result.empty?
72
+ Marshal.load(result).each { |record| gfa << record }
73
+ pipe[0].close
74
+ end
75
+
76
+ return gfa
19
77
  end
20
78
 
21
79
  def self.supported_version?(v)
@@ -24,7 +82,7 @@ class GFA
24
82
 
25
83
  # Instance-level
26
84
  def <<(obj)
27
- obj = parse_line(obj) unless obj.is_a? GFA::Record
85
+ obj = GFA::Record[obj] unless obj.is_a? GFA::Record
28
86
  return if obj.nil? || obj.empty?
29
87
  @records[obj.type] << obj
30
88
 
@@ -41,14 +99,4 @@ class GFA
41
99
 
42
100
  @gfa_version = v
43
101
  end
44
-
45
- private
46
-
47
- def parse_line(string)
48
- string = string.chomp
49
- return nil if string =~ /^\s*$/
50
- return nil if !opts[:comments] && string[0] == '#'
51
-
52
- GFA::Record[string]
53
- end
54
102
  end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Containment < GFA::Record
2
4
  CODE = :C
3
5
  REQ_FIELDS = %i[from from_orient to to_orient pos overlap]
@@ -12,6 +14,8 @@ class GFA::Record::Containment < GFA::Record
12
14
  end
13
15
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
14
16
 
17
+ include GFA::Record::HasFromTo
18
+
15
19
  alias container from
16
20
  alias container_orient from_orient
17
21
  alias contained to
@@ -0,0 +1,47 @@
1
+ module GFA::Record::HasFromTo
2
+ def from?(segment, orient = nil)
3
+ links_from_to?(segment, orient, true)
4
+ end
5
+
6
+ def to?(segment, orient = nil)
7
+ links_from_to?(segment, orient, false)
8
+ end
9
+
10
+ ##
11
+ # Extracts all linked segments from +gfa+ (which *must* be indexed)
12
+ def segments(gfa)
13
+ raise "Unindexed GFA" unless gfa.indexed?
14
+ [gfa.segments[from.value], gfa.segments[to.value]]
15
+ end
16
+
17
+ ##
18
+ # Include a GFA::Record::Segment +segment+?
19
+ def include?(segment)
20
+ # unless segment.is_a? GFA::Record::Segment
21
+ # raise "Unrecognized class: #{segment.class}"
22
+ # end
23
+ segment.name == from || segment.name == to
24
+ end
25
+
26
+ ##
27
+ # Array of strings with the names of the segments linked by the
28
+ # record
29
+ def segment_names_a
30
+ [from.value, to.value]
31
+ end
32
+
33
+ private
34
+
35
+ def links_from_to?(segment, orient, from)
36
+ segment = segment_name(segment)
37
+ orient = orient.value if orient.is_a? GFA::Field
38
+ base_k = from ? 2 : 4
39
+ segment == fields[base_k].value &&
40
+ (orient.nil? || orient == fields[base_k + 1].value)
41
+ end
42
+
43
+ def segment_name(segment)
44
+ segment.is_a?(GFA::Record::Segment) ? segment.name.value :
45
+ segment.is_a?(GFA::Field) ? segment.value : segment
46
+ end
47
+ end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Jump < GFA::Record
2
4
  CODE = :J
3
5
  REQ_FIELDS = %i[from from_orient to to_orient distance]
@@ -10,6 +12,8 @@ class GFA::Record::Jump < GFA::Record
10
12
  end
11
13
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
12
14
 
15
+ include GFA::Record::HasFromTo
16
+
13
17
  def initialize(from, from_orient, to, to_orient, distance, *opt_fields)
14
18
  @fields = {}
15
19
  add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
@@ -19,27 +23,4 @@ class GFA::Record::Jump < GFA::Record
19
23
  add_field(6, :Z, distance, /\*|[-+]?[0-9]+/)
20
24
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
21
25
  end
22
-
23
- def from?(segment, orient = nil)
24
- links_from_to?(segment, orient, true)
25
- end
26
-
27
- def to?(segment, orient = nil)
28
- links_from_to?(segment, orient, false)
29
- end
30
-
31
- private
32
-
33
- def links_from_to?(segment, orient, from)
34
- segment = segment_name(segment)
35
- orient = orient.value if orient.is_a? GFA::Field
36
- base_k = from ? 2 : 4
37
- segment==fields[base_k].value &&
38
- (orient.nil? || orient==fields[base_k + 1].value)
39
- end
40
-
41
- def segment_name(segment)
42
- segment.is_a?(GFA::Record::Segment) ? segment.name.value :
43
- segment.is_a?(GFA::Field) ? segment.value : segment
44
- end
45
26
  end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Link < GFA::Record
2
4
  CODE = :L
3
5
  REQ_FIELDS = %i[from from_orient to to_orient overlap]
@@ -15,6 +17,8 @@ class GFA::Record::Link < GFA::Record
15
17
  end
16
18
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
17
19
 
20
+ include GFA::Record::HasFromTo
21
+
18
22
  def initialize(from, from_orient, to, to_orient, overlap, *opt_fields)
19
23
  @fields = {}
20
24
  add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
@@ -24,27 +28,4 @@ class GFA::Record::Link < GFA::Record
24
28
  add_field(6, :Z, overlap, /\*|([0-9]+[MIDNSHPX=])+/)
25
29
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
26
30
  end
27
-
28
- def from?(segment, orient = nil)
29
- links_from_to?(segment, orient, true)
30
- end
31
-
32
- def to?(segment, orient = nil)
33
- links_from_to?(segment, orient, false)
34
- end
35
-
36
- private
37
-
38
- def links_from_to?(segment, orient, from)
39
- segment = segment_name(segment)
40
- orient = orient.value if orient.is_a? GFA::Field
41
- base_k = from ? 2 : 4
42
- segment==fields[base_k].value &&
43
- (orient.nil? || orient==fields[base_k + 1].value)
44
- end
45
-
46
- def segment_name(segment)
47
- segment.is_a?(GFA::Record::Segment) ? segment.name.value :
48
- segment.is_a?(GFA::Field) ? segment.value : segment
49
- end
50
31
  end
@@ -1,19 +1,45 @@
1
1
  class GFA::Record::Path < GFA::Record
2
2
  CODE = :P
3
- REQ_FIELDS = %i[path_name segment_name overlaps]
3
+ REQ_FIELDS = %i[path_name segment_names overlaps]
4
4
  OPT_FIELDS = {}
5
5
 
6
6
  REQ_FIELDS.each_index do |i|
7
7
  define_method(REQ_FIELDS[i]) { fields[i + 2] }
8
8
  end
9
9
 
10
+ alias segment_name segment_names
10
11
  alias cigar overlaps
11
12
 
12
- def initialize(path_name, segment_name, overlaps, *opt_fields)
13
+ def initialize(path_name, segment_names, overlaps, *opt_fields)
13
14
  @fields = {}
14
- add_field(2, :Z, path_name, /[!-)+-<>-~][!-~]*/)
15
- add_field(3, :Z, segment_name, /[!-)+-<>-~][!-~]*/)
16
- add_field(4, :Z, overlaps, /\*|([0-9]+[MIDNSHPX=]|[-+]?[0-9]+J|.)+/)
15
+ add_field(2, :Z, path_name, /[!-)+-<>-~][!-~]*/)
16
+ add_field(3, :Z, segment_names, /[!-)+-<>-~][!-~]*/)
17
+ add_field(4, :Z, overlaps, /\*|([0-9]+[MIDNSHPX=]|[-+]?[0-9]+J|.)+/)
17
18
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
18
19
  end
20
+
21
+ ##
22
+ # Array of segment names (without orientations) as strings
23
+ def segment_names_a
24
+ segment_names.value.split(/[,;]/).map { |i| i.gsub(/[+-]$/, '') }
25
+ end
26
+
27
+ ##
28
+ # Extracts all linked segments from +gfa+ (which *must* be indexed)
29
+ def segments(gfa)
30
+ raise "Unindexed GFA" unless gfa.indexed?
31
+ segment_names_a.map do |name|
32
+ gfa.segments[name]
33
+ end
34
+ end
35
+
36
+ ##
37
+ # Includes a GFA::Record::Segment +segment+?
38
+ def include?(segment)
39
+ # unless segment.is_a? GFA::Record::Segment
40
+ # raise "Unrecognized class: #{segment.class}"
41
+ # end
42
+
43
+ segment_names_a.any? { |name| segment.name == name }
44
+ end
19
45
  end
@@ -8,8 +8,11 @@ class GFA::Record::Segment < GFA::Record
8
8
  KC: :i, # k-mer count
9
9
  SH: :H, # SHA-256 checksum of the sequence
10
10
  UR: :Z, # URI or local file-system path of the sequence
11
- # Non-cannonical
12
- DP: :f # (From SAM)
11
+ # Non-cannonical but uppercase (thus, reserved)
12
+ DP: :f, # SAM
13
+ SN: :Z, # rGFA: Name of stable sequence from which the segment is derived
14
+ SO: :i, # rGFA: Offset on the stable sequence
15
+ SR: :i # rGFA: Rank. 0 if on a linear reference genome; >0 otherwise
13
16
  }
14
17
 
15
18
  REQ_FIELDS.each_index do |i|
@@ -23,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
23
26
  add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
24
27
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
25
28
  end
29
+
30
+ ##
31
+ # Returns the length of the sequence represented in this segment
32
+ def length
33
+ sequence.value.length
34
+ end
26
35
  end
data/lib/gfa/record.rb CHANGED
@@ -30,6 +30,8 @@ class GFA::Record
30
30
  end
31
31
 
32
32
  def self.[](string)
33
+ return nil if string.nil? || string =~ /^\s*$/
34
+
33
35
  split = string[0] == '#' ? ['', 2] : ["\t", 0]
34
36
  code, *values = string.chomp.split(*split)
35
37
  code_class(code).new(*values)
@@ -67,6 +69,10 @@ class GFA::Record
67
69
  o.join("\t")
68
70
  end
69
71
 
72
+ def dup
73
+ self.class[to_s]
74
+ end
75
+
70
76
  def hash
71
77
  { code => fields }.hash
72
78
  end
@@ -1,4 +1,10 @@
1
1
  class GFA::RecordSet::SegmentSet < GFA::RecordSet
2
2
  CODE = :S
3
3
  INDEX_FIELD = 2 # Name: Segment name
4
+
5
+ ##
6
+ # Computes the sum of all individual segment lengths
7
+ def total_length
8
+ set.map(&:length).reduce(0, :+)
9
+ end
4
10
  end
@@ -23,12 +23,12 @@ class GFA::RecordSet
23
23
 
24
24
  # Instance-level
25
25
 
26
- attr_reader :set, :gfa
26
+ attr_reader :set, :index, :gfa
27
27
 
28
- def initialize(gfa)
28
+ def initialize(gfa = nil)
29
29
  @set = []
30
30
  @index = {}
31
- @gfa = gfa
31
+ @gfa = gfa || GFA.new
32
32
  end
33
33
 
34
34
  def [](k)
@@ -69,25 +69,37 @@ class GFA::RecordSet
69
69
  raise "Wrong type of record: #{v.type}" if v.type != type
70
70
 
71
71
  @set << v
72
- index(v)
72
+ index!(v)
73
+ end
74
+
75
+ def indexed?
76
+ (empty? || !index_field) ? gfa.opts[:index] : !index.empty?
77
+ end
78
+
79
+ def rebuild_index!
80
+ @index = {}
81
+ set.each { |v| index!(v) }
73
82
  end
74
83
 
75
84
  def index_id(v)
76
85
  v[index_field]&.value
77
86
  end
78
87
 
79
- def index(v)
88
+ def index!(v)
80
89
  save_index(index_id(v), v) if index_field
81
90
 
82
91
  # Whenever present, index also by ID
83
- save_index(v[:ID].value, v) if v[:ID] && v[:ID].value =~ index_id(v)
92
+ if gfa.opts[:index_id] && v[:ID] && v[:ID].value =~ index_id(v)
93
+ save_index(v[:ID].value, v)
94
+ end
84
95
  end
85
96
 
86
97
  def save_index(k, v)
87
98
  return unless gfa.opts[:index] && k
88
99
 
89
100
  if @index[k]
90
- warn "#{type} already registered with field #{index_field}: #{k}"
101
+ f = index_field.is_a?(Integer) ? '' : "#{index_field}: "
102
+ raise "#{type} already registered: #{f}#{k}"
91
103
  end
92
104
  @index[k] = v
93
105
  end
@@ -96,4 +108,14 @@ class GFA::RecordSet
96
108
  k = k.value if k.is_a? GFA::Field
97
109
  @index[k]
98
110
  end
111
+
112
+ def merge!(record_set)
113
+ raise "Not a record set" unless record_set.is_a?(GFA::RecordSet)
114
+ if record_set.type != type
115
+ raise "Wrong type of record set: #{record_set.type}"
116
+ end
117
+
118
+ record_set.set.each { |i| @set << i }
119
+ record_set.index.each { |k, v| save_index(k, v) }
120
+ end
99
121
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.3.1'
2
+ VERSION = '0.5.0'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
data/lib/gfa.rb CHANGED
@@ -2,3 +2,4 @@ require 'gfa/common'
2
2
  require 'gfa/parser'
3
3
  require 'gfa/generator'
4
4
  require 'gfa/graph'
5
+ require 'gfa/modules'
data/test/parser_test.rb CHANGED
@@ -49,8 +49,10 @@ class ParserTest < Test::Unit::TestCase
49
49
  assert(sample.path('first').is_a?(GFA::Record))
50
50
  assert(sample.paths['first'].is_a?(GFA::Record))
51
51
  assert_equal('first', sample.path('first')[2]&.value)
52
+ assert(sample.indexed?)
52
53
  sample = GFA.load(path, index: false)
53
54
  assert_nil(sample.path('first'))
55
+ assert(!sample.indexed?)
54
56
  end
55
57
 
56
58
  def test_version_suppport
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-23 00:00:00.000000000 Z
11
+ date: 2023-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl
@@ -63,6 +63,9 @@ files:
63
63
  - LICENSE
64
64
  - README.md
65
65
  - Rakefile
66
+ - bin/gfa-add-gaf
67
+ - bin/gfa-greedy-modules
68
+ - bin/gfa-subgraph
66
69
  - lib/gfa.rb
67
70
  - lib/gfa/common.rb
68
71
  - lib/gfa/field.rb
@@ -75,10 +78,12 @@ files:
75
78
  - lib/gfa/field/string.rb
76
79
  - lib/gfa/generator.rb
77
80
  - lib/gfa/graph.rb
81
+ - lib/gfa/modules.rb
78
82
  - lib/gfa/parser.rb
79
83
  - lib/gfa/record.rb
80
84
  - lib/gfa/record/comment.rb
81
85
  - lib/gfa/record/containment.rb
86
+ - lib/gfa/record/has_from_to.rb
82
87
  - lib/gfa/record/header.rb
83
88
  - lib/gfa/record/jump.rb
84
89
  - lib/gfa/record/link.rb