gfa 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0e8e61ff97b34654b7a660b011826ad5549f66933a91a09658facf58c3fd56b1
4
- data.tar.gz: 8f85f07955e71cd38a9dfa28011c70433473ad6a1137ed3e6e217d63eeba20a8
3
+ metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
+ data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
5
5
  SHA512:
6
- metadata.gz: 5b9f8fd92cd30d9e4e5c0263e938169141749c7be43011b574f937c9645608d8e72189bfe9072d7321e3acdf7e8288cecf42c90abe3ceef2bf001780bdb3e472
7
- data.tar.gz: ee33c0b9c0dc9adb2df96d95b792b060b248e2f991ed5a0e4b4c136d66f04b9be6cf5ca58462bce046c6cfd9e7fed6c4c6949cb5e8923eb71e3d5f53f0da2703
6
+ metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
+ data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
data/README.md CHANGED
@@ -7,28 +7,35 @@
7
7
 
8
8
  This implementation follows the specifications of [GFA-spec][].
9
9
 
10
+ To load the library:
11
+
12
+ ```ruby
13
+ require 'gfa'
14
+ ```
10
15
 
11
16
  ## Parsing GFA
12
17
 
13
18
  To parse a file in GFA format:
14
19
 
15
20
  ```ruby
16
- require 'gfa'
17
-
18
21
  my_gfa = GFA.load('assembly.gfa')
19
22
  ```
20
23
 
21
- To load GFA strings line-by-line:
24
+ For large GFA files, you can also parse them in parallel:
22
25
 
23
26
  ```ruby
24
- require 'gfa'
27
+ my_gfa = GFA.load_parallel('large-graph.gfa', 4)
28
+ ```
29
+
30
+ To load GFA strings line-by-line:
25
31
 
32
+ ```ruby
26
33
  my_gfa = GFA.new
27
- fh = File.open('assembly.gfa', 'r')
28
- fh.each do |ln|
29
- my_gfa << ln
34
+ File.open('assembly.gfa', 'r') do |fh|
35
+ fh.each do |ln|
36
+ my_gfa << ln
37
+ end
30
38
  end
31
- fh.close
32
39
  ```
33
40
 
34
41
 
@@ -58,7 +65,6 @@ Any `GFA` object can be exported as an [`RGL`][rgl] graph using the methods
58
65
  [tiny.gfa](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.gfa):
59
66
 
60
67
  ```ruby
61
- require 'gfa'
62
68
  require 'rgl/dot'
63
69
 
64
70
  my_gfa = GFA.load('data/tiny.gfa')
@@ -91,8 +97,6 @@ Or add the following line to your Gemfile:
91
97
  gem 'gfa'
92
98
  ```
93
99
 
94
- and run `bundle install` from your shell.
95
-
96
100
 
97
101
  # Author
98
102
 
@@ -103,6 +107,6 @@ and run `bundle install` from your shell.
103
107
 
104
108
  [Artistic License 2.0](LICENSE).
105
109
 
106
- [GFA-spec]: https://github.com/pmelsted/GFA-spec
110
+ [GFA-spec]: https://github.com/GFA-spec/GFA-spec
107
111
  [lrr]: https://rodriguez-r.com/
108
112
  [rgl]: https://github.com/monora/rgl
data/bin/gfa-add-gaf ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input_gfa, input_gaf, output, degree, threads = ARGV
12
+
13
+ unless degree
14
+ $stderr.puts <<~HELP
15
+ gfa-add-gaf <input-gfa> <input-gaf> <output> <degree> [<pref> [<threads>]]
16
+
17
+ <input-gfa> Input GFA file to read
18
+ <input-gaf> Input GAF file to read
19
+ <output> Output GFA file to write
20
+ <degree> Maximum degree of separation between the segment set in the GAF
21
+ and any other included segments. If 0, only segments are
22
+ included. If 1, only the target segments, records linking to
23
+ them, and segments linked by those records. Any integer > 1
24
+ includes additional expansion rounds for those linked segments.
25
+ Use -1 to include the complete original GAF without subsetting.
26
+ <pref> A prefix to name all recorded paths
27
+ By default: Based on the GAF file name
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input_gfa}"
34
+ gfa = GFA.load_parallel(input_gfa, (threads || 1).to_i)
35
+
36
+ $stderr.puts "Loading GAF: #{input_gaf}"
37
+ $stderr.puts "- Minimum identity: #{0.95}"
38
+ pref ||= File.basename(input_gaf, '.gaf').gsub(/[^!-)+-<>-~]/, '_')
39
+ segments = []
40
+ File.open(input_gaf, 'r') do |fh|
41
+ fh.each do |ln|
42
+ row = ln.chomp.split("\t")
43
+ opt = Hash[row[12..].map { |i| i.split(':', 2) }]
44
+ opt.each { |k, v| opt[k] = GFA::Field[v] }
45
+ next if opt['id'] && opt['id'].value < 0.95
46
+
47
+ gaf_path = row[5]
48
+ seg_names = []
49
+ gaf_path.scan(/[><]?[^><]+/).each do |seg|
50
+ seg_orient = seg.match?(/^</) ? '-' : '+'
51
+ seg_name = seg.sub(/^[><]/, '')
52
+ seg_names << "#{seg_name}#{seg_orient}"
53
+ segments << seg_name unless segments.include?(seg_name)
54
+ end
55
+ gfa << GFA::Record::Path.new(
56
+ "#{pref}_#{$.}", seg_names.join(','), opt['cg']&.value || '*'
57
+ )
58
+ end
59
+ end
60
+ $stderr.puts "- Found #{segments.size} linked segments"
61
+
62
+ degree = degree.to_i
63
+ if degree >= 0
64
+ $stderr.puts 'Subsetting graph'
65
+ gfa = gfa.subgraph(segments, degree: degree)
66
+ end
67
+
68
+ $stderr.puts "Saving GFA: #{output}"
69
+ gfa.save(output)
70
+
data/bin/gfa-subgraph ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, degree, segments, threads = ARGV
12
+
13
+ unless segments
14
+ $stderr.puts <<~HELP
15
+ Select a set of segments and include only elements of the GFA linked to
16
+ those segments (directly or indirectly)
17
+
18
+ gfa-subgraph <input> <output> <degree> <segments> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <output> Output GFA file to write
22
+ <degree> Maximum degree of separation between the segment set and any
23
+ other included segments. If 0, only segments are included.
24
+ If 1, only the target segments, records linking to them, and
25
+ segments linked by those records. Any integer > 1 includes
26
+ additional expansion rounds for those linked segments.
27
+ <segments> Comma-delimited list of segment segments
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input}"
34
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
35
+
36
+ $stderr.puts 'Subsetting graph'
37
+ gfa = gfa.subgraph(segments.split(','), degree: degree.to_i)
38
+
39
+ $stderr.puts "Saving GFA: #{output}"
40
+ gfa.save(output)
41
+
data/lib/gfa/common.rb CHANGED
@@ -14,8 +14,8 @@ class GFA
14
14
  attr :gfa_version, :records, :opts
15
15
 
16
16
  GFA::Record.TYPES.each do |r_type|
17
- plural = "#{r_type.downcase}s"
18
17
  singular = "#{r_type.downcase}"
18
+ plural = "#{singular}s"
19
19
 
20
20
  define_method(plural) { records[r_type] }
21
21
  define_method(singular) { |k| records[r_type][k] }
@@ -24,7 +24,7 @@ class GFA
24
24
 
25
25
  def initialize(opts = {})
26
26
  @records = {}
27
- @opts = { index: true, comments: false }.merge(opts)
27
+ @opts = { index: true, index_id: false, comments: false }.merge(opts)
28
28
  GFA::Record.TYPES.each do |t|
29
29
  @records[t] = GFA::RecordSet.name_class(t).new(self)
30
30
  end
@@ -38,5 +38,27 @@ class GFA
38
38
  records == gfa.records
39
39
  end
40
40
 
41
- alias == eql?
41
+ def ==(gfa)
42
+ eql?(gfa)
43
+ end
44
+
45
+ def size
46
+ records.values.map(&:size).inject(0, :+)
47
+ end
48
+
49
+ def merge!(gfa)
50
+ raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA
51
+
52
+ GFA::Record.TYPES.each do |t|
53
+ @records[t].merge!(gfa.records[t])
54
+ end
55
+ end
56
+
57
+ def indexed?
58
+ records.values.all?(&:indexed?)
59
+ end
60
+
61
+ def rebuild_index!
62
+ @records.each_value(&:rebuild_index!)
63
+ end
42
64
  end
data/lib/gfa/generator.rb CHANGED
@@ -8,9 +8,9 @@ class GFA
8
8
  end
9
9
 
10
10
  def each_line(&blk)
11
- set_version_header('1.1') if gfa_version.nil?
11
+ set_version_header('1.2') if gfa_version.nil?
12
12
  GFA::Record.TYPES.each do |r_type|
13
- records[r_type].each do |record|
13
+ records[r_type].set.each do |record|
14
14
  blk[record.to_s]
15
15
  end
16
16
  end
@@ -23,7 +23,7 @@ class GFA
23
23
  end
24
24
 
25
25
  def unset_version
26
- @records[:Header].delete_if { |o| !o.fields[:VN].nil? }
26
+ headers.set.delete_if { |o| !o.fields[:VN].nil? }
27
27
  @gfa_version = nil
28
28
  end
29
29
 
data/lib/gfa/graph.rb CHANGED
@@ -21,9 +21,145 @@ class GFA
21
21
  def adjacency_graph(opts = {})
22
22
  implicit_graph(opts).to_adjacency
23
23
  end
24
-
24
+
25
+ ##
26
+ # Extracts the subset of records associated to +segments+, which is an Array
27
+ # with values of any class in: Integer (segment index),
28
+ # String or GFA::Field::String (segment names), or GFA::Record::Segment.
29
+ #
30
+ # +degree+ indicates the maximum degree of separation between the original
31
+ # segment set and any additional segments. Use 0 to include only the segments
32
+ # in the set. Use 1 to include those, the records linking to them, and the
33
+ # additional segments linked by those records. Use any integer greater than 1
34
+ # to prompt additional rounds of greedy graph expansion.
35
+ #
36
+ # If +headers+, it includes all the original headers. Otherwise it only
37
+ # only includes the version header (might be inferred).
38
+ #
39
+ # All comments are ignored even if originally parsed. Walks are currently
40
+ # ignored too. If the current GFA object doesn't have an index, it builds one
41
+ # and forces +index: true+. The output object inherits all options.
42
+ def subgraph(segments, degree: 1, headers: true)
43
+ # Prepare objects
44
+ unless opts[:index]
45
+ opts[:index] = true
46
+ rebuild_index!
47
+ end
48
+ gfa = GFA.new(opts)
49
+ segments =
50
+ segments.map do |i|
51
+ i.is_a?(GFA::Record::Segment) ? i :
52
+ segment(i) or raise "Cannot find segment: #{i}"
53
+ end
54
+
55
+ # Headers
56
+ if headers
57
+ self.headers.set.each { |record| gfa << record }
58
+ else
59
+ gfa << GFA::Record::Header.new("VN:Z:#{gfa_version}")
60
+ end
61
+
62
+ # Original segments
63
+ segments.each { |segment| gfa << segment }
64
+
65
+ # Expand graph
66
+ linking, edges = linking_records(gfa.segments, degree: degree)
67
+ linking += internally_linking_records(segments, edges)
68
+ linking.each { |record| gfa << record }
69
+
70
+ # Return
71
+ gfa
72
+ end
73
+
74
+ ##
75
+ # Finds all the records linking to any segments in +segments+, a
76
+ # GFA::RecordSet::SegmentSet object, and expands to links with up to
77
+ # +degree+ degrees of separation
78
+ #
79
+ # It only evaluates the edges given in the +edges+ Array of GFA::Record
80
+ # values. If +edges+ is +nil+, it uses the full set of edges in the gfa.
81
+ # Edge GFA::Record objects can be of type Link, Containment, Jump, or Path
82
+ #
83
+ # If +_ignore+ is passed, it ignores this number of segments at the beginning
84
+ # of the +segments+ set (assumes they have already been evaluated). This is
85
+ # only used for internal heuristics
86
+ #
87
+ # Returns an Array of with two elements:
88
+ # 0. An array of GFA::Record objects with all the identified linking records
89
+ # 1. An array of GFA::Record objects with all edges that were not identified
90
+ #
91
+ # IMPORTANT NOTE 1: The object +segments+ will be modified to include all
92
+ # linked segments. If you don't want this behaviour, please make sure to pass
93
+ # a duplicate of the object instead.
94
+ #
95
+ # IMPORTANT NOTE 2: The list of linking records may not comprehensively
96
+ # include all records linking the identified expanded segment set. To ensure
97
+ # a consistent set is identified, use:
98
+ # linking, edges = gfa.linking_records(segments)
99
+ # linking += gfa.internally_linking_records(segments, edges)
100
+ #
101
+ def linking_records(segments, degree: 1, edges: nil, _ignore: 0)
102
+ unless segments.is_a? GFA::RecordSet::SegmentSet
103
+ raise "Unrecognised class: #{segments.class}"
104
+ end
105
+
106
+ # Gather edges to evaluate
107
+ edges ||= all_edges
108
+ return [[], edges] if degree <= 0
109
+
110
+ # Links, Containments, Jumps (from, to) and Paths (segment_names)
111
+ linking = []
112
+ eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
113
+ edges.delete_if do |record|
114
+ if eval_set.any? { |segment| record.include? segment }
115
+ linking << record
116
+ true # Remove from the edge set to speed up future recursions
117
+ else
118
+ false # Keep it, possibly linking future recursions
119
+ end
120
+ end
121
+
122
+ # Recurse and return
123
+ if degree >= 1
124
+ pre = segments.size
125
+
126
+ # Add additional linked segments
127
+ linking.each do |record|
128
+ record.segments(self).each do |other_seg|
129
+ segments << other_seg unless segments[other_seg.name]
130
+ end
131
+ end
132
+
133
+ # Recurse only if new segments were discovered
134
+ if segments.size > pre
135
+ $stderr.puts "- Recursion [#{degree}]: " \
136
+ "#{pre} -> #{segments.size}\t(#{edges.size})"
137
+ linking +=
138
+ linking_records(
139
+ segments,
140
+ degree: degree - 1, edges: edges, _ignore: pre
141
+ )[0]
142
+ end
143
+ end
144
+ [linking, edges]
145
+ end
146
+
147
+ def internally_linking_records(segments, edges)
148
+ $stderr.puts '- Gathering internally linking records'
149
+ segments = Hash[segments.set.map { |i| [i.name.value, true]}]
150
+ edges.select { |record| record.segment_names_a.all? { |s| segments[s] } }
151
+ end
152
+
153
+ ##
154
+ # Returns an array of GFA::Record objects including all possible edges
155
+ # from the GFA. I.e., all links, jumps, containments, and paths.
156
+ def all_edges
157
+ edge_t = %i[Link Jump Containment Path]
158
+ edges = edge_t.flat_map { |t| records[t].set } if edges.nil?
159
+ end
160
+
25
161
  private
26
-
162
+
27
163
  def segment_names_with_orient
28
164
  segments.flat_map do |s|
29
165
  %w[+ -].map { |orient| GFA::GraphVertex.idx(s, orient) }
data/lib/gfa/parser.rb CHANGED
@@ -6,16 +6,74 @@ class GFA
6
6
  MAX_VERSION = '1.2'
7
7
 
8
8
  ##
9
- # Load a GFA object from a +gfa+ file with options +opts+:
10
- # - index: If the records should be indexed as loaded (default: true)
9
+ # Load a GFA object from a gfa +file+ with options +opts+:
10
+ # - index: If the records should be indexed as loaded (default: true)
11
+ # - index_id: If the records should also be index by ID (default: false)
11
12
  # - comments: If the comment records should be saved (default: false)
13
+ # - line_range: Two-integer array indicating the first and last lines to read
14
+ # (default: nil, read the entire file)
12
15
  def self.load(file, opts = {})
13
16
  gfa = GFA.new(opts)
14
- fh = File.open(file, 'r')
15
- fh.each { |ln| gfa << ln }
17
+ read_records(file, opts) do |record|
18
+ gfa << record
19
+ end
16
20
  gfa
17
- ensure
18
- fh&.close
21
+ end
22
+
23
+ def self.read_records(file, opts = {})
24
+ rng = opts[:line_range]
25
+ File.open(file, 'r') do |fh|
26
+ lno = -1
27
+ fh.each do |ln|
28
+ lno += 1
29
+ next if !rng.nil? && (lno < rng[0] || lno > rng[1])
30
+ next if !opts[:comments] && ln[0] == '#'
31
+
32
+ yield(GFA::Record[ln])
33
+ end
34
+ end
35
+ end
36
+
37
+ ##
38
+ # Load a GFA object from a gfa +file+ in parallel using +thr+ threads,
39
+ # and the same +opts+ supported by +load+. Defaults to the +load+ method
40
+ # instead if +thr <= 1+.
41
+ def self.load_parallel(file, thr, opts = {})
42
+ return self.load(file, opts) if thr <= 1
43
+
44
+ # Prepare data
45
+ lno = 0
46
+ File.open(file, 'r') { |fh| fh.each { lno += 1 } }
47
+ thr = lno if thr > lno
48
+ blk = (lno.to_f / thr).ceil
49
+
50
+ # Launch children processes
51
+ io = []
52
+ pid = []
53
+ thr.times do |i|
54
+ io[i] = IO.pipe
55
+ pid << fork do
56
+ io[i][0].close
57
+ o = opts.merge(line_range: [i * blk, (i + 1) * blk - 1])
58
+ records = []
59
+ read_records(file, o) { |record| records << record }
60
+ Marshal.dump(records, io[i][1])
61
+ exit!(0)
62
+ end
63
+ io[i][1].close
64
+ end
65
+
66
+ # Collect and merge results
67
+ gfa = GFA.new(opts)
68
+ io.each_with_index do |pipe, k|
69
+ result = pipe[0].read
70
+ Process.wait(pid[k])
71
+ raise "Child process failed: #{k}" if result.empty?
72
+ Marshal.load(result).each { |record| gfa << record }
73
+ pipe[0].close
74
+ end
75
+
76
+ return gfa
19
77
  end
20
78
 
21
79
  def self.supported_version?(v)
@@ -24,7 +82,7 @@ class GFA
24
82
 
25
83
  # Instance-level
26
84
  def <<(obj)
27
- obj = parse_line(obj) unless obj.is_a? GFA::Record
85
+ obj = GFA::Record[obj] unless obj.is_a? GFA::Record
28
86
  return if obj.nil? || obj.empty?
29
87
  @records[obj.type] << obj
30
88
 
@@ -41,14 +99,4 @@ class GFA
41
99
 
42
100
  @gfa_version = v
43
101
  end
44
-
45
- private
46
-
47
- def parse_line(string)
48
- string = string.chomp
49
- return nil if string =~ /^\s*$/
50
- return nil if !opts[:comments] && string[0] == '#'
51
-
52
- GFA::Record[string]
53
- end
54
102
  end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Containment < GFA::Record
2
4
  CODE = :C
3
5
  REQ_FIELDS = %i[from from_orient to to_orient pos overlap]
@@ -12,6 +14,8 @@ class GFA::Record::Containment < GFA::Record
12
14
  end
13
15
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
14
16
 
17
+ include GFA::Record::HasFromTo
18
+
15
19
  alias container from
16
20
  alias container_orient from_orient
17
21
  alias contained to
@@ -0,0 +1,47 @@
1
+ module GFA::Record::HasFromTo
2
+ def from?(segment, orient = nil)
3
+ links_from_to?(segment, orient, true)
4
+ end
5
+
6
+ def to?(segment, orient = nil)
7
+ links_from_to?(segment, orient, false)
8
+ end
9
+
10
+ ##
11
+ # Extracts all linked segments from +gfa+ (which *must* be indexed)
12
+ def segments(gfa)
13
+ raise "Unindexed GFA" unless gfa.indexed?
14
+ [gfa.segments[from.value], gfa.segments[to.value]]
15
+ end
16
+
17
+ ##
18
+ # Include a GFA::Record::Segment +segment+?
19
+ def include?(segment)
20
+ # unless segment.is_a? GFA::Record::Segment
21
+ # raise "Unrecognized class: #{segment.class}"
22
+ # end
23
+ segment.name == from || segment.name == to
24
+ end
25
+
26
+ ##
27
+ # Array of strings with the names of the segments linked by the
28
+ # record
29
+ def segment_names_a
30
+ [from.value, to.value]
31
+ end
32
+
33
+ private
34
+
35
+ def links_from_to?(segment, orient, from)
36
+ segment = segment_name(segment)
37
+ orient = orient.value if orient.is_a? GFA::Field
38
+ base_k = from ? 2 : 4
39
+ segment == fields[base_k].value &&
40
+ (orient.nil? || orient == fields[base_k + 1].value)
41
+ end
42
+
43
+ def segment_name(segment)
44
+ segment.is_a?(GFA::Record::Segment) ? segment.name.value :
45
+ segment.is_a?(GFA::Field) ? segment.value : segment
46
+ end
47
+ end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Jump < GFA::Record
2
4
  CODE = :J
3
5
  REQ_FIELDS = %i[from from_orient to to_orient distance]
@@ -10,6 +12,8 @@ class GFA::Record::Jump < GFA::Record
10
12
  end
11
13
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
12
14
 
15
+ include GFA::Record::HasFromTo
16
+
13
17
  def initialize(from, from_orient, to, to_orient, distance, *opt_fields)
14
18
  @fields = {}
15
19
  add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
@@ -19,27 +23,4 @@ class GFA::Record::Jump < GFA::Record
19
23
  add_field(6, :Z, distance, /\*|[-+]?[0-9]+/)
20
24
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
21
25
  end
22
-
23
- def from?(segment, orient = nil)
24
- links_from_to?(segment, orient, true)
25
- end
26
-
27
- def to?(segment, orient = nil)
28
- links_from_to?(segment, orient, false)
29
- end
30
-
31
- private
32
-
33
- def links_from_to?(segment, orient, from)
34
- segment = segment_name(segment)
35
- orient = orient.value if orient.is_a? GFA::Field
36
- base_k = from ? 2 : 4
37
- segment==fields[base_k].value &&
38
- (orient.nil? || orient==fields[base_k + 1].value)
39
- end
40
-
41
- def segment_name(segment)
42
- segment.is_a?(GFA::Record::Segment) ? segment.name.value :
43
- segment.is_a?(GFA::Field) ? segment.value : segment
44
- end
45
26
  end
@@ -1,3 +1,5 @@
1
+ require 'gfa/record/has_from_to'
2
+
1
3
  class GFA::Record::Link < GFA::Record
2
4
  CODE = :L
3
5
  REQ_FIELDS = %i[from from_orient to to_orient overlap]
@@ -15,6 +17,8 @@ class GFA::Record::Link < GFA::Record
15
17
  end
16
18
  OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
17
19
 
20
+ include GFA::Record::HasFromTo
21
+
18
22
  def initialize(from, from_orient, to, to_orient, overlap, *opt_fields)
19
23
  @fields = {}
20
24
  add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
@@ -24,27 +28,4 @@ class GFA::Record::Link < GFA::Record
24
28
  add_field(6, :Z, overlap, /\*|([0-9]+[MIDNSHPX=])+/)
25
29
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
26
30
  end
27
-
28
- def from?(segment, orient = nil)
29
- links_from_to?(segment, orient, true)
30
- end
31
-
32
- def to?(segment, orient = nil)
33
- links_from_to?(segment, orient, false)
34
- end
35
-
36
- private
37
-
38
- def links_from_to?(segment, orient, from)
39
- segment = segment_name(segment)
40
- orient = orient.value if orient.is_a? GFA::Field
41
- base_k = from ? 2 : 4
42
- segment==fields[base_k].value &&
43
- (orient.nil? || orient==fields[base_k + 1].value)
44
- end
45
-
46
- def segment_name(segment)
47
- segment.is_a?(GFA::Record::Segment) ? segment.name.value :
48
- segment.is_a?(GFA::Field) ? segment.value : segment
49
- end
50
31
  end
@@ -1,19 +1,45 @@
1
1
  class GFA::Record::Path < GFA::Record
2
2
  CODE = :P
3
- REQ_FIELDS = %i[path_name segment_name overlaps]
3
+ REQ_FIELDS = %i[path_name segment_names overlaps]
4
4
  OPT_FIELDS = {}
5
5
 
6
6
  REQ_FIELDS.each_index do |i|
7
7
  define_method(REQ_FIELDS[i]) { fields[i + 2] }
8
8
  end
9
9
 
10
+ alias segment_name segment_names
10
11
  alias cigar overlaps
11
12
 
12
- def initialize(path_name, segment_name, overlaps, *opt_fields)
13
+ def initialize(path_name, segment_names, overlaps, *opt_fields)
13
14
  @fields = {}
14
- add_field(2, :Z, path_name, /[!-)+-<>-~][!-~]*/)
15
- add_field(3, :Z, segment_name, /[!-)+-<>-~][!-~]*/)
16
- add_field(4, :Z, overlaps, /\*|([0-9]+[MIDNSHPX=]|[-+]?[0-9]+J|.)+/)
15
+ add_field(2, :Z, path_name, /[!-)+-<>-~][!-~]*/)
16
+ add_field(3, :Z, segment_names, /[!-)+-<>-~][!-~]*/)
17
+ add_field(4, :Z, overlaps, /\*|([0-9]+[MIDNSHPX=]|[-+]?[0-9]+J|.)+/)
17
18
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
18
19
  end
20
+
21
+ ##
22
+ # Array of segment names (without orientations) as strings
23
+ def segment_names_a
24
+ segment_names.value.split(/[,;]/).map { |i| i.gsub(/[+-]$/, '') }
25
+ end
26
+
27
+ ##
28
+ # Extracts all linked segments from +gfa+ (which *must* be indexed)
29
+ def segments(gfa)
30
+ raise "Unindexed GFA" unless gfa.indexed?
31
+ segment_names_a.map do |name|
32
+ gfa.segments[name]
33
+ end
34
+ end
35
+
36
+ ##
37
+ # Includes a GFA::Record::Segment +segment+?
38
+ def include?(segment)
39
+ # unless segment.is_a? GFA::Record::Segment
40
+ # raise "Unrecognized class: #{segment.class}"
41
+ # end
42
+
43
+ segment_names_a.any? { |name| segment.name == name }
44
+ end
19
45
  end
@@ -8,8 +8,11 @@ class GFA::Record::Segment < GFA::Record
8
8
  KC: :i, # k-mer count
9
9
  SH: :H, # SHA-256 checksum of the sequence
10
10
  UR: :Z, # URI or local file-system path of the sequence
11
- # Non-cannonical
12
- DP: :f # (From SAM)
11
+ # Non-cannonical but uppercase (thus, reserved)
12
+ DP: :f, # SAM
13
+ SN: :Z, # rGFA: Name of stable sequence from which the segment is derived
14
+ SO: :i, # rGFA: Offset on the stable sequence
15
+ SR: :i # rGFA: Rank. 0 if on a linear reference genome; >0 otherwise
13
16
  }
14
17
 
15
18
  REQ_FIELDS.each_index do |i|
data/lib/gfa/record.rb CHANGED
@@ -30,6 +30,8 @@ class GFA::Record
30
30
  end
31
31
 
32
32
  def self.[](string)
33
+ return nil if string.nil? || string =~ /^\s*$/
34
+
33
35
  split = string[0] == '#' ? ['', 2] : ["\t", 0]
34
36
  code, *values = string.chomp.split(*split)
35
37
  code_class(code).new(*values)
@@ -67,6 +69,10 @@ class GFA::Record
67
69
  o.join("\t")
68
70
  end
69
71
 
72
+ def dup
73
+ self.class[to_s]
74
+ end
75
+
70
76
  def hash
71
77
  { code => fields }.hash
72
78
  end
@@ -23,12 +23,12 @@ class GFA::RecordSet
23
23
 
24
24
  # Instance-level
25
25
 
26
- attr_reader :set, :gfa
26
+ attr_reader :set, :index, :gfa
27
27
 
28
- def initialize(gfa)
28
+ def initialize(gfa = nil)
29
29
  @set = []
30
30
  @index = {}
31
- @gfa = gfa
31
+ @gfa = gfa || GFA.new
32
32
  end
33
33
 
34
34
  def [](k)
@@ -69,25 +69,37 @@ class GFA::RecordSet
69
69
  raise "Wrong type of record: #{v.type}" if v.type != type
70
70
 
71
71
  @set << v
72
- index(v)
72
+ index!(v)
73
+ end
74
+
75
+ def indexed?
76
+ (empty? || !index_field) ? gfa.opts[:index] : !index.empty?
77
+ end
78
+
79
+ def rebuild_index!
80
+ @index = {}
81
+ set.each { |v| index!(v) }
73
82
  end
74
83
 
75
84
  def index_id(v)
76
85
  v[index_field]&.value
77
86
  end
78
87
 
79
- def index(v)
88
+ def index!(v)
80
89
  save_index(index_id(v), v) if index_field
81
90
 
82
91
  # Whenever present, index also by ID
83
- save_index(v[:ID].value, v) if v[:ID] && v[:ID].value =~ index_id(v)
92
+ if gfa.opts[:index_id] && v[:ID] && v[:ID].value =~ index_id(v)
93
+ save_index(v[:ID].value, v)
94
+ end
84
95
  end
85
96
 
86
97
  def save_index(k, v)
87
98
  return unless gfa.opts[:index] && k
88
99
 
89
100
  if @index[k]
90
- warn "#{type} already registered with field #{index_field}: #{k}"
101
+ f = index_field.is_a?(Integer) ? '' : "#{index_field}: "
102
+ raise "#{type} already registered: #{f}#{k}"
91
103
  end
92
104
  @index[k] = v
93
105
  end
@@ -96,4 +108,14 @@ class GFA::RecordSet
96
108
  k = k.value if k.is_a? GFA::Field
97
109
  @index[k]
98
110
  end
111
+
112
+ def merge!(record_set)
113
+ raise "Not a record set" unless record_set.is_a?(GFA::RecordSet)
114
+ if record_set.type != type
115
+ raise "Wrong type of record set: #{record_set.type}"
116
+ end
117
+
118
+ record_set.set.each { |i| @set << i }
119
+ record_set.index.each { |k, v| save_index(k, v) }
120
+ end
99
121
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.3.1'
2
+ VERSION = '0.4.0'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
data/test/parser_test.rb CHANGED
@@ -49,8 +49,10 @@ class ParserTest < Test::Unit::TestCase
49
49
  assert(sample.path('first').is_a?(GFA::Record))
50
50
  assert(sample.paths['first'].is_a?(GFA::Record))
51
51
  assert_equal('first', sample.path('first')[2]&.value)
52
+ assert(sample.indexed?)
52
53
  sample = GFA.load(path, index: false)
53
54
  assert_nil(sample.path('first'))
55
+ assert(!sample.indexed?)
54
56
  end
55
57
 
56
58
  def test_version_suppport
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-23 00:00:00.000000000 Z
11
+ date: 2023-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl
@@ -63,6 +63,8 @@ files:
63
63
  - LICENSE
64
64
  - README.md
65
65
  - Rakefile
66
+ - bin/gfa-add-gaf
67
+ - bin/gfa-subgraph
66
68
  - lib/gfa.rb
67
69
  - lib/gfa/common.rb
68
70
  - lib/gfa/field.rb
@@ -79,6 +81,7 @@ files:
79
81
  - lib/gfa/record.rb
80
82
  - lib/gfa/record/comment.rb
81
83
  - lib/gfa/record/containment.rb
84
+ - lib/gfa/record/has_from_to.rb
82
85
  - lib/gfa/record/header.rb
83
86
  - lib/gfa/record/jump.rb
84
87
  - lib/gfa/record/link.rb