gfa 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -12
- data/bin/gfa-add-gaf +70 -0
- data/bin/gfa-subgraph +41 -0
- data/lib/gfa/common.rb +25 -3
- data/lib/gfa/generator.rb +3 -3
- data/lib/gfa/graph.rb +138 -2
- data/lib/gfa/parser.rb +65 -17
- data/lib/gfa/record/containment.rb +4 -0
- data/lib/gfa/record/has_from_to.rb +47 -0
- data/lib/gfa/record/jump.rb +4 -23
- data/lib/gfa/record/link.rb +4 -23
- data/lib/gfa/record/path.rb +31 -5
- data/lib/gfa/record/segment.rb +5 -2
- data/lib/gfa/record.rb +6 -0
- data/lib/gfa/record_set.rb +29 -7
- data/lib/gfa/version.rb +1 -1
- data/test/parser_test.rb +2 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
|
4
|
+
data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
|
7
|
+
data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
|
data/README.md
CHANGED
@@ -7,28 +7,35 @@
|
|
7
7
|
|
8
8
|
This implementation follows the specifications of [GFA-spec][].
|
9
9
|
|
10
|
+
To load the library:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
require 'gfa'
|
14
|
+
```
|
10
15
|
|
11
16
|
## Parsing GFA
|
12
17
|
|
13
18
|
To parse a file in GFA format:
|
14
19
|
|
15
20
|
```ruby
|
16
|
-
require 'gfa'
|
17
|
-
|
18
21
|
my_gfa = GFA.load('assembly.gfa')
|
19
22
|
```
|
20
23
|
|
21
|
-
|
24
|
+
For large GFA files, you can also parse them in parallel:
|
22
25
|
|
23
26
|
```ruby
|
24
|
-
|
27
|
+
my_gfa = GFA.load_parallel('large-graph.gfa', 4)
|
28
|
+
```
|
29
|
+
|
30
|
+
To load GFA strings line-by-line:
|
25
31
|
|
32
|
+
```ruby
|
26
33
|
my_gfa = GFA.new
|
27
|
-
|
28
|
-
fh.each do |ln|
|
29
|
-
|
34
|
+
File.open('assembly.gfa', 'r') do |fh|
|
35
|
+
fh.each do |ln|
|
36
|
+
my_gfa << ln
|
37
|
+
end
|
30
38
|
end
|
31
|
-
fh.close
|
32
39
|
```
|
33
40
|
|
34
41
|
|
@@ -58,7 +65,6 @@ Any `GFA` object can be exported as an [`RGL`][rgl] graph using the methods
|
|
58
65
|
[tiny.gfa](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.gfa):
|
59
66
|
|
60
67
|
```ruby
|
61
|
-
require 'gfa'
|
62
68
|
require 'rgl/dot'
|
63
69
|
|
64
70
|
my_gfa = GFA.load('data/tiny.gfa')
|
@@ -91,8 +97,6 @@ Or add the following line to your Gemfile:
|
|
91
97
|
gem 'gfa'
|
92
98
|
```
|
93
99
|
|
94
|
-
and run `bundle install` from your shell.
|
95
|
-
|
96
100
|
|
97
101
|
# Author
|
98
102
|
|
@@ -103,6 +107,6 @@ and run `bundle install` from your shell.
|
|
103
107
|
|
104
108
|
[Artistic License 2.0](LICENSE).
|
105
109
|
|
106
|
-
[GFA-spec]: https://github.com/
|
110
|
+
[GFA-spec]: https://github.com/GFA-spec/GFA-spec
|
107
111
|
[lrr]: https://rodriguez-r.com/
|
108
112
|
[rgl]: https://github.com/monora/rgl
|
data/bin/gfa-add-gaf
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input_gfa, input_gaf, output, degree, threads = ARGV
|
12
|
+
|
13
|
+
unless degree
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
gfa-add-gaf <input-gfa> <input-gaf> <output> <degree> [<pref> [<threads>]]
|
16
|
+
|
17
|
+
<input-gfa> Input GFA file to read
|
18
|
+
<input-gaf> Input GAF file to read
|
19
|
+
<output> Output GFA file to write
|
20
|
+
<degree> Maximum degree of separation between the segment set in the GAF
|
21
|
+
and any other included segments. If 0, only segments are
|
22
|
+
included. If 1, only the target segments, records linking to
|
23
|
+
them, and segments linked by those records. Any integer > 1
|
24
|
+
includes additional expansion rounds for those linked segments.
|
25
|
+
Use -1 to include the complete original GAF without subsetting.
|
26
|
+
<pref> A prefix to name all recorded paths
|
27
|
+
By default: Based on the GAF file name
|
28
|
+
<threads> If passed, parallelize process with these many threads
|
29
|
+
HELP
|
30
|
+
exit(1)
|
31
|
+
end
|
32
|
+
|
33
|
+
$stderr.puts "Loading GFA: #{input_gfa}"
|
34
|
+
gfa = GFA.load_parallel(input_gfa, (threads || 1).to_i)
|
35
|
+
|
36
|
+
$stderr.puts "Loading GAF: #{input_gaf}"
|
37
|
+
$stderr.puts "- Minimum identity: #{0.95}"
|
38
|
+
pref ||= File.basename(input_gaf, '.gaf').gsub(/[^!-)+-<>-~]/, '_')
|
39
|
+
segments = []
|
40
|
+
File.open(input_gaf, 'r') do |fh|
|
41
|
+
fh.each do |ln|
|
42
|
+
row = ln.chomp.split("\t")
|
43
|
+
opt = Hash[row[12..].map { |i| i.split(':', 2) }]
|
44
|
+
opt.each { |k, v| opt[k] = GFA::Field[v] }
|
45
|
+
next if opt['id'] && opt['id'].value < 0.95
|
46
|
+
|
47
|
+
gaf_path = row[5]
|
48
|
+
seg_names = []
|
49
|
+
gaf_path.scan(/[><]?[^><]+/).each do |seg|
|
50
|
+
seg_orient = seg.match?(/^</) ? '-' : '+'
|
51
|
+
seg_name = seg.sub(/^[><]/, '')
|
52
|
+
seg_names << "#{seg_name}#{seg_orient}"
|
53
|
+
segments << seg_name unless segments.include?(seg_name)
|
54
|
+
end
|
55
|
+
gfa << GFA::Record::Path.new(
|
56
|
+
"#{pref}_#{$.}", seg_names.join(','), opt['cg']&.value || '*'
|
57
|
+
)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
$stderr.puts "- Found #{segments.size} linked segments"
|
61
|
+
|
62
|
+
degree = degree.to_i
|
63
|
+
if degree >= 0
|
64
|
+
$stderr.puts 'Subsetting graph'
|
65
|
+
gfa = gfa.subgraph(segments, degree: degree)
|
66
|
+
end
|
67
|
+
|
68
|
+
$stderr.puts "Saving GFA: #{output}"
|
69
|
+
gfa.save(output)
|
70
|
+
|
data/bin/gfa-subgraph
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, output, degree, segments, threads = ARGV
|
12
|
+
|
13
|
+
unless segments
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Select a set of segments and include only elements of the GFA linked to
|
16
|
+
those segments (directly or indirectly)
|
17
|
+
|
18
|
+
gfa-subgraph <input> <output> <degree> <segments> [<threads>]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<output> Output GFA file to write
|
22
|
+
<degree> Maximum degree of separation between the segment set and any
|
23
|
+
other included segments. If 0, only segments are included.
|
24
|
+
If 1, only the target segments, records linking to them, and
|
25
|
+
segments linked by those records. Any integer > 1 includes
|
26
|
+
additional expansion rounds for those linked segments.
|
27
|
+
<segments> Comma-delimited list of segment segments
|
28
|
+
<threads> If passed, parallelize process with these many threads
|
29
|
+
HELP
|
30
|
+
exit(1)
|
31
|
+
end
|
32
|
+
|
33
|
+
$stderr.puts "Loading GFA: #{input}"
|
34
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
35
|
+
|
36
|
+
$stderr.puts 'Subsetting graph'
|
37
|
+
gfa = gfa.subgraph(segments.split(','), degree: degree.to_i)
|
38
|
+
|
39
|
+
$stderr.puts "Saving GFA: #{output}"
|
40
|
+
gfa.save(output)
|
41
|
+
|
data/lib/gfa/common.rb
CHANGED
@@ -14,8 +14,8 @@ class GFA
|
|
14
14
|
attr :gfa_version, :records, :opts
|
15
15
|
|
16
16
|
GFA::Record.TYPES.each do |r_type|
|
17
|
-
plural = "#{r_type.downcase}s"
|
18
17
|
singular = "#{r_type.downcase}"
|
18
|
+
plural = "#{singular}s"
|
19
19
|
|
20
20
|
define_method(plural) { records[r_type] }
|
21
21
|
define_method(singular) { |k| records[r_type][k] }
|
@@ -24,7 +24,7 @@ class GFA
|
|
24
24
|
|
25
25
|
def initialize(opts = {})
|
26
26
|
@records = {}
|
27
|
-
@opts = { index: true, comments: false }.merge(opts)
|
27
|
+
@opts = { index: true, index_id: false, comments: false }.merge(opts)
|
28
28
|
GFA::Record.TYPES.each do |t|
|
29
29
|
@records[t] = GFA::RecordSet.name_class(t).new(self)
|
30
30
|
end
|
@@ -38,5 +38,27 @@ class GFA
|
|
38
38
|
records == gfa.records
|
39
39
|
end
|
40
40
|
|
41
|
-
|
41
|
+
def ==(gfa)
|
42
|
+
eql?(gfa)
|
43
|
+
end
|
44
|
+
|
45
|
+
def size
|
46
|
+
records.values.map(&:size).inject(0, :+)
|
47
|
+
end
|
48
|
+
|
49
|
+
def merge!(gfa)
|
50
|
+
raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA
|
51
|
+
|
52
|
+
GFA::Record.TYPES.each do |t|
|
53
|
+
@records[t].merge!(gfa.records[t])
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def indexed?
|
58
|
+
records.values.all?(&:indexed?)
|
59
|
+
end
|
60
|
+
|
61
|
+
def rebuild_index!
|
62
|
+
@records.each_value(&:rebuild_index!)
|
63
|
+
end
|
42
64
|
end
|
data/lib/gfa/generator.rb
CHANGED
@@ -8,9 +8,9 @@ class GFA
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def each_line(&blk)
|
11
|
-
set_version_header('1.
|
11
|
+
set_version_header('1.2') if gfa_version.nil?
|
12
12
|
GFA::Record.TYPES.each do |r_type|
|
13
|
-
records[r_type].each do |record|
|
13
|
+
records[r_type].set.each do |record|
|
14
14
|
blk[record.to_s]
|
15
15
|
end
|
16
16
|
end
|
@@ -23,7 +23,7 @@ class GFA
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def unset_version
|
26
|
-
|
26
|
+
headers.set.delete_if { |o| !o.fields[:VN].nil? }
|
27
27
|
@gfa_version = nil
|
28
28
|
end
|
29
29
|
|
data/lib/gfa/graph.rb
CHANGED
@@ -21,9 +21,145 @@ class GFA
|
|
21
21
|
def adjacency_graph(opts = {})
|
22
22
|
implicit_graph(opts).to_adjacency
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
|
+
##
|
26
|
+
# Extracts the subset of records associated to +segments+, which is an Array
|
27
|
+
# with values of any class in: Integer (segment index),
|
28
|
+
# String or GFA::Field::String (segment names), or GFA::Record::Segment.
|
29
|
+
#
|
30
|
+
# +degree+ indicates the maximum degree of separation between the original
|
31
|
+
# segment set and any additional segments. Use 0 to include only the segments
|
32
|
+
# in the set. Use 1 to include those, the records linking to them, and the
|
33
|
+
# additional segments linked by those records. Use any integer greater than 1
|
34
|
+
# to prompt additional rounds of greedy graph expansion.
|
35
|
+
#
|
36
|
+
# If +headers+, it includes all the original headers. Otherwise it only
|
37
|
+
# only includes the version header (might be inferred).
|
38
|
+
#
|
39
|
+
# All comments are ignored even if originally parsed. Walks are currently
|
40
|
+
# ignored too. If the current GFA object doesn't have an index, it builds one
|
41
|
+
# and forces +index: true+. The output object inherits all options.
|
42
|
+
def subgraph(segments, degree: 1, headers: true)
|
43
|
+
# Prepare objects
|
44
|
+
unless opts[:index]
|
45
|
+
opts[:index] = true
|
46
|
+
rebuild_index!
|
47
|
+
end
|
48
|
+
gfa = GFA.new(opts)
|
49
|
+
segments =
|
50
|
+
segments.map do |i|
|
51
|
+
i.is_a?(GFA::Record::Segment) ? i :
|
52
|
+
segment(i) or raise "Cannot find segment: #{i}"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Headers
|
56
|
+
if headers
|
57
|
+
self.headers.set.each { |record| gfa << record }
|
58
|
+
else
|
59
|
+
gfa << GFA::Record::Header.new("VN:Z:#{gfa_version}")
|
60
|
+
end
|
61
|
+
|
62
|
+
# Original segments
|
63
|
+
segments.each { |segment| gfa << segment }
|
64
|
+
|
65
|
+
# Expand graph
|
66
|
+
linking, edges = linking_records(gfa.segments, degree: degree)
|
67
|
+
linking += internally_linking_records(segments, edges)
|
68
|
+
linking.each { |record| gfa << record }
|
69
|
+
|
70
|
+
# Return
|
71
|
+
gfa
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Finds all the records linking to any segments in +segments+, a
|
76
|
+
# GFA::RecordSet::SegmentSet object, and expands to links with up to
|
77
|
+
# +degree+ degrees of separation
|
78
|
+
#
|
79
|
+
# It only evaluates the edges given in the +edges+ Array of GFA::Record
|
80
|
+
# values. If +edges+ is +nil+, it uses the full set of edges in the gfa.
|
81
|
+
# Edge GFA::Record objects can be of type Link, Containment, Jump, or Path
|
82
|
+
#
|
83
|
+
# If +_ignore+ is passed, it ignores this number of segments at the beginning
|
84
|
+
# of the +segments+ set (assumes they have already been evaluated). This is
|
85
|
+
# only used for internal heuristics
|
86
|
+
#
|
87
|
+
# Returns an Array of with two elements:
|
88
|
+
# 0. An array of GFA::Record objects with all the identified linking records
|
89
|
+
# 1. An array of GFA::Record objects with all edges that were not identified
|
90
|
+
#
|
91
|
+
# IMPORTANT NOTE 1: The object +segments+ will be modified to include all
|
92
|
+
# linked segments. If you don't want this behaviour, please make sure to pass
|
93
|
+
# a duplicate of the object instead.
|
94
|
+
#
|
95
|
+
# IMPORTANT NOTE 2: The list of linking records may not comprehensively
|
96
|
+
# include all records linking the identified expanded segment set. To ensure
|
97
|
+
# a consistent set is identified, use:
|
98
|
+
# linking, edges = gfa.linking_records(segments)
|
99
|
+
# linking += gfa.internally_linking_records(segments, edges)
|
100
|
+
#
|
101
|
+
def linking_records(segments, degree: 1, edges: nil, _ignore: 0)
|
102
|
+
unless segments.is_a? GFA::RecordSet::SegmentSet
|
103
|
+
raise "Unrecognised class: #{segments.class}"
|
104
|
+
end
|
105
|
+
|
106
|
+
# Gather edges to evaluate
|
107
|
+
edges ||= all_edges
|
108
|
+
return [[], edges] if degree <= 0
|
109
|
+
|
110
|
+
# Links, Containments, Jumps (from, to) and Paths (segment_names)
|
111
|
+
linking = []
|
112
|
+
eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
|
113
|
+
edges.delete_if do |record|
|
114
|
+
if eval_set.any? { |segment| record.include? segment }
|
115
|
+
linking << record
|
116
|
+
true # Remove from the edge set to speed up future recursions
|
117
|
+
else
|
118
|
+
false # Keep it, possibly linking future recursions
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Recurse and return
|
123
|
+
if degree >= 1
|
124
|
+
pre = segments.size
|
125
|
+
|
126
|
+
# Add additional linked segments
|
127
|
+
linking.each do |record|
|
128
|
+
record.segments(self).each do |other_seg|
|
129
|
+
segments << other_seg unless segments[other_seg.name]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Recurse only if new segments were discovered
|
134
|
+
if segments.size > pre
|
135
|
+
$stderr.puts "- Recursion [#{degree}]: " \
|
136
|
+
"#{pre} -> #{segments.size}\t(#{edges.size})"
|
137
|
+
linking +=
|
138
|
+
linking_records(
|
139
|
+
segments,
|
140
|
+
degree: degree - 1, edges: edges, _ignore: pre
|
141
|
+
)[0]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
[linking, edges]
|
145
|
+
end
|
146
|
+
|
147
|
+
def internally_linking_records(segments, edges)
|
148
|
+
$stderr.puts '- Gathering internally linking records'
|
149
|
+
segments = Hash[segments.set.map { |i| [i.name.value, true]}]
|
150
|
+
edges.select { |record| record.segment_names_a.all? { |s| segments[s] } }
|
151
|
+
end
|
152
|
+
|
153
|
+
##
|
154
|
+
# Returns an array of GFA::Record objects including all possible edges
|
155
|
+
# from the GFA. I.e., all links, jumps, containments, and paths.
|
156
|
+
def all_edges
|
157
|
+
edge_t = %i[Link Jump Containment Path]
|
158
|
+
edges = edge_t.flat_map { |t| records[t].set } if edges.nil?
|
159
|
+
end
|
160
|
+
|
25
161
|
private
|
26
|
-
|
162
|
+
|
27
163
|
def segment_names_with_orient
|
28
164
|
segments.flat_map do |s|
|
29
165
|
%w[+ -].map { |orient| GFA::GraphVertex.idx(s, orient) }
|
data/lib/gfa/parser.rb
CHANGED
@@ -6,16 +6,74 @@ class GFA
|
|
6
6
|
MAX_VERSION = '1.2'
|
7
7
|
|
8
8
|
##
|
9
|
-
# Load a GFA object from a
|
10
|
-
# - index: If the
|
9
|
+
# Load a GFA object from a gfa +file+ with options +opts+:
|
10
|
+
# - index: If the records should be indexed as loaded (default: true)
|
11
|
+
# - index_id: If the records should also be index by ID (default: false)
|
11
12
|
# - comments: If the comment records should be saved (default: false)
|
13
|
+
# - line_range: Two-integer array indicating the first and last lines to read
|
14
|
+
# (default: nil, read the entire file)
|
12
15
|
def self.load(file, opts = {})
|
13
16
|
gfa = GFA.new(opts)
|
14
|
-
|
15
|
-
|
17
|
+
read_records(file, opts) do |record|
|
18
|
+
gfa << record
|
19
|
+
end
|
16
20
|
gfa
|
17
|
-
|
18
|
-
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.read_records(file, opts = {})
|
24
|
+
rng = opts[:line_range]
|
25
|
+
File.open(file, 'r') do |fh|
|
26
|
+
lno = -1
|
27
|
+
fh.each do |ln|
|
28
|
+
lno += 1
|
29
|
+
next if !rng.nil? && (lno < rng[0] || lno > rng[1])
|
30
|
+
next if !opts[:comments] && ln[0] == '#'
|
31
|
+
|
32
|
+
yield(GFA::Record[ln])
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Load a GFA object from a gfa +file+ in parallel using +thr+ threads,
|
39
|
+
# and the same +opts+ supported by +load+. Defaults to the +load+ method
|
40
|
+
# instead if +thr <= 1+.
|
41
|
+
def self.load_parallel(file, thr, opts = {})
|
42
|
+
return self.load(file, opts) if thr <= 1
|
43
|
+
|
44
|
+
# Prepare data
|
45
|
+
lno = 0
|
46
|
+
File.open(file, 'r') { |fh| fh.each { lno += 1 } }
|
47
|
+
thr = lno if thr > lno
|
48
|
+
blk = (lno.to_f / thr).ceil
|
49
|
+
|
50
|
+
# Launch children processes
|
51
|
+
io = []
|
52
|
+
pid = []
|
53
|
+
thr.times do |i|
|
54
|
+
io[i] = IO.pipe
|
55
|
+
pid << fork do
|
56
|
+
io[i][0].close
|
57
|
+
o = opts.merge(line_range: [i * blk, (i + 1) * blk - 1])
|
58
|
+
records = []
|
59
|
+
read_records(file, o) { |record| records << record }
|
60
|
+
Marshal.dump(records, io[i][1])
|
61
|
+
exit!(0)
|
62
|
+
end
|
63
|
+
io[i][1].close
|
64
|
+
end
|
65
|
+
|
66
|
+
# Collect and merge results
|
67
|
+
gfa = GFA.new(opts)
|
68
|
+
io.each_with_index do |pipe, k|
|
69
|
+
result = pipe[0].read
|
70
|
+
Process.wait(pid[k])
|
71
|
+
raise "Child process failed: #{k}" if result.empty?
|
72
|
+
Marshal.load(result).each { |record| gfa << record }
|
73
|
+
pipe[0].close
|
74
|
+
end
|
75
|
+
|
76
|
+
return gfa
|
19
77
|
end
|
20
78
|
|
21
79
|
def self.supported_version?(v)
|
@@ -24,7 +82,7 @@ class GFA
|
|
24
82
|
|
25
83
|
# Instance-level
|
26
84
|
def <<(obj)
|
27
|
-
obj =
|
85
|
+
obj = GFA::Record[obj] unless obj.is_a? GFA::Record
|
28
86
|
return if obj.nil? || obj.empty?
|
29
87
|
@records[obj.type] << obj
|
30
88
|
|
@@ -41,14 +99,4 @@ class GFA
|
|
41
99
|
|
42
100
|
@gfa_version = v
|
43
101
|
end
|
44
|
-
|
45
|
-
private
|
46
|
-
|
47
|
-
def parse_line(string)
|
48
|
-
string = string.chomp
|
49
|
-
return nil if string =~ /^\s*$/
|
50
|
-
return nil if !opts[:comments] && string[0] == '#'
|
51
|
-
|
52
|
-
GFA::Record[string]
|
53
|
-
end
|
54
102
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'gfa/record/has_from_to'
|
2
|
+
|
1
3
|
class GFA::Record::Containment < GFA::Record
|
2
4
|
CODE = :C
|
3
5
|
REQ_FIELDS = %i[from from_orient to to_orient pos overlap]
|
@@ -12,6 +14,8 @@ class GFA::Record::Containment < GFA::Record
|
|
12
14
|
end
|
13
15
|
OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
|
14
16
|
|
17
|
+
include GFA::Record::HasFromTo
|
18
|
+
|
15
19
|
alias container from
|
16
20
|
alias container_orient from_orient
|
17
21
|
alias contained to
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module GFA::Record::HasFromTo
|
2
|
+
def from?(segment, orient = nil)
|
3
|
+
links_from_to?(segment, orient, true)
|
4
|
+
end
|
5
|
+
|
6
|
+
def to?(segment, orient = nil)
|
7
|
+
links_from_to?(segment, orient, false)
|
8
|
+
end
|
9
|
+
|
10
|
+
##
|
11
|
+
# Extracts all linked segments from +gfa+ (which *must* be indexed)
|
12
|
+
def segments(gfa)
|
13
|
+
raise "Unindexed GFA" unless gfa.indexed?
|
14
|
+
[gfa.segments[from.value], gfa.segments[to.value]]
|
15
|
+
end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Include a GFA::Record::Segment +segment+?
|
19
|
+
def include?(segment)
|
20
|
+
# unless segment.is_a? GFA::Record::Segment
|
21
|
+
# raise "Unrecognized class: #{segment.class}"
|
22
|
+
# end
|
23
|
+
segment.name == from || segment.name == to
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Array of strings with the names of the segments linked by the
|
28
|
+
# record
|
29
|
+
def segment_names_a
|
30
|
+
[from.value, to.value]
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def links_from_to?(segment, orient, from)
|
36
|
+
segment = segment_name(segment)
|
37
|
+
orient = orient.value if orient.is_a? GFA::Field
|
38
|
+
base_k = from ? 2 : 4
|
39
|
+
segment == fields[base_k].value &&
|
40
|
+
(orient.nil? || orient == fields[base_k + 1].value)
|
41
|
+
end
|
42
|
+
|
43
|
+
def segment_name(segment)
|
44
|
+
segment.is_a?(GFA::Record::Segment) ? segment.name.value :
|
45
|
+
segment.is_a?(GFA::Field) ? segment.value : segment
|
46
|
+
end
|
47
|
+
end
|
data/lib/gfa/record/jump.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'gfa/record/has_from_to'
|
2
|
+
|
1
3
|
class GFA::Record::Jump < GFA::Record
|
2
4
|
CODE = :J
|
3
5
|
REQ_FIELDS = %i[from from_orient to to_orient distance]
|
@@ -10,6 +12,8 @@ class GFA::Record::Jump < GFA::Record
|
|
10
12
|
end
|
11
13
|
OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
|
12
14
|
|
15
|
+
include GFA::Record::HasFromTo
|
16
|
+
|
13
17
|
def initialize(from, from_orient, to, to_orient, distance, *opt_fields)
|
14
18
|
@fields = {}
|
15
19
|
add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
|
@@ -19,27 +23,4 @@ class GFA::Record::Jump < GFA::Record
|
|
19
23
|
add_field(6, :Z, distance, /\*|[-+]?[0-9]+/)
|
20
24
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
21
25
|
end
|
22
|
-
|
23
|
-
def from?(segment, orient = nil)
|
24
|
-
links_from_to?(segment, orient, true)
|
25
|
-
end
|
26
|
-
|
27
|
-
def to?(segment, orient = nil)
|
28
|
-
links_from_to?(segment, orient, false)
|
29
|
-
end
|
30
|
-
|
31
|
-
private
|
32
|
-
|
33
|
-
def links_from_to?(segment, orient, from)
|
34
|
-
segment = segment_name(segment)
|
35
|
-
orient = orient.value if orient.is_a? GFA::Field
|
36
|
-
base_k = from ? 2 : 4
|
37
|
-
segment==fields[base_k].value &&
|
38
|
-
(orient.nil? || orient==fields[base_k + 1].value)
|
39
|
-
end
|
40
|
-
|
41
|
-
def segment_name(segment)
|
42
|
-
segment.is_a?(GFA::Record::Segment) ? segment.name.value :
|
43
|
-
segment.is_a?(GFA::Field) ? segment.value : segment
|
44
|
-
end
|
45
26
|
end
|
data/lib/gfa/record/link.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'gfa/record/has_from_to'
|
2
|
+
|
1
3
|
class GFA::Record::Link < GFA::Record
|
2
4
|
CODE = :L
|
3
5
|
REQ_FIELDS = %i[from from_orient to to_orient overlap]
|
@@ -15,6 +17,8 @@ class GFA::Record::Link < GFA::Record
|
|
15
17
|
end
|
16
18
|
OPT_FIELDS.each_key { |i| define_method(i) { fields[i] } }
|
17
19
|
|
20
|
+
include GFA::Record::HasFromTo
|
21
|
+
|
18
22
|
def initialize(from, from_orient, to, to_orient, overlap, *opt_fields)
|
19
23
|
@fields = {}
|
20
24
|
add_field(2, :Z, from, /[!-)+-<>-~][!-~]*/)
|
@@ -24,27 +28,4 @@ class GFA::Record::Link < GFA::Record
|
|
24
28
|
add_field(6, :Z, overlap, /\*|([0-9]+[MIDNSHPX=])+/)
|
25
29
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
26
30
|
end
|
27
|
-
|
28
|
-
def from?(segment, orient = nil)
|
29
|
-
links_from_to?(segment, orient, true)
|
30
|
-
end
|
31
|
-
|
32
|
-
def to?(segment, orient = nil)
|
33
|
-
links_from_to?(segment, orient, false)
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def links_from_to?(segment, orient, from)
|
39
|
-
segment = segment_name(segment)
|
40
|
-
orient = orient.value if orient.is_a? GFA::Field
|
41
|
-
base_k = from ? 2 : 4
|
42
|
-
segment==fields[base_k].value &&
|
43
|
-
(orient.nil? || orient==fields[base_k + 1].value)
|
44
|
-
end
|
45
|
-
|
46
|
-
def segment_name(segment)
|
47
|
-
segment.is_a?(GFA::Record::Segment) ? segment.name.value :
|
48
|
-
segment.is_a?(GFA::Field) ? segment.value : segment
|
49
|
-
end
|
50
31
|
end
|
data/lib/gfa/record/path.rb
CHANGED
@@ -1,19 +1,45 @@
|
|
1
1
|
class GFA::Record::Path < GFA::Record
|
2
2
|
CODE = :P
|
3
|
-
REQ_FIELDS = %i[path_name
|
3
|
+
REQ_FIELDS = %i[path_name segment_names overlaps]
|
4
4
|
OPT_FIELDS = {}
|
5
5
|
|
6
6
|
REQ_FIELDS.each_index do |i|
|
7
7
|
define_method(REQ_FIELDS[i]) { fields[i + 2] }
|
8
8
|
end
|
9
9
|
|
10
|
+
alias segment_name segment_names
|
10
11
|
alias cigar overlaps
|
11
12
|
|
12
|
-
def initialize(path_name,
|
13
|
+
def initialize(path_name, segment_names, overlaps, *opt_fields)
|
13
14
|
@fields = {}
|
14
|
-
add_field(2, :Z, path_name,
|
15
|
-
add_field(3, :Z,
|
16
|
-
add_field(4, :Z, overlaps,
|
15
|
+
add_field(2, :Z, path_name, /[!-)+-<>-~][!-~]*/)
|
16
|
+
add_field(3, :Z, segment_names, /[!-)+-<>-~][!-~]*/)
|
17
|
+
add_field(4, :Z, overlaps, /\*|([0-9]+[MIDNSHPX=]|[-+]?[0-9]+J|.)+/)
|
17
18
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
18
19
|
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Array of segment names (without orientations) as strings
|
23
|
+
def segment_names_a
|
24
|
+
segment_names.value.split(/[,;]/).map { |i| i.gsub(/[+-]$/, '') }
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# Extracts all linked segments from +gfa+ (which *must* be indexed)
|
29
|
+
def segments(gfa)
|
30
|
+
raise "Unindexed GFA" unless gfa.indexed?
|
31
|
+
segment_names_a.map do |name|
|
32
|
+
gfa.segments[name]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# Includes a GFA::Record::Segment +segment+?
|
38
|
+
def include?(segment)
|
39
|
+
# unless segment.is_a? GFA::Record::Segment
|
40
|
+
# raise "Unrecognized class: #{segment.class}"
|
41
|
+
# end
|
42
|
+
|
43
|
+
segment_names_a.any? { |name| segment.name == name }
|
44
|
+
end
|
19
45
|
end
|
data/lib/gfa/record/segment.rb
CHANGED
@@ -8,8 +8,11 @@ class GFA::Record::Segment < GFA::Record
|
|
8
8
|
KC: :i, # k-mer count
|
9
9
|
SH: :H, # SHA-256 checksum of the sequence
|
10
10
|
UR: :Z, # URI or local file-system path of the sequence
|
11
|
-
# Non-cannonical
|
12
|
-
DP: :f
|
11
|
+
# Non-cannonical but uppercase (thus, reserved)
|
12
|
+
DP: :f, # SAM
|
13
|
+
SN: :Z, # rGFA: Name of stable sequence from which the segment is derived
|
14
|
+
SO: :i, # rGFA: Offset on the stable sequence
|
15
|
+
SR: :i # rGFA: Rank. 0 if on a linear reference genome; >0 otherwise
|
13
16
|
}
|
14
17
|
|
15
18
|
REQ_FIELDS.each_index do |i|
|
data/lib/gfa/record.rb
CHANGED
@@ -30,6 +30,8 @@ class GFA::Record
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def self.[](string)
|
33
|
+
return nil if string.nil? || string =~ /^\s*$/
|
34
|
+
|
33
35
|
split = string[0] == '#' ? ['', 2] : ["\t", 0]
|
34
36
|
code, *values = string.chomp.split(*split)
|
35
37
|
code_class(code).new(*values)
|
@@ -67,6 +69,10 @@ class GFA::Record
|
|
67
69
|
o.join("\t")
|
68
70
|
end
|
69
71
|
|
72
|
+
def dup
|
73
|
+
self.class[to_s]
|
74
|
+
end
|
75
|
+
|
70
76
|
def hash
|
71
77
|
{ code => fields }.hash
|
72
78
|
end
|
data/lib/gfa/record_set.rb
CHANGED
@@ -23,12 +23,12 @@ class GFA::RecordSet
|
|
23
23
|
|
24
24
|
# Instance-level
|
25
25
|
|
26
|
-
attr_reader :set, :gfa
|
26
|
+
attr_reader :set, :index, :gfa
|
27
27
|
|
28
|
-
def initialize(gfa)
|
28
|
+
def initialize(gfa = nil)
|
29
29
|
@set = []
|
30
30
|
@index = {}
|
31
|
-
@gfa = gfa
|
31
|
+
@gfa = gfa || GFA.new
|
32
32
|
end
|
33
33
|
|
34
34
|
def [](k)
|
@@ -69,25 +69,37 @@ class GFA::RecordSet
|
|
69
69
|
raise "Wrong type of record: #{v.type}" if v.type != type
|
70
70
|
|
71
71
|
@set << v
|
72
|
-
index(v)
|
72
|
+
index!(v)
|
73
|
+
end
|
74
|
+
|
75
|
+
def indexed?
|
76
|
+
(empty? || !index_field) ? gfa.opts[:index] : !index.empty?
|
77
|
+
end
|
78
|
+
|
79
|
+
def rebuild_index!
|
80
|
+
@index = {}
|
81
|
+
set.each { |v| index!(v) }
|
73
82
|
end
|
74
83
|
|
75
84
|
def index_id(v)
|
76
85
|
v[index_field]&.value
|
77
86
|
end
|
78
87
|
|
79
|
-
def index(v)
|
88
|
+
def index!(v)
|
80
89
|
save_index(index_id(v), v) if index_field
|
81
90
|
|
82
91
|
# Whenever present, index also by ID
|
83
|
-
|
92
|
+
if gfa.opts[:index_id] && v[:ID] && v[:ID].value =~ index_id(v)
|
93
|
+
save_index(v[:ID].value, v)
|
94
|
+
end
|
84
95
|
end
|
85
96
|
|
86
97
|
def save_index(k, v)
|
87
98
|
return unless gfa.opts[:index] && k
|
88
99
|
|
89
100
|
if @index[k]
|
90
|
-
|
101
|
+
f = index_field.is_a?(Integer) ? '' : "#{index_field}: "
|
102
|
+
raise "#{type} already registered: #{f}#{k}"
|
91
103
|
end
|
92
104
|
@index[k] = v
|
93
105
|
end
|
@@ -96,4 +108,14 @@ class GFA::RecordSet
|
|
96
108
|
k = k.value if k.is_a? GFA::Field
|
97
109
|
@index[k]
|
98
110
|
end
|
111
|
+
|
112
|
+
def merge!(record_set)
|
113
|
+
raise "Not a record set" unless record_set.is_a?(GFA::RecordSet)
|
114
|
+
if record_set.type != type
|
115
|
+
raise "Wrong type of record set: #{record_set.type}"
|
116
|
+
end
|
117
|
+
|
118
|
+
record_set.set.each { |i| @set << i }
|
119
|
+
record_set.index.each { |k, v| save_index(k, v) }
|
120
|
+
end
|
99
121
|
end
|
data/lib/gfa/version.rb
CHANGED
data/test/parser_test.rb
CHANGED
@@ -49,8 +49,10 @@ class ParserTest < Test::Unit::TestCase
|
|
49
49
|
assert(sample.path('first').is_a?(GFA::Record))
|
50
50
|
assert(sample.paths['first'].is_a?(GFA::Record))
|
51
51
|
assert_equal('first', sample.path('first')[2]&.value)
|
52
|
+
assert(sample.indexed?)
|
52
53
|
sample = GFA.load(path, index: false)
|
53
54
|
assert_nil(sample.path('first'))
|
55
|
+
assert(!sample.indexed?)
|
54
56
|
end
|
55
57
|
|
56
58
|
def test_version_suppport
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gfa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rgl
|
@@ -63,6 +63,8 @@ files:
|
|
63
63
|
- LICENSE
|
64
64
|
- README.md
|
65
65
|
- Rakefile
|
66
|
+
- bin/gfa-add-gaf
|
67
|
+
- bin/gfa-subgraph
|
66
68
|
- lib/gfa.rb
|
67
69
|
- lib/gfa/common.rb
|
68
70
|
- lib/gfa/field.rb
|
@@ -79,6 +81,7 @@ files:
|
|
79
81
|
- lib/gfa/record.rb
|
80
82
|
- lib/gfa/record/comment.rb
|
81
83
|
- lib/gfa/record/containment.rb
|
84
|
+
- lib/gfa/record/has_from_to.rb
|
82
85
|
- lib/gfa/record/header.rb
|
83
86
|
- lib/gfa/record/jump.rb
|
84
87
|
- lib/gfa/record/link.rb
|