gfa 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cde4a3e432409c7a2967832dcebd502ddc54b1f6cb25856b6d9d21ce53f67b32
4
- data.tar.gz: 91213d63365dd3608c28e30115cbfc8621e78bbe34936832bcee2ac7e6f460fb
3
+ metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
+ data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
5
5
  SHA512:
6
- metadata.gz: 321634c28ec8927bd38286a84a02783b9f915dcbabb7941233583dda7f41b66e952ff9611c9158bd7baca09d7d3d6c254a036f1c9f2169e5e24e6e964d292e71
7
- data.tar.gz: 3698d16ab5953ffd70bf2c102d154bc1f61e5a13a752bc317a756df13762c3668c1bf6e8144821e53e395783f36c45185249c7fc206694be19145200310c3f48
6
+ metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
+ data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
data/README.md CHANGED
@@ -7,28 +7,35 @@
7
7
 
8
8
  This implementation follows the specifications of [GFA-spec][].
9
9
 
10
+ To load the library:
11
+
12
+ ```ruby
13
+ require 'gfa'
14
+ ```
10
15
 
11
16
  ## Parsing GFA
12
17
 
13
18
  To parse a file in GFA format:
14
19
 
15
20
  ```ruby
16
- require 'gfa'
17
-
18
21
  my_gfa = GFA.load('assembly.gfa')
19
22
  ```
20
23
 
21
- To load GFA strings line-by-line:
24
+ For large GFA files, you can also parse them in parallel:
22
25
 
23
26
  ```ruby
24
- require 'gfa'
27
+ my_gfa = GFA.load_parallel('large-graph.gfa', 4)
28
+ ```
29
+
30
+ To load GFA strings line-by-line:
25
31
 
32
+ ```ruby
26
33
  my_gfa = GFA.new
27
- fh = File.open('assembly.gfa', 'r')
28
- fh.each do |ln|
29
- my_gfa << ln
34
+ File.open('assembly.gfa', 'r') do |fh|
35
+ fh.each do |ln|
36
+ my_gfa << ln
37
+ end
30
38
  end
31
- fh.close
32
39
  ```
33
40
 
34
41
 
@@ -58,12 +65,11 @@ Any `GFA` object can be exported as an [`RGL`][rgl] graph using the methods
58
65
  [tiny.gfa](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.gfa):
59
66
 
60
67
  ```ruby
61
- require "gfa"
62
- require "rgl/dot"
68
+ require 'rgl/dot'
63
69
 
64
- my_gfa = GFA.load("data/tiny.gfa")
70
+ my_gfa = GFA.load('data/tiny.gfa')
65
71
  dg = my_gfa.implicit_graph
66
- dg.write_to_graphic_file("jpg")
72
+ dg.write_to_graphic_file('jpg')
67
73
  ```
68
74
 
69
75
  ![tiny_dg](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.jpg)
@@ -72,8 +78,8 @@ If you don't care about orientation, you can also build an undirected graph
72
78
  without orientation:
73
79
 
74
80
  ```ruby
75
- ug = my_gfa.implicit_graph(orient:false)
76
- ug.write_to_graphic_file("jpg")
81
+ ug = my_gfa.implicit_graph(orient: false)
82
+ ug.write_to_graphic_file('jpg')
77
83
  ```
78
84
 
79
85
  ![tiny_ug](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny_undirected.jpg)
@@ -88,11 +94,9 @@ gem install gfa
88
94
  Or add the following line to your Gemfile:
89
95
 
90
96
  ```ruby
91
- gem "gfa"
97
+ gem 'gfa'
92
98
  ```
93
99
 
94
- and run `bundle install` from your shell.
95
-
96
100
 
97
101
  # Author
98
102
 
@@ -103,6 +107,6 @@ and run `bundle install` from your shell.
103
107
 
104
108
  [Artistic License 2.0](LICENSE).
105
109
 
106
- [GFA-spec]: https://github.com/pmelsted/GFA-spec
110
+ [GFA-spec]: https://github.com/GFA-spec/GFA-spec
107
111
  [lrr]: https://rodriguez-r.com/
108
112
  [rgl]: https://github.com/monora/rgl
data/bin/gfa-add-gaf ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input_gfa, input_gaf, output, degree, threads = ARGV
12
+
13
+ unless degree
14
+ $stderr.puts <<~HELP
15
+ gfa-add-gaf <input-gfa> <input-gaf> <output> <degree> [<pref> [<threads>]]
16
+
17
+ <input-gfa> Input GFA file to read
18
+ <input-gaf> Input GAF file to read
19
+ <output> Output GFA file to write
20
+ <degree> Maximum degree of separation between the segment set in the GAF
21
+ and any other included segments. If 0, only segments are
22
+ included. If 1, only the target segments, records linking to
23
+ them, and segments linked by those records. Any integer > 1
24
+ includes additional expansion rounds for those linked segments.
25
+ Use -1 to include the complete original GAF without subsetting.
26
+ <pref> A prefix to name all recorded paths
27
+ By default: Based on the GAF file name
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input_gfa}"
34
+ gfa = GFA.load_parallel(input_gfa, (threads || 1).to_i)
35
+
36
+ $stderr.puts "Loading GAF: #{input_gaf}"
37
+ $stderr.puts "- Minimum identity: #{0.95}"
38
+ pref ||= File.basename(input_gaf, '.gaf').gsub(/[^!-)+-<>-~]/, '_')
39
+ segments = []
40
+ File.open(input_gaf, 'r') do |fh|
41
+ fh.each do |ln|
42
+ row = ln.chomp.split("\t")
43
+ opt = Hash[row[12..].map { |i| i.split(':', 2) }]
44
+ opt.each { |k, v| opt[k] = GFA::Field[v] }
45
+ next if opt['id'] && opt['id'].value < 0.95
46
+
47
+ gaf_path = row[5]
48
+ seg_names = []
49
+ gaf_path.scan(/[><]?[^><]+/).each do |seg|
50
+ seg_orient = seg.match?(/^</) ? '-' : '+'
51
+ seg_name = seg.sub(/^[><]/, '')
52
+ seg_names << "#{seg_name}#{seg_orient}"
53
+ segments << seg_name unless segments.include?(seg_name)
54
+ end
55
+ gfa << GFA::Record::Path.new(
56
+ "#{pref}_#{$.}", seg_names.join(','), opt['cg']&.value || '*'
57
+ )
58
+ end
59
+ end
60
+ $stderr.puts "- Found #{segments.size} linked segments"
61
+
62
+ degree = degree.to_i
63
+ if degree >= 0
64
+ $stderr.puts 'Subsetting graph'
65
+ gfa = gfa.subgraph(segments, degree: degree)
66
+ end
67
+
68
+ $stderr.puts "Saving GFA: #{output}"
69
+ gfa.save(output)
70
+
data/bin/gfa-subgraph ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, degree, segments, threads = ARGV
12
+
13
+ unless segments
14
+ $stderr.puts <<~HELP
15
+ Select a set of segments and include only elements of the GFA linked to
16
+ those segments (directly or indirectly)
17
+
18
+ gfa-subgraph <input> <output> <degree> <segments> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <output> Output GFA file to write
22
+ <degree> Maximum degree of separation between the segment set and any
23
+ other included segments. If 0, only segments are included.
24
+ If 1, only the target segments, records linking to them, and
25
+ segments linked by those records. Any integer > 1 includes
26
+ additional expansion rounds for those linked segments.
27
+ <segments> Comma-delimited list of segment segments
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input}"
34
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
35
+
36
+ $stderr.puts 'Subsetting graph'
37
+ gfa = gfa.subgraph(segments.split(','), degree: degree.to_i)
38
+
39
+ $stderr.puts "Saving GFA: #{output}"
40
+ gfa.save(output)
41
+
data/lib/gfa/common.rb CHANGED
@@ -1,30 +1,33 @@
1
1
  require 'gfa/version'
2
- require 'gfa/record'
2
+ require 'gfa/record_set'
3
3
  require 'gfa/field'
4
4
 
5
5
  class GFA
6
6
  # Class-level
7
7
  def self.assert_format(value, regex, message)
8
- unless value =~ regex
9
- raise "#{message}: #{value}."
8
+ unless value =~ /^(?:#{regex})$/
9
+ raise "#{message}: #{value}"
10
10
  end
11
11
  end
12
12
 
13
13
  # Instance-level
14
- attr :gfa_version, :records
14
+ attr :gfa_version, :records, :opts
15
15
 
16
16
  GFA::Record.TYPES.each do |r_type|
17
- plural = "#{r_type.downcase}s"
18
17
  singular = "#{r_type.downcase}"
18
+ plural = "#{singular}s"
19
19
 
20
20
  define_method(plural) { records[r_type] }
21
21
  define_method(singular) { |k| records[r_type][k] }
22
22
  define_method("add_#{singular}") { |v| @records[r_type] << v }
23
23
  end
24
24
 
25
- def initialize
25
+ def initialize(opts = {})
26
26
  @records = {}
27
- GFA::Record.TYPES.each { |t| @records[t] = [] }
27
+ @opts = { index: true, index_id: false, comments: false }.merge(opts)
28
+ GFA::Record.TYPES.each do |t|
29
+ @records[t] = GFA::RecordSet.name_class(t).new(self)
30
+ end
28
31
  end
29
32
 
30
33
  def empty?
@@ -35,5 +38,27 @@ class GFA
35
38
  records == gfa.records
36
39
  end
37
40
 
38
- alias == eql?
41
+ def ==(gfa)
42
+ eql?(gfa)
43
+ end
44
+
45
+ def size
46
+ records.values.map(&:size).inject(0, :+)
47
+ end
48
+
49
+ def merge!(gfa)
50
+ raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA
51
+
52
+ GFA::Record.TYPES.each do |t|
53
+ @records[t].merge!(gfa.records[t])
54
+ end
55
+ end
56
+
57
+ def indexed?
58
+ records.values.all?(&:indexed?)
59
+ end
60
+
61
+ def rebuild_index!
62
+ @records.each_value(&:rebuild_index!)
63
+ end
39
64
  end
@@ -1,6 +1,7 @@
1
1
  class GFA::Field::Char < GFA::Field
2
2
  CODE = :A
3
- REGEX = /^[!-~]$/
3
+ REGEX = /[!-~]/
4
+ NATIVE_FUN = :to_s
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
@@ -1,9 +1,26 @@
1
1
  class GFA::Field::Float < GFA::Field
2
2
  CODE = :f
3
- REGEX = /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/
3
+ REGEX = /[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?/
4
+ NATIVE_FUN = :to_f
5
+
6
+ def to_f
7
+ value
8
+ end
9
+
10
+ def to_i
11
+ value.to_i
12
+ end
4
13
 
5
14
  def initialize(f)
6
15
  GFA.assert_format(f, regex, "Bad #{type}")
7
16
  @value = f.to_f
8
17
  end
18
+
19
+ def equivalent?(field)
20
+ if field.is_a?(GFA::Field::NumArray)
21
+ return field.size == 1 && field.first.to_f == value
22
+ end
23
+
24
+ super
25
+ end
9
26
  end
data/lib/gfa/field/hex.rb CHANGED
@@ -1,9 +1,26 @@
1
1
  class GFA::Field::Hex < GFA::Field
2
2
  CODE = :H
3
- REGEX = /^[0-9A-F]+$/
3
+ REGEX = /[0-9A-F]+/
4
+ NATIVE_FUN = :to_i
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
10
+
11
+ def to_i
12
+ value.to_i(16)
13
+ end
14
+
15
+ def to_f
16
+ to_i.to_f
17
+ end
18
+
19
+ def equivalent?(field)
20
+ if field.is_a? GFA::Field::NumArray
21
+ return field.size == 1 && field.first.to_i == value
22
+ end
23
+
24
+ super
25
+ end
9
26
  end
@@ -1,9 +1,18 @@
1
1
  class GFA::Field::Json < GFA::Field
2
2
  CODE = :J
3
- REGEX = /^[ !-~]+$/
3
+ REGEX = /[ !-~]+/
4
+ NATIVE_FUN = :to_s
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
10
+
11
+ def equivalent?(field)
12
+ # TODO
13
+ # We should parse the contents when comparing two GFA::Field::Json to
14
+ # evaluate equivalencies such as 'J:{ "a" : 1 }' ~ 'J:{"a":1}' (spaces)
15
+ # or 'J:{"a":1,"b":2}' ~ 'J:{"b":2,"a":1}' (element order)
16
+ super
17
+ end
9
18
  end
@@ -1,17 +1,30 @@
1
1
  class GFA::Field::NumArray < GFA::Field
2
2
  CODE = :B
3
- REGEX = /^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+$/
3
+ REGEX = /[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+/
4
+ NATIVE_FUN = :to_a
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
9
10
 
10
- def modifier ; value[0] ; end
11
+ def modifier
12
+ value[0]
13
+ end
11
14
 
12
- def array ; value[2..-1].split(/,/) ; end
15
+ def modifier_fun
16
+ modifier == 'f' ? :to_f : :to_i
17
+ end
13
18
 
14
- alias as_a array
19
+ def array
20
+ @array ||= value[2..-1].split(',').map(&modifier_fun)
21
+ end
22
+
23
+ alias to_a array
24
+
25
+ %i[empty? size count length first last].each do |i|
26
+ define_method(i) { array.send(i) }
27
+ end
15
28
 
16
29
  def number_type
17
30
  {
@@ -21,4 +34,16 @@ class GFA::Field::NumArray < GFA::Field
21
34
  f: 'float'
22
35
  }[modifier.to_sym]
23
36
  end
37
+
38
+ def equivalent?(field)
39
+ return true if eql?(field)
40
+
41
+ if field.respond_to?(:to_a)
42
+ field.to_a.map(&modifier_fun) == array
43
+ elsif size == 1 && field.respond_to?(modifier_fun)
44
+ field.send(modifier_fun) == first
45
+ else
46
+ false
47
+ end
48
+ end
24
49
  end
@@ -1,9 +1,22 @@
1
1
  class GFA::Field::SigInt < GFA::Field
2
2
  CODE = :i
3
- REGEX = /^[-+]?[0-9]+$/
3
+ REGEX = /[-+]?[0-9]+/
4
+ NATIVE_FUN = :to_i
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f.to_i
8
9
  end
10
+
11
+ def to_i
12
+ value
13
+ end
14
+
15
+ def equivalent?(field)
16
+ if field.is_a?(GFA::Field::NumArray)
17
+ return field.size == 1 && field.first.to_i == value
18
+ end
19
+
20
+ super
21
+ end
9
22
  end
@@ -1,6 +1,15 @@
1
1
  class GFA::Field::String < GFA::Field
2
2
  CODE = :Z
3
- REGEX = /^[ !-~]+$/
3
+ REGEX = /[ !-~]+/
4
+ NATIVE_FUN = :to_s
5
+
6
+ def to_f
7
+ value.to_f
8
+ end
9
+
10
+ def to_i(base = 10)
11
+ value.to_i(base)
12
+ end
4
13
 
5
14
  def initialize(f)
6
15
  GFA.assert_format(f, regex, "Bad #{type}")
data/lib/gfa/field.rb CHANGED
@@ -12,7 +12,7 @@ class GFA::Field
12
12
  TYPES = CODES.values
13
13
  TYPES.each { |t| require "gfa/field/#{t.downcase}" }
14
14
 
15
- [:CODES, :TYPES].each do |x|
15
+ %i[CODES TYPES].each do |x|
16
16
  define_singleton_method(x) { const_get(x) }
17
17
  end
18
18
 
@@ -25,23 +25,95 @@ class GFA::Field
25
25
  def self.name_class(name)
26
26
  const_get(name)
27
27
  end
28
-
28
+
29
+ def self.[](string)
30
+ code, value = string.split(':', 2)
31
+ code_class(code).new(value)
32
+ end
33
+
29
34
  # Instance-level
30
35
 
31
36
  attr :value
32
37
 
33
- def type ; CODES[code] ; end
34
-
35
- def code ; self.class::CODE ; end
36
-
37
- def regex ; self.class::REGEX ; end
38
-
39
- def to_s(with_type=true)
38
+ def type
39
+ CODES[code]
40
+ end
41
+
42
+ def code
43
+ self.class::CODE
44
+ end
45
+
46
+ def regex
47
+ self.class::REGEX
48
+ end
49
+
50
+ def native_fun
51
+ self.class::NATIVE_FUN
52
+ end
53
+
54
+ def to_native
55
+ native_fun == :to_s ? to_s(false) : send(native_fun)
56
+ end
57
+
58
+ def to_s(with_type = true)
40
59
  "#{"#{code}:" if with_type}#{value}"
41
60
  end
42
-
61
+
43
62
  def hash
44
63
  value.hash
45
64
  end
46
65
 
66
+ ##
67
+ # Evaluate equivalency of contents. All the following fields are distinct but
68
+ # contain the same information, and are therefore considered equivalent:
69
+ # Z:123, i:123, f:123.0, B:i,123, H:7b
70
+ #
71
+ # Note that the information content is determined by the class of the first
72
+ # operator. For example:
73
+ # - 'i:123' ~ 'f:123.4' is true because values are compared as integers
74
+ # - 'f:123.4' ~ 'i:123' if false because values are compared as floats
75
+ def equivalent?(field)
76
+ return true if eql?(field) # Might be faster, so testing this first
77
+
78
+ if field.respond_to?(native_fun)
79
+ if field.is_a?(GFA::Field) && native_fun == :to_s
80
+ field.to_s(false) == to_native
81
+ else
82
+ field.send(native_fun) == to_native
83
+ end
84
+ else
85
+ field == value
86
+ end
87
+ end
88
+
89
+ ##
90
+ # Non-equivalent to +field+, same as +!equivalent?+
91
+ def !~(field)
92
+ !self.~(field)
93
+ end
94
+
95
+ ##
96
+ # Same as +equivalent?+
97
+ def ~(field)
98
+ equivalent?(field)
99
+ end
100
+
101
+ ##
102
+ # Evaluate equality. Note that fields with equivalent values evaluate as
103
+ # different. For example, the following fields have equivalent information,
104
+ # but they all evaluate as different: Z:123, i:123, f:123.0, B:i,123, H:7b.
105
+ # To test equivalency of contents instead, use +equivalent?+
106
+ def eql?(field)
107
+ if field.is_a?(GFA::Field)
108
+ type == field.type && value == field.value
109
+ else
110
+ field.is_a?(value.class) && value == field
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Same as +eql?+
116
+ def ==(field)
117
+ eql?(field)
118
+ end
47
119
  end
data/lib/gfa/generator.rb CHANGED
@@ -8,9 +8,9 @@ class GFA
8
8
  end
9
9
 
10
10
  def each_line(&blk)
11
- set_version_header('1.1') if gfa_version.nil?
11
+ set_version_header('1.2') if gfa_version.nil?
12
12
  GFA::Record.TYPES.each do |r_type|
13
- records[r_type].each do |record|
13
+ records[r_type].set.each do |record|
14
14
  blk[record.to_s]
15
15
  end
16
16
  end
@@ -23,7 +23,7 @@ class GFA
23
23
  end
24
24
 
25
25
  def unset_version
26
- @records[:Header].delete_if { |o| !o.fields[:VN].nil? }
26
+ headers.set.delete_if { |o| !o.fields[:VN].nil? }
27
27
  @gfa_version = nil
28
28
  end
29
29