gfa 0.2.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cde4a3e432409c7a2967832dcebd502ddc54b1f6cb25856b6d9d21ce53f67b32
4
- data.tar.gz: 91213d63365dd3608c28e30115cbfc8621e78bbe34936832bcee2ac7e6f460fb
3
+ metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
+ data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
5
5
  SHA512:
6
- metadata.gz: 321634c28ec8927bd38286a84a02783b9f915dcbabb7941233583dda7f41b66e952ff9611c9158bd7baca09d7d3d6c254a036f1c9f2169e5e24e6e964d292e71
7
- data.tar.gz: 3698d16ab5953ffd70bf2c102d154bc1f61e5a13a752bc317a756df13762c3668c1bf6e8144821e53e395783f36c45185249c7fc206694be19145200310c3f48
6
+ metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
+ data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
data/README.md CHANGED
@@ -7,28 +7,35 @@
7
7
 
8
8
  This implementation follows the specifications of [GFA-spec][].
9
9
 
10
+ To load the library:
11
+
12
+ ```ruby
13
+ require 'gfa'
14
+ ```
10
15
 
11
16
  ## Parsing GFA
12
17
 
13
18
  To parse a file in GFA format:
14
19
 
15
20
  ```ruby
16
- require 'gfa'
17
-
18
21
  my_gfa = GFA.load('assembly.gfa')
19
22
  ```
20
23
 
21
- To load GFA strings line-by-line:
24
+ For large GFA files, you can also parse them in parallel:
22
25
 
23
26
  ```ruby
24
- require 'gfa'
27
+ my_gfa = GFA.load_parallel('large-graph.gfa', 4)
28
+ ```
29
+
30
+ To load GFA strings line-by-line:
25
31
 
32
+ ```ruby
26
33
  my_gfa = GFA.new
27
- fh = File.open('assembly.gfa', 'r')
28
- fh.each do |ln|
29
- my_gfa << ln
34
+ File.open('assembly.gfa', 'r') do |fh|
35
+ fh.each do |ln|
36
+ my_gfa << ln
37
+ end
30
38
  end
31
- fh.close
32
39
  ```
33
40
 
34
41
 
@@ -58,12 +65,11 @@ Any `GFA` object can be exported as an [`RGL`][rgl] graph using the methods
58
65
  [tiny.gfa](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.gfa):
59
66
 
60
67
  ```ruby
61
- require "gfa"
62
- require "rgl/dot"
68
+ require 'rgl/dot'
63
69
 
64
- my_gfa = GFA.load("data/tiny.gfa")
70
+ my_gfa = GFA.load('data/tiny.gfa')
65
71
  dg = my_gfa.implicit_graph
66
- dg.write_to_graphic_file("jpg")
72
+ dg.write_to_graphic_file('jpg')
67
73
  ```
68
74
 
69
75
  ![tiny_dg](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny.jpg)
@@ -72,8 +78,8 @@ If you don't care about orientation, you can also build an undirected graph
72
78
  without orientation:
73
79
 
74
80
  ```ruby
75
- ug = my_gfa.implicit_graph(orient:false)
76
- ug.write_to_graphic_file("jpg")
81
+ ug = my_gfa.implicit_graph(orient: false)
82
+ ug.write_to_graphic_file('jpg')
77
83
  ```
78
84
 
79
85
  ![tiny_ug](https://github.com/lmrodriguezr/gfa/raw/master/data/tiny_undirected.jpg)
@@ -88,11 +94,9 @@ gem install gfa
88
94
  Or add the following line to your Gemfile:
89
95
 
90
96
  ```ruby
91
- gem "gfa"
97
+ gem 'gfa'
92
98
  ```
93
99
 
94
- and run `bundle install` from your shell.
95
-
96
100
 
97
101
  # Author
98
102
 
@@ -103,6 +107,6 @@ and run `bundle install` from your shell.
103
107
 
104
108
  [Artistic License 2.0](LICENSE).
105
109
 
106
- [GFA-spec]: https://github.com/pmelsted/GFA-spec
110
+ [GFA-spec]: https://github.com/GFA-spec/GFA-spec
107
111
  [lrr]: https://rodriguez-r.com/
108
112
  [rgl]: https://github.com/monora/rgl
data/bin/gfa-add-gaf ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input_gfa, input_gaf, output, degree, threads = ARGV
12
+
13
+ unless degree
14
+ $stderr.puts <<~HELP
15
+ gfa-add-gaf <input-gfa> <input-gaf> <output> <degree> [<pref> [<threads>]]
16
+
17
+ <input-gfa> Input GFA file to read
18
+ <input-gaf> Input GAF file to read
19
+ <output> Output GFA file to write
20
+ <degree> Maximum degree of separation between the segment set in the GAF
21
+ and any other included segments. If 0, only segments are
22
+ included. If 1, only the target segments, records linking to
23
+ them, and segments linked by those records. Any integer > 1
24
+ includes additional expansion rounds for those linked segments.
25
+ Use -1 to include the complete original GAF without subsetting.
26
+ <pref> A prefix to name all recorded paths
27
+ By default: Based on the GAF file name
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input_gfa}"
34
+ gfa = GFA.load_parallel(input_gfa, (threads || 1).to_i)
35
+
36
+ $stderr.puts "Loading GAF: #{input_gaf}"
37
+ $stderr.puts "- Minimum identity: #{0.95}"
38
+ pref ||= File.basename(input_gaf, '.gaf').gsub(/[^!-)+-<>-~]/, '_')
39
+ segments = []
40
+ File.open(input_gaf, 'r') do |fh|
41
+ fh.each do |ln|
42
+ row = ln.chomp.split("\t")
43
+ opt = Hash[row[12..].map { |i| i.split(':', 2) }]
44
+ opt.each { |k, v| opt[k] = GFA::Field[v] }
45
+ next if opt['id'] && opt['id'].value < 0.95
46
+
47
+ gaf_path = row[5]
48
+ seg_names = []
49
+ gaf_path.scan(/[><]?[^><]+/).each do |seg|
50
+ seg_orient = seg.match?(/^</) ? '-' : '+'
51
+ seg_name = seg.sub(/^[><]/, '')
52
+ seg_names << "#{seg_name}#{seg_orient}"
53
+ segments << seg_name unless segments.include?(seg_name)
54
+ end
55
+ gfa << GFA::Record::Path.new(
56
+ "#{pref}_#{$.}", seg_names.join(','), opt['cg']&.value || '*'
57
+ )
58
+ end
59
+ end
60
+ $stderr.puts "- Found #{segments.size} linked segments"
61
+
62
+ degree = degree.to_i
63
+ if degree >= 0
64
+ $stderr.puts 'Subsetting graph'
65
+ gfa = gfa.subgraph(segments, degree: degree)
66
+ end
67
+
68
+ $stderr.puts "Saving GFA: #{output}"
69
+ gfa.save(output)
70
+
data/bin/gfa-subgraph ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, degree, segments, threads = ARGV
12
+
13
+ unless segments
14
+ $stderr.puts <<~HELP
15
+ Select a set of segments and include only elements of the GFA linked to
16
+ those segments (directly or indirectly)
17
+
18
+ gfa-subgraph <input> <output> <degree> <segments> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <output> Output GFA file to write
22
+ <degree> Maximum degree of separation between the segment set and any
23
+ other included segments. If 0, only segments are included.
24
+ If 1, only the target segments, records linking to them, and
25
+ segments linked by those records. Any integer > 1 includes
26
+ additional expansion rounds for those linked segments.
27
+ <segments> Comma-delimited list of segment segments
28
+ <threads> If passed, parallelize process with these many threads
29
+ HELP
30
+ exit(1)
31
+ end
32
+
33
+ $stderr.puts "Loading GFA: #{input}"
34
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
35
+
36
+ $stderr.puts 'Subsetting graph'
37
+ gfa = gfa.subgraph(segments.split(','), degree: degree.to_i)
38
+
39
+ $stderr.puts "Saving GFA: #{output}"
40
+ gfa.save(output)
41
+
data/lib/gfa/common.rb CHANGED
@@ -1,30 +1,33 @@
1
1
  require 'gfa/version'
2
- require 'gfa/record'
2
+ require 'gfa/record_set'
3
3
  require 'gfa/field'
4
4
 
5
5
  class GFA
6
6
  # Class-level
7
7
  def self.assert_format(value, regex, message)
8
- unless value =~ regex
9
- raise "#{message}: #{value}."
8
+ unless value =~ /^(?:#{regex})$/
9
+ raise "#{message}: #{value}"
10
10
  end
11
11
  end
12
12
 
13
13
  # Instance-level
14
- attr :gfa_version, :records
14
+ attr :gfa_version, :records, :opts
15
15
 
16
16
  GFA::Record.TYPES.each do |r_type|
17
- plural = "#{r_type.downcase}s"
18
17
  singular = "#{r_type.downcase}"
18
+ plural = "#{singular}s"
19
19
 
20
20
  define_method(plural) { records[r_type] }
21
21
  define_method(singular) { |k| records[r_type][k] }
22
22
  define_method("add_#{singular}") { |v| @records[r_type] << v }
23
23
  end
24
24
 
25
- def initialize
25
+ def initialize(opts = {})
26
26
  @records = {}
27
- GFA::Record.TYPES.each { |t| @records[t] = [] }
27
+ @opts = { index: true, index_id: false, comments: false }.merge(opts)
28
+ GFA::Record.TYPES.each do |t|
29
+ @records[t] = GFA::RecordSet.name_class(t).new(self)
30
+ end
28
31
  end
29
32
 
30
33
  def empty?
@@ -35,5 +38,27 @@ class GFA
35
38
  records == gfa.records
36
39
  end
37
40
 
38
- alias == eql?
41
+ def ==(gfa)
42
+ eql?(gfa)
43
+ end
44
+
45
+ def size
46
+ records.values.map(&:size).inject(0, :+)
47
+ end
48
+
49
+ def merge!(gfa)
50
+ raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA
51
+
52
+ GFA::Record.TYPES.each do |t|
53
+ @records[t].merge!(gfa.records[t])
54
+ end
55
+ end
56
+
57
+ def indexed?
58
+ records.values.all?(&:indexed?)
59
+ end
60
+
61
+ def rebuild_index!
62
+ @records.each_value(&:rebuild_index!)
63
+ end
39
64
  end
@@ -1,6 +1,7 @@
1
1
  class GFA::Field::Char < GFA::Field
2
2
  CODE = :A
3
- REGEX = /^[!-~]$/
3
+ REGEX = /[!-~]/
4
+ NATIVE_FUN = :to_s
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
@@ -1,9 +1,26 @@
1
1
  class GFA::Field::Float < GFA::Field
2
2
  CODE = :f
3
- REGEX = /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/
3
+ REGEX = /[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?/
4
+ NATIVE_FUN = :to_f
5
+
6
+ def to_f
7
+ value
8
+ end
9
+
10
+ def to_i
11
+ value.to_i
12
+ end
4
13
 
5
14
  def initialize(f)
6
15
  GFA.assert_format(f, regex, "Bad #{type}")
7
16
  @value = f.to_f
8
17
  end
18
+
19
+ def equivalent?(field)
20
+ if field.is_a?(GFA::Field::NumArray)
21
+ return field.size == 1 && field.first.to_f == value
22
+ end
23
+
24
+ super
25
+ end
9
26
  end
data/lib/gfa/field/hex.rb CHANGED
@@ -1,9 +1,26 @@
1
1
  class GFA::Field::Hex < GFA::Field
2
2
  CODE = :H
3
- REGEX = /^[0-9A-F]+$/
3
+ REGEX = /[0-9A-F]+/
4
+ NATIVE_FUN = :to_i
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
10
+
11
+ def to_i
12
+ value.to_i(16)
13
+ end
14
+
15
+ def to_f
16
+ to_i.to_f
17
+ end
18
+
19
+ def equivalent?(field)
20
+ if field.is_a? GFA::Field::NumArray
21
+ return field.size == 1 && field.first.to_i == value
22
+ end
23
+
24
+ super
25
+ end
9
26
  end
@@ -1,9 +1,18 @@
1
1
  class GFA::Field::Json < GFA::Field
2
2
  CODE = :J
3
- REGEX = /^[ !-~]+$/
3
+ REGEX = /[ !-~]+/
4
+ NATIVE_FUN = :to_s
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
10
+
11
+ def equivalent?(field)
12
+ # TODO
13
+ # We should parse the contents when comparing two GFA::Field::Json to
14
+ # evaluate equivalencies such as 'J:{ "a" : 1 }' ~ 'J:{"a":1}' (spaces)
15
+ # or 'J:{"a":1,"b":2}' ~ 'J:{"b":2,"a":1}' (element order)
16
+ super
17
+ end
9
18
  end
@@ -1,17 +1,30 @@
1
1
  class GFA::Field::NumArray < GFA::Field
2
2
  CODE = :B
3
- REGEX = /^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+$/
3
+ REGEX = /[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+/
4
+ NATIVE_FUN = :to_a
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f
8
9
  end
9
10
 
10
- def modifier ; value[0] ; end
11
+ def modifier
12
+ value[0]
13
+ end
11
14
 
12
- def array ; value[2..-1].split(/,/) ; end
15
+ def modifier_fun
16
+ modifier == 'f' ? :to_f : :to_i
17
+ end
13
18
 
14
- alias as_a array
19
+ def array
20
+ @array ||= value[2..-1].split(',').map(&modifier_fun)
21
+ end
22
+
23
+ alias to_a array
24
+
25
+ %i[empty? size count length first last].each do |i|
26
+ define_method(i) { array.send(i) }
27
+ end
15
28
 
16
29
  def number_type
17
30
  {
@@ -21,4 +34,16 @@ class GFA::Field::NumArray < GFA::Field
21
34
  f: 'float'
22
35
  }[modifier.to_sym]
23
36
  end
37
+
38
+ def equivalent?(field)
39
+ return true if eql?(field)
40
+
41
+ if field.respond_to?(:to_a)
42
+ field.to_a.map(&modifier_fun) == array
43
+ elsif size == 1 && field.respond_to?(modifier_fun)
44
+ field.send(modifier_fun) == first
45
+ else
46
+ false
47
+ end
48
+ end
24
49
  end
@@ -1,9 +1,22 @@
1
1
  class GFA::Field::SigInt < GFA::Field
2
2
  CODE = :i
3
- REGEX = /^[-+]?[0-9]+$/
3
+ REGEX = /[-+]?[0-9]+/
4
+ NATIVE_FUN = :to_i
4
5
 
5
6
  def initialize(f)
6
7
  GFA.assert_format(f, regex, "Bad #{type}")
7
8
  @value = f.to_i
8
9
  end
10
+
11
+ def to_i
12
+ value
13
+ end
14
+
15
+ def equivalent?(field)
16
+ if field.is_a?(GFA::Field::NumArray)
17
+ return field.size == 1 && field.first.to_i == value
18
+ end
19
+
20
+ super
21
+ end
9
22
  end
@@ -1,6 +1,15 @@
1
1
  class GFA::Field::String < GFA::Field
2
2
  CODE = :Z
3
- REGEX = /^[ !-~]+$/
3
+ REGEX = /[ !-~]+/
4
+ NATIVE_FUN = :to_s
5
+
6
+ def to_f
7
+ value.to_f
8
+ end
9
+
10
+ def to_i(base = 10)
11
+ value.to_i(base)
12
+ end
4
13
 
5
14
  def initialize(f)
6
15
  GFA.assert_format(f, regex, "Bad #{type}")
data/lib/gfa/field.rb CHANGED
@@ -12,7 +12,7 @@ class GFA::Field
12
12
  TYPES = CODES.values
13
13
  TYPES.each { |t| require "gfa/field/#{t.downcase}" }
14
14
 
15
- [:CODES, :TYPES].each do |x|
15
+ %i[CODES TYPES].each do |x|
16
16
  define_singleton_method(x) { const_get(x) }
17
17
  end
18
18
 
@@ -25,23 +25,95 @@ class GFA::Field
25
25
  def self.name_class(name)
26
26
  const_get(name)
27
27
  end
28
-
28
+
29
+ def self.[](string)
30
+ code, value = string.split(':', 2)
31
+ code_class(code).new(value)
32
+ end
33
+
29
34
  # Instance-level
30
35
 
31
36
  attr :value
32
37
 
33
- def type ; CODES[code] ; end
34
-
35
- def code ; self.class::CODE ; end
36
-
37
- def regex ; self.class::REGEX ; end
38
-
39
- def to_s(with_type=true)
38
+ def type
39
+ CODES[code]
40
+ end
41
+
42
+ def code
43
+ self.class::CODE
44
+ end
45
+
46
+ def regex
47
+ self.class::REGEX
48
+ end
49
+
50
+ def native_fun
51
+ self.class::NATIVE_FUN
52
+ end
53
+
54
+ def to_native
55
+ native_fun == :to_s ? to_s(false) : send(native_fun)
56
+ end
57
+
58
+ def to_s(with_type = true)
40
59
  "#{"#{code}:" if with_type}#{value}"
41
60
  end
42
-
61
+
43
62
  def hash
44
63
  value.hash
45
64
  end
46
65
 
66
+ ##
67
+ # Evaluate equivalency of contents. All the following fields are distinct but
68
+ # contain the same information, and are therefore considered equivalent:
69
+ # Z:123, i:123, f:123.0, B:i,123, H:7b
70
+ #
71
+ # Note that the information content is determined by the class of the first
72
+ # operator. For example:
73
+ # - 'i:123' ~ 'f:123.4' is true because values are compared as integers
74
+ # - 'f:123.4' ~ 'i:123' if false because values are compared as floats
75
+ def equivalent?(field)
76
+ return true if eql?(field) # Might be faster, so testing this first
77
+
78
+ if field.respond_to?(native_fun)
79
+ if field.is_a?(GFA::Field) && native_fun == :to_s
80
+ field.to_s(false) == to_native
81
+ else
82
+ field.send(native_fun) == to_native
83
+ end
84
+ else
85
+ field == value
86
+ end
87
+ end
88
+
89
+ ##
90
+ # Non-equivalent to +field+, same as +!equivalent?+
91
+ def !~(field)
92
+ !self.~(field)
93
+ end
94
+
95
+ ##
96
+ # Same as +equivalent?+
97
+ def ~(field)
98
+ equivalent?(field)
99
+ end
100
+
101
+ ##
102
+ # Evaluate equality. Note that fields with equivalent values evaluate as
103
+ # different. For example, the following fields have equivalent information,
104
+ # but they all evaluate as different: Z:123, i:123, f:123.0, B:i,123, H:7b.
105
+ # To test equivalency of contents instead, use +equivalent?+
106
+ def eql?(field)
107
+ if field.is_a?(GFA::Field)
108
+ type == field.type && value == field.value
109
+ else
110
+ field.is_a?(value.class) && value == field
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Same as +eql?+
116
+ def ==(field)
117
+ eql?(field)
118
+ end
47
119
  end
data/lib/gfa/generator.rb CHANGED
@@ -8,9 +8,9 @@ class GFA
8
8
  end
9
9
 
10
10
  def each_line(&blk)
11
- set_version_header('1.1') if gfa_version.nil?
11
+ set_version_header('1.2') if gfa_version.nil?
12
12
  GFA::Record.TYPES.each do |r_type|
13
- records[r_type].each do |record|
13
+ records[r_type].set.each do |record|
14
14
  blk[record.to_s]
15
15
  end
16
16
  end
@@ -23,7 +23,7 @@ class GFA
23
23
  end
24
24
 
25
25
  def unset_version
26
- @records[:Header].delete_if { |o| !o.fields[:VN].nil? }
26
+ headers.set.delete_if { |o| !o.fields[:VN].nil? }
27
27
  @gfa_version = nil
28
28
  end
29
29