gfa 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
- data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
3
+ metadata.gz: 9eaaaf8bcac372e7d6b63d96aee94493faa51d78f5cc2b7edb525c3fbd7efaa2
4
+ data.tar.gz: 950e9ffa9cd07cdea8ef5cc80b6c162bf009ff3bef5ab757dcda51c443925043
5
5
  SHA512:
6
- metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
- data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
6
+ metadata.gz: 92e53b9d6c09b1814f3feda86b67dcea6192bd935e3c04dcca7b6ee9c0e7eed919c711941985313c2918f02772f3598883f037638e6e1b7cc9cf8897fec80caf
7
+ data.tar.gz: 4c14ce2bd91582b7b81bc6123443bd5b0cffe8c83550fb85dc48c8da030d35dafaf393506d537c5df7550b25b87ab10003a938c407bd6210983e5f8841552317
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, base, min_len, threads = ARGV
12
+
13
+ unless base
14
+ $stderr.puts <<~HELP
15
+ Split a GFA into multiple independent GFA files that have no links between
16
+ them by greedily identifying all individual modules in the graph
17
+
18
+ gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
19
+
20
+ <input> Input GFA file to read
21
+ <base> Prefix of the output GFA files to write
22
+ <min_len> Minimum length (in bp) to report a module
23
+ By default: 0 (all modules are reported)
24
+ <threads> If passed, parallelize process with these many threads
25
+ HELP
26
+ exit(1)
27
+ end
28
+
29
+ $stderr.puts "Loading GFA: #{input}"
30
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
31
+
32
+ $stderr.puts 'Splitting graph into modules'
33
+ gfas = gfa.split_modules
34
+
35
+ min_len = min_len.to_i
36
+ if min_len > 0
37
+ $stderr.puts 'Filtering out small modules'
38
+ gfas.select! { |gfa| gfa.total_length > min_len }
39
+ end
40
+
41
+ if gfas.empty?
42
+ $stderr.puts "No modules found"
43
+ else
44
+ $stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
45
+ int_len = Math.log10(gfas.size).ceil
46
+ gfas.each_with_index do |gfa, k|
47
+ gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
48
+ end
49
+ end
50
+
data/lib/gfa/common.rb CHANGED
@@ -61,4 +61,10 @@ class GFA
61
61
  def rebuild_index!
62
62
  @records.each_value(&:rebuild_index!)
63
63
  end
64
+
65
+ ##
66
+ # Computes the sum of all individual segment lengths
67
+ def total_length
68
+ segments.total_length
69
+ end
64
70
  end
data/lib/gfa/graph.rb CHANGED
@@ -110,9 +110,9 @@ class GFA
110
110
  # Links, Containments, Jumps (from, to) and Paths (segment_names)
111
111
  linking = []
112
112
  eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
113
- edges.delete_if do |record|
114
- if eval_set.any? { |segment| record.include? segment }
115
- linking << record
113
+ edges.delete_if do |edge|
114
+ if eval_set.any? { |segment| edge.include? segment }
115
+ linking << edge
116
116
  true # Remove from the edge set to speed up future recursions
117
117
  else
118
118
  false # Keep it, possibly linking future recursions
@@ -0,0 +1,96 @@
1
+
2
+ class GFA
3
+ ##
4
+ # Find all independent modules by greedily crawling the linking entries for
5
+ # each segment, and returns an Array of GFA objects containing each individual
6
+ # module. If +recalculate+ is false, it trusts the current calculated
7
+ # matrix unless none exists
8
+ def split_modules(recalculate = true)
9
+ recalculate_matrix if recalculate || @matrix.nil?
10
+ missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
11
+ modules = []
12
+ until missing_segments.empty?
13
+ mod = matrix_find_module(missing_segments[0])
14
+ mod.segments.set.map(&:name).map(&:value).each do |name|
15
+ missing_segments.delete(@matrix_segment_names.index(name))
16
+ end
17
+ modules << mod
18
+ end
19
+ modules
20
+ end
21
+
22
+ ##
23
+ # Finds the entire module containing the segment with index +segment_index+
24
+ # in the matrix (requires calling +recalculate_matrix+ first!). Returns the
25
+ # module as a new GFA
26
+ def matrix_find_module(segment_index)
27
+ # Initialize
28
+ segs = [segment_index]
29
+ edges = []
30
+ new_segs = true
31
+
32
+ # Iterate until no new segments are found
33
+ while new_segs
34
+ new_segs = false
35
+ segs.each do |seg|
36
+ @matrix.size.times do |k|
37
+ next if seg == k
38
+ v = @matrix[[seg, k].max][[seg, k].min]
39
+ next if v.empty?
40
+ edges += v
41
+ unless segs.include?(k)
42
+ new_segs = true
43
+ segs << k
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Save as GFA and return
50
+ o = GFA.new
51
+ segs.each { |k| o << segments[k] }
52
+ edges.uniq.each { |k| o << @matrix_edges[k] }
53
+ o
54
+ end
55
+
56
+ ##
57
+ # Calculates a matrix where all links between segments are represented by the
58
+ # following variables:
59
+ #
60
+ # +@matrix_segment_names+ includes the names of all segments (as String) with
61
+ # the order indicating the segment index in the matrix
62
+ #
63
+ # +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
64
+ # the row index segment, the second index indicates the column index segment,
65
+ # and the third index indicates each of the links between those two. Note that
66
+ # matrix only stores the lower triangle, so the row index must be stictly less
67
+ # than the column index. For example, +@matrix[3][1]+ returns an Array of all
68
+ # index links between the segment with index 3 and the segment with index 1:
69
+ # ```
70
+ # [
71
+ # [ ], # Row 0 is always empty
72
+ # [[] ], # Row 1 stores connections to column 0
73
+ # [[], [] ], # Row 2 stores connections to columns 0 and 1
74
+ # [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
75
+ # ... # &c
76
+ # ]
77
+ # ```
78
+ #
79
+ # +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
80
+ # the GFA. The order indicates the index used by the values of +@matrix+
81
+ def recalculate_matrix
82
+ @matrix_segment_names = segments.set.map(&:name).map(&:value)
83
+ @matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
84
+ @matrix_edges = all_edges
85
+ @matrix_edges.each_with_index do |edge, k|
86
+ names = edge.segments(self).map(&:name).map(&:value)
87
+ indices = names.map { |i| @matrix_segment_names.index(i) }
88
+ indices.each do |a|
89
+ indices.each do |b|
90
+ break if a == b
91
+ @matrix[[a, b].max][[a, b].min] << k
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
26
26
  add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
27
27
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
28
28
  end
29
+
30
+ ##
31
+ # Returns the length of the sequence represented in this segment
32
+ def length
33
+ sequence.value.length
34
+ end
29
35
  end
@@ -1,4 +1,10 @@
1
1
  class GFA::RecordSet::SegmentSet < GFA::RecordSet
2
2
  CODE = :S
3
3
  INDEX_FIELD = 2 # Name: Segment name
4
+
5
+ ##
6
+ # Computes the sum of all individual segment lengths
7
+ def total_length
8
+ set.map(&:length).reduce(0, :+)
9
+ end
4
10
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
data/lib/gfa.rb CHANGED
@@ -2,3 +2,4 @@ require 'gfa/common'
2
2
  require 'gfa/parser'
3
3
  require 'gfa/generator'
4
4
  require 'gfa/graph'
5
+ require 'gfa/modules'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-15 00:00:00.000000000 Z
11
+ date: 2023-03-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl
@@ -64,6 +64,7 @@ files:
64
64
  - README.md
65
65
  - Rakefile
66
66
  - bin/gfa-add-gaf
67
+ - bin/gfa-greedy-modules
67
68
  - bin/gfa-subgraph
68
69
  - lib/gfa.rb
69
70
  - lib/gfa/common.rb
@@ -77,6 +78,7 @@ files:
77
78
  - lib/gfa/field/string.rb
78
79
  - lib/gfa/generator.rb
79
80
  - lib/gfa/graph.rb
81
+ - lib/gfa/modules.rb
80
82
  - lib/gfa/parser.rb
81
83
  - lib/gfa/record.rb
82
84
  - lib/gfa/record/comment.rb