gfa 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
- data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
3
+ metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
4
+ data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
5
5
  SHA512:
6
- metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
- data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
6
+ metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
7
+ data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, base, min_len, threads = ARGV
12
+
13
+ unless base
14
+ $stderr.puts <<~HELP
15
+ Split a GFA into multiple independent GFA files that have no links between
16
+ them by greedily identifying all individual modules in the graph
17
+
18
+ gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
19
+
20
+ <input> Input GFA file to read
21
+ <base> Prefix of the output GFA files to write
22
+ <min_len> Minimum length (in bp) to report a module
23
+ By default: 0 (all modules are reported)
24
+ <threads> If passed, parallelize process with these many threads
25
+ HELP
26
+ exit(1)
27
+ end
28
+
29
+ $stderr.puts "Loading GFA: #{input}"
30
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
31
+
32
+ $stderr.puts 'Splitting graph into modules'
33
+ gfas = gfa.split_modules
34
+
35
+ min_len = min_len.to_i
36
+ if min_len > 0
37
+ $stderr.puts 'Filtering out small modules'
38
+ gfas.select! { |gfa| gfa.total_length > min_len }
39
+ end
40
+
41
+ if gfas.empty?
42
+ $stderr.puts "No modules found"
43
+ else
44
+ $stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
45
+ int_len = Math.log10(gfas.size).ceil
46
+ gfas.each_with_index do |gfa, k|
47
+ gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
48
+ end
49
+ end
50
+
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, threads = ARGV
12
+
13
+ unless input
14
+ $stderr.puts <<~HELP
15
+ Calculate the average sequencing depth of all segments in the GFA
16
+ weighted by the segment lengths
17
+
18
+ gfa-mean-depth <input> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <threads> If passed, parallelize process with these many threads
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ $stderr.puts "Loading GFA: #{input}"
27
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
+
29
+ $stderr.puts 'Calculating average depth'
30
+ n = gfa.total_length
31
+ avg =
32
+ gfa.segments.set.map do |segment|
33
+ raise "Some segments are missing depth data" unless segment.DP
34
+ segment.DP.value * segment.length / n
35
+ end.inject(:+)
36
+ puts avg
37
+
data/bin/gfa-merge ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ output = ARGV.shift
12
+ input = ARGV
13
+
14
+ if input.empty?
15
+ $stderr.puts <<~HELP
16
+ Combine several GFAs into a single GFA. Requires uniqueness of element names
17
+
18
+ gfa-merge <output> <input...>
19
+
20
+ <output> Output GFA file to be created
21
+ <input...> List of input GFA files to read
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ gfa = GFA.new
27
+ input.each do |i|
28
+ $stderr.puts "Merging GFA: #{i}"
29
+ gfa.merge! GFA.load(i)
30
+ end
31
+
32
+ $stderr.puts "Saving GFA: #{output}"
33
+ gfa.save(output)
34
+
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, threads = ARGV
12
+
13
+ unless output
14
+ $stderr.puts <<~HELP
15
+ Extract the sequences of the paths from a GFA to FastA file
16
+
17
+ gfa-merge <input> <output> [<threads>]
18
+
19
+ <input> Input GFA file to read
20
+ <output> Output FastA file to be created
21
+ <threads> If passed, parallelize process with these many threads
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ $stderr.puts "Loading GFA: #{input}"
27
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
+
29
+
data/lib/gfa/common.rb CHANGED
@@ -61,4 +61,23 @@ class GFA
61
61
  def rebuild_index!
62
62
  @records.each_value(&:rebuild_index!)
63
63
  end
64
+
65
+ ##
66
+ # Computes the sum of all individual segment lengths
67
+ def total_length
68
+ segments.total_length
69
+ end
70
+
71
+ ##
72
+ # Adds the entrie of +gfa+ to itself
73
+ def merge!(gfa)
74
+ records.each { |k, v| v.merge!(gfa.records[k]) }
75
+ self
76
+ end
77
+
78
+ ##
79
+ # Creates a new GFA based on itself and appends all entries in +gfa+
80
+ def merge(gfa)
81
+ GFA.new(opts).merge!(self).merge!(gfa)
82
+ end
64
83
  end
data/lib/gfa/graph.rb CHANGED
@@ -110,9 +110,9 @@ class GFA
110
110
  # Links, Containments, Jumps (from, to) and Paths (segment_names)
111
111
  linking = []
112
112
  eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
113
- edges.delete_if do |record|
114
- if eval_set.any? { |segment| record.include? segment }
115
- linking << record
113
+ edges.delete_if do |edge|
114
+ if eval_set.any? { |segment| edge.include? segment }
115
+ linking << edge
116
116
  true # Remove from the edge set to speed up future recursions
117
117
  else
118
118
  false # Keep it, possibly linking future recursions
@@ -0,0 +1,96 @@
1
+
2
+ class GFA
3
+ ##
4
+ # Find all independent modules by greedily crawling the linking entries for
5
+ # each segment, and returns an Array of GFA objects containing each individual
6
+ # module. If +recalculate+ is false, it trusts the current calculated
7
+ # matrix unless none exists
8
+ def split_modules(recalculate = true)
9
+ recalculate_matrix if recalculate || @matrix.nil?
10
+ missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
11
+ modules = []
12
+ until missing_segments.empty?
13
+ mod = matrix_find_module(missing_segments[0])
14
+ mod.segments.set.map(&:name).map(&:value).each do |name|
15
+ missing_segments.delete(@matrix_segment_names.index(name))
16
+ end
17
+ modules << mod
18
+ end
19
+ modules
20
+ end
21
+
22
+ ##
23
+ # Finds the entire module containing the segment with index +segment_index+
24
+ # in the matrix (requires calling +recalculate_matrix+ first!). Returns the
25
+ # module as a new GFA
26
+ def matrix_find_module(segment_index)
27
+ # Initialize
28
+ segs = [segment_index]
29
+ edges = []
30
+ new_segs = true
31
+
32
+ # Iterate until no new segments are found
33
+ while new_segs
34
+ new_segs = false
35
+ segs.each do |seg|
36
+ @matrix.size.times do |k|
37
+ next if seg == k
38
+ v = @matrix[[seg, k].max][[seg, k].min]
39
+ next if v.empty?
40
+ edges += v
41
+ unless segs.include?(k)
42
+ new_segs = true
43
+ segs << k
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Save as GFA and return
50
+ o = GFA.new
51
+ segs.each { |k| o << segments[k] }
52
+ edges.uniq.each { |k| o << @matrix_edges[k] }
53
+ o
54
+ end
55
+
56
+ ##
57
+ # Calculates a matrix where all links between segments are represented by the
58
+ # following variables:
59
+ #
60
+ # +@matrix_segment_names+ includes the names of all segments (as String) with
61
+ # the order indicating the segment index in the matrix
62
+ #
63
+ # +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
64
+ # the row index segment, the second index indicates the column index segment,
65
+ # and the third index indicates each of the links between those two. Note that
66
+ # matrix only stores the lower triangle, so the row index must be stictly less
67
+ # than the column index. For example, +@matrix[3][1]+ returns an Array of all
68
+ # index links between the segment with index 3 and the segment with index 1:
69
+ # ```
70
+ # [
71
+ # [ ], # Row 0 is always empty
72
+ # [[] ], # Row 1 stores connections to column 0
73
+ # [[], [] ], # Row 2 stores connections to columns 0 and 1
74
+ # [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
75
+ # ... # &c
76
+ # ]
77
+ # ```
78
+ #
79
+ # +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
80
+ # the GFA. The order indicates the index used by the values of +@matrix+
81
+ def recalculate_matrix
82
+ @matrix_segment_names = segments.set.map(&:name).map(&:value)
83
+ @matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
84
+ @matrix_edges = all_edges
85
+ @matrix_edges.each_with_index do |edge, k|
86
+ names = edge.segments(self).map(&:name).map(&:value)
87
+ indices = names.map { |i| @matrix_segment_names.index(i) }
88
+ indices.each do |a|
89
+ indices.each do |b|
90
+ break if a == b
91
+ @matrix[[a, b].max][[a, b].min] << k
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
26
26
  add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
27
27
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
28
28
  end
29
+
30
+ ##
31
+ # Returns the length of the sequence represented in this segment
32
+ def length
33
+ sequence.value.length
34
+ end
29
35
  end
@@ -1,4 +1,10 @@
1
1
  class GFA::RecordSet::SegmentSet < GFA::RecordSet
2
2
  CODE = :S
3
3
  INDEX_FIELD = 2 # Name: Segment name
4
+
5
+ ##
6
+ # Computes the sum of all individual segment lengths
7
+ def total_length
8
+ set.map(&:length).reduce(0, :+)
9
+ end
4
10
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.4.0'
2
+ VERSION = '0.6.0'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
data/lib/gfa.rb CHANGED
@@ -2,3 +2,4 @@ require 'gfa/common'
2
2
  require 'gfa/parser'
3
3
  require 'gfa/generator'
4
4
  require 'gfa/graph'
5
+ require 'gfa/modules'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-15 00:00:00.000000000 Z
11
+ date: 2023-10-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl
@@ -64,6 +64,10 @@ files:
64
64
  - README.md
65
65
  - Rakefile
66
66
  - bin/gfa-add-gaf
67
+ - bin/gfa-greedy-modules
68
+ - bin/gfa-mean-depth
69
+ - bin/gfa-merge
70
+ - bin/gfa-paths-to-fasta
67
71
  - bin/gfa-subgraph
68
72
  - lib/gfa.rb
69
73
  - lib/gfa/common.rb
@@ -77,6 +81,7 @@ files:
77
81
  - lib/gfa/field/string.rb
78
82
  - lib/gfa/generator.rb
79
83
  - lib/gfa/graph.rb
84
+ - lib/gfa/modules.rb
80
85
  - lib/gfa/parser.rb
81
86
  - lib/gfa/record.rb
82
87
  - lib/gfa/record/comment.rb