gfa 0.4.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 97e6400338884b4ceb1161778c26c5d6de6ee71616be1b5caae6aa0691d88395
4
- data.tar.gz: 2fe8103598246724d98e3ceeecce92b5564f45bf029bf034978277cde59b4caa
3
+ metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
4
+ data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
5
5
  SHA512:
6
- metadata.gz: 3beac70ac4c3d4e46bd01399351fbc5e5ffcdaac5bd2a653b7f17c8f29df5c13e48a11575c42fa0ec78cba62485528148687614e9373ac6cc5dd773f97cd67a6
7
- data.tar.gz: c488a4b26604ffc95228d5aa454399b4da8facecb9a90c2579be3c36446637bc60b89578c5a5443047ac020c4c9fb9b980fd3943fdb56e7993ebfcc7c4e60876
6
+ metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
7
+ data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, base, min_len, threads = ARGV
12
+
13
+ unless base
14
+ $stderr.puts <<~HELP
15
+ Split a GFA into multiple independent GFA files that have no links between
16
+ them by greedily identifying all individual modules in the graph
17
+
18
+ gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
19
+
20
+ <input> Input GFA file to read
21
+ <base> Prefix of the output GFA files to write
22
+ <min_len> Minimum length (in bp) to report a module
23
+ By default: 0 (all modules are reported)
24
+ <threads> If passed, parallelize process with these many threads
25
+ HELP
26
+ exit(1)
27
+ end
28
+
29
+ $stderr.puts "Loading GFA: #{input}"
30
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
31
+
32
+ $stderr.puts 'Splitting graph into modules'
33
+ gfas = gfa.split_modules
34
+
35
+ min_len = min_len.to_i
36
+ if min_len > 0
37
+ $stderr.puts 'Filtering out small modules'
38
+ gfas.select! { |gfa| gfa.total_length > min_len }
39
+ end
40
+
41
+ if gfas.empty?
42
+ $stderr.puts "No modules found"
43
+ else
44
+ $stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
45
+ int_len = Math.log10(gfas.size).ceil
46
+ gfas.each_with_index do |gfa, k|
47
+ gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
48
+ end
49
+ end
50
+
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, threads = ARGV
12
+
13
+ unless input
14
+ $stderr.puts <<~HELP
15
+ Calculate the average sequencing depth of all segments in the GFA
16
+ weighted by the segment lengths
17
+
18
+ gfa-mean-depth <input> [<threads>]
19
+
20
+ <input> Input GFA file to read
21
+ <threads> If passed, parallelize process with these many threads
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ $stderr.puts "Loading GFA: #{input}"
27
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
+
29
+ $stderr.puts 'Calculating average depth'
30
+ n = gfa.total_length
31
+ avg =
32
+ gfa.segments.set.map do |segment|
33
+ raise "Some segments are missing depth data" unless segment.DP
34
+ segment.DP.value * segment.length / n
35
+ end.inject(:+)
36
+ puts avg
37
+
data/bin/gfa-merge ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ output = ARGV.shift
12
+ input = ARGV
13
+
14
+ if input.empty?
15
+ $stderr.puts <<~HELP
16
+ Combine several GFAs into a single GFA. Requires uniqueness of element names
17
+
18
+ gfa-merge <output> <input...>
19
+
20
+ <output> Output GFA file to be created
21
+ <input...> List of input GFA files to read
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ gfa = GFA.new
27
+ input.each do |i|
28
+ $stderr.puts "Merging GFA: #{i}"
29
+ gfa.merge! GFA.load(i)
30
+ end
31
+
32
+ $stderr.puts "Saving GFA: #{output}"
33
+ gfa.save(output)
34
+
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # @package MiGA
4
+ # @license Artistic-2.0
5
+
6
+ $LOAD_PATH.push File.expand_path('../../lib', __FILE__)
7
+ $LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
8
+
9
+ require 'gfa'
10
+
11
+ input, output, threads = ARGV
12
+
13
+ unless output
14
+ $stderr.puts <<~HELP
15
+ Extract the sequences of the paths from a GFA to FastA file
16
+
17
+ gfa-merge <input> <output> [<threads>]
18
+
19
+ <input> Input GFA file to read
20
+ <output> Output FastA file to be created
21
+ <threads> If passed, parallelize process with these many threads
22
+ HELP
23
+ exit(1)
24
+ end
25
+
26
+ $stderr.puts "Loading GFA: #{input}"
27
+ gfa = GFA.load_parallel(input, (threads || 1).to_i)
28
+
29
+
data/lib/gfa/common.rb CHANGED
@@ -61,4 +61,23 @@ class GFA
61
61
  def rebuild_index!
62
62
  @records.each_value(&:rebuild_index!)
63
63
  end
64
+
65
+ ##
66
+ # Computes the sum of all individual segment lengths
67
+ def total_length
68
+ segments.total_length
69
+ end
70
+
71
+ ##
72
+ # Adds the entrie of +gfa+ to itself
73
+ def merge!(gfa)
74
+ records.each { |k, v| v.merge!(gfa.records[k]) }
75
+ self
76
+ end
77
+
78
+ ##
79
+ # Creates a new GFA based on itself and appends all entries in +gfa+
80
+ def merge(gfa)
81
+ GFA.new(opts).merge!(self).merge!(gfa)
82
+ end
64
83
  end
data/lib/gfa/graph.rb CHANGED
@@ -110,9 +110,9 @@ class GFA
110
110
  # Links, Containments, Jumps (from, to) and Paths (segment_names)
111
111
  linking = []
112
112
  eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
113
- edges.delete_if do |record|
114
- if eval_set.any? { |segment| record.include? segment }
115
- linking << record
113
+ edges.delete_if do |edge|
114
+ if eval_set.any? { |segment| edge.include? segment }
115
+ linking << edge
116
116
  true # Remove from the edge set to speed up future recursions
117
117
  else
118
118
  false # Keep it, possibly linking future recursions
@@ -0,0 +1,96 @@
1
+
2
+ class GFA
3
+ ##
4
+ # Find all independent modules by greedily crawling the linking entries for
5
+ # each segment, and returns an Array of GFA objects containing each individual
6
+ # module. If +recalculate+ is false, it trusts the current calculated
7
+ # matrix unless none exists
8
+ def split_modules(recalculate = true)
9
+ recalculate_matrix if recalculate || @matrix.nil?
10
+ missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
11
+ modules = []
12
+ until missing_segments.empty?
13
+ mod = matrix_find_module(missing_segments[0])
14
+ mod.segments.set.map(&:name).map(&:value).each do |name|
15
+ missing_segments.delete(@matrix_segment_names.index(name))
16
+ end
17
+ modules << mod
18
+ end
19
+ modules
20
+ end
21
+
22
+ ##
23
+ # Finds the entire module containing the segment with index +segment_index+
24
+ # in the matrix (requires calling +recalculate_matrix+ first!). Returns the
25
+ # module as a new GFA
26
+ def matrix_find_module(segment_index)
27
+ # Initialize
28
+ segs = [segment_index]
29
+ edges = []
30
+ new_segs = true
31
+
32
+ # Iterate until no new segments are found
33
+ while new_segs
34
+ new_segs = false
35
+ segs.each do |seg|
36
+ @matrix.size.times do |k|
37
+ next if seg == k
38
+ v = @matrix[[seg, k].max][[seg, k].min]
39
+ next if v.empty?
40
+ edges += v
41
+ unless segs.include?(k)
42
+ new_segs = true
43
+ segs << k
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # Save as GFA and return
50
+ o = GFA.new
51
+ segs.each { |k| o << segments[k] }
52
+ edges.uniq.each { |k| o << @matrix_edges[k] }
53
+ o
54
+ end
55
+
56
+ ##
57
+ # Calculates a matrix where all links between segments are represented by the
58
+ # following variables:
59
+ #
60
+ # +@matrix_segment_names+ includes the names of all segments (as String) with
61
+ # the order indicating the segment index in the matrix
62
+ #
63
+ # +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
64
+ # the row index segment, the second index indicates the column index segment,
65
+ # and the third index indicates each of the links between those two. Note that
66
+ # matrix only stores the lower triangle, so the row index must be stictly less
67
+ # than the column index. For example, +@matrix[3][1]+ returns an Array of all
68
+ # index links between the segment with index 3 and the segment with index 1:
69
+ # ```
70
+ # [
71
+ # [ ], # Row 0 is always empty
72
+ # [[] ], # Row 1 stores connections to column 0
73
+ # [[], [] ], # Row 2 stores connections to columns 0 and 1
74
+ # [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
75
+ # ... # &c
76
+ # ]
77
+ # ```
78
+ #
79
+ # +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
80
+ # the GFA. The order indicates the index used by the values of +@matrix+
81
+ def recalculate_matrix
82
+ @matrix_segment_names = segments.set.map(&:name).map(&:value)
83
+ @matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
84
+ @matrix_edges = all_edges
85
+ @matrix_edges.each_with_index do |edge, k|
86
+ names = edge.segments(self).map(&:name).map(&:value)
87
+ indices = names.map { |i| @matrix_segment_names.index(i) }
88
+ indices.each do |a|
89
+ indices.each do |b|
90
+ break if a == b
91
+ @matrix[[a, b].max][[a, b].min] << k
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
26
26
  add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
27
27
  opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
28
28
  end
29
+
30
+ ##
31
+ # Returns the length of the sequence represented in this segment
32
+ def length
33
+ sequence.value.length
34
+ end
29
35
  end
@@ -1,4 +1,10 @@
1
1
  class GFA::RecordSet::SegmentSet < GFA::RecordSet
2
2
  CODE = :S
3
3
  INDEX_FIELD = 2 # Name: Segment name
4
+
5
+ ##
6
+ # Computes the sum of all individual segment lengths
7
+ def total_length
8
+ set.map(&:length).reduce(0, :+)
9
+ end
4
10
  end
data/lib/gfa/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  class GFA
2
- VERSION = '0.4.0'
2
+ VERSION = '0.6.0'
3
3
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
4
4
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
5
5
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
data/lib/gfa.rb CHANGED
@@ -2,3 +2,4 @@ require 'gfa/common'
2
2
  require 'gfa/parser'
3
3
  require 'gfa/generator'
4
4
  require 'gfa/graph'
5
+ require 'gfa/modules'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gfa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-03-15 00:00:00.000000000 Z
11
+ date: 2023-10-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rgl
@@ -64,6 +64,10 @@ files:
64
64
  - README.md
65
65
  - Rakefile
66
66
  - bin/gfa-add-gaf
67
+ - bin/gfa-greedy-modules
68
+ - bin/gfa-mean-depth
69
+ - bin/gfa-merge
70
+ - bin/gfa-paths-to-fasta
67
71
  - bin/gfa-subgraph
68
72
  - lib/gfa.rb
69
73
  - lib/gfa/common.rb
@@ -77,6 +81,7 @@ files:
77
81
  - lib/gfa/field/string.rb
78
82
  - lib/gfa/generator.rb
79
83
  - lib/gfa/graph.rb
84
+ - lib/gfa/modules.rb
80
85
  - lib/gfa/parser.rb
81
86
  - lib/gfa/record.rb
82
87
  - lib/gfa/record/comment.rb