gfa 0.4.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/gfa-greedy-modules +50 -0
- data/bin/gfa-mean-depth +37 -0
- data/bin/gfa-merge +34 -0
- data/bin/gfa-paths-to-fasta +29 -0
- data/lib/gfa/common.rb +19 -0
- data/lib/gfa/graph.rb +3 -3
- data/lib/gfa/modules.rb +96 -0
- data/lib/gfa/record/segment.rb +6 -0
- data/lib/gfa/record_set/segment_set.rb +6 -0
- data/lib/gfa/version.rb +1 -1
- data/lib/gfa.rb +1 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
|
4
|
+
data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
|
7
|
+
data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, base, min_len, threads = ARGV
|
12
|
+
|
13
|
+
unless base
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Split a GFA into multiple independent GFA files that have no links between
|
16
|
+
them by greedily identifying all individual modules in the graph
|
17
|
+
|
18
|
+
gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<base> Prefix of the output GFA files to write
|
22
|
+
<min_len> Minimum length (in bp) to report a module
|
23
|
+
By default: 0 (all modules are reported)
|
24
|
+
<threads> If passed, parallelize process with these many threads
|
25
|
+
HELP
|
26
|
+
exit(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
$stderr.puts "Loading GFA: #{input}"
|
30
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
31
|
+
|
32
|
+
$stderr.puts 'Splitting graph into modules'
|
33
|
+
gfas = gfa.split_modules
|
34
|
+
|
35
|
+
min_len = min_len.to_i
|
36
|
+
if min_len > 0
|
37
|
+
$stderr.puts 'Filtering out small modules'
|
38
|
+
gfas.select! { |gfa| gfa.total_length > min_len }
|
39
|
+
end
|
40
|
+
|
41
|
+
if gfas.empty?
|
42
|
+
$stderr.puts "No modules found"
|
43
|
+
else
|
44
|
+
$stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
|
45
|
+
int_len = Math.log10(gfas.size).ceil
|
46
|
+
gfas.each_with_index do |gfa, k|
|
47
|
+
gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/bin/gfa-mean-depth
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, threads = ARGV
|
12
|
+
|
13
|
+
unless input
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Calculate the average sequencing depth of all segments in the GFA
|
16
|
+
weighted by the segment lengths
|
17
|
+
|
18
|
+
gfa-mean-depth <input> [<threads>]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<threads> If passed, parallelize process with these many threads
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
$stderr.puts "Loading GFA: #{input}"
|
27
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
28
|
+
|
29
|
+
$stderr.puts 'Calculating average depth'
|
30
|
+
n = gfa.total_length
|
31
|
+
avg =
|
32
|
+
gfa.segments.set.map do |segment|
|
33
|
+
raise "Some segments are missing depth data" unless segment.DP
|
34
|
+
segment.DP.value * segment.length / n
|
35
|
+
end.inject(:+)
|
36
|
+
puts avg
|
37
|
+
|
data/bin/gfa-merge
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
output = ARGV.shift
|
12
|
+
input = ARGV
|
13
|
+
|
14
|
+
if input.empty?
|
15
|
+
$stderr.puts <<~HELP
|
16
|
+
Combine several GFAs into a single GFA. Requires uniqueness of element names
|
17
|
+
|
18
|
+
gfa-merge <output> <input...>
|
19
|
+
|
20
|
+
<output> Output GFA file to be created
|
21
|
+
<input...> List of input GFA files to read
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
gfa = GFA.new
|
27
|
+
input.each do |i|
|
28
|
+
$stderr.puts "Merging GFA: #{i}"
|
29
|
+
gfa.merge! GFA.load(i)
|
30
|
+
end
|
31
|
+
|
32
|
+
$stderr.puts "Saving GFA: #{output}"
|
33
|
+
gfa.save(output)
|
34
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, output, threads = ARGV
|
12
|
+
|
13
|
+
unless output
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Extract the sequences of the paths from a GFA to FastA file
|
16
|
+
|
17
|
+
gfa-merge <input> <output> [<threads>]
|
18
|
+
|
19
|
+
<input> Input GFA file to read
|
20
|
+
<output> Output FastA file to be created
|
21
|
+
<threads> If passed, parallelize process with these many threads
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
$stderr.puts "Loading GFA: #{input}"
|
27
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
28
|
+
|
29
|
+
|
data/lib/gfa/common.rb
CHANGED
@@ -61,4 +61,23 @@ class GFA
|
|
61
61
|
def rebuild_index!
|
62
62
|
@records.each_value(&:rebuild_index!)
|
63
63
|
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Computes the sum of all individual segment lengths
|
67
|
+
def total_length
|
68
|
+
segments.total_length
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Adds the entrie of +gfa+ to itself
|
73
|
+
def merge!(gfa)
|
74
|
+
records.each { |k, v| v.merge!(gfa.records[k]) }
|
75
|
+
self
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# Creates a new GFA based on itself and appends all entries in +gfa+
|
80
|
+
def merge(gfa)
|
81
|
+
GFA.new(opts).merge!(self).merge!(gfa)
|
82
|
+
end
|
64
83
|
end
|
data/lib/gfa/graph.rb
CHANGED
@@ -110,9 +110,9 @@ class GFA
|
|
110
110
|
# Links, Containments, Jumps (from, to) and Paths (segment_names)
|
111
111
|
linking = []
|
112
112
|
eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
|
113
|
-
edges.delete_if do |
|
114
|
-
if eval_set.any? { |segment|
|
115
|
-
linking <<
|
113
|
+
edges.delete_if do |edge|
|
114
|
+
if eval_set.any? { |segment| edge.include? segment }
|
115
|
+
linking << edge
|
116
116
|
true # Remove from the edge set to speed up future recursions
|
117
117
|
else
|
118
118
|
false # Keep it, possibly linking future recursions
|
data/lib/gfa/modules.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
class GFA
|
3
|
+
##
|
4
|
+
# Find all independent modules by greedily crawling the linking entries for
|
5
|
+
# each segment, and returns an Array of GFA objects containing each individual
|
6
|
+
# module. If +recalculate+ is false, it trusts the current calculated
|
7
|
+
# matrix unless none exists
|
8
|
+
def split_modules(recalculate = true)
|
9
|
+
recalculate_matrix if recalculate || @matrix.nil?
|
10
|
+
missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
|
11
|
+
modules = []
|
12
|
+
until missing_segments.empty?
|
13
|
+
mod = matrix_find_module(missing_segments[0])
|
14
|
+
mod.segments.set.map(&:name).map(&:value).each do |name|
|
15
|
+
missing_segments.delete(@matrix_segment_names.index(name))
|
16
|
+
end
|
17
|
+
modules << mod
|
18
|
+
end
|
19
|
+
modules
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Finds the entire module containing the segment with index +segment_index+
|
24
|
+
# in the matrix (requires calling +recalculate_matrix+ first!). Returns the
|
25
|
+
# module as a new GFA
|
26
|
+
def matrix_find_module(segment_index)
|
27
|
+
# Initialize
|
28
|
+
segs = [segment_index]
|
29
|
+
edges = []
|
30
|
+
new_segs = true
|
31
|
+
|
32
|
+
# Iterate until no new segments are found
|
33
|
+
while new_segs
|
34
|
+
new_segs = false
|
35
|
+
segs.each do |seg|
|
36
|
+
@matrix.size.times do |k|
|
37
|
+
next if seg == k
|
38
|
+
v = @matrix[[seg, k].max][[seg, k].min]
|
39
|
+
next if v.empty?
|
40
|
+
edges += v
|
41
|
+
unless segs.include?(k)
|
42
|
+
new_segs = true
|
43
|
+
segs << k
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Save as GFA and return
|
50
|
+
o = GFA.new
|
51
|
+
segs.each { |k| o << segments[k] }
|
52
|
+
edges.uniq.each { |k| o << @matrix_edges[k] }
|
53
|
+
o
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Calculates a matrix where all links between segments are represented by the
|
58
|
+
# following variables:
|
59
|
+
#
|
60
|
+
# +@matrix_segment_names+ includes the names of all segments (as String) with
|
61
|
+
# the order indicating the segment index in the matrix
|
62
|
+
#
|
63
|
+
# +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
|
64
|
+
# the row index segment, the second index indicates the column index segment,
|
65
|
+
# and the third index indicates each of the links between those two. Note that
|
66
|
+
# matrix only stores the lower triangle, so the row index must be stictly less
|
67
|
+
# than the column index. For example, +@matrix[3][1]+ returns an Array of all
|
68
|
+
# index links between the segment with index 3 and the segment with index 1:
|
69
|
+
# ```
|
70
|
+
# [
|
71
|
+
# [ ], # Row 0 is always empty
|
72
|
+
# [[] ], # Row 1 stores connections to column 0
|
73
|
+
# [[], [] ], # Row 2 stores connections to columns 0 and 1
|
74
|
+
# [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
|
75
|
+
# ... # &c
|
76
|
+
# ]
|
77
|
+
# ```
|
78
|
+
#
|
79
|
+
# +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
|
80
|
+
# the GFA. The order indicates the index used by the values of +@matrix+
|
81
|
+
def recalculate_matrix
|
82
|
+
@matrix_segment_names = segments.set.map(&:name).map(&:value)
|
83
|
+
@matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
|
84
|
+
@matrix_edges = all_edges
|
85
|
+
@matrix_edges.each_with_index do |edge, k|
|
86
|
+
names = edge.segments(self).map(&:name).map(&:value)
|
87
|
+
indices = names.map { |i| @matrix_segment_names.index(i) }
|
88
|
+
indices.each do |a|
|
89
|
+
indices.each do |b|
|
90
|
+
break if a == b
|
91
|
+
@matrix[[a, b].max][[a, b].min] << k
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gfa/record/segment.rb
CHANGED
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
|
|
26
26
|
add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
|
27
27
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
28
28
|
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Returns the length of the sequence represented in this segment
|
32
|
+
def length
|
33
|
+
sequence.value.length
|
34
|
+
end
|
29
35
|
end
|
data/lib/gfa/version.rb
CHANGED
data/lib/gfa.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gfa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rgl
|
@@ -64,6 +64,10 @@ files:
|
|
64
64
|
- README.md
|
65
65
|
- Rakefile
|
66
66
|
- bin/gfa-add-gaf
|
67
|
+
- bin/gfa-greedy-modules
|
68
|
+
- bin/gfa-mean-depth
|
69
|
+
- bin/gfa-merge
|
70
|
+
- bin/gfa-paths-to-fasta
|
67
71
|
- bin/gfa-subgraph
|
68
72
|
- lib/gfa.rb
|
69
73
|
- lib/gfa/common.rb
|
@@ -77,6 +81,7 @@ files:
|
|
77
81
|
- lib/gfa/field/string.rb
|
78
82
|
- lib/gfa/generator.rb
|
79
83
|
- lib/gfa/graph.rb
|
84
|
+
- lib/gfa/modules.rb
|
80
85
|
- lib/gfa/parser.rb
|
81
86
|
- lib/gfa/record.rb
|
82
87
|
- lib/gfa/record/comment.rb
|