gfa 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/gfa-greedy-modules +50 -0
- data/bin/gfa-mean-depth +37 -0
- data/bin/gfa-merge +34 -0
- data/bin/gfa-paths-to-fasta +29 -0
- data/lib/gfa/common.rb +19 -0
- data/lib/gfa/graph.rb +3 -3
- data/lib/gfa/modules.rb +96 -0
- data/lib/gfa/record/segment.rb +6 -0
- data/lib/gfa/record_set/segment_set.rb +6 -0
- data/lib/gfa/version.rb +1 -1
- data/lib/gfa.rb +1 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc98fa1d3479be3fee554dab64876d6cd42795aec08a6b60be6ccb148c57c679
|
4
|
+
data.tar.gz: 28ef77ca42b374e58c983f474d44801060dadb2df912230e431cf40ee8f1386c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a7226dbe5813cc9e7b0cde0e756b0b41395d45e690fd3df0e6ad266f51a209d55d72fdaeaab5aaebd3fec7ce033b54bb04510903cc3eadef230bd30485f037e
|
7
|
+
data.tar.gz: 32f081a9cf35219f05cb21654011a32a1645f13cbf1d7dda320ebef0b7d893523943cd71f8c9f8a310f1021b6adfaa99b1f863d498be1bbb77bf0a5e04779efd
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, base, min_len, threads = ARGV
|
12
|
+
|
13
|
+
unless base
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Split a GFA into multiple independent GFA files that have no links between
|
16
|
+
them by greedily identifying all individual modules in the graph
|
17
|
+
|
18
|
+
gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<base> Prefix of the output GFA files to write
|
22
|
+
<min_len> Minimum length (in bp) to report a module
|
23
|
+
By default: 0 (all modules are reported)
|
24
|
+
<threads> If passed, parallelize process with these many threads
|
25
|
+
HELP
|
26
|
+
exit(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
$stderr.puts "Loading GFA: #{input}"
|
30
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
31
|
+
|
32
|
+
$stderr.puts 'Splitting graph into modules'
|
33
|
+
gfas = gfa.split_modules
|
34
|
+
|
35
|
+
min_len = min_len.to_i
|
36
|
+
if min_len > 0
|
37
|
+
$stderr.puts 'Filtering out small modules'
|
38
|
+
gfas.select! { |gfa| gfa.total_length > min_len }
|
39
|
+
end
|
40
|
+
|
41
|
+
if gfas.empty?
|
42
|
+
$stderr.puts "No modules found"
|
43
|
+
else
|
44
|
+
$stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
|
45
|
+
int_len = Math.log10(gfas.size).ceil
|
46
|
+
gfas.each_with_index do |gfa, k|
|
47
|
+
gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/bin/gfa-mean-depth
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, threads = ARGV
|
12
|
+
|
13
|
+
unless input
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Calculate the average sequencing depth of all segments in the GFA
|
16
|
+
weighted by the segment lengths
|
17
|
+
|
18
|
+
gfa-mean-depth <input> [<threads>]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<threads> If passed, parallelize process with these many threads
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
$stderr.puts "Loading GFA: #{input}"
|
27
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
28
|
+
|
29
|
+
$stderr.puts 'Calculating average depth'
|
30
|
+
n = gfa.total_length
|
31
|
+
avg =
|
32
|
+
gfa.segments.set.map do |segment|
|
33
|
+
raise "Some segments are missing depth data" unless segment.DP
|
34
|
+
segment.DP.value * segment.length / n
|
35
|
+
end.inject(:+)
|
36
|
+
puts avg
|
37
|
+
|
data/bin/gfa-merge
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
output = ARGV.shift
|
12
|
+
input = ARGV
|
13
|
+
|
14
|
+
if input.empty?
|
15
|
+
$stderr.puts <<~HELP
|
16
|
+
Combine several GFAs into a single GFA. Requires uniqueness of element names
|
17
|
+
|
18
|
+
gfa-merge <output> <input...>
|
19
|
+
|
20
|
+
<output> Output GFA file to be created
|
21
|
+
<input...> List of input GFA files to read
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
gfa = GFA.new
|
27
|
+
input.each do |i|
|
28
|
+
$stderr.puts "Merging GFA: #{i}"
|
29
|
+
gfa.merge! GFA.load(i)
|
30
|
+
end
|
31
|
+
|
32
|
+
$stderr.puts "Saving GFA: #{output}"
|
33
|
+
gfa.save(output)
|
34
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, output, threads = ARGV
|
12
|
+
|
13
|
+
unless output
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Extract the sequences of the paths from a GFA to FastA file
|
16
|
+
|
17
|
+
gfa-merge <input> <output> [<threads>]
|
18
|
+
|
19
|
+
<input> Input GFA file to read
|
20
|
+
<output> Output FastA file to be created
|
21
|
+
<threads> If passed, parallelize process with these many threads
|
22
|
+
HELP
|
23
|
+
exit(1)
|
24
|
+
end
|
25
|
+
|
26
|
+
$stderr.puts "Loading GFA: #{input}"
|
27
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
28
|
+
|
29
|
+
|
data/lib/gfa/common.rb
CHANGED
@@ -61,4 +61,23 @@ class GFA
|
|
61
61
|
def rebuild_index!
|
62
62
|
@records.each_value(&:rebuild_index!)
|
63
63
|
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Computes the sum of all individual segment lengths
|
67
|
+
def total_length
|
68
|
+
segments.total_length
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# Adds the entrie of +gfa+ to itself
|
73
|
+
def merge!(gfa)
|
74
|
+
records.each { |k, v| v.merge!(gfa.records[k]) }
|
75
|
+
self
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# Creates a new GFA based on itself and appends all entries in +gfa+
|
80
|
+
def merge(gfa)
|
81
|
+
GFA.new(opts).merge!(self).merge!(gfa)
|
82
|
+
end
|
64
83
|
end
|
data/lib/gfa/graph.rb
CHANGED
@@ -110,9 +110,9 @@ class GFA
|
|
110
110
|
# Links, Containments, Jumps (from, to) and Paths (segment_names)
|
111
111
|
linking = []
|
112
112
|
eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
|
113
|
-
edges.delete_if do |
|
114
|
-
if eval_set.any? { |segment|
|
115
|
-
linking <<
|
113
|
+
edges.delete_if do |edge|
|
114
|
+
if eval_set.any? { |segment| edge.include? segment }
|
115
|
+
linking << edge
|
116
116
|
true # Remove from the edge set to speed up future recursions
|
117
117
|
else
|
118
118
|
false # Keep it, possibly linking future recursions
|
data/lib/gfa/modules.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
class GFA
|
3
|
+
##
|
4
|
+
# Find all independent modules by greedily crawling the linking entries for
|
5
|
+
# each segment, and returns an Array of GFA objects containing each individual
|
6
|
+
# module. If +recalculate+ is false, it trusts the current calculated
|
7
|
+
# matrix unless none exists
|
8
|
+
def split_modules(recalculate = true)
|
9
|
+
recalculate_matrix if recalculate || @matrix.nil?
|
10
|
+
missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
|
11
|
+
modules = []
|
12
|
+
until missing_segments.empty?
|
13
|
+
mod = matrix_find_module(missing_segments[0])
|
14
|
+
mod.segments.set.map(&:name).map(&:value).each do |name|
|
15
|
+
missing_segments.delete(@matrix_segment_names.index(name))
|
16
|
+
end
|
17
|
+
modules << mod
|
18
|
+
end
|
19
|
+
modules
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Finds the entire module containing the segment with index +segment_index+
|
24
|
+
# in the matrix (requires calling +recalculate_matrix+ first!). Returns the
|
25
|
+
# module as a new GFA
|
26
|
+
def matrix_find_module(segment_index)
|
27
|
+
# Initialize
|
28
|
+
segs = [segment_index]
|
29
|
+
edges = []
|
30
|
+
new_segs = true
|
31
|
+
|
32
|
+
# Iterate until no new segments are found
|
33
|
+
while new_segs
|
34
|
+
new_segs = false
|
35
|
+
segs.each do |seg|
|
36
|
+
@matrix.size.times do |k|
|
37
|
+
next if seg == k
|
38
|
+
v = @matrix[[seg, k].max][[seg, k].min]
|
39
|
+
next if v.empty?
|
40
|
+
edges += v
|
41
|
+
unless segs.include?(k)
|
42
|
+
new_segs = true
|
43
|
+
segs << k
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Save as GFA and return
|
50
|
+
o = GFA.new
|
51
|
+
segs.each { |k| o << segments[k] }
|
52
|
+
edges.uniq.each { |k| o << @matrix_edges[k] }
|
53
|
+
o
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Calculates a matrix where all links between segments are represented by the
|
58
|
+
# following variables:
|
59
|
+
#
|
60
|
+
# +@matrix_segment_names+ includes the names of all segments (as String) with
|
61
|
+
# the order indicating the segment index in the matrix
|
62
|
+
#
|
63
|
+
# +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
|
64
|
+
# the row index segment, the second index indicates the column index segment,
|
65
|
+
# and the third index indicates each of the links between those two. Note that
|
66
|
+
# matrix only stores the lower triangle, so the row index must be stictly less
|
67
|
+
# than the column index. For example, +@matrix[3][1]+ returns an Array of all
|
68
|
+
# index links between the segment with index 3 and the segment with index 1:
|
69
|
+
# ```
|
70
|
+
# [
|
71
|
+
# [ ], # Row 0 is always empty
|
72
|
+
# [[] ], # Row 1 stores connections to column 0
|
73
|
+
# [[], [] ], # Row 2 stores connections to columns 0 and 1
|
74
|
+
# [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
|
75
|
+
# ... # &c
|
76
|
+
# ]
|
77
|
+
# ```
|
78
|
+
#
|
79
|
+
# +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
|
80
|
+
# the GFA. The order indicates the index used by the values of +@matrix+
|
81
|
+
def recalculate_matrix
|
82
|
+
@matrix_segment_names = segments.set.map(&:name).map(&:value)
|
83
|
+
@matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
|
84
|
+
@matrix_edges = all_edges
|
85
|
+
@matrix_edges.each_with_index do |edge, k|
|
86
|
+
names = edge.segments(self).map(&:name).map(&:value)
|
87
|
+
indices = names.map { |i| @matrix_segment_names.index(i) }
|
88
|
+
indices.each do |a|
|
89
|
+
indices.each do |b|
|
90
|
+
break if a == b
|
91
|
+
@matrix[[a, b].max][[a, b].min] << k
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gfa/record/segment.rb
CHANGED
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
|
|
26
26
|
add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
|
27
27
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
28
28
|
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Returns the length of the sequence represented in this segment
|
32
|
+
def length
|
33
|
+
sequence.value.length
|
34
|
+
end
|
29
35
|
end
|
data/lib/gfa/version.rb
CHANGED
data/lib/gfa.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gfa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rgl
|
@@ -64,6 +64,10 @@ files:
|
|
64
64
|
- README.md
|
65
65
|
- Rakefile
|
66
66
|
- bin/gfa-add-gaf
|
67
|
+
- bin/gfa-greedy-modules
|
68
|
+
- bin/gfa-mean-depth
|
69
|
+
- bin/gfa-merge
|
70
|
+
- bin/gfa-paths-to-fasta
|
67
71
|
- bin/gfa-subgraph
|
68
72
|
- lib/gfa.rb
|
69
73
|
- lib/gfa/common.rb
|
@@ -77,6 +81,7 @@ files:
|
|
77
81
|
- lib/gfa/field/string.rb
|
78
82
|
- lib/gfa/generator.rb
|
79
83
|
- lib/gfa/graph.rb
|
84
|
+
- lib/gfa/modules.rb
|
80
85
|
- lib/gfa/parser.rb
|
81
86
|
- lib/gfa/record.rb
|
82
87
|
- lib/gfa/record/comment.rb
|