gfa 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/gfa-greedy-modules +50 -0
- data/lib/gfa/common.rb +6 -0
- data/lib/gfa/graph.rb +3 -3
- data/lib/gfa/modules.rb +96 -0
- data/lib/gfa/record/segment.rb +6 -0
- data/lib/gfa/record_set/segment_set.rb +6 -0
- data/lib/gfa/version.rb +1 -1
- data/lib/gfa.rb +1 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9eaaaf8bcac372e7d6b63d96aee94493faa51d78f5cc2b7edb525c3fbd7efaa2
|
4
|
+
data.tar.gz: 950e9ffa9cd07cdea8ef5cc80b6c162bf009ff3bef5ab757dcda51c443925043
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 92e53b9d6c09b1814f3feda86b67dcea6192bd935e3c04dcca7b6ee9c0e7eed919c711941985313c2918f02772f3598883f037638e6e1b7cc9cf8897fec80caf
|
7
|
+
data.tar.gz: 4c14ce2bd91582b7b81bc6123443bd5b0cffe8c83550fb85dc48c8da030d35dafaf393506d537c5df7550b25b87ab10003a938c407bd6210983e5f8841552317
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# @package MiGA
|
4
|
+
# @license Artistic-2.0
|
5
|
+
|
6
|
+
$LOAD_PATH.push File.expand_path('../../lib', __FILE__)
|
7
|
+
$LOAD_PATH.push File.expand_path('../../lib', File.realpath(__FILE__))
|
8
|
+
|
9
|
+
require 'gfa'
|
10
|
+
|
11
|
+
input, base, min_len, threads = ARGV
|
12
|
+
|
13
|
+
unless base
|
14
|
+
$stderr.puts <<~HELP
|
15
|
+
Split a GFA into multiple independent GFA files that have no links between
|
16
|
+
them by greedily identifying all individual modules in the graph
|
17
|
+
|
18
|
+
gfa-greedy-modules <input> <base> [<min_len> [<threads>]]
|
19
|
+
|
20
|
+
<input> Input GFA file to read
|
21
|
+
<base> Prefix of the output GFA files to write
|
22
|
+
<min_len> Minimum length (in bp) to report a module
|
23
|
+
By default: 0 (all modules are reported)
|
24
|
+
<threads> If passed, parallelize process with these many threads
|
25
|
+
HELP
|
26
|
+
exit(1)
|
27
|
+
end
|
28
|
+
|
29
|
+
$stderr.puts "Loading GFA: #{input}"
|
30
|
+
gfa = GFA.load_parallel(input, (threads || 1).to_i)
|
31
|
+
|
32
|
+
$stderr.puts 'Splitting graph into modules'
|
33
|
+
gfas = gfa.split_modules
|
34
|
+
|
35
|
+
min_len = min_len.to_i
|
36
|
+
if min_len > 0
|
37
|
+
$stderr.puts 'Filtering out small modules'
|
38
|
+
gfas.select! { |gfa| gfa.total_length > min_len }
|
39
|
+
end
|
40
|
+
|
41
|
+
if gfas.empty?
|
42
|
+
$stderr.puts "No modules found"
|
43
|
+
else
|
44
|
+
$stderr.puts "Saving #{gfas.size} GFA files: #{base}.*"
|
45
|
+
int_len = Math.log10(gfas.size).ceil
|
46
|
+
gfas.each_with_index do |gfa, k|
|
47
|
+
gfa.save("%s.%0#{int_len}i.gfa" % [base, k])
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
data/lib/gfa/common.rb
CHANGED
data/lib/gfa/graph.rb
CHANGED
@@ -110,9 +110,9 @@ class GFA
|
|
110
110
|
# Links, Containments, Jumps (from, to) and Paths (segment_names)
|
111
111
|
linking = []
|
112
112
|
eval_set = _ignore == 0 ? segments.set : segments.set[_ignore..]
|
113
|
-
edges.delete_if do |
|
114
|
-
if eval_set.any? { |segment|
|
115
|
-
linking <<
|
113
|
+
edges.delete_if do |edge|
|
114
|
+
if eval_set.any? { |segment| edge.include? segment }
|
115
|
+
linking << edge
|
116
116
|
true # Remove from the edge set to speed up future recursions
|
117
117
|
else
|
118
118
|
false # Keep it, possibly linking future recursions
|
data/lib/gfa/modules.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
class GFA
|
3
|
+
##
|
4
|
+
# Find all independent modules by greedily crawling the linking entries for
|
5
|
+
# each segment, and returns an Array of GFA objects containing each individual
|
6
|
+
# module. If +recalculate+ is false, it trusts the current calculated
|
7
|
+
# matrix unless none exists
|
8
|
+
def split_modules(recalculate = true)
|
9
|
+
recalculate_matrix if recalculate || @matrix.nil?
|
10
|
+
missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
|
11
|
+
modules = []
|
12
|
+
until missing_segments.empty?
|
13
|
+
mod = matrix_find_module(missing_segments[0])
|
14
|
+
mod.segments.set.map(&:name).map(&:value).each do |name|
|
15
|
+
missing_segments.delete(@matrix_segment_names.index(name))
|
16
|
+
end
|
17
|
+
modules << mod
|
18
|
+
end
|
19
|
+
modules
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Finds the entire module containing the segment with index +segment_index+
|
24
|
+
# in the matrix (requires calling +recalculate_matrix+ first!). Returns the
|
25
|
+
# module as a new GFA
|
26
|
+
def matrix_find_module(segment_index)
|
27
|
+
# Initialize
|
28
|
+
segs = [segment_index]
|
29
|
+
edges = []
|
30
|
+
new_segs = true
|
31
|
+
|
32
|
+
# Iterate until no new segments are found
|
33
|
+
while new_segs
|
34
|
+
new_segs = false
|
35
|
+
segs.each do |seg|
|
36
|
+
@matrix.size.times do |k|
|
37
|
+
next if seg == k
|
38
|
+
v = @matrix[[seg, k].max][[seg, k].min]
|
39
|
+
next if v.empty?
|
40
|
+
edges += v
|
41
|
+
unless segs.include?(k)
|
42
|
+
new_segs = true
|
43
|
+
segs << k
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Save as GFA and return
|
50
|
+
o = GFA.new
|
51
|
+
segs.each { |k| o << segments[k] }
|
52
|
+
edges.uniq.each { |k| o << @matrix_edges[k] }
|
53
|
+
o
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Calculates a matrix where all links between segments are represented by the
|
58
|
+
# following variables:
|
59
|
+
#
|
60
|
+
# +@matrix_segment_names+ includes the names of all segments (as String) with
|
61
|
+
# the order indicating the segment index in the matrix
|
62
|
+
#
|
63
|
+
# +@matrix+ is an Array of Arrays of Arrays, where the first index indicates
|
64
|
+
# the row index segment, the second index indicates the column index segment,
|
65
|
+
# and the third index indicates each of the links between those two. Note that
|
66
|
+
# matrix only stores the lower triangle, so the row index must be stictly less
|
67
|
+
# than the column index. For example, +@matrix[3][1]+ returns an Array of all
|
68
|
+
# index links between the segment with index 3 and the segment with index 1:
|
69
|
+
# ```
|
70
|
+
# [
|
71
|
+
# [ ], # Row 0 is always empty
|
72
|
+
# [[] ], # Row 1 stores connections to column 0
|
73
|
+
# [[], [] ], # Row 2 stores connections to columns 0 and 1
|
74
|
+
# [[], [], [] ], # Row 3 stores connections to columns 0, 1, and 2
|
75
|
+
# ... # &c
|
76
|
+
# ]
|
77
|
+
# ```
|
78
|
+
#
|
79
|
+
# +@matrix_edges+ is an Array of GFA::Record objects representing all edges in
|
80
|
+
# the GFA. The order indicates the index used by the values of +@matrix+
|
81
|
+
def recalculate_matrix
|
82
|
+
@matrix_segment_names = segments.set.map(&:name).map(&:value)
|
83
|
+
@matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
|
84
|
+
@matrix_edges = all_edges
|
85
|
+
@matrix_edges.each_with_index do |edge, k|
|
86
|
+
names = edge.segments(self).map(&:name).map(&:value)
|
87
|
+
indices = names.map { |i| @matrix_segment_names.index(i) }
|
88
|
+
indices.each do |a|
|
89
|
+
indices.each do |b|
|
90
|
+
break if a == b
|
91
|
+
@matrix[[a, b].max][[a, b].min] << k
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/gfa/record/segment.rb
CHANGED
@@ -26,4 +26,10 @@ class GFA::Record::Segment < GFA::Record
|
|
26
26
|
add_field(3, :Z, sequence, /\*|[A-Za-z=.]+/)
|
27
27
|
opt_fields.each { |f| add_opt_field(f, OPT_FIELDS) }
|
28
28
|
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# Returns the length of the sequence represented in this segment
|
32
|
+
def length
|
33
|
+
sequence.value.length
|
34
|
+
end
|
29
35
|
end
|
data/lib/gfa/version.rb
CHANGED
data/lib/gfa.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gfa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-03-
|
11
|
+
date: 2023-03-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rgl
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- README.md
|
65
65
|
- Rakefile
|
66
66
|
- bin/gfa-add-gaf
|
67
|
+
- bin/gfa-greedy-modules
|
67
68
|
- bin/gfa-subgraph
|
68
69
|
- lib/gfa.rb
|
69
70
|
- lib/gfa/common.rb
|
@@ -77,6 +78,7 @@ files:
|
|
77
78
|
- lib/gfa/field/string.rb
|
78
79
|
- lib/gfa/generator.rb
|
79
80
|
- lib/gfa/graph.rb
|
81
|
+
- lib/gfa/modules.rb
|
80
82
|
- lib/gfa/parser.rb
|
81
83
|
- lib/gfa/record.rb
|
82
84
|
- lib/gfa/record/comment.rb
|