rgfa 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/gfadiff.rb +420 -0
- data/bin/rgfa-findcrisprs.rb +208 -0
- data/bin/rgfa-mergelinear.rb +14 -0
- data/bin/rgfa-simdebruijn.rb +86 -0
- data/lib/rgfa.rb +376 -0
- data/lib/rgfa/byte_array.rb +74 -0
- data/lib/rgfa/cigar.rb +157 -0
- data/lib/rgfa/connectivity.rb +131 -0
- data/lib/rgfa/containments.rb +97 -0
- data/lib/rgfa/error.rb +3 -0
- data/lib/rgfa/field_array.rb +87 -0
- data/lib/rgfa/field_parser.rb +109 -0
- data/lib/rgfa/field_validator.rb +241 -0
- data/lib/rgfa/field_writer.rb +108 -0
- data/lib/rgfa/headers.rb +76 -0
- data/lib/rgfa/line.rb +721 -0
- data/lib/rgfa/line/containment.rb +87 -0
- data/lib/rgfa/line/header.rb +92 -0
- data/lib/rgfa/line/link.rb +379 -0
- data/lib/rgfa/line/path.rb +106 -0
- data/lib/rgfa/line/segment.rb +209 -0
- data/lib/rgfa/linear_paths.rb +285 -0
- data/lib/rgfa/lines.rb +155 -0
- data/lib/rgfa/links.rb +242 -0
- data/lib/rgfa/logger.rb +192 -0
- data/lib/rgfa/multiplication.rb +156 -0
- data/lib/rgfa/numeric_array.rb +196 -0
- data/lib/rgfa/paths.rb +98 -0
- data/lib/rgfa/rgl.rb +194 -0
- data/lib/rgfa/segment_ends_path.rb +9 -0
- data/lib/rgfa/segment_info.rb +162 -0
- data/lib/rgfa/segments.rb +99 -0
- data/lib/rgfa/sequence.rb +65 -0
- data/lib/rgfatools.rb +102 -0
- data/lib/rgfatools/artifacts.rb +29 -0
- data/lib/rgfatools/copy_number.rb +126 -0
- data/lib/rgfatools/invertible_segments.rb +104 -0
- data/lib/rgfatools/linear_paths.rb +140 -0
- data/lib/rgfatools/multiplication.rb +194 -0
- data/lib/rgfatools/p_bubbles.rb +66 -0
- data/lib/rgfatools/superfluous_links.rb +64 -0
- metadata +97 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
#
|
2
|
+
# Methods which edit the graph components without traversal
|
3
|
+
#
|
4
|
+
module RGFATools::Multiplication
|
5
|
+
|
6
|
+
# Allowed values for the links_distribution_policy option
|
7
|
+
LINKS_DISTRIBUTION_POLICY = [:off, :auto, :equal, :E, :B]
|
8
|
+
|
9
|
+
# @overload multiply(segment, factor, copy_names: :lowcase, distribute: :auto, conserve_components: true, origin_tag: :or)
|
10
|
+
# Create multiple copies of a segment.
|
11
|
+
#
|
12
|
+
# Complements the multiply method of gfatools with additional functionality.
|
13
|
+
# These extensions are used only after #enable_extensions is called on the
|
14
|
+
# RGFA object. After that, you may still call the original method
|
15
|
+
# using #multiply_without_rgfatools.
|
16
|
+
#
|
17
|
+
# For more information on the additional functionality, see
|
18
|
+
# #multiply_extended.
|
19
|
+
#
|
20
|
+
# @return [RGFA] self
|
21
|
+
def multiply_with_rgfatools(segment, factor,
|
22
|
+
copy_names: :lowcase,
|
23
|
+
distribute: :auto,
|
24
|
+
conserve_components: true,
|
25
|
+
origin_tag: :or)
|
26
|
+
if !@extensions_enabled
|
27
|
+
return multiply_without_rgfatools(segment, factor,
|
28
|
+
copy_names: copy_names,
|
29
|
+
conserve_components: conserve_components)
|
30
|
+
else
|
31
|
+
multiply_extended(segment, factor,
|
32
|
+
copy_names: copy_names,
|
33
|
+
distribute: distribute,
|
34
|
+
conserve_components: conserve_components,
|
35
|
+
origin_tag: origin_tag)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Create multiple copies of a segment.
|
40
|
+
#
|
41
|
+
# Complements the multiply method of gfatools with additional functionality.
|
42
|
+
# To always run the additional functionality when multiply is called,
|
43
|
+
# use RGFA#enable_extensions.
|
44
|
+
#
|
45
|
+
# @!macro [new] copynames_text
|
46
|
+
#
|
47
|
+
# <b>Automatic computation of the copy names:</b>
|
48
|
+
#
|
49
|
+
# - First, itis checked if the name of the original segment ends with a
|
50
|
+
# relevant
|
51
|
+
# string, i.e. a lower case letter (for +:lowcase+), an upper case letter
|
52
|
+
# (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
|
53
|
+
# plus one or more optional digits (for +:copy+).
|
54
|
+
# - If so, it is assumed, it was already a copy, and it is not
|
55
|
+
# altered.
|
56
|
+
# - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
|
57
|
+
# +:number+), +_copy+ (for +:copy+) is appended to the string.
|
58
|
+
# - Then, in all
|
59
|
+
# cases, next (*) is called on the string, until a valid, non-existant
|
60
|
+
# name is found for each of the segment copies
|
61
|
+
# - (*) = except for +:copy+, where
|
62
|
+
# for the first copy no digit is present, but for the following is,
|
63
|
+
# i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
|
64
|
+
# - Can be overridden, by providing an array of copy names.
|
65
|
+
#
|
66
|
+
# @!macro [new] ldp_text
|
67
|
+
#
|
68
|
+
# <b>Links distribution policy</b>
|
69
|
+
#
|
70
|
+
# Depending on the value of the option +distribute+, an end
|
71
|
+
# is eventually selected for distribution of the links.
|
72
|
+
#
|
73
|
+
# - +:off+: no distribution performed
|
74
|
+
# - +:E+: links of the E end are distributed
|
75
|
+
# - +:B+: links of the B end are distributed
|
76
|
+
# - +:equal+: select an end for which the number of links is equal to
|
77
|
+
# +factor+, if any; if both, then the E end is selected
|
78
|
+
# - +:auto+: automatically select E or B, trying to maximize the number of
|
79
|
+
# links which can be deleted
|
80
|
+
#
|
81
|
+
# @param [Integer] factor multiplication factor; if 0, delete the segment;
|
82
|
+
# if 1; do nothing; if > 1; number of copies to create
|
83
|
+
# @!macro [new] segment_param
|
84
|
+
# @param segment [String, RGFA::Line::Segment] segment name or instance
|
85
|
+
# @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
|
86
|
+
# <i>(Defaults to: +:lowcase+)</i>
|
87
|
+
# Array of names for the copies of the segment,
|
88
|
+
# or a symbol, which defines a system to compute the names from the name of
|
89
|
+
# the original segment. See "Automatic computation of the copy names".
|
90
|
+
# @!macro [new] conserve_components
|
91
|
+
# @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
|
92
|
+
# If factor == 0 (i.e. deletion), delete segment only if
|
93
|
+
# #cut_segment?(segment) is +false+ (see RGFA API).
|
94
|
+
# @!macro [new] ldp_param
|
95
|
+
# @param distribute
|
96
|
+
# [RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY]
|
97
|
+
# <i>(Defaults to: +:auto+)</i>
|
98
|
+
# Determines if and for which end of the segment, links are distributed
|
99
|
+
# among the copies. See "Links distribution policy".
|
100
|
+
# @!macro [new] origin_tag
|
101
|
+
# @param origin_tag [Symbol] <i>(Defaults to: +:or+)</i>
|
102
|
+
# Name of the custom tag to use for storing origin information.
|
103
|
+
#
|
104
|
+
# @return [RGFA] self
|
105
|
+
def multiply_extended(segment, factor,
|
106
|
+
copy_names: :lowcase,
|
107
|
+
distribute: :auto,
|
108
|
+
conserve_components: true,
|
109
|
+
origin_tag: :or)
|
110
|
+
s, sn = segment_and_segment_name(segment)
|
111
|
+
s.set(origin_tag, sn) if !s.get(origin_tag)
|
112
|
+
copy_names = compute_copy_names(copy_names, sn, factor)
|
113
|
+
multiply_without_rgfatools(sn, factor,
|
114
|
+
copy_names: copy_names,
|
115
|
+
conserve_components: conserve_components)
|
116
|
+
distribute_links(distribute, sn, copy_names, factor)
|
117
|
+
return self
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
Redefined = [:multiply]
|
123
|
+
|
124
|
+
def select_distribute_end(links_distribution_policy, segment_name, factor)
|
125
|
+
accepted = RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY
|
126
|
+
if !accepted.include?(links_distribution_policy)
|
127
|
+
raise "Unknown links distribution policy #{links_distribution_policy}, "+
|
128
|
+
"accepted values are: "+
|
129
|
+
accepted.inspect
|
130
|
+
end
|
131
|
+
return nil if links_distribution_policy == :off
|
132
|
+
if [:B, :E].include?(links_distribution_policy)
|
133
|
+
return links_distribution_policy
|
134
|
+
end
|
135
|
+
esize = links_of([segment_name, :E]).size
|
136
|
+
bsize = links_of([segment_name, :B]).size
|
137
|
+
auto_select_distribute_end(factor, bsize, esize,
|
138
|
+
links_distribution_policy == :equal)
|
139
|
+
end
|
140
|
+
|
141
|
+
# (keep separate for testing)
|
142
|
+
def auto_select_distribute_end(factor, bsize, esize, equal_only)
|
143
|
+
if esize == factor
|
144
|
+
return :E
|
145
|
+
elsif bsize == factor
|
146
|
+
return :B
|
147
|
+
elsif equal_only
|
148
|
+
return nil
|
149
|
+
elsif esize < 2
|
150
|
+
return (bsize < 2) ? nil : :B
|
151
|
+
elsif bsize < 2
|
152
|
+
return :E
|
153
|
+
elsif esize < factor
|
154
|
+
return ((bsize <= esize) ? :E :
|
155
|
+
((bsize < factor) ? :B : :E))
|
156
|
+
elsif bsize < factor
|
157
|
+
return :B
|
158
|
+
else
|
159
|
+
return ((bsize <= esize) ? :B : :E)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def distribute_links(links_distribution_policy, segment_name,
|
164
|
+
copy_names, factor)
|
165
|
+
return if factor < 2
|
166
|
+
end_type = select_distribute_end(links_distribution_policy,
|
167
|
+
segment_name, factor)
|
168
|
+
return nil if end_type.nil?
|
169
|
+
et_links = links_of([segment_name, end_type])
|
170
|
+
diff = [et_links.size - factor, 0].max
|
171
|
+
links_signatures = et_links.map do |l|
|
172
|
+
l.other_end([segment_name, end_type]).join
|
173
|
+
end
|
174
|
+
([segment_name]+copy_names).each_with_index do |sn, i|
|
175
|
+
links_of([sn, end_type]).each do |l|
|
176
|
+
l_sig = l.other_end([sn, end_type]).join
|
177
|
+
to_save = links_signatures[i..i+diff].to_a
|
178
|
+
delete_link(l) unless to_save.include?(l_sig)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def segment_and_segment_name(segment_or_segment_name)
|
184
|
+
if segment_or_segment_name.kind_of?(RGFA::Line)
|
185
|
+
s = segment_or_segment_name
|
186
|
+
sn = segment_or_segment_name.name
|
187
|
+
else
|
188
|
+
sn = segment_or_segment_name.to_sym
|
189
|
+
s = segment(sn)
|
190
|
+
end
|
191
|
+
return s, sn
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#
|
2
|
+
# Methods for the RGFA class, which involve a traversal of the graph following
|
3
|
+
# links
|
4
|
+
#
|
5
|
+
module RGFATools::PBubbles
|
6
|
+
|
7
|
+
require "set"
|
8
|
+
|
9
|
+
# Removes all p-bubbles in the graph
|
10
|
+
# @return [RGFA] self
|
11
|
+
def remove_p_bubbles
|
12
|
+
visited = Set.new
|
13
|
+
segment_names.each do |sn|
|
14
|
+
next if visited.include?(sn)
|
15
|
+
if connectivity(sn) == [1,1]
|
16
|
+
s1 = neighbours([sn, :B])[0]
|
17
|
+
s2 = neighbours([sn, :E])[0]
|
18
|
+
n1 = neighbours(s1).sort
|
19
|
+
n2 = neighbours(s2).sort
|
20
|
+
n1.each {|se| visited << se[0].name}
|
21
|
+
if n1 == n2.map{|se| se.invert_end_type}
|
22
|
+
remove_proven_p_bubble(s1, s2, n1)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
return self
|
27
|
+
end
|
28
|
+
|
29
|
+
# Removes a p-bubble between segment_end1 and segment_end2
|
30
|
+
# @param [RGFA::SegmentEnd] segment_end1 a segment end
|
31
|
+
# @param [RGFA::SegmentEnd] segment_end2 another segment end
|
32
|
+
# @!macro [new] count_tag
|
33
|
+
# @param count_tag [Symbol] <i>(defaults to: +:RC+ or the value set by
|
34
|
+
# {#set_default_count_tag})</i> the count tag to use for coverage
|
35
|
+
# computation
|
36
|
+
# @!macro [new] unit_length
|
37
|
+
# @param unit_length [Integer] <i>(defaults to: 1 or the value set by
|
38
|
+
# {#set_count_unit_length})</i> the unit length to use for coverage
|
39
|
+
# computation
|
40
|
+
# @return [RGFA] self
|
41
|
+
#
|
42
|
+
def remove_p_bubble(segment_end1, segment_end2,
|
43
|
+
count_tag: @default[:count_tag],
|
44
|
+
unit_length: @default[:unit_length])
|
45
|
+
n1 = neighbours(segment_end1).sort
|
46
|
+
n2 = neighbours(segment_end2).sort
|
47
|
+
raise if n1 != n2.map{|se| se.invert_end_type}
|
48
|
+
raise if n1.any? {|se| connectivity(se[0]) != [1,1]}
|
49
|
+
remove_proven_p_bubble(segment_end1, segment_end2, n1,
|
50
|
+
count_tag: count_tag,
|
51
|
+
unit_length: unit_length)
|
52
|
+
return self
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def remove_proven_p_bubble(segment_end1, segment_end2, alternatives,
|
58
|
+
count_tag: @default[:count_tag],
|
59
|
+
unit_length: @default[:unit_length])
|
60
|
+
coverages = alternatives.map{|s|segment!(s[0]).coverage(
|
61
|
+
count_tag: count_tag, unit_length: unit_length)}
|
62
|
+
alternatives.delete_at(coverages.index(coverages.max))
|
63
|
+
alternatives.each {|s| delete_segment(s[0])}
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#
|
2
|
+
# Methods which edit the graph components without traversal
|
3
|
+
#
|
4
|
+
module RGFATools::SuperfluousLinks
|
5
|
+
|
6
|
+
# Remove superfluous links in the presence of mandatory links
|
7
|
+
# for a single segment
|
8
|
+
# @return [RGFA] self
|
9
|
+
# @!macro segment_param
|
10
|
+
# @!macro [new] conserve_components_links
|
11
|
+
# @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
|
12
|
+
# delete links only if #cut_link?(link) is +false+ (see RGFA API).
|
13
|
+
def enforce_segment_mandatory_links(segment, conserve_components: true)
|
14
|
+
sn = segment_and_segment_name(segment)[1]
|
15
|
+
se = {}
|
16
|
+
l = {}
|
17
|
+
[:B, :E].each do |et|
|
18
|
+
se[et] = [sn, et]
|
19
|
+
l[et] = links_of(se[et])
|
20
|
+
end
|
21
|
+
cs = connectivity_symbols(l[:B].size, l[:E].size)
|
22
|
+
if cs == [1, 1]
|
23
|
+
oe = {}
|
24
|
+
[:B, :E].each {|et| oe[et] = l[et][0].other_end(se[et])}
|
25
|
+
return if oe[:B] == oe[:E]
|
26
|
+
[:B, :E].each {|et| delete_other_links(oe[et], se[et],
|
27
|
+
conserve_components: conserve_components)}
|
28
|
+
else
|
29
|
+
i = cs.index(1)
|
30
|
+
return if i.nil?
|
31
|
+
et = [:B, :E][i]
|
32
|
+
oe = l[et][0].other_end(se[et])
|
33
|
+
delete_other_links(oe, se[et], conserve_components: conserve_components)
|
34
|
+
end
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
# Remove superfluous links in the presence of mandatory links
|
39
|
+
# in the entire graph
|
40
|
+
# @!macro conserve_components_links
|
41
|
+
# @return [RGFA] self
|
42
|
+
def enforce_all_mandatory_links(conserve_components: true)
|
43
|
+
segment_names.each {|sn| enforce_segment_mandatory_links(sn,
|
44
|
+
conserve_components: conserve_components)}
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
# Remove links of segment to itself
|
49
|
+
# @!macro segment_param
|
50
|
+
# @return [RGFA] self
|
51
|
+
def remove_self_link(segment)
|
52
|
+
segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
|
53
|
+
unconnect_segments(segment_name, segment_name)
|
54
|
+
self
|
55
|
+
end
|
56
|
+
|
57
|
+
# Remove all links of segments to themselves
|
58
|
+
# @return [RGFA] self
|
59
|
+
def remove_self_links
|
60
|
+
segment_names.each {|sn| remove_self_link(sn)}
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rgfa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.2.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Giorgio Gonnella
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-09-21 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: |2
|
14
|
+
The Graphical Fragment Assembly (GFA) is a proposed format which allow
|
15
|
+
to describe the product of sequence assembly.
|
16
|
+
This gem implements the proposed specifications for the GFA format
|
17
|
+
described under https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md
|
18
|
+
as close as possible.
|
19
|
+
The library allows to create an RGFA object from a file in the GFA format
|
20
|
+
or from scratch, to enumerate the graph elements (segments, links,
|
21
|
+
containments, paths and header lines), to traverse the graph (by
|
22
|
+
traversing all links outgoing from or incoming to a segment), to search for
|
23
|
+
elements (e.g. which links connect two segments) and to manipulate the
|
24
|
+
graph (e.g. to eliminate a link or a segment or to duplicate a segment
|
25
|
+
distributing the read counts evenly on the copies).
|
26
|
+
email: gonnella@zbh.uni-hamburg.de
|
27
|
+
executables: []
|
28
|
+
extensions: []
|
29
|
+
extra_rdoc_files: []
|
30
|
+
files:
|
31
|
+
- lib/rgfa.rb
|
32
|
+
- lib/rgfa/byte_array.rb
|
33
|
+
- lib/rgfa/cigar.rb
|
34
|
+
- lib/rgfa/connectivity.rb
|
35
|
+
- lib/rgfa/containments.rb
|
36
|
+
- lib/rgfa/error.rb
|
37
|
+
- lib/rgfa/field_array.rb
|
38
|
+
- lib/rgfa/field_writer.rb
|
39
|
+
- lib/rgfa/field_parser.rb
|
40
|
+
- lib/rgfa/field_validator.rb
|
41
|
+
- lib/rgfa/headers.rb
|
42
|
+
- lib/rgfa/line/containment.rb
|
43
|
+
- lib/rgfa/line/header.rb
|
44
|
+
- lib/rgfa/line/link.rb
|
45
|
+
- lib/rgfa/line/path.rb
|
46
|
+
- lib/rgfa/line/segment.rb
|
47
|
+
- lib/rgfa/line.rb
|
48
|
+
- lib/rgfa/linear_paths.rb
|
49
|
+
- lib/rgfa/lines.rb
|
50
|
+
- lib/rgfa/links.rb
|
51
|
+
- lib/rgfa/logger.rb
|
52
|
+
- lib/rgfa/multiplication.rb
|
53
|
+
- lib/rgfa/numeric_array.rb
|
54
|
+
- lib/rgfa/paths.rb
|
55
|
+
- lib/rgfa/rgl.rb
|
56
|
+
- lib/rgfa/segment_ends_path.rb
|
57
|
+
- lib/rgfa/segment_info.rb
|
58
|
+
- lib/rgfa/segments.rb
|
59
|
+
- lib/rgfa/sequence.rb
|
60
|
+
- lib/rgfatools.rb
|
61
|
+
- lib/rgfatools/artifacts.rb
|
62
|
+
- lib/rgfatools/copy_number.rb
|
63
|
+
- lib/rgfatools/invertible_segments.rb
|
64
|
+
- lib/rgfatools/multiplication.rb
|
65
|
+
- lib/rgfatools/superfluous_links.rb
|
66
|
+
- lib/rgfatools/linear_paths.rb
|
67
|
+
- lib/rgfatools/p_bubbles.rb
|
68
|
+
- bin/gfadiff.rb
|
69
|
+
- bin/rgfa-mergelinear.rb
|
70
|
+
- bin/rgfa-simdebruijn.rb
|
71
|
+
- bin/rgfa-findcrisprs.rb
|
72
|
+
homepage: http://github.com/ggonnella/rgfa
|
73
|
+
licenses:
|
74
|
+
- CC-BY-SA
|
75
|
+
metadata: {}
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options: []
|
78
|
+
require_paths:
|
79
|
+
- lib
|
80
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '2.0'
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
requirements: []
|
91
|
+
rubyforge_project:
|
92
|
+
rubygems_version: 2.0.3
|
93
|
+
signing_key:
|
94
|
+
specification_version: 4
|
95
|
+
summary: Parse, edit and write GFA-format graphs in Ruby
|
96
|
+
test_files: []
|
97
|
+
has_rdoc:
|