rgfa 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/bin/gfadiff.rb +420 -0
  3. data/bin/rgfa-findcrisprs.rb +208 -0
  4. data/bin/rgfa-mergelinear.rb +14 -0
  5. data/bin/rgfa-simdebruijn.rb +86 -0
  6. data/lib/rgfa.rb +376 -0
  7. data/lib/rgfa/byte_array.rb +74 -0
  8. data/lib/rgfa/cigar.rb +157 -0
  9. data/lib/rgfa/connectivity.rb +131 -0
  10. data/lib/rgfa/containments.rb +97 -0
  11. data/lib/rgfa/error.rb +3 -0
  12. data/lib/rgfa/field_array.rb +87 -0
  13. data/lib/rgfa/field_parser.rb +109 -0
  14. data/lib/rgfa/field_validator.rb +241 -0
  15. data/lib/rgfa/field_writer.rb +108 -0
  16. data/lib/rgfa/headers.rb +76 -0
  17. data/lib/rgfa/line.rb +721 -0
  18. data/lib/rgfa/line/containment.rb +87 -0
  19. data/lib/rgfa/line/header.rb +92 -0
  20. data/lib/rgfa/line/link.rb +379 -0
  21. data/lib/rgfa/line/path.rb +106 -0
  22. data/lib/rgfa/line/segment.rb +209 -0
  23. data/lib/rgfa/linear_paths.rb +285 -0
  24. data/lib/rgfa/lines.rb +155 -0
  25. data/lib/rgfa/links.rb +242 -0
  26. data/lib/rgfa/logger.rb +192 -0
  27. data/lib/rgfa/multiplication.rb +156 -0
  28. data/lib/rgfa/numeric_array.rb +196 -0
  29. data/lib/rgfa/paths.rb +98 -0
  30. data/lib/rgfa/rgl.rb +194 -0
  31. data/lib/rgfa/segment_ends_path.rb +9 -0
  32. data/lib/rgfa/segment_info.rb +162 -0
  33. data/lib/rgfa/segments.rb +99 -0
  34. data/lib/rgfa/sequence.rb +65 -0
  35. data/lib/rgfatools.rb +102 -0
  36. data/lib/rgfatools/artifacts.rb +29 -0
  37. data/lib/rgfatools/copy_number.rb +126 -0
  38. data/lib/rgfatools/invertible_segments.rb +104 -0
  39. data/lib/rgfatools/linear_paths.rb +140 -0
  40. data/lib/rgfatools/multiplication.rb +194 -0
  41. data/lib/rgfatools/p_bubbles.rb +66 -0
  42. data/lib/rgfatools/superfluous_links.rb +64 -0
  43. metadata +97 -0
@@ -0,0 +1,194 @@
1
+ #
2
+ # Methods which edit the graph components without traversal
3
+ #
4
+ module RGFATools::Multiplication
5
+
6
+ # Allowed values for the links_distribution_policy option
7
+ LINKS_DISTRIBUTION_POLICY = [:off, :auto, :equal, :E, :B]
8
+
9
+ # @overload multiply(segment, factor, copy_names: :lowcase, distribute: :auto, conserve_components: true, origin_tag: :or)
10
+ # Create multiple copies of a segment.
11
+ #
12
+ # Complements the multiply method of gfatools with additional functionality.
13
+ # These extensions are used only after #enable_extensions is called on the
14
+ # RGFA object. After that, you may still call the original method
15
+ # using #multiply_without_rgfatools.
16
+ #
17
+ # For more information on the additional functionality, see
18
+ # #multiply_extended.
19
+ #
20
+ # @return [RGFA] self
21
+ def multiply_with_rgfatools(segment, factor,
22
+ copy_names: :lowcase,
23
+ distribute: :auto,
24
+ conserve_components: true,
25
+ origin_tag: :or)
26
+ if !@extensions_enabled
27
+ return multiply_without_rgfatools(segment, factor,
28
+ copy_names: copy_names,
29
+ conserve_components: conserve_components)
30
+ else
31
+ multiply_extended(segment, factor,
32
+ copy_names: copy_names,
33
+ distribute: distribute,
34
+ conserve_components: conserve_components,
35
+ origin_tag: origin_tag)
36
+ end
37
+ end
38
+
39
+ # Create multiple copies of a segment.
40
+ #
41
+ # Complements the multiply method of gfatools with additional functionality.
42
+ # To always run the additional functionality when multiply is called,
43
+ # use RGFA#enable_extensions.
44
+ #
45
+ # @!macro [new] copynames_text
46
+ #
47
+ # <b>Automatic computation of the copy names:</b>
48
+ #
49
+ # - First, itis checked if the name of the original segment ends with a
50
+ # relevant
51
+ # string, i.e. a lower case letter (for +:lowcase+), an upper case letter
52
+ # (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
53
+ # plus one or more optional digits (for +:copy+).
54
+ # - If so, it is assumed, it was already a copy, and it is not
55
+ # altered.
56
+ # - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
57
+ # +:number+), +_copy+ (for +:copy+) is appended to the string.
58
+ # - Then, in all
59
+ # cases, next (*) is called on the string, until a valid, non-existant
60
+ # name is found for each of the segment copies
61
+ # - (*) = except for +:copy+, where
62
+ # for the first copy no digit is present, but for the following is,
63
+ # i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
64
+ # - Can be overridden, by providing an array of copy names.
65
+ #
66
+ # @!macro [new] ldp_text
67
+ #
68
+ # <b>Links distribution policy</b>
69
+ #
70
+ # Depending on the value of the option +distribute+, an end
71
+ # is eventually selected for distribution of the links.
72
+ #
73
+ # - +:off+: no distribution performed
74
+ # - +:E+: links of the E end are distributed
75
+ # - +:B+: links of the B end are distributed
76
+ # - +:equal+: select an end for which the number of links is equal to
77
+ # +factor+, if any; if both, then the E end is selected
78
+ # - +:auto+: automatically select E or B, trying to maximize the number of
79
+ # links which can be deleted
80
+ #
81
+ # @param [Integer] factor multiplication factor; if 0, delete the segment;
82
+ # if 1; do nothing; if > 1; number of copies to create
83
+ # @!macro [new] segment_param
84
+ # @param segment [String, RGFA::Line::Segment] segment name or instance
85
+ # @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
86
+ # <i>(Defaults to: +:lowcase+)</i>
87
+ # Array of names for the copies of the segment,
88
+ # or a symbol, which defines a system to compute the names from the name of
89
+ # the original segment. See "Automatic computation of the copy names".
90
+ # @!macro [new] conserve_components
91
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
92
+ # If factor == 0 (i.e. deletion), delete segment only if
93
+ # #cut_segment?(segment) is +false+ (see RGFA API).
94
+ # @!macro [new] ldp_param
95
+ # @param distribute
96
+ # [RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY]
97
+ # <i>(Defaults to: +:auto+)</i>
98
+ # Determines if and for which end of the segment, links are distributed
99
+ # among the copies. See "Links distribution policy".
100
+ # @!macro [new] origin_tag
101
+ # @param origin_tag [Symbol] <i>(Defaults to: +:or+)</i>
102
+ # Name of the custom tag to use for storing origin information.
103
+ #
104
+ # @return [RGFA] self
105
+ def multiply_extended(segment, factor,
106
+ copy_names: :lowcase,
107
+ distribute: :auto,
108
+ conserve_components: true,
109
+ origin_tag: :or)
110
+ s, sn = segment_and_segment_name(segment)
111
+ s.set(origin_tag, sn) if !s.get(origin_tag)
112
+ copy_names = compute_copy_names(copy_names, sn, factor)
113
+ multiply_without_rgfatools(sn, factor,
114
+ copy_names: copy_names,
115
+ conserve_components: conserve_components)
116
+ distribute_links(distribute, sn, copy_names, factor)
117
+ return self
118
+ end
119
+
120
+ private
121
+
122
+ Redefined = [:multiply]
123
+
124
+ def select_distribute_end(links_distribution_policy, segment_name, factor)
125
+ accepted = RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY
126
+ if !accepted.include?(links_distribution_policy)
127
+ raise "Unknown links distribution policy #{links_distribution_policy}, "+
128
+ "accepted values are: "+
129
+ accepted.inspect
130
+ end
131
+ return nil if links_distribution_policy == :off
132
+ if [:B, :E].include?(links_distribution_policy)
133
+ return links_distribution_policy
134
+ end
135
+ esize = links_of([segment_name, :E]).size
136
+ bsize = links_of([segment_name, :B]).size
137
+ auto_select_distribute_end(factor, bsize, esize,
138
+ links_distribution_policy == :equal)
139
+ end
140
+
141
+ # (keep separate for testing)
142
+ def auto_select_distribute_end(factor, bsize, esize, equal_only)
143
+ if esize == factor
144
+ return :E
145
+ elsif bsize == factor
146
+ return :B
147
+ elsif equal_only
148
+ return nil
149
+ elsif esize < 2
150
+ return (bsize < 2) ? nil : :B
151
+ elsif bsize < 2
152
+ return :E
153
+ elsif esize < factor
154
+ return ((bsize <= esize) ? :E :
155
+ ((bsize < factor) ? :B : :E))
156
+ elsif bsize < factor
157
+ return :B
158
+ else
159
+ return ((bsize <= esize) ? :B : :E)
160
+ end
161
+ end
162
+
163
+ def distribute_links(links_distribution_policy, segment_name,
164
+ copy_names, factor)
165
+ return if factor < 2
166
+ end_type = select_distribute_end(links_distribution_policy,
167
+ segment_name, factor)
168
+ return nil if end_type.nil?
169
+ et_links = links_of([segment_name, end_type])
170
+ diff = [et_links.size - factor, 0].max
171
+ links_signatures = et_links.map do |l|
172
+ l.other_end([segment_name, end_type]).join
173
+ end
174
+ ([segment_name]+copy_names).each_with_index do |sn, i|
175
+ links_of([sn, end_type]).each do |l|
176
+ l_sig = l.other_end([sn, end_type]).join
177
+ to_save = links_signatures[i..i+diff].to_a
178
+ delete_link(l) unless to_save.include?(l_sig)
179
+ end
180
+ end
181
+ end
182
+
183
+ def segment_and_segment_name(segment_or_segment_name)
184
+ if segment_or_segment_name.kind_of?(RGFA::Line)
185
+ s = segment_or_segment_name
186
+ sn = segment_or_segment_name.name
187
+ else
188
+ sn = segment_or_segment_name.to_sym
189
+ s = segment(sn)
190
+ end
191
+ return s, sn
192
+ end
193
+
194
+ end
@@ -0,0 +1,66 @@
1
+ #
2
+ # Methods for the RGFA class, which involve a traversal of the graph following
3
+ # links
4
+ #
5
+ module RGFATools::PBubbles
6
+
7
+ require "set"
8
+
9
+ # Removes all p-bubbles in the graph
10
+ # @return [RGFA] self
11
+ def remove_p_bubbles
12
+ visited = Set.new
13
+ segment_names.each do |sn|
14
+ next if visited.include?(sn)
15
+ if connectivity(sn) == [1,1]
16
+ s1 = neighbours([sn, :B])[0]
17
+ s2 = neighbours([sn, :E])[0]
18
+ n1 = neighbours(s1).sort
19
+ n2 = neighbours(s2).sort
20
+ n1.each {|se| visited << se[0].name}
21
+ if n1 == n2.map{|se| se.invert_end_type}
22
+ remove_proven_p_bubble(s1, s2, n1)
23
+ end
24
+ end
25
+ end
26
+ return self
27
+ end
28
+
29
+ # Removes a p-bubble between segment_end1 and segment_end2
30
+ # @param [RGFA::SegmentEnd] segment_end1 a segment end
31
+ # @param [RGFA::SegmentEnd] segment_end2 another segment end
32
+ # @!macro [new] count_tag
33
+ # @param count_tag [Symbol] <i>(defaults to: +:RC+ or the value set by
34
+ # {#set_default_count_tag})</i> the count tag to use for coverage
35
+ # computation
36
+ # @!macro [new] unit_length
37
+ # @param unit_length [Integer] <i>(defaults to: 1 or the value set by
38
+ # {#set_count_unit_length})</i> the unit length to use for coverage
39
+ # computation
40
+ # @return [RGFA] self
41
+ #
42
+ def remove_p_bubble(segment_end1, segment_end2,
43
+ count_tag: @default[:count_tag],
44
+ unit_length: @default[:unit_length])
45
+ n1 = neighbours(segment_end1).sort
46
+ n2 = neighbours(segment_end2).sort
47
+ raise if n1 != n2.map{|se| se.invert_end_type}
48
+ raise if n1.any? {|se| connectivity(se[0]) != [1,1]}
49
+ remove_proven_p_bubble(segment_end1, segment_end2, n1,
50
+ count_tag: count_tag,
51
+ unit_length: unit_length)
52
+ return self
53
+ end
54
+
55
+ private
56
+
57
+ def remove_proven_p_bubble(segment_end1, segment_end2, alternatives,
58
+ count_tag: @default[:count_tag],
59
+ unit_length: @default[:unit_length])
60
+ coverages = alternatives.map{|s|segment!(s[0]).coverage(
61
+ count_tag: count_tag, unit_length: unit_length)}
62
+ alternatives.delete_at(coverages.index(coverages.max))
63
+ alternatives.each {|s| delete_segment(s[0])}
64
+ end
65
+
66
+ end
@@ -0,0 +1,64 @@
1
+ #
2
+ # Methods which edit the graph components without traversal
3
+ #
4
+ module RGFATools::SuperfluousLinks
5
+
6
+ # Remove superfluous links in the presence of mandatory links
7
+ # for a single segment
8
+ # @return [RGFA] self
9
+ # @!macro segment_param
10
+ # @!macro [new] conserve_components_links
11
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
12
+ # delete links only if #cut_link?(link) is +false+ (see RGFA API).
13
+ def enforce_segment_mandatory_links(segment, conserve_components: true)
14
+ sn = segment_and_segment_name(segment)[1]
15
+ se = {}
16
+ l = {}
17
+ [:B, :E].each do |et|
18
+ se[et] = [sn, et]
19
+ l[et] = links_of(se[et])
20
+ end
21
+ cs = connectivity_symbols(l[:B].size, l[:E].size)
22
+ if cs == [1, 1]
23
+ oe = {}
24
+ [:B, :E].each {|et| oe[et] = l[et][0].other_end(se[et])}
25
+ return if oe[:B] == oe[:E]
26
+ [:B, :E].each {|et| delete_other_links(oe[et], se[et],
27
+ conserve_components: conserve_components)}
28
+ else
29
+ i = cs.index(1)
30
+ return if i.nil?
31
+ et = [:B, :E][i]
32
+ oe = l[et][0].other_end(se[et])
33
+ delete_other_links(oe, se[et], conserve_components: conserve_components)
34
+ end
35
+ self
36
+ end
37
+
38
+ # Remove superfluous links in the presence of mandatory links
39
+ # in the entire graph
40
+ # @!macro conserve_components_links
41
+ # @return [RGFA] self
42
+ def enforce_all_mandatory_links(conserve_components: true)
43
+ segment_names.each {|sn| enforce_segment_mandatory_links(sn,
44
+ conserve_components: conserve_components)}
45
+ self
46
+ end
47
+
48
+ # Remove links of segment to itself
49
+ # @!macro segment_param
50
+ # @return [RGFA] self
51
+ def remove_self_link(segment)
52
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
53
+ unconnect_segments(segment_name, segment_name)
54
+ self
55
+ end
56
+
57
+ # Remove all links of segments to themselves
58
+ # @return [RGFA] self
59
+ def remove_self_links
60
+ segment_names.each {|sn| remove_self_link(sn)}
61
+ self
62
+ end
63
+
64
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rgfa
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.2.1
5
+ platform: ruby
6
+ authors:
7
+ - Giorgio Gonnella
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-09-21 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |2
14
+ The Graphical Fragment Assembly (GFA) is a proposed format which allow
15
+ to describe the product of sequence assembly.
16
+ This gem implements the proposed specifications for the GFA format
17
+ described under https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md
18
+ as close as possible.
19
+ The library allows to create an RGFA object from a file in the GFA format
20
+ or from scratch, to enumerate the graph elements (segments, links,
21
+ containments, paths and header lines), to traverse the graph (by
22
+ traversing all links outgoing from or incoming to a segment), to search for
23
+ elements (e.g. which links connect two segments) and to manipulate the
24
+ graph (e.g. to eliminate a link or a segment or to duplicate a segment
25
+ distributing the read counts evenly on the copies).
26
+ email: gonnella@zbh.uni-hamburg.de
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files: []
30
+ files:
31
+ - lib/rgfa.rb
32
+ - lib/rgfa/byte_array.rb
33
+ - lib/rgfa/cigar.rb
34
+ - lib/rgfa/connectivity.rb
35
+ - lib/rgfa/containments.rb
36
+ - lib/rgfa/error.rb
37
+ - lib/rgfa/field_array.rb
38
+ - lib/rgfa/field_writer.rb
39
+ - lib/rgfa/field_parser.rb
40
+ - lib/rgfa/field_validator.rb
41
+ - lib/rgfa/headers.rb
42
+ - lib/rgfa/line/containment.rb
43
+ - lib/rgfa/line/header.rb
44
+ - lib/rgfa/line/link.rb
45
+ - lib/rgfa/line/path.rb
46
+ - lib/rgfa/line/segment.rb
47
+ - lib/rgfa/line.rb
48
+ - lib/rgfa/linear_paths.rb
49
+ - lib/rgfa/lines.rb
50
+ - lib/rgfa/links.rb
51
+ - lib/rgfa/logger.rb
52
+ - lib/rgfa/multiplication.rb
53
+ - lib/rgfa/numeric_array.rb
54
+ - lib/rgfa/paths.rb
55
+ - lib/rgfa/rgl.rb
56
+ - lib/rgfa/segment_ends_path.rb
57
+ - lib/rgfa/segment_info.rb
58
+ - lib/rgfa/segments.rb
59
+ - lib/rgfa/sequence.rb
60
+ - lib/rgfatools.rb
61
+ - lib/rgfatools/artifacts.rb
62
+ - lib/rgfatools/copy_number.rb
63
+ - lib/rgfatools/invertible_segments.rb
64
+ - lib/rgfatools/multiplication.rb
65
+ - lib/rgfatools/superfluous_links.rb
66
+ - lib/rgfatools/linear_paths.rb
67
+ - lib/rgfatools/p_bubbles.rb
68
+ - bin/gfadiff.rb
69
+ - bin/rgfa-mergelinear.rb
70
+ - bin/rgfa-simdebruijn.rb
71
+ - bin/rgfa-findcrisprs.rb
72
+ homepage: http://github.com/ggonnella/rgfa
73
+ licenses:
74
+ - CC-BY-SA
75
+ metadata: {}
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '2.0'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubyforge_project:
92
+ rubygems_version: 2.0.3
93
+ signing_key:
94
+ specification_version: 4
95
+ summary: Parse, edit and write GFA-format graphs in Ruby
96
+ test_files: []
97
+ has_rdoc: