rgfa 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/gfadiff.rb +420 -0
- data/bin/rgfa-findcrisprs.rb +208 -0
- data/bin/rgfa-mergelinear.rb +14 -0
- data/bin/rgfa-simdebruijn.rb +86 -0
- data/lib/rgfa.rb +376 -0
- data/lib/rgfa/byte_array.rb +74 -0
- data/lib/rgfa/cigar.rb +157 -0
- data/lib/rgfa/connectivity.rb +131 -0
- data/lib/rgfa/containments.rb +97 -0
- data/lib/rgfa/error.rb +3 -0
- data/lib/rgfa/field_array.rb +87 -0
- data/lib/rgfa/field_parser.rb +109 -0
- data/lib/rgfa/field_validator.rb +241 -0
- data/lib/rgfa/field_writer.rb +108 -0
- data/lib/rgfa/headers.rb +76 -0
- data/lib/rgfa/line.rb +721 -0
- data/lib/rgfa/line/containment.rb +87 -0
- data/lib/rgfa/line/header.rb +92 -0
- data/lib/rgfa/line/link.rb +379 -0
- data/lib/rgfa/line/path.rb +106 -0
- data/lib/rgfa/line/segment.rb +209 -0
- data/lib/rgfa/linear_paths.rb +285 -0
- data/lib/rgfa/lines.rb +155 -0
- data/lib/rgfa/links.rb +242 -0
- data/lib/rgfa/logger.rb +192 -0
- data/lib/rgfa/multiplication.rb +156 -0
- data/lib/rgfa/numeric_array.rb +196 -0
- data/lib/rgfa/paths.rb +98 -0
- data/lib/rgfa/rgl.rb +194 -0
- data/lib/rgfa/segment_ends_path.rb +9 -0
- data/lib/rgfa/segment_info.rb +162 -0
- data/lib/rgfa/segments.rb +99 -0
- data/lib/rgfa/sequence.rb +65 -0
- data/lib/rgfatools.rb +102 -0
- data/lib/rgfatools/artifacts.rb +29 -0
- data/lib/rgfatools/copy_number.rb +126 -0
- data/lib/rgfatools/invertible_segments.rb +104 -0
- data/lib/rgfatools/linear_paths.rb +140 -0
- data/lib/rgfatools/multiplication.rb +194 -0
- data/lib/rgfatools/p_bubbles.rb +66 -0
- data/lib/rgfatools/superfluous_links.rb +64 -0
- metadata +97 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
# A path line of a RGFA file
|
2
|
+
class RGFA::Line::Path < RGFA::Line
|
3
|
+
|
4
|
+
RECORD_TYPE = :P
|
5
|
+
REQFIELDS = [:path_name, :segment_names, :cigars]
|
6
|
+
PREDEFINED_OPTFIELDS = []
|
7
|
+
DATATYPE = {
|
8
|
+
:path_name => :lbl,
|
9
|
+
:segment_names => :lbs,
|
10
|
+
:cigars => :cgs,
|
11
|
+
}
|
12
|
+
|
13
|
+
define_field_methods!
|
14
|
+
|
15
|
+
# @note The field names are derived from the RGFA specification at:
|
16
|
+
# https://github.com/pmelsted/RGFA-spec/blob/master/RGFA-spec.md#path-line
|
17
|
+
# and were made all downcase with _ separating words;
|
18
|
+
# the cigar and segment_name regexps and name were changed to better
|
19
|
+
# implement what written in the commentaries of the specification
|
20
|
+
# (i.e. name pluralized and regexp changed to a comma-separated list
|
21
|
+
# for segment_name of segment names and orientations and for cigar of
|
22
|
+
# CIGAR strings);
|
23
|
+
|
24
|
+
# @return [Symbol] name of the path as symbol
|
25
|
+
def to_sym
|
26
|
+
name.to_sym
|
27
|
+
end
|
28
|
+
|
29
|
+
# Is the path circular? In this case the number of CIGARs must be
|
30
|
+
# equal to the number of segments.
|
31
|
+
# @return [Boolean]
|
32
|
+
def circular?
|
33
|
+
self.cigars.size == self.segment_names.size
|
34
|
+
end
|
35
|
+
|
36
|
+
# Is the path linear? This is the case when the number of CIGARs
|
37
|
+
# is equal to the number of segments minus 1, or the CIGARs are
|
38
|
+
# represented by a single "*".
|
39
|
+
def linear?
|
40
|
+
!circular?
|
41
|
+
end
|
42
|
+
|
43
|
+
# Are the cigars a single "*"? This is a compact representation of
|
44
|
+
# a linear path where all CIGARs are "*"
|
45
|
+
# @return [Boolean]
|
46
|
+
def undef_cigars?
|
47
|
+
self.cigars.size == 1 and self.cigars[0].empty?
|
48
|
+
end
|
49
|
+
|
50
|
+
# The links to which the path refers; it can be an empty array
|
51
|
+
# (e.g. from a line which is not embedded in a graph);
|
52
|
+
# the boolean is true if the equivalent reverse link is used.
|
53
|
+
# @return [Array<RGFA::Line::Link, Boolean>]
|
54
|
+
def links
|
55
|
+
@links ||= []
|
56
|
+
@links
|
57
|
+
end
|
58
|
+
|
59
|
+
# computes the list of links which are required to support
|
60
|
+
# the path
|
61
|
+
# @return [Array<[RGFA::OrientedSegment, RGFA::OrientedSegment, RGFA::Cigar]>]
|
62
|
+
# an array, which elements are 3-tuples (from oriented segment,
|
63
|
+
# to oriented segment, cigar)
|
64
|
+
def required_links
|
65
|
+
has_undef_cigars = self.undef_cigars?
|
66
|
+
retval = []
|
67
|
+
self.segment_names.size.times do |i|
|
68
|
+
j = i+1
|
69
|
+
if j == self.segment_names.size
|
70
|
+
circular? ? j = 0 : break
|
71
|
+
end
|
72
|
+
cigar = has_undef_cigars ? [] : self.cigars[i]
|
73
|
+
retval << [self.segment_names[i], self.segment_names[j], cigar]
|
74
|
+
end
|
75
|
+
retval
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def validate_lists_size!
|
81
|
+
n_cigars = self.cigars.size
|
82
|
+
n_segments = self.segment_names.size
|
83
|
+
if n_cigars == n_segments - 1
|
84
|
+
# case 1: linear path
|
85
|
+
return true
|
86
|
+
elsif n_cigars == 1 and self.cigars[0].empty?
|
87
|
+
# case 2: linear path, single "*" to represent cigars which are all "*"
|
88
|
+
return true
|
89
|
+
elsif n_cigars == n_segments
|
90
|
+
# case 3: circular path
|
91
|
+
else
|
92
|
+
raise RGFA::Line::Path::ListLengthsError,
|
93
|
+
"Path has #{n_segments} oriented segments, "+
|
94
|
+
"but #{n_cigars} CIGARs"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def validate_record_type_specific_info!
|
99
|
+
validate_lists_size!
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
end
|
104
|
+
|
105
|
+
# Error raised if number of segments and cigars are not consistent
|
106
|
+
class RGFA::Line::Path::ListLengthsError < RGFA::Error; end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
# A segment line of a RGFA file
|
2
|
+
class RGFA::Line::Segment < RGFA::Line
|
3
|
+
|
4
|
+
RECORD_TYPE = :S
|
5
|
+
REQFIELDS = [:name, :sequence]
|
6
|
+
PREDEFINED_OPTFIELDS = [:LN, :RC, :FC, :KC, :SH, :UR]
|
7
|
+
DATATYPE = {
|
8
|
+
:name => :lbl,
|
9
|
+
:sequence => :seq,
|
10
|
+
:LN => :i,
|
11
|
+
:RC => :i,
|
12
|
+
:FC => :i,
|
13
|
+
:KC => :i,
|
14
|
+
:SH => :H,
|
15
|
+
:UR => :Z
|
16
|
+
}
|
17
|
+
|
18
|
+
define_field_methods!
|
19
|
+
|
20
|
+
attr_writer :links, :containments, :paths
|
21
|
+
|
22
|
+
# References to the links in which the segment is involved.
|
23
|
+
#
|
24
|
+
# @!macro references_table
|
25
|
+
# The references are in four arrays which are
|
26
|
+
# accessed from a nested hash table. The first key is
|
27
|
+
# the direction (from or to), the second is the orientation
|
28
|
+
# (+ or -).
|
29
|
+
#
|
30
|
+
# @example
|
31
|
+
# segment.links[:from][:+]
|
32
|
+
#
|
33
|
+
# @return [Hash{RGFA::Line::DIRECTION => Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Link>}}]
|
34
|
+
def links
|
35
|
+
@links ||= {:from => {:+ => [], :- => []},
|
36
|
+
:to => {:+ => [], :- => []}}
|
37
|
+
@links
|
38
|
+
end
|
39
|
+
|
40
|
+
# References to the containments in which the segment is involved.
|
41
|
+
# @!macro references_table
|
42
|
+
#
|
43
|
+
# @example
|
44
|
+
# segment.containments[:from][:+]
|
45
|
+
#
|
46
|
+
# @return [Hash{RGFA::Line::DIRECTION => Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Containment>}}]
|
47
|
+
def containments
|
48
|
+
@containments ||= {:from => {:+ => [], :- => []},
|
49
|
+
:to => {:+ => [], :- => []}}
|
50
|
+
@containments
|
51
|
+
end
|
52
|
+
|
53
|
+
# References to the containments in which the segment is involved.
|
54
|
+
#
|
55
|
+
# The references are in two arrays which are
|
56
|
+
# accessed from a hash table. The key is the orientation
|
57
|
+
# (+ or -).
|
58
|
+
#
|
59
|
+
# @example
|
60
|
+
# segment.paths[:+]
|
61
|
+
#
|
62
|
+
# @return [Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Path>}]
|
63
|
+
def paths
|
64
|
+
@paths ||= {:+ => [], :- => []}
|
65
|
+
@paths
|
66
|
+
end
|
67
|
+
|
68
|
+
# All containments where a segment is involved.
|
69
|
+
# @!macro this_is_a_copy
|
70
|
+
# @note the list shall be considered read-only, as this
|
71
|
+
# is a copy of the original arrays of references, concatenated
|
72
|
+
# to each other.
|
73
|
+
def all_containments
|
74
|
+
l = self.containments
|
75
|
+
l[:from][:+] + l[:from][:-] + l[:to][:+] + l[:to][:-]
|
76
|
+
end
|
77
|
+
|
78
|
+
# All links where the segment is involved.
|
79
|
+
# @!macro this_is_a_copy
|
80
|
+
def all_links
|
81
|
+
l = self.links
|
82
|
+
l[:from][:+] + l[:from][:-] + l[:to][:+] + l[:to][:-]
|
83
|
+
end
|
84
|
+
|
85
|
+
# All links and containments where the segment is involved.
|
86
|
+
# @!macro this_is_a_copy
|
87
|
+
def all_connections
|
88
|
+
all_links + all_containments
|
89
|
+
end
|
90
|
+
|
91
|
+
# All paths where the segment is involved.
|
92
|
+
# @!macro this_is_a_copy
|
93
|
+
def all_paths
|
94
|
+
pt = self.paths
|
95
|
+
pt[:+] + pt[:-]
|
96
|
+
end
|
97
|
+
|
98
|
+
# All paths, links and containments where the segment is involved.
|
99
|
+
# @!macro this_is_a_copy
|
100
|
+
def all_references
|
101
|
+
all_connections + all_paths
|
102
|
+
end
|
103
|
+
|
104
|
+
# @raise [RGFA::Line::Segment::InconsistentLengthError]
|
105
|
+
# if sequence length and LN tag are not consistent.
|
106
|
+
def validate_length!
|
107
|
+
if sequence != "*" and optional_fieldnames.include?(:LN)
|
108
|
+
if self.LN != sequence.length
|
109
|
+
raise RGFA::Line::Segment::InconsistentLengthError,
|
110
|
+
"Length in LN tag (#{self.LN}) "+
|
111
|
+
"is different from length of sequence field (#{sequence.length})"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# @!macro [new] length
|
117
|
+
# @return [Integer] value of LN tag, if segment has LN tag
|
118
|
+
# @return [Integer] sequence length if no LN and sequence not "*"
|
119
|
+
# @return [nil] if sequence is "*"
|
120
|
+
# @see #length!
|
121
|
+
def length
|
122
|
+
if self.LN
|
123
|
+
self.LN
|
124
|
+
elsif sequence != "*"
|
125
|
+
sequence.length
|
126
|
+
else
|
127
|
+
nil
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# @!macro length
|
132
|
+
# @!macro [new] length_needed
|
133
|
+
# @raise [RGFA::Line::Segment::UndefinedLengthError] if not an LN tag and
|
134
|
+
# the sequence is "*"
|
135
|
+
# @see #length
|
136
|
+
def length!
|
137
|
+
l = self.length()
|
138
|
+
raise RGFA::Line::Segment::UndefinedLengthError,
|
139
|
+
"No length information available" if l.nil?
|
140
|
+
return l
|
141
|
+
end
|
142
|
+
|
143
|
+
# @!macro [new] coverage
|
144
|
+
# The coverage computed from a count_tag.
|
145
|
+
# If unit_length is provided then: count/(length-unit_length+1),
|
146
|
+
# otherwise: count/length.
|
147
|
+
# The latter is a good approximation if length >>> unit_length.
|
148
|
+
# @param [Symbol] count_tag <i>(defaults to +:RC+)</i>
|
149
|
+
# integer tag storing the count, usually :KC, :RC or :FC
|
150
|
+
# @param [Integer] unit_length the (average) length of a read (for
|
151
|
+
# :RC), fragment (for :FC) or k-mer (for :KC)
|
152
|
+
# @return [Integer] coverage, if count_tag and length are defined
|
153
|
+
# @return [nil] otherwise
|
154
|
+
# @see #coverage!
|
155
|
+
def coverage(count_tag: :RC, unit_length: 1)
|
156
|
+
if optional_fieldnames.include?(count_tag) and self.length
|
157
|
+
return (self.get(count_tag).to_f)/(self.length-unit_length+1)
|
158
|
+
else
|
159
|
+
return nil
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# @see #coverage
|
164
|
+
# @!macro coverage
|
165
|
+
# @raise [RGFA::Line::TagMissingError] if segment does not have count_tag
|
166
|
+
# @!macro length_needed
|
167
|
+
def coverage!(count_tag: :RC, unit_length: 1)
|
168
|
+
c = coverage(count_tag: count_tag, unit_length: unit_length)
|
169
|
+
if c.nil?
|
170
|
+
self.length!
|
171
|
+
raise RGFA::Line::TagMissingError,
|
172
|
+
"Tag #{count_tag} undefined for segment #{name}"
|
173
|
+
else
|
174
|
+
return c
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# @return string representation of the segment
|
179
|
+
# @param [Boolean] without_sequence if +true+, output "*" instead of sequence
|
180
|
+
def to_s(without_sequence: false)
|
181
|
+
if !without_sequence
|
182
|
+
return super()
|
183
|
+
else
|
184
|
+
saved = self.sequence
|
185
|
+
self.sequence = "*"
|
186
|
+
retval = super()
|
187
|
+
self.sequence = saved
|
188
|
+
return retval
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
# @return [Symbol] name of the segment as symbol
|
193
|
+
def to_sym
|
194
|
+
name.to_sym
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
|
199
|
+
def validate_record_type_specific_info!
|
200
|
+
validate_length!
|
201
|
+
end
|
202
|
+
|
203
|
+
end
|
204
|
+
|
205
|
+
# Error raised if length of segment cannot be computed
|
206
|
+
class RGFA::Line::Segment::UndefinedLengthError < RGFA::Error; end
|
207
|
+
|
208
|
+
# Error raised if length of segment and LN are not consistent
|
209
|
+
class RGFA::Line::Segment::InconsistentLengthError < RGFA::Error; end
|
@@ -0,0 +1,285 @@
|
|
1
|
+
require_relative "segment_ends_path"
|
2
|
+
|
3
|
+
#
|
4
|
+
# Methods for the RGFA class, which allow to find and merge linear paths.
|
5
|
+
#
|
6
|
+
module RGFA::LinearPaths
|
7
|
+
|
8
|
+
require "set"
|
9
|
+
|
10
|
+
#
|
11
|
+
# Find a path without branches.
|
12
|
+
#
|
13
|
+
# The path must
|
14
|
+
# include +segment+ and excludes segments in +exclude+.
|
15
|
+
# Any segment used in the returned path will be added to +exclude+
|
16
|
+
#
|
17
|
+
# @param s [String|RGFA::Line::Segment] a segment name or instance
|
18
|
+
# @param exclude [Set<String>] a set of segment names to exclude from the path
|
19
|
+
# @return [Array<RGFA::SegmentEnd>]
|
20
|
+
#
|
21
|
+
def linear_path(s, exclude = Set.new)
|
22
|
+
s = s.to_sym
|
23
|
+
cs = connectivity(s)
|
24
|
+
segpath = RGFA::SegmentEndsPath.new()
|
25
|
+
[:B, :E].each_with_index do |et, i|
|
26
|
+
if cs[i] == 1
|
27
|
+
exclude << s
|
28
|
+
segpath.pop
|
29
|
+
segpath += traverse_linear_path(RGFA::SegmentEnd.new([s, et]), exclude)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return (segpath.size < 2) ? nil : segpath
|
33
|
+
end
|
34
|
+
|
35
|
+
# Find all unbranched paths in the graph.
|
36
|
+
#
|
37
|
+
# @return [Array<Array<RGFA::SegmentEnd>>]
|
38
|
+
def linear_paths
|
39
|
+
exclude = Set.new
|
40
|
+
retval = []
|
41
|
+
segnames = segment_names
|
42
|
+
progress_log_init(:linear_paths, "segments", segnames.size,
|
43
|
+
"Detect linear paths (#{segnames.size} segments)") if @progress
|
44
|
+
segnames.each do |sn|
|
45
|
+
progress_log(:linear_paths) if @progress
|
46
|
+
next if exclude.include?(sn)
|
47
|
+
retval << linear_path(sn, exclude)
|
48
|
+
end
|
49
|
+
progress_log_end(:linear_paths)
|
50
|
+
return retval.compact
|
51
|
+
end
|
52
|
+
|
53
|
+
# Merge a linear path, i.e. a path of segments without extra-branches
|
54
|
+
# @!macro [new] merge_lim
|
55
|
+
# Limitations: all containments und paths involving merged segments are
|
56
|
+
# deleted.
|
57
|
+
#
|
58
|
+
# @param segpath [Array<RGFA::SegmentEnd>] a linear path, such as that
|
59
|
+
# retrieved by {#linear_path}
|
60
|
+
# @!macro [new] merge_options
|
61
|
+
# @param options [Hash] optional keyword arguments
|
62
|
+
# @option options [String, :short, nil] :merged_name (nil)
|
63
|
+
# if nil, the merged_name is automatically computed; if :short,
|
64
|
+
# a name is computed starting with "merged1" and calling next until
|
65
|
+
# an available name is founf; if String, the name to use
|
66
|
+
# @option options [Boolean] :cut_counts (false)
|
67
|
+
# if true, total count in merged segment m, composed of segments
|
68
|
+
# s of set S is multiplied by the factor Sum(|s in S|)/|m|
|
69
|
+
#
|
70
|
+
# @return [RGFA] self
|
71
|
+
# @see #merge_linear_paths
|
72
|
+
def merge_linear_path(segpath, **options)
|
73
|
+
return if segpath.size < 2
|
74
|
+
segpath.map!{|se|se.to_segment_end}
|
75
|
+
if segpath[1..-2].any? {|sn,et| connectivity(sn) != [1,1]}
|
76
|
+
raise ArgumentError, "The specified path is not linear"
|
77
|
+
end
|
78
|
+
merged, first_reversed, last_reversed =
|
79
|
+
create_merged_segment(segpath, options)
|
80
|
+
self << merged
|
81
|
+
link_merged(merged.name, segpath.first.to_segment_end.invert_end_type,
|
82
|
+
first_reversed)
|
83
|
+
link_merged(merged.name, segpath.last, last_reversed)
|
84
|
+
segpath.each do |sn_et|
|
85
|
+
delete_segment(sn_et.segment)
|
86
|
+
progress_log(:merge_linear_paths, 0.05) if @progress
|
87
|
+
end
|
88
|
+
self
|
89
|
+
end
|
90
|
+
|
91
|
+
# Merge all linear paths in the graph, i.e.
|
92
|
+
# paths of segments without extra-branches
|
93
|
+
# @!macro merge_lim
|
94
|
+
# @!macro merge_options
|
95
|
+
#
|
96
|
+
# @return [RGFA] self
|
97
|
+
def merge_linear_paths(**options)
|
98
|
+
paths = linear_paths
|
99
|
+
psize = paths.flatten.size / 2
|
100
|
+
progress_log_init(:merge_linear_paths, "segments", psize,
|
101
|
+
"Merge #{paths.size} linear paths (#{psize} segments)") if @progress
|
102
|
+
paths.each do |path|
|
103
|
+
merge_linear_path(path, **options)
|
104
|
+
end
|
105
|
+
progress_log_end(:merge_linear_paths)
|
106
|
+
self
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
# Traverse the links, starting from the segment +from+ :E end if
|
112
|
+
# +traverse_from_E_end+ is true, or :B end otherwise.
|
113
|
+
#
|
114
|
+
# If any segment after +from+ is found whose name is included in +exclude+
|
115
|
+
# the traversing is interrupted. The +exclude+ set is updated, so that
|
116
|
+
# circular paths are avoided.
|
117
|
+
#
|
118
|
+
# *Arguments*:
|
119
|
+
# - +from+ -> first segment
|
120
|
+
# - +traverse_from_E_end+ -> if true, start from E end, otherwise from B end
|
121
|
+
# - +exclude+ -> Set of names of already visited segments
|
122
|
+
#
|
123
|
+
# *Side Effects*:
|
124
|
+
# - Any element added to the returned list is also added to +exclude+
|
125
|
+
#
|
126
|
+
# *Returns*:
|
127
|
+
# - An array of segment names of the unbranched path.
|
128
|
+
# If +from+ is not an element of an unbranched path then [].
|
129
|
+
# Otherwise the first (and possibly only) element is +from+.
|
130
|
+
# All elements in the index range 1..-2 are :internal.
|
131
|
+
def traverse_linear_path(segment_end, exclude)
|
132
|
+
list = RGFA::SegmentEndsPath.new()
|
133
|
+
current = segment_end
|
134
|
+
loop do
|
135
|
+
after = links_of(current)
|
136
|
+
before = links_of(current.to_segment_end.invert_end_type)
|
137
|
+
cs = connectivity_symbols(before.size, after.size)
|
138
|
+
if cs == [1,1] or list.empty?
|
139
|
+
list << current
|
140
|
+
exclude << current.name
|
141
|
+
l = after.first
|
142
|
+
current = l.other_end(current).invert_end_type
|
143
|
+
break if exclude.include?(current.name)
|
144
|
+
elsif cs[0] == 1
|
145
|
+
list << current
|
146
|
+
exclude << current.name
|
147
|
+
break
|
148
|
+
else
|
149
|
+
break
|
150
|
+
end
|
151
|
+
end
|
152
|
+
return segment_end.end_type == :B ? list.reverse : list
|
153
|
+
end
|
154
|
+
|
155
|
+
def sum_of_counts(segpath, multfactor = 1)
|
156
|
+
retval = {}
|
157
|
+
segs = segpath.map {|sn,et|segment!(sn)}
|
158
|
+
[:KC, :RC, :FC].each do |count_tag|
|
159
|
+
segs.each do |s|
|
160
|
+
if s.optional_fieldnames.include?(count_tag)
|
161
|
+
retval[count_tag] ||= 0
|
162
|
+
retval[count_tag] += s.get(count_tag)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
if retval[count_tag]
|
166
|
+
retval[count_tag] = (retval[count_tag] * multfactor).to_i
|
167
|
+
end
|
168
|
+
end
|
169
|
+
return retval
|
170
|
+
end
|
171
|
+
|
172
|
+
def reverse_segment_name(name, separator)
|
173
|
+
name.to_s.split(separator).map do |part|
|
174
|
+
openp = part[0] == "("
|
175
|
+
part = part[1..-1] if openp
|
176
|
+
closep = part[-1] == ")"
|
177
|
+
part = part[0..-2] if closep
|
178
|
+
part = (part[-1] == "^") ? part[0..-2] : part+"^"
|
179
|
+
part += ")" if openp
|
180
|
+
part = "(#{part}" if closep
|
181
|
+
part
|
182
|
+
end.reverse.join(separator)
|
183
|
+
end
|
184
|
+
|
185
|
+
def reverse_pos_array(pos_array, lastpos)
|
186
|
+
return nil if pos_array.nil? or lastpos.nil?
|
187
|
+
pos_array.map {|pos| lastpos - pos + 1}.reverse
|
188
|
+
end
|
189
|
+
|
190
|
+
def add_segment_to_merged(merged, segment, reversed, cut, init, options)
|
191
|
+
s = (reversed ? segment.sequence.rc[cut..-1] : segment.sequence[cut..-1])
|
192
|
+
if init
|
193
|
+
merged.sequence = s
|
194
|
+
merged.name = (options[:merged_name].nil? ?
|
195
|
+
segment.name : options[:merged_name])
|
196
|
+
merged.LN = segment.LN
|
197
|
+
else
|
198
|
+
(segment.sequence == "*") ? (merged.sequence = "*")
|
199
|
+
: (merged.sequence += s)
|
200
|
+
if options[:merged_name].nil?
|
201
|
+
merged.name = "#{merged.name}_#{segment.name}"
|
202
|
+
end
|
203
|
+
if merged.LN
|
204
|
+
segment.LN ? merged.LN += (segment.LN - cut)
|
205
|
+
: merged.LN = nil
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def create_merged_segment(segpath, options)
|
211
|
+
merged = segment!(segpath.first.first).clone
|
212
|
+
total_cut = 0
|
213
|
+
a = segpath.first
|
214
|
+
first_reversed = (a.end_type == :B)
|
215
|
+
last_reversed = nil
|
216
|
+
if options[:merged_name] == :short
|
217
|
+
forbidden = (segment_names + path_names)
|
218
|
+
options[:merged_name] = "merged1"
|
219
|
+
while forbidden.include?(options[:merged_name])
|
220
|
+
options[:merged_name] = options[:merged_name].next
|
221
|
+
end
|
222
|
+
end
|
223
|
+
add_segment_to_merged(merged, segment(a.segment), first_reversed, 0, true,
|
224
|
+
options)
|
225
|
+
progress_log(:merge_linear_paths, 0.95) if @progress
|
226
|
+
(segpath.size-1).times do |i|
|
227
|
+
b = segpath[i+1].to_segment_end.invert_end_type
|
228
|
+
l = link!(a, b)
|
229
|
+
if l.overlap == []
|
230
|
+
cut = 0
|
231
|
+
elsif l.overlap.all?{|op|[:M, :"="].include?(op.code)}
|
232
|
+
cut = l.overlap.map(&:len).inject(:+)
|
233
|
+
else
|
234
|
+
raise ArgumentError,
|
235
|
+
"Merging is only allowed if all operations are M/="
|
236
|
+
end
|
237
|
+
total_cut += cut
|
238
|
+
last_reversed = (b[1] == :E)
|
239
|
+
add_segment_to_merged(merged, segment(b.segment), last_reversed, cut,
|
240
|
+
false, options)
|
241
|
+
a = b.to_segment_end.invert_end_type
|
242
|
+
if @progress
|
243
|
+
progress_log(:merge_linear_paths, 0.95)
|
244
|
+
end
|
245
|
+
end
|
246
|
+
if merged.sequence != "*"
|
247
|
+
if merged.LN.nil?
|
248
|
+
merged.LN = merged.sequence.length
|
249
|
+
elsif @validate and merged.LN != merged.sequence.length
|
250
|
+
raise RGFA::Line::Segment::InconsistentLengthError,
|
251
|
+
"Computed sequence length #{merged.sequence.length} "+
|
252
|
+
"and computed LN #{merged.LN} differ"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
if merged.LN.nil?
|
256
|
+
[:KC, :RC, :FC].each {|count_tag| merged.set(count_tag, nil)}
|
257
|
+
else
|
258
|
+
sum_of_counts(segpath, (options[:cut_counts] ?
|
259
|
+
merged.LN.to_f / (total_cut+merged.LN) : 1)).
|
260
|
+
each do |count_tag, count|
|
261
|
+
merged.set(count_tag, count)
|
262
|
+
end
|
263
|
+
end
|
264
|
+
return merged, first_reversed, last_reversed
|
265
|
+
end
|
266
|
+
|
267
|
+
def link_merged(merged_name, segment_end, reversed)
|
268
|
+
links_of(segment_end).each do |l|
|
269
|
+
l2 = l.clone
|
270
|
+
if l2.to == segment_end.first
|
271
|
+
l2.to = merged_name
|
272
|
+
if reversed
|
273
|
+
l2.to_orient = RGFA::OrientedSegment.invert(l2.to_orient)
|
274
|
+
end
|
275
|
+
else
|
276
|
+
l2.from = merged_name
|
277
|
+
if reversed
|
278
|
+
l2.from_orient = RGFA::OrientedSegment.invert(l2.from_orient)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
self << l2
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
end
|