trace_visualization 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +339 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +4 -0
  7. data/Rakefile +1 -0
  8. data/lib/trace_visualization/bwt.rb +32 -0
  9. data/lib/trace_visualization/bwt.rbold +32 -0
  10. data/lib/trace_visualization/data/irepetition.rb +49 -0
  11. data/lib/trace_visualization/data/repetition.rb +101 -0
  12. data/lib/trace_visualization/generators.rb +53 -0
  13. data/lib/trace_visualization/longest_common_prefix.rb +34 -0
  14. data/lib/trace_visualization/mapping.rb +120 -0
  15. data/lib/trace_visualization/reorder.rb +25 -0
  16. data/lib/trace_visualization/repetitions.rb +66 -0
  17. data/lib/trace_visualization/repetitions_concatenation.rb +134 -0
  18. data/lib/trace_visualization/repetitions_context.rb +18 -0
  19. data/lib/trace_visualization/repetitions_incrementation.rb +81 -0
  20. data/lib/trace_visualization/repetitions_psy.rb +83 -0
  21. data/lib/trace_visualization/suffix_array.rb +203 -0
  22. data/lib/trace_visualization/utils.rb +47 -0
  23. data/lib/trace_visualization/version.rb +3 -0
  24. data/lib/trace_visualization/visualization/console_color_print.rb +32 -0
  25. data/lib/trace_visualization.rb +10 -0
  26. data/spec/bwt_spec.rb +47 -0
  27. data/spec/generators_spec.rb +30 -0
  28. data/spec/longest_common_prefix_spec.rb +29 -0
  29. data/spec/mapping_spec.rb +67 -0
  30. data/spec/reorder_spec.rb +42 -0
  31. data/spec/repetitions_concatenation_spec.rb +58 -0
  32. data/spec/repetitions_incrementation_spec.rb +88 -0
  33. data/spec/repetitions_psy_spec.rb +39 -0
  34. data/spec/repetitions_spec.rb +18 -0
  35. data/spec/spec_helper.rb +19 -0
  36. data/spec/suffix_array_spec.rb +68 -0
  37. data/trace_visualization.gemspec +35 -0
  38. metadata +204 -0
@@ -0,0 +1,34 @@
1
+ module TraceVisualization
2
+ module LongestCommonPrefix
3
+
4
+ # A linear-time algorithm to compute the longest common prefix information
5
+ # in suffix arrays from article: "Linear-Time Longest-Common-Prefix
6
+ # Computation in Suffix Arrays and Its Applications" Toru Kasai et al.
7
+ #
8
+ # The method signature and variable names are stored under the specified
9
+ # work without changes.
10
+ def self.effective(a, pos, n)
11
+ rank = Array.new(n, 0)
12
+ height = Array.new(n, 0)
13
+
14
+ for i in 0 ... n
15
+ rank[pos[i]] = i
16
+ end
17
+
18
+ h = 0
19
+ for i in 0 ... n
20
+ if (rank[i] > 0)
21
+ j = pos[rank[i] - 1]
22
+ while a[i + h] == a[j + h]
23
+ h += 1
24
+ end
25
+ height[rank[i]] = h
26
+ h -= 1 if h > 0
27
+ end
28
+ end
29
+
30
+ height
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,120 @@
1
+ module TraceVisualization
2
+ module Mapping
3
+ require 'time'
4
+ require 'ipaddr'
5
+
6
+ require 'trace_visualization/reorder'
7
+
8
+ PATTERNS = {
9
+
10
+ "id" => [
11
+ /(?<value>\[\d{3,}\])/
12
+ ],
13
+
14
+ "ip" => [
15
+ /(?<value>(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))/
16
+ ],
17
+
18
+ "time" => [
19
+ /(?<value>\[\d{2} [a-zA-Z]{3} \d{4} \d{2}\:\d{2}\:\d{2}\])/
20
+ ]
21
+
22
+ }
23
+
24
+ class Item
25
+ include Comparable
26
+
27
+ attr_reader :value # integer value for comparison
28
+ attr_reader :src # source value
29
+ attr_reader :type # source type
30
+
31
+ attr_accessor :ord # re-order value
32
+
33
+ def initialize(src, type)
34
+ @src = src
35
+ @type = type
36
+
37
+ case type
38
+ when "id"
39
+ @value = @src[1 ... -1].to_i
40
+ when "ip"
41
+ @value = IPAddr.new(src).to_i
42
+ when "time"
43
+ @value = Time.parse(@src[1 ... -1]).to_i
44
+ when "char"
45
+ @value = src.getbyte(0)
46
+ else
47
+ raise Exception.new("unknown type")
48
+ end
49
+ end
50
+
51
+ def length
52
+ @src.length
53
+ end
54
+
55
+ def <=>(anOther)
56
+ @ord <=> anOther.ord
57
+ end
58
+
59
+ def to_str
60
+ @src
61
+ end
62
+ end
63
+
64
+ def self.parse(str)
65
+ map = {}
66
+ ppos = []
67
+ itemByPos = {}
68
+
69
+ PATTERNS.each do |type, patterns|
70
+ patterns.each do |pattern|
71
+ match(str, type, pattern, map, ppos, itemByPos)
72
+ end
73
+ end
74
+
75
+ i, j = 0, 0
76
+ result = []
77
+
78
+ ppos.sort!
79
+
80
+ while i < str.size
81
+ if i == ppos[j]
82
+ item = itemByPos[ppos[j]]
83
+ result << item
84
+ i += item.length
85
+ j += 1
86
+ else
87
+ result << Item.new(str[i], "char")
88
+ i += 1
89
+ end
90
+ end
91
+
92
+ TraceVisualization::Reorder.process(result)
93
+
94
+ result
95
+ end
96
+
97
+ def self.match(str, type, pattern, map, ppos, itemByPos)
98
+ pos = 0
99
+
100
+ limit = 1000
101
+
102
+ while (m = pattern.match(str, pos))
103
+ value = m[:value]
104
+ pos = m.begin(0)
105
+ ppos << pos
106
+
107
+ map[value] = Item.new(value, type) unless map[value]
108
+ itemByPos[pos] = map[value]
109
+
110
+ pos += value.size
111
+ end
112
+
113
+ end
114
+
115
+ def self.restore(array)
116
+ array.inject("") { |res, c| res += c.to_str }
117
+ end
118
+
119
+ end
120
+ end
@@ -0,0 +1,25 @@
1
+ module TraceVisualization
2
+ module Reorder
3
+
4
+ # Assign new values (ord field) in order to reduce the distance between min
5
+ # and max values. It's necessary to reduce the size of the alphabet.
6
+ def self.process(data)
7
+ sorted = data.sort do |a, b|
8
+ c = a.value - b.value
9
+ c == 0 ? 0 : (c < 0 ? -1 : 1)
10
+ end
11
+
12
+ idx = 0
13
+ prev = nil
14
+
15
+ sorted.each do |item|
16
+ if prev != item.value
17
+ prev = item.value
18
+ idx += 1
19
+ end
20
+
21
+ item.ord = idx
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,66 @@
1
+ module TraceVisualization
2
+ module Repetitions
3
+
4
+ # The naive approach to finding repetitions in the string.
5
+ # Time complexity: О(n^2)
6
+ def self.naive(str, l)
7
+ result = Array.new(l + 1) { [] }
8
+ result[0] << Array.new(str.length) {|idx| idx + 1 }
9
+
10
+ for i in 1 .. l
11
+ result[i - 1].each do |item|
12
+ counter = char_counter_inc(str, item, i - 1)
13
+ counter.each do |positions|
14
+ result[i] << positions if positions != nil && positions.size > 1
15
+ end
16
+ end
17
+ end
18
+
19
+ result[l].sort
20
+ end
21
+
22
+ # The naive approach to finding repetitions in the string.
23
+ # Time complexity: О(n^2)
24
+ def self.naive_all(str, p_min)
25
+ result = []
26
+
27
+ result << []
28
+ result[0] << Array.new(str.length) {|idx| idx + 1 }
29
+
30
+ idx = 1
31
+ while true
32
+ result << []
33
+
34
+ result[idx - 1].each do |item|
35
+ counter = char_counter_inc(str, item, idx - 1)
36
+ counter.each do |positions|
37
+ if positions != nil && positions.size > 1
38
+ result[idx] << positions
39
+ result[idx - 1].delete_if { |item| item == positions }
40
+ end
41
+ end
42
+ end
43
+
44
+ break if result.last.size == 0
45
+
46
+ idx += 1
47
+ end
48
+
49
+ result[p_min ... -1] || []
50
+ end
51
+
52
+ def self.char_counter_inc(str, pos, offset)
53
+ counter = Array.new(256)
54
+
55
+ pos.each do |pos|
56
+ if pos + offset - 1 < str.length
57
+ c = str[pos + offset - 1].ord
58
+ counter[c] = [] if counter[c] == nil
59
+ counter[c] << pos
60
+ end
61
+ end
62
+
63
+ counter
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,134 @@
1
+ require 'trace_visualization'
2
+
3
+ module TraceVisualization
4
+ module RepetitionsConcatenation
5
+
6
+ def self.process(rs, k, context, options = {})
7
+ opts = {
8
+ :positions_min_size => 3
9
+ }.merge options
10
+
11
+ result = []
12
+
13
+ useful_cnt = 0
14
+
15
+ pairs_cnt = {}
16
+ context.rs_by_line.each do |item|
17
+ for i in 0 ... item.size
18
+ for j in i + 1 ... item.size
19
+ left, right = item[i][0], item[j][0]
20
+ delta = k - left.k - right.k
21
+
22
+ next if not concat_condition(left, right, delta, opts[:positions_min_size])
23
+
24
+ key = (left.id << 32) + right.id
25
+ val = (pairs_cnt[key] || 0) + 1
26
+ pairs_cnt[key] = val
27
+ next if val != opts[:positions_min_size]
28
+
29
+ lps, rps = process_common_positions(left, right, delta, context)
30
+
31
+ if lps.size >= opts[:positions_min_size]
32
+ result << create_repetition(left, right, delta, lps, rps)
33
+ end
34
+
35
+ useful_cnt += 1
36
+ end
37
+ end
38
+ end
39
+
40
+ options[:counter] << [k, useful_cnt] if options[:counter]
41
+
42
+ puts "Total: #{rs.size ** 2} #{useful_cnt} #{result.size}"
43
+
44
+ process_new_repetitions(result)
45
+
46
+ rs.concat(result)
47
+ end
48
+
49
+ def self.process_new_repetitions(rs, context)
50
+ TraceVisualization::Utils.rs_by_line(rs, context.lines_pos, context.rs_by_line)
51
+ end
52
+
53
+ def self.process_full_search(rs, k, context, options = {})
54
+ opts = {
55
+ :positions_min_size => 3
56
+ }.merge options
57
+
58
+ result = []
59
+
60
+ useful_cnt = 0
61
+
62
+ for left in rs
63
+ for right in rs
64
+ delta = k - left.k - right.k
65
+ next if not concat_condition(left, right, delta, opts[:positions_min_size])
66
+
67
+ # @@processed_path.add(key(left, right, delta))
68
+
69
+ lps, rps = process_common_positions(left, right, delta, context)
70
+
71
+ if lps.size >= opts[:positions_min_size]
72
+ result << create_repetition(left, right, delta, lps, rps)
73
+ end
74
+
75
+ useful_cnt += 1
76
+ end
77
+ end
78
+
79
+ puts "Total: #{rs.size ** 2} #{useful_cnt} #{result.size}"
80
+
81
+ rs.concat(result)
82
+ end
83
+
84
+ def self.delete_repetition(r, context)
85
+ r.lines.each do |line|
86
+ context.rs_by_line[line].delete_if { |item| item[0] == r }
87
+ end
88
+ end
89
+
90
+ def self.concat_condition(left, right, delta, positions_min_size)
91
+ delta >= 0 && left.id != right.id &&
92
+ left.positions_size >= positions_min_size &&
93
+ right.positions_size >= positions_min_size
94
+ end
95
+
96
+ # *Attention* Position arrays are modified in place which can lead to side
97
+ # effects. Don't send left == right!
98
+ def self.process_common_positions(left, right, delta, context)
99
+ lr_pos = left.left_positions
100
+ lr_pos.collect! { |pos| pos + left.length + delta }
101
+
102
+ rr_pos = right.left_positions
103
+
104
+ cpr = lr_pos & rr_pos
105
+ cpl = cpr.collect { |pos| pos - left.length - delta }
106
+
107
+ idx = 0
108
+ while idx < cpr.size
109
+ if context.str[cpl[idx] + left.length ... cpr[idx]].scan(TraceVisualization::FORBIDDEN_CHARS).size != 0
110
+ cpr.delete_at(idx)
111
+ cpl.delete_at(idx)
112
+ else
113
+ idx += 1
114
+ end
115
+ end
116
+
117
+ lr_pos.collect! { |lpos| lpos - left.length - delta }
118
+
119
+ [cpl, cpr]
120
+ end
121
+
122
+ def self.create_repetition(left, right, delta, lps, rps)
123
+ r = left.class.new(left.length + right.length + delta, lps, rps)
124
+
125
+ r.k = left.k + right.k + delta
126
+ r.pcount = left.pcount + right.pcount
127
+ r.left = left
128
+ r.right = right
129
+ r.strict_ids = left.strict_ids + right.strict_ids
130
+
131
+ r
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,18 @@
1
+ require 'trace_visualization/utils'
2
+
3
+ module TraceVisualization
4
+ module Repetitions
5
+ class Context
6
+ attr_accessor :str, :lines_pos, :rs_by_line
7
+
8
+ def initialize(str, rs)
9
+ @str = str
10
+ @lines_pos = TraceVisualization::Utils.lines_pos(str)
11
+ @rs_by_line = Array.new(@lines_pos.size) { [] }
12
+
13
+ TraceVisualization::Utils.rs_by_line(rs, @lines_pos, @rs_by_line)
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,81 @@
1
+ require 'trace_visualization/utils'
2
+
3
+ module TraceVisualization
4
+ module RepetitionsIncrementation
5
+
6
+ #--------------------------------------------------------------------------
7
+ def self.incrementation(str, repetitions, hashes, k)
8
+ result = []
9
+
10
+ for core in repetitions
11
+ next if core.k != 0
12
+
13
+ # Positions for Right Incrementation
14
+ positions_ri = []
15
+
16
+ # Positions for Left Incrementation
17
+ positions_li = []
18
+
19
+ for r in repetitions
20
+ next if r == core || core.length + k != r.length
21
+
22
+ clps = core.left_positions
23
+ rlps = r.left_positions
24
+
25
+ common_positions_ri = clps & rlps
26
+ common_positions_li = clps.collect { |p| p - k } & rlps
27
+
28
+ positions_ri |= common_positions_ri if common_positions_ri.size > 2
29
+ positions_li |= common_positions_li if common_positions_li.size > 2
30
+ end
31
+
32
+ if positions_ri.size > 1
33
+ left_positions = positions_ri.sort
34
+ right_positions = left_positions.collect { |pos| pos + core.length + k}
35
+
36
+ hash = TraceVisualization::Utils.rhash(left_positions, right_positions)
37
+
38
+ if (not hashes.include? hash)
39
+ hashes << hash
40
+ result << create_repetition(core, k, left_positions, right_positions, "right")
41
+ end
42
+ end
43
+
44
+ if positions_li.size > 1
45
+ left_positions = positions_li.sort
46
+ right_positions = left_positions.collect { |pos| pos + k}
47
+
48
+ hash = TraceVisualization::Utils.rhash(left_positions, right_positions)
49
+
50
+ if (not hashes.include? hash)
51
+ hashes << hash
52
+ result << create_repetition(core, k, left_positions, right_positions, "left")
53
+ end
54
+ end
55
+ end
56
+
57
+ repetitions.concat(result)
58
+ end
59
+
60
+ #--------------------------------------------------------------------------
61
+ def self.create_repetition(core, k, left_positions, right_positions, type)
62
+ repetition = core.class.new(core.length + k, left_positions, right_positions)
63
+ repetition.k = k
64
+
65
+ fake = fake_repetition(core.class, left_positions, right_positions, type)
66
+
67
+ if type == "left"
68
+ repetition.left, repetition.right = fake, core
69
+ else
70
+ repetition.left, repetition.right = core, fake
71
+ end
72
+
73
+ repetition
74
+ end
75
+
76
+ #---------------------------------------------------------------------------
77
+ def self.fake_repetition(cls, left_positions, right_positions, type)
78
+ cls.new(0, type == "left" ? left_positions : right_positions)
79
+ end
80
+ end # module RepetitionsIncrementation
81
+ end # module TraceVisualization
@@ -0,0 +1,83 @@
1
+ require 'trace_visualization/bwt'
2
+ require 'trace_visualization/suffix_array'
3
+ require 'trace_visualization/longest_common_prefix'
4
+ require 'trace_visualization/data/repetition'
5
+
6
+ module TraceVisualization
7
+ module Repetitions
8
+ def self.psy1(str, p_min, decode_result = true)
9
+ sa = TraceVisualization::SuffixArray.effective(str)
10
+ lcp = TraceVisualization::LongestCommonPrefix.effective(str, sa, str.size)
11
+ bwt = TraceVisualization::BurrowsWheelerTransform.bwt(str, sa, str.length)
12
+
13
+ result = psy1_original(lcp, bwt, 3, str.length)
14
+ result = decode_psy1_result(result, sa) if decode_result
15
+
16
+ result
17
+ end
18
+
19
+ ##
20
+ # PSY1 computes all the complete nonextendible repeats in 'str' of length
21
+ # p >= p_min. Complexity: \Theta(n)
22
+ #
23
+ # Article: Fast Optimal Algorithms for Computing All the Repeats is a String
24
+ # by Simon J. Puglisi, William F. Smyth, Munina Yusufu
25
+ def self.psy1_original(_LCP, _BWT, p_min, n)
26
+ result = []
27
+
28
+ lcp = -1
29
+ lb = 0
30
+ bwt1 = _BWT[0]
31
+
32
+ _LB = []
33
+ _LB.push(:lcp => lcp, :lb => lb, :bwt => bwt1)
34
+
35
+ for j in 0 ... n
36
+ lb = j
37
+
38
+ lcp = j + 1 < n ? _LCP[j + 1] : -1
39
+ bwt2 = j + 1 < n ? _BWT[j + 1] : TraceVisualization::TERMINATION_CHAR
40
+
41
+ bwt = le_letter(bwt1, bwt2)
42
+ bwt1 = bwt2
43
+
44
+ while _LB.last()[:lcp] > lcp
45
+ prev = _LB.pop()
46
+
47
+ if prev[:bwt] == TraceVisualization::TERMINATION_CHAR && prev[:lcp] >= p_min
48
+ result.push(:lcp => prev[:lcp], :i => prev[:lb], :j => j)
49
+ end
50
+
51
+ lb = prev[:lb]
52
+ _LB.last()[:bwt] = le_letter(prev[:bwt], _LB.last()[:bwt])
53
+ bwt = le_letter(prev[:bwt], bwt)
54
+ end
55
+
56
+ if _LB.last()[:lcp] == lcp
57
+ _LB.last()[:bwt] = le_letter(_LB.last()[:bwt], bwt)
58
+ else
59
+ _LB.push(:lcp => lcp, :lb => lb, :bwt => bwt)
60
+ end
61
+ end
62
+
63
+ result
64
+ end
65
+
66
+ def self.le_letter(l1, l2)
67
+ (l1 == TraceVisualization::TERMINATION_CHAR || l1 != l2) ? TraceVisualization::TERMINATION_CHAR : l1
68
+ end
69
+
70
+ def self.decode_psy1_result(result, sa)
71
+ repetitions = []
72
+
73
+ result.each do |item|
74
+ positions = (item[:i] .. item[:j]).collect { |idx| sa[idx] }
75
+ repetitions << TraceVisualization::Data::Repetition.new(item[:lcp], positions.sort)
76
+
77
+ end
78
+
79
+ repetitions
80
+ end
81
+
82
+ end # module Repetitions
83
+ end # module TraceVisualization