trace_visualization 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +339 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +4 -0
  7. data/Rakefile +1 -0
  8. data/lib/trace_visualization/bwt.rb +32 -0
  9. data/lib/trace_visualization/bwt.rbold +32 -0
  10. data/lib/trace_visualization/data/irepetition.rb +49 -0
  11. data/lib/trace_visualization/data/repetition.rb +101 -0
  12. data/lib/trace_visualization/generators.rb +53 -0
  13. data/lib/trace_visualization/longest_common_prefix.rb +34 -0
  14. data/lib/trace_visualization/mapping.rb +120 -0
  15. data/lib/trace_visualization/reorder.rb +25 -0
  16. data/lib/trace_visualization/repetitions.rb +66 -0
  17. data/lib/trace_visualization/repetitions_concatenation.rb +134 -0
  18. data/lib/trace_visualization/repetitions_context.rb +18 -0
  19. data/lib/trace_visualization/repetitions_incrementation.rb +81 -0
  20. data/lib/trace_visualization/repetitions_psy.rb +83 -0
  21. data/lib/trace_visualization/suffix_array.rb +203 -0
  22. data/lib/trace_visualization/utils.rb +47 -0
  23. data/lib/trace_visualization/version.rb +3 -0
  24. data/lib/trace_visualization/visualization/console_color_print.rb +32 -0
  25. data/lib/trace_visualization.rb +10 -0
  26. data/spec/bwt_spec.rb +47 -0
  27. data/spec/generators_spec.rb +30 -0
  28. data/spec/longest_common_prefix_spec.rb +29 -0
  29. data/spec/mapping_spec.rb +67 -0
  30. data/spec/reorder_spec.rb +42 -0
  31. data/spec/repetitions_concatenation_spec.rb +58 -0
  32. data/spec/repetitions_incrementation_spec.rb +88 -0
  33. data/spec/repetitions_psy_spec.rb +39 -0
  34. data/spec/repetitions_spec.rb +18 -0
  35. data/spec/spec_helper.rb +19 -0
  36. data/spec/suffix_array_spec.rb +68 -0
  37. data/trace_visualization.gemspec +35 -0
  38. metadata +204 -0
@@ -0,0 +1,34 @@
1
+ module TraceVisualization
2
+ module LongestCommonPrefix
3
+
4
+ # A linear-time algorithm to compute the longest common prefix information
5
+ # in suffix arrays from article: "Linear-Time Longest-Common-Prefix
6
+ # Computation in Suffix Arrays and Its Applications" Toru Kasai et al.
7
+ #
8
+ # The method signature and variable names are stored under the specified
9
+ # work without changes.
10
+ def self.effective(a, pos, n)
11
+ rank = Array.new(n, 0)
12
+ height = Array.new(n, 0)
13
+
14
+ for i in 0 ... n
15
+ rank[pos[i]] = i
16
+ end
17
+
18
+ h = 0
19
+ for i in 0 ... n
20
+ if (rank[i] > 0)
21
+ j = pos[rank[i] - 1]
22
+ while a[i + h] == a[j + h]
23
+ h += 1
24
+ end
25
+ height[rank[i]] = h
26
+ h -= 1 if h > 0
27
+ end
28
+ end
29
+
30
+ height
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,120 @@
1
+ module TraceVisualization
2
+ module Mapping
3
+ require 'time'
4
+ require 'ipaddr'
5
+
6
+ require 'trace_visualization/reorder'
7
+
8
+ PATTERNS = {
9
+
10
+ "id" => [
11
+ /(?<value>\[\d{3,}\])/
12
+ ],
13
+
14
+ "ip" => [
15
+ /(?<value>(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))/
16
+ ],
17
+
18
+ "time" => [
19
+ /(?<value>\[\d{2} [a-zA-Z]{3} \d{4} \d{2}\:\d{2}\:\d{2}\])/
20
+ ]
21
+
22
+ }
23
+
24
+ class Item
25
+ include Comparable
26
+
27
+ attr_reader :value # integer value for comparison
28
+ attr_reader :src # source value
29
+ attr_reader :type # source type
30
+
31
+ attr_accessor :ord # re-order value
32
+
33
+ def initialize(src, type)
34
+ @src = src
35
+ @type = type
36
+
37
+ case type
38
+ when "id"
39
+ @value = @src[1 ... -1].to_i
40
+ when "ip"
41
+ @value = IPAddr.new(src).to_i
42
+ when "time"
43
+ @value = Time.parse(@src[1 ... -1]).to_i
44
+ when "char"
45
+ @value = src.getbyte(0)
46
+ else
47
+ raise Exception.new("unknown type")
48
+ end
49
+ end
50
+
51
+ def length
52
+ @src.length
53
+ end
54
+
55
+ def <=>(anOther)
56
+ @ord <=> anOther.ord
57
+ end
58
+
59
+ def to_str
60
+ @src
61
+ end
62
+ end
63
+
64
+ def self.parse(str)
65
+ map = {}
66
+ ppos = []
67
+ itemByPos = {}
68
+
69
+ PATTERNS.each do |type, patterns|
70
+ patterns.each do |pattern|
71
+ match(str, type, pattern, map, ppos, itemByPos)
72
+ end
73
+ end
74
+
75
+ i, j = 0, 0
76
+ result = []
77
+
78
+ ppos.sort!
79
+
80
+ while i < str.size
81
+ if i == ppos[j]
82
+ item = itemByPos[ppos[j]]
83
+ result << item
84
+ i += item.length
85
+ j += 1
86
+ else
87
+ result << Item.new(str[i], "char")
88
+ i += 1
89
+ end
90
+ end
91
+
92
+ TraceVisualization::Reorder.process(result)
93
+
94
+ result
95
+ end
96
+
97
+ def self.match(str, type, pattern, map, ppos, itemByPos)
98
+ pos = 0
99
+
100
+ limit = 1000
101
+
102
+ while (m = pattern.match(str, pos))
103
+ value = m[:value]
104
+ pos = m.begin(0)
105
+ ppos << pos
106
+
107
+ map[value] = Item.new(value, type) unless map[value]
108
+ itemByPos[pos] = map[value]
109
+
110
+ pos += value.size
111
+ end
112
+
113
+ end
114
+
115
+ def self.restore(array)
116
+ array.inject("") { |res, c| res += c.to_str }
117
+ end
118
+
119
+ end
120
+ end
@@ -0,0 +1,25 @@
1
+ module TraceVisualization
2
+ module Reorder
3
+
4
+ # Assign new values (ord field) in order to reduce the distance between min
5
+ # and max values. It's necessary to reduce the size of the alphabet.
6
+ def self.process(data)
7
+ sorted = data.sort do |a, b|
8
+ c = a.value - b.value
9
+ c == 0 ? 0 : (c < 0 ? -1 : 1)
10
+ end
11
+
12
+ idx = 0
13
+ prev = nil
14
+
15
+ sorted.each do |item|
16
+ if prev != item.value
17
+ prev = item.value
18
+ idx += 1
19
+ end
20
+
21
+ item.ord = idx
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,66 @@
1
+ module TraceVisualization
2
+ module Repetitions
3
+
4
+ # The naive approach to finding repetitions in the string.
5
+ # Time complexity: О(n^2)
6
+ def self.naive(str, l)
7
+ result = Array.new(l + 1) { [] }
8
+ result[0] << Array.new(str.length) {|idx| idx + 1 }
9
+
10
+ for i in 1 .. l
11
+ result[i - 1].each do |item|
12
+ counter = char_counter_inc(str, item, i - 1)
13
+ counter.each do |positions|
14
+ result[i] << positions if positions != nil && positions.size > 1
15
+ end
16
+ end
17
+ end
18
+
19
+ result[l].sort
20
+ end
21
+
22
+ # The naive approach to finding repetitions in the string.
23
+ # Time complexity: О(n^2)
24
+ def self.naive_all(str, p_min)
25
+ result = []
26
+
27
+ result << []
28
+ result[0] << Array.new(str.length) {|idx| idx + 1 }
29
+
30
+ idx = 1
31
+ while true
32
+ result << []
33
+
34
+ result[idx - 1].each do |item|
35
+ counter = char_counter_inc(str, item, idx - 1)
36
+ counter.each do |positions|
37
+ if positions != nil && positions.size > 1
38
+ result[idx] << positions
39
+ result[idx - 1].delete_if { |item| item == positions }
40
+ end
41
+ end
42
+ end
43
+
44
+ break if result.last.size == 0
45
+
46
+ idx += 1
47
+ end
48
+
49
+ result[p_min ... -1] || []
50
+ end
51
+
52
+ def self.char_counter_inc(str, pos, offset)
53
+ counter = Array.new(256)
54
+
55
+ pos.each do |pos|
56
+ if pos + offset - 1 < str.length
57
+ c = str[pos + offset - 1].ord
58
+ counter[c] = [] if counter[c] == nil
59
+ counter[c] << pos
60
+ end
61
+ end
62
+
63
+ counter
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,134 @@
1
+ require 'trace_visualization'
2
+
3
+ module TraceVisualization
4
+ module RepetitionsConcatenation
5
+
6
+ def self.process(rs, k, context, options = {})
7
+ opts = {
8
+ :positions_min_size => 3
9
+ }.merge options
10
+
11
+ result = []
12
+
13
+ useful_cnt = 0
14
+
15
+ pairs_cnt = {}
16
+ context.rs_by_line.each do |item|
17
+ for i in 0 ... item.size
18
+ for j in i + 1 ... item.size
19
+ left, right = item[i][0], item[j][0]
20
+ delta = k - left.k - right.k
21
+
22
+ next if not concat_condition(left, right, delta, opts[:positions_min_size])
23
+
24
+ key = (left.id << 32) + right.id
25
+ val = (pairs_cnt[key] || 0) + 1
26
+ pairs_cnt[key] = val
27
+ next if val != opts[:positions_min_size]
28
+
29
+ lps, rps = process_common_positions(left, right, delta, context)
30
+
31
+ if lps.size >= opts[:positions_min_size]
32
+ result << create_repetition(left, right, delta, lps, rps)
33
+ end
34
+
35
+ useful_cnt += 1
36
+ end
37
+ end
38
+ end
39
+
40
+ options[:counter] << [k, useful_cnt] if options[:counter]
41
+
42
+ puts "Total: #{rs.size ** 2} #{useful_cnt} #{result.size}"
43
+
44
+ process_new_repetitions(result)
45
+
46
+ rs.concat(result)
47
+ end
48
+
49
+ def self.process_new_repetitions(rs, context)
50
+ TraceVisualization::Utils.rs_by_line(rs, context.lines_pos, context.rs_by_line)
51
+ end
52
+
53
+ def self.process_full_search(rs, k, context, options = {})
54
+ opts = {
55
+ :positions_min_size => 3
56
+ }.merge options
57
+
58
+ result = []
59
+
60
+ useful_cnt = 0
61
+
62
+ for left in rs
63
+ for right in rs
64
+ delta = k - left.k - right.k
65
+ next if not concat_condition(left, right, delta, opts[:positions_min_size])
66
+
67
+ # @@processed_path.add(key(left, right, delta))
68
+
69
+ lps, rps = process_common_positions(left, right, delta, context)
70
+
71
+ if lps.size >= opts[:positions_min_size]
72
+ result << create_repetition(left, right, delta, lps, rps)
73
+ end
74
+
75
+ useful_cnt += 1
76
+ end
77
+ end
78
+
79
+ puts "Total: #{rs.size ** 2} #{useful_cnt} #{result.size}"
80
+
81
+ rs.concat(result)
82
+ end
83
+
84
+ def self.delete_repetition(r, context)
85
+ r.lines.each do |line|
86
+ context.rs_by_line[line].delete_if { |item| item[0] == r }
87
+ end
88
+ end
89
+
90
+ def self.concat_condition(left, right, delta, positions_min_size)
91
+ delta >= 0 && left.id != right.id &&
92
+ left.positions_size >= positions_min_size &&
93
+ right.positions_size >= positions_min_size
94
+ end
95
+
96
+ # *Attention* Position arrays are modified in place which can lead to side
97
+ # effects. Don't send left == right!
98
+ def self.process_common_positions(left, right, delta, context)
99
+ lr_pos = left.left_positions
100
+ lr_pos.collect! { |pos| pos + left.length + delta }
101
+
102
+ rr_pos = right.left_positions
103
+
104
+ cpr = lr_pos & rr_pos
105
+ cpl = cpr.collect { |pos| pos - left.length - delta }
106
+
107
+ idx = 0
108
+ while idx < cpr.size
109
+ if context.str[cpl[idx] + left.length ... cpr[idx]].scan(TraceVisualization::FORBIDDEN_CHARS).size != 0
110
+ cpr.delete_at(idx)
111
+ cpl.delete_at(idx)
112
+ else
113
+ idx += 1
114
+ end
115
+ end
116
+
117
+ lr_pos.collect! { |lpos| lpos - left.length - delta }
118
+
119
+ [cpl, cpr]
120
+ end
121
+
122
+ def self.create_repetition(left, right, delta, lps, rps)
123
+ r = left.class.new(left.length + right.length + delta, lps, rps)
124
+
125
+ r.k = left.k + right.k + delta
126
+ r.pcount = left.pcount + right.pcount
127
+ r.left = left
128
+ r.right = right
129
+ r.strict_ids = left.strict_ids + right.strict_ids
130
+
131
+ r
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,18 @@
1
+ require 'trace_visualization/utils'
2
+
3
+ module TraceVisualization
4
+ module Repetitions
5
+ class Context
6
+ attr_accessor :str, :lines_pos, :rs_by_line
7
+
8
+ def initialize(str, rs)
9
+ @str = str
10
+ @lines_pos = TraceVisualization::Utils.lines_pos(str)
11
+ @rs_by_line = Array.new(@lines_pos.size) { [] }
12
+
13
+ TraceVisualization::Utils.rs_by_line(rs, @lines_pos, @rs_by_line)
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,81 @@
1
+ require 'trace_visualization/utils'
2
+
3
+ module TraceVisualization
4
+ module RepetitionsIncrementation
5
+
6
+ #--------------------------------------------------------------------------
7
+ def self.incrementation(str, repetitions, hashes, k)
8
+ result = []
9
+
10
+ for core in repetitions
11
+ next if core.k != 0
12
+
13
+ # Positions for Right Incrementation
14
+ positions_ri = []
15
+
16
+ # Positions for Left Incrementation
17
+ positions_li = []
18
+
19
+ for r in repetitions
20
+ next if r == core || core.length + k != r.length
21
+
22
+ clps = core.left_positions
23
+ rlps = r.left_positions
24
+
25
+ common_positions_ri = clps & rlps
26
+ common_positions_li = clps.collect { |p| p - k } & rlps
27
+
28
+ positions_ri |= common_positions_ri if common_positions_ri.size > 2
29
+ positions_li |= common_positions_li if common_positions_li.size > 2
30
+ end
31
+
32
+ if positions_ri.size > 1
33
+ left_positions = positions_ri.sort
34
+ right_positions = left_positions.collect { |pos| pos + core.length + k}
35
+
36
+ hash = TraceVisualization::Utils.rhash(left_positions, right_positions)
37
+
38
+ if (not hashes.include? hash)
39
+ hashes << hash
40
+ result << create_repetition(core, k, left_positions, right_positions, "right")
41
+ end
42
+ end
43
+
44
+ if positions_li.size > 1
45
+ left_positions = positions_li.sort
46
+ right_positions = left_positions.collect { |pos| pos + k}
47
+
48
+ hash = TraceVisualization::Utils.rhash(left_positions, right_positions)
49
+
50
+ if (not hashes.include? hash)
51
+ hashes << hash
52
+ result << create_repetition(core, k, left_positions, right_positions, "left")
53
+ end
54
+ end
55
+ end
56
+
57
+ repetitions.concat(result)
58
+ end
59
+
60
+ #--------------------------------------------------------------------------
61
+ def self.create_repetition(core, k, left_positions, right_positions, type)
62
+ repetition = core.class.new(core.length + k, left_positions, right_positions)
63
+ repetition.k = k
64
+
65
+ fake = fake_repetition(core.class, left_positions, right_positions, type)
66
+
67
+ if type == "left"
68
+ repetition.left, repetition.right = fake, core
69
+ else
70
+ repetition.left, repetition.right = core, fake
71
+ end
72
+
73
+ repetition
74
+ end
75
+
76
+ #---------------------------------------------------------------------------
77
+ def self.fake_repetition(cls, left_positions, right_positions, type)
78
+ cls.new(0, type == "left" ? left_positions : right_positions)
79
+ end
80
+ end # module RepetitionsIncrementation
81
+ end # module TraceVisualization
@@ -0,0 +1,83 @@
1
+ require 'trace_visualization/bwt'
2
+ require 'trace_visualization/suffix_array'
3
+ require 'trace_visualization/longest_common_prefix'
4
+ require 'trace_visualization/data/repetition'
5
+
6
+ module TraceVisualization
7
+ module Repetitions
8
+ def self.psy1(str, p_min, decode_result = true)
9
+ sa = TraceVisualization::SuffixArray.effective(str)
10
+ lcp = TraceVisualization::LongestCommonPrefix.effective(str, sa, str.size)
11
+ bwt = TraceVisualization::BurrowsWheelerTransform.bwt(str, sa, str.length)
12
+
13
+ result = psy1_original(lcp, bwt, 3, str.length)
14
+ result = decode_psy1_result(result, sa) if decode_result
15
+
16
+ result
17
+ end
18
+
19
+ ##
20
+ # PSY1 computes all the complete nonextendible repeats in 'str' of length
21
+ # p >= p_min. Complexity: \Theta(n)
22
+ #
23
+ # Article: Fast Optimal Algorithms for Computing All the Repeats is a String
24
+ # by Simon J. Puglisi, William F. Smyth, Munina Yusufu
25
+ def self.psy1_original(_LCP, _BWT, p_min, n)
26
+ result = []
27
+
28
+ lcp = -1
29
+ lb = 0
30
+ bwt1 = _BWT[0]
31
+
32
+ _LB = []
33
+ _LB.push(:lcp => lcp, :lb => lb, :bwt => bwt1)
34
+
35
+ for j in 0 ... n
36
+ lb = j
37
+
38
+ lcp = j + 1 < n ? _LCP[j + 1] : -1
39
+ bwt2 = j + 1 < n ? _BWT[j + 1] : TraceVisualization::TERMINATION_CHAR
40
+
41
+ bwt = le_letter(bwt1, bwt2)
42
+ bwt1 = bwt2
43
+
44
+ while _LB.last()[:lcp] > lcp
45
+ prev = _LB.pop()
46
+
47
+ if prev[:bwt] == TraceVisualization::TERMINATION_CHAR && prev[:lcp] >= p_min
48
+ result.push(:lcp => prev[:lcp], :i => prev[:lb], :j => j)
49
+ end
50
+
51
+ lb = prev[:lb]
52
+ _LB.last()[:bwt] = le_letter(prev[:bwt], _LB.last()[:bwt])
53
+ bwt = le_letter(prev[:bwt], bwt)
54
+ end
55
+
56
+ if _LB.last()[:lcp] == lcp
57
+ _LB.last()[:bwt] = le_letter(_LB.last()[:bwt], bwt)
58
+ else
59
+ _LB.push(:lcp => lcp, :lb => lb, :bwt => bwt)
60
+ end
61
+ end
62
+
63
+ result
64
+ end
65
+
66
+ def self.le_letter(l1, l2)
67
+ (l1 == TraceVisualization::TERMINATION_CHAR || l1 != l2) ? TraceVisualization::TERMINATION_CHAR : l1
68
+ end
69
+
70
+ def self.decode_psy1_result(result, sa)
71
+ repetitions = []
72
+
73
+ result.each do |item|
74
+ positions = (item[:i] .. item[:j]).collect { |idx| sa[idx] }
75
+ repetitions << TraceVisualization::Data::Repetition.new(item[:lcp], positions.sort)
76
+
77
+ end
78
+
79
+ repetitions
80
+ end
81
+
82
+ end # module Repetitions
83
+ end # module TraceVisualization