dawg 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 243a3162eee48baf49a16fc03b7950b55b6583fb
4
- data.tar.gz: 21e7e03637c9418a297dc218b37449989e0da18d
3
+ metadata.gz: 44e60a8bd509c089b4b3d8d251491d798a5fac8b
4
+ data.tar.gz: 69e7885dfcbaadda4167d55abc7c678227a9c395
5
5
  SHA512:
6
- metadata.gz: 283153800b269b13e87b169b0972221612eaa9c4e76b9ebb783985ce7b62dad3a4bcde2c17a33364a653daec504376c828816774d37d13765b7a647df193d653
7
- data.tar.gz: e5ef8892af3655be6eff55abbbac3701b2e51aa0dd67b8a3b3063ca5ea6f7bdc5b61f0bac865d173d8769c861022cad04ec4f00cbd6f17cd0e12768134752fe2
6
+ metadata.gz: ecd9e5bf52c4211e139933f64bf769444d3370f41e74163b3868cfa6959afec72798cf86e300da850a63b4eed4eec88e3507592ee744fc145c98836bea47c8fa
7
+ data.tar.gz: c7d5bec04d7d9567f1cb0db9002172b4fa3a8879a5116ea8bb5fc29492a144faef3f3a6d2d36232c3759e59ef03022a4af041b34ce82280b651e08a577362ab4
@@ -1,167 +1,30 @@
1
- class DawgNode
2
- @@next_id = 0
3
- attr_accessor :edges,:final,:id
4
- def initialize
5
- @id = @@next_id
6
- @@next_id += 1
7
- @final = false
8
- @edges = {}
9
- end
10
-
11
- def to_s
12
- arr = []
13
- if @final
14
- arr<<"1"
15
- else
16
- arr<<"0"
17
- end
1
+ require 'stringio'
2
+ require_relative 'dawg/serialization'
3
+ require_relative 'dawg/finder'
4
+ require_relative 'dawg/word'
5
+ require_relative 'dawg/node/node'
6
+ require_relative 'dawg/node/memory_node'
7
+ require_relative 'dawg/dawg/dawg'
8
+ require_relative 'dawg/dawg/memory_dawg'
18
9
 
19
- for (label, node) in @edges
20
- arr << label
21
- arr << node.id.to_s
22
- end
10
+ module Dawg
11
+ NODE_START = 8
12
+ NODE_SIZE = 21
13
+ EDGE_SIZE = 16
14
+ extend self
23
15
 
24
- arr.join("_")
16
+ def new
17
+ Dawg.new
25
18
  end
26
19
 
27
- def hash
28
- to_s.hash
29
- end
30
-
31
- def eql?(other)
32
- to_s == other.to_s
33
- end
34
- def inspect
35
- "to_s"
36
- end
37
- end
38
-
39
- class Dawg
40
- def initialize
41
- @previousWord = ""
42
- @root = DawgNode.new
43
-
44
- # Here is a list of nodes that have not been checked for duplication.
45
- @uncheckedNodes = []
46
-
47
- # Here is a list of unique nodes that have been checked for
48
- # duplication.
49
- @minimizedNodes = {}
50
- end
51
-
52
- def save(filename)
53
- data = Marshal.dump(self)
54
- File.open(filename, 'w') { |file| file.write(data) }
55
- end
56
-
57
- def self.load(filename)
58
- dawg = Marshal.load( File.open(filename).read )
59
- end
60
-
61
- def insert( word )
62
- if word < @previousWord
63
- raise "Error: Words must be inserted in alphabetical order."
64
- end
65
-
66
- # find common prefix between word and previous word
67
- commonPrefix = 0
68
- for i in 0..[word.length-1, @previousWord.length-1].min
69
- break if word[i] != @previousWord[i]
70
- commonPrefix += 1
71
- end
72
-
73
- # Check the uncheckedNodes for redundant nodes, proceeding from last
74
- # one down to the common prefix size. Then truncate the list at that
75
- # point.
76
- _minimize( commonPrefix )
77
-
78
- # add the suffix, starting from the correct node mid-way through the
79
- # graph
80
- if @uncheckedNodes.length == 0
81
- node = @root
82
- else
83
- node = @uncheckedNodes[-1][2]
84
- end
85
-
86
- for letter in word.split("")[commonPrefix..-1]
87
- nextNode = DawgNode.new
88
- node.edges[letter] = nextNode
89
- @uncheckedNodes<< [node, letter, nextNode]
90
- node = nextNode
20
+ def load(filename, type = :small)
21
+ return case type
22
+ when :small
23
+ MemoryDawg.load(filename)
24
+ when :fast
25
+ Dawg.load(filename)
91
26
  end
92
-
93
- node.final = true
94
- @previousWord = word
95
- end
96
- def finish
97
- # minimize all uncheckedNodes
98
- _minimize( 0 )
99
- end
100
-
101
- def _minimize(downTo)
102
- # proceed from the leaf up to a certain point
103
- for i in (@uncheckedNodes.length - 1).downto(downTo)
104
- parent, letter, child = @uncheckedNodes[i]
105
- if @minimizedNodes.has_key? child
106
- # replace the child with the previously encountered one
107
- parent.edges[letter] = @minimizedNodes[child]
108
- else
109
- # add the state to the minimized nodes.
110
- @minimizedNodes[child] = child
111
- end
112
- @uncheckedNodes.pop
113
- end
114
- end
115
-
116
- def lookup(word)
117
- node = @root
118
- for letter in word.split("")
119
- return false if !node.edges.has_key? letter
120
- node = node.edges[letter]
121
- end
122
- node.final
123
- end
124
-
125
- def find_similar(word)
126
- node = @root
127
- for letter in word.split("")
128
- return [] if !node.edges.has_key? letter
129
- node = node.edges[letter]
130
- end
131
- results = get_recuirsively_all(node)
132
-
133
- return [word].product(results).map(&:join)
27
+ dawg
134
28
  end
135
-
136
- def get_recuirsively_all(node)
137
- suffixes = []
138
29
 
139
- node.edges.each do |key,value|
140
- results = get_recuirsively_all(value)
141
-
142
- # result.flatten! if result.length==2
143
- results.each do |result|
144
- suffixes << [[key] + [result]].flatten.join
145
- end
146
-
147
- suffixes << key if results.empty?
148
-
149
-
150
- end
151
- return suffixes
152
- end
153
- def nodeCount
154
- @minimizedNodes.length
155
- end
156
-
157
- def edgeCount
158
- count = 0
159
- for key,node in @minimizedNodes
160
- count += node.edges.length
161
- end
162
- count
163
- end
164
- def inspect
165
- "Dawg"
166
- end
167
- end
30
+ end
@@ -0,0 +1,143 @@
1
+ module Dawg
2
+ class Dawg
3
+ extend Serialization
4
+ include Serialization
5
+ include Finder
6
+
7
+ attr_accessor :minimized_nodes, :root
8
+
9
+ def initialize
10
+ @previous_word = ''
11
+ @root = Node.new
12
+ @unchecked_nodes = []
13
+ @minimized_nodes = {}
14
+ set_the_node(@root)
15
+ end
16
+
17
+ def insert(word)
18
+ if word < @previous_word #TODO there's should be the way to make adding without this
19
+ raise 'Error: Words must be inserted in alphabetical order.'
20
+ end
21
+
22
+ # find common prefix between word and previous word
23
+ common_prefix = 0
24
+ (0..[word.length-1, @previous_word.length-1].min).each do |i|
25
+ break if word[i] != @previous_word[i]
26
+ common_prefix += 1
27
+ end
28
+
29
+ # Check the uncheckedNodes for redundant nodes, proceeding from last
30
+ # one down to the common prefix size. Then truncate the list at that
31
+ # point.
32
+ minimize(common_prefix)
33
+
34
+ # add the suffix, starting from the correct node mid-way through the
35
+ # graph
36
+ if @unchecked_nodes.length == 0
37
+ node = @root
38
+ else
39
+ node = @unchecked_nodes[-1][2]
40
+ end
41
+
42
+ word.split('')[common_prefix..-1].each do |letter|
43
+ next_node = Node.new
44
+ node.edges[letter] = next_node
45
+ @unchecked_nodes << [node, letter, next_node]
46
+ node = next_node
47
+ end
48
+
49
+ node.final = true
50
+ @previous_word = word
51
+ end
52
+
53
+ def finish
54
+ minimize 0
55
+ @minimized_nodes[@root.hash] = @root
56
+ end
57
+
58
+ def minimize(down_to)
59
+ (@unchecked_nodes.length - 1).downto(down_to) do |i|
60
+ parent, letter, child = @unchecked_nodes[i]
61
+ if @minimized_nodes.has_key? child.hash
62
+ parent.edges[letter] = @minimized_nodes[child.hash]
63
+ else
64
+ child.index = @minimized_nodes.size
65
+ @minimized_nodes[child.hash] = child
66
+ end
67
+ @unchecked_nodes.pop
68
+ end
69
+ end
70
+
71
+ def node_count
72
+ @minimized_nodes.length
73
+ end
74
+
75
+ def edge_count
76
+ count = 0
77
+ @minimized_nodes.each do |hash, node|
78
+ count += node.edges.length
79
+ end
80
+ count
81
+ end
82
+
83
+ def inspect
84
+ 'Dawg'
85
+ end
86
+
87
+
88
+ def save(filename)
89
+ dawg = self
90
+ File.open(filename,'w') do |f|
91
+ write_int(dawg.node_count, f) # overall nodes count
92
+ write_int(dawg.edge_count, f) # overall edge count
93
+ edges_pos = 0
94
+ dawg.minimized_nodes.each do |hash, node|
95
+ write_int(edges_pos, f)
96
+ write_int(node.edges.keys.length, f)
97
+ write_int(node.id, f)
98
+ write_bool(node.final, f)
99
+ write_bigint(hash, f)
100
+ edges_pos += EDGE_SIZE * node.edges.keys.length # position of node's edges in a file
101
+ end
102
+ dawg.minimized_nodes.each do |hash, node|
103
+ node.edges.each do |letter, n|
104
+ write_bigint(n.hash, f)
105
+ write_char(letter, f)
106
+ write_int(n.index,f)
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ def self.load(filename)
113
+ dawg = Dawg.new
114
+ File.open(filename) do |f|
115
+ minimized_nodes_count = load_int(f)
116
+ overall_edges_count = load_int(f)
117
+ minimized_nodes_count.times do
118
+ edges_pos = load_int(f)
119
+ edge_count = load_int(f)
120
+ id = load_int(f)
121
+ final = load_bool(f)
122
+ hash = load_bigint(f)
123
+ node = Node.new(id: id, final: final, edge_count: edge_count)
124
+ dawg.minimized_nodes[hash] = node
125
+ end
126
+
127
+ dawg.minimized_nodes.each do |hash, node|
128
+ node.edge_count.times do
129
+ hash2 = load_bigint(f)
130
+ letter = load_char(f)
131
+ node_index = load_int(f)
132
+ node.edges[letter] = dawg.minimized_nodes[hash2]
133
+ end
134
+ end
135
+ root_key = dawg.minimized_nodes.keys.last
136
+ dawg.minimized_nodes[root_key].edges.each do |letter, node|
137
+ dawg.root.edges[letter] = node
138
+ end
139
+ end
140
+ dawg
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,69 @@
1
+ module Dawg
2
+ class MemoryDawg
3
+ include Serialization
4
+ include Finder
5
+ attr_accessor :slice, :node_count, :edge_count
6
+
7
+ def initialize(slice)
8
+ @slice = slice
9
+ @node_count = get_node_count
10
+ @edge_count = get_edge_count
11
+ set_the_node(root)
12
+ end
13
+
14
+
15
+
16
+ def root
17
+ @root = get_node_by_index(@node_count - 1)
18
+ end
19
+
20
+ def make_io(start, size)
21
+ @slice.pos = start
22
+ StringIO.new(@slice.read(size))
23
+ end
24
+
25
+ def get_node_count
26
+ load_int(make_io(0,4))
27
+ end
28
+
29
+ def get_edge_count
30
+ load_int(make_io(4,4))
31
+ end
32
+
33
+ def get_node_by_index(index)
34
+ pos = NODE_START + (NODE_SIZE * index)
35
+ io = make_io(pos, NODE_SIZE)
36
+ edges_pos = load_int(io)
37
+ edge_count = load_int(io)
38
+ id = load_int(io)
39
+ final = load_bool(io)
40
+ hash = load_bigint(io)
41
+ MemoryNode.new(self, index, edge_count, final, hash, edges_pos)
42
+ end
43
+
44
+ def each_edge(index, &block)
45
+ pos = NODE_START + (NODE_SIZE * index)
46
+ io = make_io(pos, 8)
47
+ edges_pos = load_int(io)
48
+ edge_count = load_int(io)
49
+ edge_start = NODE_START + NODE_SIZE * @node_count
50
+ position = edge_start + edges_pos
51
+ io = make_io(position, edge_count * EDGE_SIZE)
52
+
53
+ edge_count.times do
54
+ hash = load_bigint(io)
55
+ char = load_char(io)
56
+ node_index = load_int(io)
57
+ yield char, node_index
58
+ end
59
+ end
60
+
61
+ def self.load(filename)
62
+ File.open(filename) do |f|
63
+ slice = StringIO.new(f.read)
64
+ return MemoryDawg.new(slice)
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,52 @@
1
+ module Dawg
2
+ module Finder
3
+ def set_the_node(node)
4
+ @the_node = node
5
+ end
6
+
7
+ def lookup(word)
8
+ node = @the_node
9
+ word.each_char do |letter|
10
+ next_node = node[letter]
11
+ if next_node != nil
12
+ node = next_node
13
+ next
14
+ else
15
+ return ['']
16
+ end
17
+ end
18
+ node.final
19
+ end
20
+
21
+ # get all words with given prefix
22
+ def query(word)
23
+ node = @the_node
24
+ results = []
25
+ word.split("").each do |letter|
26
+ next_node = node[letter]
27
+ if next_node != nil
28
+ node = next_node
29
+ next
30
+ else
31
+ return ['']
32
+ end
33
+ end
34
+ results << Word.new(word, node.final)
35
+ results += get_childs(node).map{|s| Word.new(word) + s}
36
+ results.select{|r| r.final}.map{|r| r}
37
+ end
38
+
39
+ def get_childs(node)
40
+ results = []
41
+ node.each_edge do |letter|
42
+ next_node = node[letter]
43
+ if next_node != nil
44
+ results += get_childs(next_node).map{|s| Word.new(letter) + s}
45
+ results << Word.new(letter, next_node.final)
46
+ end
47
+ end
48
+ results
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,31 @@
1
+ module Dawg
2
+ class MemoryNode
3
+ attr_accessor :index, :io, :edge_count, :final, :hash, :edges_pos
4
+ def initialize(io, index , edge_count , final, hash, edges_pos )
5
+ @io = io
6
+ @index = index
7
+ @edge_count = edge_count
8
+ @final = final
9
+ @hash = hash
10
+ @edges_pos = edges_pos
11
+ end
12
+
13
+ def [](letter)
14
+ @io.each_edge @index do |char, node_index|
15
+ if letter == char
16
+ return @io.get_node_by_index(node_index)
17
+ end
18
+ end
19
+
20
+ nil
21
+ end
22
+
23
+ def each_edge(&block)
24
+ @io.each_edge @index do |char, node_index|
25
+ yield char
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,52 @@
1
+ module Dawg
2
+ class Node
3
+ @@next_id = 0
4
+ @id = 0
5
+ attr_accessor :edges, :id, :edge_count, :index, :final
6
+
7
+ def initialize(id: @@next_id, final: false, edge_count: 0, index: -1)
8
+ @id = id
9
+ @@next_id += 1
10
+ @final = final
11
+ @edge_count = edge_count
12
+ @index = index
13
+ @edges = {}
14
+ end
15
+
16
+ def to_s
17
+ arr = []
18
+ if @final
19
+ arr<<'1'
20
+ else
21
+ arr<<'0'
22
+ end
23
+
24
+ @edges.each do |label,node|
25
+ arr << label.to_s
26
+ arr << node.id.to_s
27
+ end
28
+
29
+ arr.join('_')
30
+
31
+ end
32
+
33
+ def hash
34
+ to_s.hash
35
+ end
36
+
37
+ def ==(other)
38
+ to_s == other.to_s
39
+ end
40
+
41
+ def [](letter)
42
+ @edges[letter]
43
+ end
44
+
45
+ def each_edge(&block)
46
+ @edges.each do |letter, node|
47
+ yield letter
48
+ end
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,37 @@
1
+ module Dawg
2
+ module Serialization
3
+ def load_int(io)
4
+ io.read(4).unpack('l')[0]
5
+ end
6
+
7
+ def write_int(int, io) #32bit signed integer
8
+ io << [int].pack('l')
9
+ end
10
+
11
+ def load_bigint(io) #64bit signed integer
12
+ io.read(8).unpack('q')[0]
13
+ end
14
+
15
+ def write_bigint(int, io)
16
+ io << [int].pack('q')
17
+ end
18
+
19
+ def write_bool(var, io)
20
+ bool = var ? 1 : 0
21
+ io << [bool].pack('c')
22
+ end
23
+
24
+ def load_bool(io)
25
+ bool = io.read(1).unpack('c')[0]
26
+ bool == 1 ? true : false
27
+ end
28
+
29
+ def write_char(char, io)
30
+ io << [char].pack('Z4')
31
+ end
32
+
33
+ def load_char(io)
34
+ io.read(4).unpack('Z4')[0].force_encoding('utf-8')
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,22 @@
1
+ module Dawg
2
+ class Word
3
+ attr_accessor :final, :word
4
+
5
+ def initialize( word = '', final = false)
6
+ @word = word
7
+ @final = final
8
+ end
9
+
10
+ def +(other)
11
+ Word.new(@word + other.word, other.final)
12
+ end
13
+
14
+ def to_s
15
+ @word
16
+ end
17
+
18
+ def inspect
19
+ @word
20
+ end
21
+ end
22
+ end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dawg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maksatbek Manurov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-29 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2017-01-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bindata
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.3'
13
27
  description: Basic deterministic acyclic finite state automaton in ruby
14
28
  email:
15
29
  - maksat.mansurov@gmail.com
@@ -18,6 +32,13 @@ extensions: []
18
32
  extra_rdoc_files: []
19
33
  files:
20
34
  - lib/dawg.rb
35
+ - lib/dawg/dawg/dawg.rb
36
+ - lib/dawg/dawg/memory_dawg.rb
37
+ - lib/dawg/finder.rb
38
+ - lib/dawg/node/memory_node.rb
39
+ - lib/dawg/node/node.rb
40
+ - lib/dawg/serialization.rb
41
+ - lib/dawg/word.rb
21
42
  homepage: https://github.com/baltavay/dawg
22
43
  licenses:
23
44
  - MIT
@@ -38,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
59
  version: '0'
39
60
  requirements: []
40
61
  rubyforge_project:
41
- rubygems_version: 2.2.2
62
+ rubygems_version: 2.4.5.1
42
63
  signing_key:
43
64
  specification_version: 4
44
65
  summary: Deterministic acyclic finite state automaton