dawg 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 243a3162eee48baf49a16fc03b7950b55b6583fb
4
- data.tar.gz: 21e7e03637c9418a297dc218b37449989e0da18d
3
+ metadata.gz: 44e60a8bd509c089b4b3d8d251491d798a5fac8b
4
+ data.tar.gz: 69e7885dfcbaadda4167d55abc7c678227a9c395
5
5
  SHA512:
6
- metadata.gz: 283153800b269b13e87b169b0972221612eaa9c4e76b9ebb783985ce7b62dad3a4bcde2c17a33364a653daec504376c828816774d37d13765b7a647df193d653
7
- data.tar.gz: e5ef8892af3655be6eff55abbbac3701b2e51aa0dd67b8a3b3063ca5ea6f7bdc5b61f0bac865d173d8769c861022cad04ec4f00cbd6f17cd0e12768134752fe2
6
+ metadata.gz: ecd9e5bf52c4211e139933f64bf769444d3370f41e74163b3868cfa6959afec72798cf86e300da850a63b4eed4eec88e3507592ee744fc145c98836bea47c8fa
7
+ data.tar.gz: c7d5bec04d7d9567f1cb0db9002172b4fa3a8879a5116ea8bb5fc29492a144faef3f3a6d2d36232c3759e59ef03022a4af041b34ce82280b651e08a577362ab4
@@ -1,167 +1,30 @@
1
- class DawgNode
2
- @@next_id = 0
3
- attr_accessor :edges,:final,:id
4
- def initialize
5
- @id = @@next_id
6
- @@next_id += 1
7
- @final = false
8
- @edges = {}
9
- end
10
-
11
- def to_s
12
- arr = []
13
- if @final
14
- arr<<"1"
15
- else
16
- arr<<"0"
17
- end
1
+ require 'stringio'
2
+ require_relative 'dawg/serialization'
3
+ require_relative 'dawg/finder'
4
+ require_relative 'dawg/word'
5
+ require_relative 'dawg/node/node'
6
+ require_relative 'dawg/node/memory_node'
7
+ require_relative 'dawg/dawg/dawg'
8
+ require_relative 'dawg/dawg/memory_dawg'
18
9
 
19
- for (label, node) in @edges
20
- arr << label
21
- arr << node.id.to_s
22
- end
10
+ module Dawg
11
+ NODE_START = 8
12
+ NODE_SIZE = 21
13
+ EDGE_SIZE = 16
14
+ extend self
23
15
 
24
- arr.join("_")
16
+ def new
17
+ Dawg.new
25
18
  end
26
19
 
27
- def hash
28
- to_s.hash
29
- end
30
-
31
- def eql?(other)
32
- to_s == other.to_s
33
- end
34
- def inspect
35
- "to_s"
36
- end
37
- end
38
-
39
- class Dawg
40
- def initialize
41
- @previousWord = ""
42
- @root = DawgNode.new
43
-
44
- # Here is a list of nodes that have not been checked for duplication.
45
- @uncheckedNodes = []
46
-
47
- # Here is a list of unique nodes that have been checked for
48
- # duplication.
49
- @minimizedNodes = {}
50
- end
51
-
52
- def save(filename)
53
- data = Marshal.dump(self)
54
- File.open(filename, 'w') { |file| file.write(data) }
55
- end
56
-
57
- def self.load(filename)
58
- dawg = Marshal.load( File.open(filename).read )
59
- end
60
-
61
- def insert( word )
62
- if word < @previousWord
63
- raise "Error: Words must be inserted in alphabetical order."
64
- end
65
-
66
- # find common prefix between word and previous word
67
- commonPrefix = 0
68
- for i in 0..[word.length-1, @previousWord.length-1].min
69
- break if word[i] != @previousWord[i]
70
- commonPrefix += 1
71
- end
72
-
73
- # Check the uncheckedNodes for redundant nodes, proceeding from last
74
- # one down to the common prefix size. Then truncate the list at that
75
- # point.
76
- _minimize( commonPrefix )
77
-
78
- # add the suffix, starting from the correct node mid-way through the
79
- # graph
80
- if @uncheckedNodes.length == 0
81
- node = @root
82
- else
83
- node = @uncheckedNodes[-1][2]
84
- end
85
-
86
- for letter in word.split("")[commonPrefix..-1]
87
- nextNode = DawgNode.new
88
- node.edges[letter] = nextNode
89
- @uncheckedNodes<< [node, letter, nextNode]
90
- node = nextNode
20
+ def load(filename, type = :small)
21
+ return case type
22
+ when :small
23
+ MemoryDawg.load(filename)
24
+ when :fast
25
+ Dawg.load(filename)
91
26
  end
92
-
93
- node.final = true
94
- @previousWord = word
95
- end
96
- def finish
97
- # minimize all uncheckedNodes
98
- _minimize( 0 )
99
- end
100
-
101
- def _minimize(downTo)
102
- # proceed from the leaf up to a certain point
103
- for i in (@uncheckedNodes.length - 1).downto(downTo)
104
- parent, letter, child = @uncheckedNodes[i]
105
- if @minimizedNodes.has_key? child
106
- # replace the child with the previously encountered one
107
- parent.edges[letter] = @minimizedNodes[child]
108
- else
109
- # add the state to the minimized nodes.
110
- @minimizedNodes[child] = child
111
- end
112
- @uncheckedNodes.pop
113
- end
114
- end
115
-
116
- def lookup(word)
117
- node = @root
118
- for letter in word.split("")
119
- return false if !node.edges.has_key? letter
120
- node = node.edges[letter]
121
- end
122
- node.final
123
- end
124
-
125
- def find_similar(word)
126
- node = @root
127
- for letter in word.split("")
128
- return [] if !node.edges.has_key? letter
129
- node = node.edges[letter]
130
- end
131
- results = get_recuirsively_all(node)
132
-
133
- return [word].product(results).map(&:join)
27
+ dawg
134
28
  end
135
-
136
- def get_recuirsively_all(node)
137
- suffixes = []
138
29
 
139
- node.edges.each do |key,value|
140
- results = get_recuirsively_all(value)
141
-
142
- # result.flatten! if result.length==2
143
- results.each do |result|
144
- suffixes << [[key] + [result]].flatten.join
145
- end
146
-
147
- suffixes << key if results.empty?
148
-
149
-
150
- end
151
- return suffixes
152
- end
153
- def nodeCount
154
- @minimizedNodes.length
155
- end
156
-
157
- def edgeCount
158
- count = 0
159
- for key,node in @minimizedNodes
160
- count += node.edges.length
161
- end
162
- count
163
- end
164
- def inspect
165
- "Dawg"
166
- end
167
- end
30
+ end
@@ -0,0 +1,143 @@
1
+ module Dawg
2
+ class Dawg
3
+ extend Serialization
4
+ include Serialization
5
+ include Finder
6
+
7
+ attr_accessor :minimized_nodes, :root
8
+
9
+ def initialize
10
+ @previous_word = ''
11
+ @root = Node.new
12
+ @unchecked_nodes = []
13
+ @minimized_nodes = {}
14
+ set_the_node(@root)
15
+ end
16
+
17
+ def insert(word)
18
+ if word < @previous_word #TODO there's should be the way to make adding without this
19
+ raise 'Error: Words must be inserted in alphabetical order.'
20
+ end
21
+
22
+ # find common prefix between word and previous word
23
+ common_prefix = 0
24
+ (0..[word.length-1, @previous_word.length-1].min).each do |i|
25
+ break if word[i] != @previous_word[i]
26
+ common_prefix += 1
27
+ end
28
+
29
+ # Check the uncheckedNodes for redundant nodes, proceeding from last
30
+ # one down to the common prefix size. Then truncate the list at that
31
+ # point.
32
+ minimize(common_prefix)
33
+
34
+ # add the suffix, starting from the correct node mid-way through the
35
+ # graph
36
+ if @unchecked_nodes.length == 0
37
+ node = @root
38
+ else
39
+ node = @unchecked_nodes[-1][2]
40
+ end
41
+
42
+ word.split('')[common_prefix..-1].each do |letter|
43
+ next_node = Node.new
44
+ node.edges[letter] = next_node
45
+ @unchecked_nodes << [node, letter, next_node]
46
+ node = next_node
47
+ end
48
+
49
+ node.final = true
50
+ @previous_word = word
51
+ end
52
+
53
+ def finish
54
+ minimize 0
55
+ @minimized_nodes[@root.hash] = @root
56
+ end
57
+
58
+ def minimize(down_to)
59
+ (@unchecked_nodes.length - 1).downto(down_to) do |i|
60
+ parent, letter, child = @unchecked_nodes[i]
61
+ if @minimized_nodes.has_key? child.hash
62
+ parent.edges[letter] = @minimized_nodes[child.hash]
63
+ else
64
+ child.index = @minimized_nodes.size
65
+ @minimized_nodes[child.hash] = child
66
+ end
67
+ @unchecked_nodes.pop
68
+ end
69
+ end
70
+
71
+ def node_count
72
+ @minimized_nodes.length
73
+ end
74
+
75
+ def edge_count
76
+ count = 0
77
+ @minimized_nodes.each do |hash, node|
78
+ count += node.edges.length
79
+ end
80
+ count
81
+ end
82
+
83
+ def inspect
84
+ 'Dawg'
85
+ end
86
+
87
+
88
+ def save(filename)
89
+ dawg = self
90
+ File.open(filename,'w') do |f|
91
+ write_int(dawg.node_count, f) # overall nodes count
92
+ write_int(dawg.edge_count, f) # overall edge count
93
+ edges_pos = 0
94
+ dawg.minimized_nodes.each do |hash, node|
95
+ write_int(edges_pos, f)
96
+ write_int(node.edges.keys.length, f)
97
+ write_int(node.id, f)
98
+ write_bool(node.final, f)
99
+ write_bigint(hash, f)
100
+ edges_pos += EDGE_SIZE * node.edges.keys.length # position of node's edges in a file
101
+ end
102
+ dawg.minimized_nodes.each do |hash, node|
103
+ node.edges.each do |letter, n|
104
+ write_bigint(n.hash, f)
105
+ write_char(letter, f)
106
+ write_int(n.index,f)
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ def self.load(filename)
113
+ dawg = Dawg.new
114
+ File.open(filename) do |f|
115
+ minimized_nodes_count = load_int(f)
116
+ overall_edges_count = load_int(f)
117
+ minimized_nodes_count.times do
118
+ edges_pos = load_int(f)
119
+ edge_count = load_int(f)
120
+ id = load_int(f)
121
+ final = load_bool(f)
122
+ hash = load_bigint(f)
123
+ node = Node.new(id: id, final: final, edge_count: edge_count)
124
+ dawg.minimized_nodes[hash] = node
125
+ end
126
+
127
+ dawg.minimized_nodes.each do |hash, node|
128
+ node.edge_count.times do
129
+ hash2 = load_bigint(f)
130
+ letter = load_char(f)
131
+ node_index = load_int(f)
132
+ node.edges[letter] = dawg.minimized_nodes[hash2]
133
+ end
134
+ end
135
+ root_key = dawg.minimized_nodes.keys.last
136
+ dawg.minimized_nodes[root_key].edges.each do |letter, node|
137
+ dawg.root.edges[letter] = node
138
+ end
139
+ end
140
+ dawg
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,69 @@
1
+ module Dawg
2
+ class MemoryDawg
3
+ include Serialization
4
+ include Finder
5
+ attr_accessor :slice, :node_count, :edge_count
6
+
7
+ def initialize(slice)
8
+ @slice = slice
9
+ @node_count = get_node_count
10
+ @edge_count = get_edge_count
11
+ set_the_node(root)
12
+ end
13
+
14
+
15
+
16
+ def root
17
+ @root = get_node_by_index(@node_count - 1)
18
+ end
19
+
20
+ def make_io(start, size)
21
+ @slice.pos = start
22
+ StringIO.new(@slice.read(size))
23
+ end
24
+
25
+ def get_node_count
26
+ load_int(make_io(0,4))
27
+ end
28
+
29
+ def get_edge_count
30
+ load_int(make_io(4,4))
31
+ end
32
+
33
+ def get_node_by_index(index)
34
+ pos = NODE_START + (NODE_SIZE * index)
35
+ io = make_io(pos, NODE_SIZE)
36
+ edges_pos = load_int(io)
37
+ edge_count = load_int(io)
38
+ id = load_int(io)
39
+ final = load_bool(io)
40
+ hash = load_bigint(io)
41
+ MemoryNode.new(self, index, edge_count, final, hash, edges_pos)
42
+ end
43
+
44
+ def each_edge(index, &block)
45
+ pos = NODE_START + (NODE_SIZE * index)
46
+ io = make_io(pos, 8)
47
+ edges_pos = load_int(io)
48
+ edge_count = load_int(io)
49
+ edge_start = NODE_START + NODE_SIZE * @node_count
50
+ position = edge_start + edges_pos
51
+ io = make_io(position, edge_count * EDGE_SIZE)
52
+
53
+ edge_count.times do
54
+ hash = load_bigint(io)
55
+ char = load_char(io)
56
+ node_index = load_int(io)
57
+ yield char, node_index
58
+ end
59
+ end
60
+
61
+ def self.load(filename)
62
+ File.open(filename) do |f|
63
+ slice = StringIO.new(f.read)
64
+ return MemoryDawg.new(slice)
65
+ end
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,52 @@
1
+ module Dawg
2
+ module Finder
3
+ def set_the_node(node)
4
+ @the_node = node
5
+ end
6
+
7
+ def lookup(word)
8
+ node = @the_node
9
+ word.each_char do |letter|
10
+ next_node = node[letter]
11
+ if next_node != nil
12
+ node = next_node
13
+ next
14
+ else
15
+ return ['']
16
+ end
17
+ end
18
+ node.final
19
+ end
20
+
21
+ # get all words with given prefix
22
+ def query(word)
23
+ node = @the_node
24
+ results = []
25
+ word.split("").each do |letter|
26
+ next_node = node[letter]
27
+ if next_node != nil
28
+ node = next_node
29
+ next
30
+ else
31
+ return ['']
32
+ end
33
+ end
34
+ results << Word.new(word, node.final)
35
+ results += get_childs(node).map{|s| Word.new(word) + s}
36
+ results.select{|r| r.final}.map{|r| r}
37
+ end
38
+
39
+ def get_childs(node)
40
+ results = []
41
+ node.each_edge do |letter|
42
+ next_node = node[letter]
43
+ if next_node != nil
44
+ results += get_childs(next_node).map{|s| Word.new(letter) + s}
45
+ results << Word.new(letter, next_node.final)
46
+ end
47
+ end
48
+ results
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,31 @@
1
+ module Dawg
2
+ class MemoryNode
3
+ attr_accessor :index, :io, :edge_count, :final, :hash, :edges_pos
4
+ def initialize(io, index , edge_count , final, hash, edges_pos )
5
+ @io = io
6
+ @index = index
7
+ @edge_count = edge_count
8
+ @final = final
9
+ @hash = hash
10
+ @edges_pos = edges_pos
11
+ end
12
+
13
+ def [](letter)
14
+ @io.each_edge @index do |char, node_index|
15
+ if letter == char
16
+ return @io.get_node_by_index(node_index)
17
+ end
18
+ end
19
+
20
+ nil
21
+ end
22
+
23
+ def each_edge(&block)
24
+ @io.each_edge @index do |char, node_index|
25
+ yield char
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,52 @@
1
+ module Dawg
2
+ class Node
3
+ @@next_id = 0
4
+ @id = 0
5
+ attr_accessor :edges, :id, :edge_count, :index, :final
6
+
7
+ def initialize(id: @@next_id, final: false, edge_count: 0, index: -1)
8
+ @id = id
9
+ @@next_id += 1
10
+ @final = final
11
+ @edge_count = edge_count
12
+ @index = index
13
+ @edges = {}
14
+ end
15
+
16
+ def to_s
17
+ arr = []
18
+ if @final
19
+ arr<<'1'
20
+ else
21
+ arr<<'0'
22
+ end
23
+
24
+ @edges.each do |label,node|
25
+ arr << label.to_s
26
+ arr << node.id.to_s
27
+ end
28
+
29
+ arr.join('_')
30
+
31
+ end
32
+
33
+ def hash
34
+ to_s.hash
35
+ end
36
+
37
+ def ==(other)
38
+ to_s == other.to_s
39
+ end
40
+
41
+ def [](letter)
42
+ @edges[letter]
43
+ end
44
+
45
+ def each_edge(&block)
46
+ @edges.each do |letter, node|
47
+ yield letter
48
+ end
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,37 @@
1
+ module Dawg
2
+ module Serialization
3
+ def load_int(io)
4
+ io.read(4).unpack('l')[0]
5
+ end
6
+
7
+ def write_int(int, io) #32bit signed integer
8
+ io << [int].pack('l')
9
+ end
10
+
11
+ def load_bigint(io) #64bit signed integer
12
+ io.read(8).unpack('q')[0]
13
+ end
14
+
15
+ def write_bigint(int, io)
16
+ io << [int].pack('q')
17
+ end
18
+
19
+ def write_bool(var, io)
20
+ bool = var ? 1 : 0
21
+ io << [bool].pack('c')
22
+ end
23
+
24
+ def load_bool(io)
25
+ bool = io.read(1).unpack('c')[0]
26
+ bool == 1 ? true : false
27
+ end
28
+
29
+ def write_char(char, io)
30
+ io << [char].pack('Z4')
31
+ end
32
+
33
+ def load_char(io)
34
+ io.read(4).unpack('Z4')[0].force_encoding('utf-8')
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,22 @@
1
+ module Dawg
2
+ class Word
3
+ attr_accessor :final, :word
4
+
5
+ def initialize( word = '', final = false)
6
+ @word = word
7
+ @final = final
8
+ end
9
+
10
+ def +(other)
11
+ Word.new(@word + other.word, other.final)
12
+ end
13
+
14
+ def to_s
15
+ @word
16
+ end
17
+
18
+ def inspect
19
+ @word
20
+ end
21
+ end
22
+ end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dawg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maksatbek Manurov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-29 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2017-01-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bindata
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.3'
13
27
  description: Basic deterministic acyclic finite state automaton in ruby
14
28
  email:
15
29
  - maksat.mansurov@gmail.com
@@ -18,6 +32,13 @@ extensions: []
18
32
  extra_rdoc_files: []
19
33
  files:
20
34
  - lib/dawg.rb
35
+ - lib/dawg/dawg/dawg.rb
36
+ - lib/dawg/dawg/memory_dawg.rb
37
+ - lib/dawg/finder.rb
38
+ - lib/dawg/node/memory_node.rb
39
+ - lib/dawg/node/node.rb
40
+ - lib/dawg/serialization.rb
41
+ - lib/dawg/word.rb
21
42
  homepage: https://github.com/baltavay/dawg
22
43
  licenses:
23
44
  - MIT
@@ -38,7 +59,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
38
59
  version: '0'
39
60
  requirements: []
40
61
  rubyforge_project:
41
- rubygems_version: 2.2.2
62
+ rubygems_version: 2.4.5.1
42
63
  signing_key:
43
64
  specification_version: 4
44
65
  summary: Deterministic acyclic finite state automaton