trie_suggest 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/trie_suggest.rb +198 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 50a705f68e74677a9c91f94f33696803114dcf26
4
+ data.tar.gz: fa9009a7e0b62d97e45f0f04d4c298c2f45f358e
5
+ SHA512:
6
+ metadata.gz: c0fb07e00a6a682dd7111dde817b0ec6b445ce442694c3c8ac14ce6c5802af2d34f9b0f7cd2a60864f90b62388050c8f042cc1efcaee22f4ac785c43f6dd9de4
7
+ data.tar.gz: f9de0480f7922c6b5c345b4cc49045807f6dbfb14ca7e40336d1d3a0643539a69a0a7807d10861537e4982b40d3ce2e2097f5dfc4fd5e7bf6d677171283a9881
@@ -0,0 +1,198 @@
1
+ # Trie tree class
2
+ class TrieTree
3
+ require 'natto'
4
+ require 'romaji'
5
+ require 'active_support'
6
+ require 'active_support/core_ext'
7
+ # char type https://github.com/buruzaemon/natto/wiki/Node-Parsing-char_type#appendix-d-node-parsing-and-char_type
8
+ DEFAULT = 0
9
+ SPACE = 1
10
+ KANJI = 2
11
+ SYMBOL = 3
12
+ NUMERIC = 4
13
+ ALPHA = 5
14
+ HIRAGANA = 6
15
+ KATAKANA = 7
16
+ KANJINUMERIC = 8 # 漢字数字
17
+ GREEK = 9 # ギリシャ文字
18
+ CYRILLIC = 10 # キリル文字
19
+
20
+ def initialize
21
+ @root = TrieNode.new('')
22
+ @romaji_dictionary = {}
23
+ # you need install natto https://github.com/buruzaemon/natto/
24
+ @nm = Natto::MeCab.new
25
+ end
26
+
27
+ # Trie tree node
28
+ class TrieNode
29
+ attr_accessor :val, :score, :next, :is_word
30
+ def initialize(val)
31
+ @val = val
32
+ @score = 0
33
+ @next = {}
34
+ end
35
+ end
36
+
37
+ def add_word(keyword, score)
38
+ return if keyword.blank?
39
+ return if score < 10
40
+ keyword = normalize(keyword)
41
+ return if keyword.to_s.empty?
42
+ romaji = to_romaji(keyword)
43
+ if @romaji_dictionary[romaji].nil? || @romaji_dictionary[romaji][1] < score
44
+ @romaji_dictionary[romaji] = [keyword, score]
45
+ end
46
+
47
+ romaji += '+'
48
+ current_node = @root
49
+ for i in 0..romaji.size - 1
50
+ char = romaji.slice(0, i + 1)
51
+ idx = romaji[i].ord
52
+ current_node.next[idx] = TrieNode.new(char) if current_node.next[idx].nil?
53
+ current_node.next[idx].score += score
54
+ current_node = current_node.next[idx]
55
+ end
56
+ end
57
+
58
+ def suggest(keyword)
59
+ return [] if keyword.blank? || keyword == '+'
60
+ res = []
61
+ keyword = normalize(keyword)
62
+ romaji = to_romaji(keyword)
63
+
64
+ current_node = @root
65
+ # search keyword olog(keyword.length)
66
+ for i in 0..romaji.size - 1
67
+ idx = romaji[i].ord
68
+ return res if current_node.next[idx].nil?
69
+ current_node = current_node.next[idx]
70
+ end
71
+ # Traversal node
72
+ next_level = current_node.next.sort_by { |r| r[1].score }.reverse
73
+ until next_level.empty?
74
+ node = next_level.shift
75
+ if node[1].val.last == '+'
76
+ next if @romaji_dictionary[node[1].val.chop].nil?
77
+ sufrace = @romaji_dictionary[node[1].val.chop][0]
78
+ res << [sufrace, node[1].score]
79
+ return res if res.size > 9
80
+ else
81
+ next if node[1].next.empty?
82
+ next_level << node[1].next.sort_by { |r| r[1].score }.reverse[0]
83
+ next_level = next_level.sort_by { |r| r[1].score }.reverse
84
+ next_level = next_level[0..9]
85
+ end
86
+ end
87
+ res
88
+ end
89
+
90
+ def match(keyword)
91
+ return '' if keyword.blank? || keyword == '+'
92
+ keyword = normalize(keyword)
93
+ romaji = to_romaji(keyword)
94
+
95
+ current_node = @root
96
+ for i in 0..romaji.size - 1
97
+ idx = romaji[i].ord
98
+ return '' if current_node.next[idx].nil?
99
+ current_node = current_node.next[idx]
100
+ end
101
+ return '' if current_node.next['+'.ord].nil?
102
+ @romaji_dictionary[current_node.val][0]
103
+ end
104
+
105
+ def spellcheck(keyword, max_cost = 3)
106
+ return [] if keyword.blank? || keyword == '+'
107
+ keyword += '+'
108
+ current_node = @root
109
+
110
+ current_row = []
111
+ for i in 0..keyword.size
112
+ current_row[i] = i
113
+ end
114
+ res = []
115
+ # recursively search each branch of the trie
116
+ current_node.next.map { |node| search_recursive(node[1], node[1].val[-1], keyword, current_row, res, max_cost) }
117
+ end
118
+
119
+ def levenshtein(word1, word2)
120
+ columns = word1.size + 1
121
+ rows = word2.size + 1
122
+ current_row = [0]
123
+ for column in 1..columns
124
+ current_row << current_row[column - 1] + 1
125
+ end
126
+
127
+ for row in 1..rows
128
+ previous_row = current_row
129
+ current_row = [ previous_row[0] + 1 ]
130
+
131
+ for column in 1..columns
132
+ insert_cost = current_row[column - 1] + 1
133
+ delete_cost = previous_row[column] + 1
134
+ if word1[column - 1] != word2[row - 1]
135
+ replace_cost = previous_row[ column - 1 ] + 1
136
+ else
137
+ replace_cost = previous_row[ column - 1 ]
138
+ end
139
+ current_row << [insert_cost, delete_cost, replace_cost].min
140
+ end
141
+ end
142
+ return current_row[-1]
143
+ end
144
+
145
+ private
146
+
147
+ def search_recursive(node, letter, keyword, previous_row, res, max_cost)
148
+ current_row = [previous_row[0] + 1]
149
+
150
+ for column in 1..keyword.size
151
+ insert_cost = current_row[column - 1] + 1
152
+ delete_cost = previous_row[column] + 1
153
+
154
+ replace_cost = if keyword[column - 1] != letter
155
+ previous_row[column - 1] + 1
156
+ else
157
+ previous_row[column - 1]
158
+ end
159
+ current_row << [insert_cost, delete_cost, replace_cost].min
160
+ end
161
+
162
+ if current_row[-1] != 0 && current_row[-1] <= max_cost &&
163
+ letter == '+' && !@romaji_dictionary[node.val.chop].nil?
164
+ res << [@romaji_dictionary[node.val.chop][0], node.score, current_row[-1]]
165
+ end
166
+
167
+ if current_row.min <= max_cost
168
+ node.next.map { |next_node| search_recursive(next_node[1], next_node[1].val[-1], keyword, current_row, res, max_cost) }
169
+ end
170
+ res
171
+ end
172
+
173
+ def normalize(string)
174
+ return '' if string.blank?
175
+ # stringの長さはtrie treeの深さを決めるので、検索走査のスピードも影響を及ぼす、変更する場合は慎重に
176
+ return '' if string.size > 30
177
+ string.scrub!
178
+ string.unicode_normalize!
179
+ string = string.tr('0-9a-zA-Z', '0-9a-zA-Z').downcase
180
+ string.gsub(/[^-’[^\p{P}]]|’$|’”$/, '')
181
+ end
182
+
183
+ def to_romaji(string)
184
+ string = string.delete('  ')
185
+ string = to_yomi(string) if string =~ /\p{Han}/
186
+ Romaji.kana2romaji(string)
187
+ end
188
+
189
+ def to_yomi(string)
190
+ @nm.enum_parse(string).map do |node|
191
+ if node.char_type.in?([KANJI, HIRAGANA, KATAKANA, KANJINUMERIC])
192
+ node.feature.split(',')[8]
193
+ else
194
+ node.surface
195
+ end
196
+ end.join
197
+ end
198
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: trie_suggest
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - cheung
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-05-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: a simple program for suggest or spellcheck keyword use trie tree
14
+ email: CheungYishin@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/trie_suggest.rb
20
+ homepage: https://github.com/cheungYX/suggest
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.5.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: trie tree for suggest
44
+ test_files: []