trie_suggest 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/trie_suggest.rb +198 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 50a705f68e74677a9c91f94f33696803114dcf26
|
4
|
+
data.tar.gz: fa9009a7e0b62d97e45f0f04d4c298c2f45f358e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c0fb07e00a6a682dd7111dde817b0ec6b445ce442694c3c8ac14ce6c5802af2d34f9b0f7cd2a60864f90b62388050c8f042cc1efcaee22f4ac785c43f6dd9de4
|
7
|
+
data.tar.gz: f9de0480f7922c6b5c345b4cc49045807f6dbfb14ca7e40336d1d3a0643539a69a0a7807d10861537e4982b40d3ce2e2097f5dfc4fd5e7bf6d677171283a9881
|
data/lib/trie_suggest.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Trie tree class
|
2
|
+
class TrieTree
|
3
|
+
require 'natto'
|
4
|
+
require 'romaji'
|
5
|
+
require 'active_support'
|
6
|
+
require 'active_support/core_ext'
|
7
|
+
# char type https://github.com/buruzaemon/natto/wiki/Node-Parsing-char_type#appendix-d-node-parsing-and-char_type
|
8
|
+
DEFAULT = 0
|
9
|
+
SPACE = 1
|
10
|
+
KANJI = 2
|
11
|
+
SYMBOL = 3
|
12
|
+
NUMERIC = 4
|
13
|
+
ALPHA = 5
|
14
|
+
HIRAGANA = 6
|
15
|
+
KATAKANA = 7
|
16
|
+
KANJINUMERIC = 8 # 漢字数字
|
17
|
+
GREEK = 9 # ギリシャ文字
|
18
|
+
CYRILLIC = 10 # キリル文字
|
19
|
+
|
20
|
+
def initialize
|
21
|
+
@root = TrieNode.new('')
|
22
|
+
@romaji_dictionary = {}
|
23
|
+
# you need install natto https://github.com/buruzaemon/natto/
|
24
|
+
@nm = Natto::MeCab.new
|
25
|
+
end
|
26
|
+
|
27
|
+
# Trie tree node
|
28
|
+
class TrieNode
|
29
|
+
attr_accessor :val, :score, :next, :is_word
|
30
|
+
def initialize(val)
|
31
|
+
@val = val
|
32
|
+
@score = 0
|
33
|
+
@next = {}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_word(keyword, score)
|
38
|
+
return if keyword.blank?
|
39
|
+
return if score < 10
|
40
|
+
keyword = normalize(keyword)
|
41
|
+
return if keyword.to_s.empty?
|
42
|
+
romaji = to_romaji(keyword)
|
43
|
+
if @romaji_dictionary[romaji].nil? || @romaji_dictionary[romaji][1] < score
|
44
|
+
@romaji_dictionary[romaji] = [keyword, score]
|
45
|
+
end
|
46
|
+
|
47
|
+
romaji += '+'
|
48
|
+
current_node = @root
|
49
|
+
for i in 0..romaji.size - 1
|
50
|
+
char = romaji.slice(0, i + 1)
|
51
|
+
idx = romaji[i].ord
|
52
|
+
current_node.next[idx] = TrieNode.new(char) if current_node.next[idx].nil?
|
53
|
+
current_node.next[idx].score += score
|
54
|
+
current_node = current_node.next[idx]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def suggest(keyword)
|
59
|
+
return [] if keyword.blank? || keyword == '+'
|
60
|
+
res = []
|
61
|
+
keyword = normalize(keyword)
|
62
|
+
romaji = to_romaji(keyword)
|
63
|
+
|
64
|
+
current_node = @root
|
65
|
+
# search keyword olog(keyword.length)
|
66
|
+
for i in 0..romaji.size - 1
|
67
|
+
idx = romaji[i].ord
|
68
|
+
return res if current_node.next[idx].nil?
|
69
|
+
current_node = current_node.next[idx]
|
70
|
+
end
|
71
|
+
# Traversal node
|
72
|
+
next_level = current_node.next.sort_by { |r| r[1].score }.reverse
|
73
|
+
until next_level.empty?
|
74
|
+
node = next_level.shift
|
75
|
+
if node[1].val.last == '+'
|
76
|
+
next if @romaji_dictionary[node[1].val.chop].nil?
|
77
|
+
sufrace = @romaji_dictionary[node[1].val.chop][0]
|
78
|
+
res << [sufrace, node[1].score]
|
79
|
+
return res if res.size > 9
|
80
|
+
else
|
81
|
+
next if node[1].next.empty?
|
82
|
+
next_level << node[1].next.sort_by { |r| r[1].score }.reverse[0]
|
83
|
+
next_level = next_level.sort_by { |r| r[1].score }.reverse
|
84
|
+
next_level = next_level[0..9]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
res
|
88
|
+
end
|
89
|
+
|
90
|
+
def match(keyword)
|
91
|
+
return '' if keyword.blank? || keyword == '+'
|
92
|
+
keyword = normalize(keyword)
|
93
|
+
romaji = to_romaji(keyword)
|
94
|
+
|
95
|
+
current_node = @root
|
96
|
+
for i in 0..romaji.size - 1
|
97
|
+
idx = romaji[i].ord
|
98
|
+
return '' if current_node.next[idx].nil?
|
99
|
+
current_node = current_node.next[idx]
|
100
|
+
end
|
101
|
+
return '' if current_node.next['+'.ord].nil?
|
102
|
+
@romaji_dictionary[current_node.val][0]
|
103
|
+
end
|
104
|
+
|
105
|
+
def spellcheck(keyword, max_cost = 3)
|
106
|
+
return [] if keyword.blank? || keyword == '+'
|
107
|
+
keyword += '+'
|
108
|
+
current_node = @root
|
109
|
+
|
110
|
+
current_row = []
|
111
|
+
for i in 0..keyword.size
|
112
|
+
current_row[i] = i
|
113
|
+
end
|
114
|
+
res = []
|
115
|
+
# recursively search each branch of the trie
|
116
|
+
current_node.next.map { |node| search_recursive(node[1], node[1].val[-1], keyword, current_row, res, max_cost) }
|
117
|
+
end
|
118
|
+
|
119
|
+
def levenshtein(word1, word2)
|
120
|
+
columns = word1.size + 1
|
121
|
+
rows = word2.size + 1
|
122
|
+
current_row = [0]
|
123
|
+
for column in 1..columns
|
124
|
+
current_row << current_row[column - 1] + 1
|
125
|
+
end
|
126
|
+
|
127
|
+
for row in 1..rows
|
128
|
+
previous_row = current_row
|
129
|
+
current_row = [ previous_row[0] + 1 ]
|
130
|
+
|
131
|
+
for column in 1..columns
|
132
|
+
insert_cost = current_row[column - 1] + 1
|
133
|
+
delete_cost = previous_row[column] + 1
|
134
|
+
if word1[column - 1] != word2[row - 1]
|
135
|
+
replace_cost = previous_row[ column - 1 ] + 1
|
136
|
+
else
|
137
|
+
replace_cost = previous_row[ column - 1 ]
|
138
|
+
end
|
139
|
+
current_row << [insert_cost, delete_cost, replace_cost].min
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return current_row[-1]
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def search_recursive(node, letter, keyword, previous_row, res, max_cost)
|
148
|
+
current_row = [previous_row[0] + 1]
|
149
|
+
|
150
|
+
for column in 1..keyword.size
|
151
|
+
insert_cost = current_row[column - 1] + 1
|
152
|
+
delete_cost = previous_row[column] + 1
|
153
|
+
|
154
|
+
replace_cost = if keyword[column - 1] != letter
|
155
|
+
previous_row[column - 1] + 1
|
156
|
+
else
|
157
|
+
previous_row[column - 1]
|
158
|
+
end
|
159
|
+
current_row << [insert_cost, delete_cost, replace_cost].min
|
160
|
+
end
|
161
|
+
|
162
|
+
if current_row[-1] != 0 && current_row[-1] <= max_cost &&
|
163
|
+
letter == '+' && !@romaji_dictionary[node.val.chop].nil?
|
164
|
+
res << [@romaji_dictionary[node.val.chop][0], node.score, current_row[-1]]
|
165
|
+
end
|
166
|
+
|
167
|
+
if current_row.min <= max_cost
|
168
|
+
node.next.map { |next_node| search_recursive(next_node[1], next_node[1].val[-1], keyword, current_row, res, max_cost) }
|
169
|
+
end
|
170
|
+
res
|
171
|
+
end
|
172
|
+
|
173
|
+
def normalize(string)
|
174
|
+
return '' if string.blank?
|
175
|
+
# stringの長さはtrie treeの深さを決めるので、検索走査のスピードも影響を及ぼす、変更する場合は慎重に
|
176
|
+
return '' if string.size > 30
|
177
|
+
string.scrub!
|
178
|
+
string.unicode_normalize!
|
179
|
+
string = string.tr('0-9a-zA-Z', '0-9a-zA-Z').downcase
|
180
|
+
string.gsub(/[^-’[^\p{P}]]|’$|’”$/, '')
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_romaji(string)
|
184
|
+
string = string.delete(' ')
|
185
|
+
string = to_yomi(string) if string =~ /\p{Han}/
|
186
|
+
Romaji.kana2romaji(string)
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_yomi(string)
|
190
|
+
@nm.enum_parse(string).map do |node|
|
191
|
+
if node.char_type.in?([KANJI, HIRAGANA, KATAKANA, KANJINUMERIC])
|
192
|
+
node.feature.split(',')[8]
|
193
|
+
else
|
194
|
+
node.surface
|
195
|
+
end
|
196
|
+
end.join
|
197
|
+
end
|
198
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: trie_suggest
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- cheung
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-05-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: a simple program for suggest or spellcheck keyword use trie tree
|
14
|
+
email: CheungYishin@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/trie_suggest.rb
|
20
|
+
homepage: https://github.com/cheungYX/suggest
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.5.2
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: trie tree for suggest
|
44
|
+
test_files: []
|