trie_suggest 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/trie_suggest.rb +198 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 50a705f68e74677a9c91f94f33696803114dcf26
|
4
|
+
data.tar.gz: fa9009a7e0b62d97e45f0f04d4c298c2f45f358e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c0fb07e00a6a682dd7111dde817b0ec6b445ce442694c3c8ac14ce6c5802af2d34f9b0f7cd2a60864f90b62388050c8f042cc1efcaee22f4ac785c43f6dd9de4
|
7
|
+
data.tar.gz: f9de0480f7922c6b5c345b4cc49045807f6dbfb14ca7e40336d1d3a0643539a69a0a7807d10861537e4982b40d3ce2e2097f5dfc4fd5e7bf6d677171283a9881
|
data/lib/trie_suggest.rb
ADDED
@@ -0,0 +1,198 @@
|
|
1
|
+
# Trie tree class
|
2
|
+
class TrieTree
|
3
|
+
require 'natto'
|
4
|
+
require 'romaji'
|
5
|
+
require 'active_support'
|
6
|
+
require 'active_support/core_ext'
|
7
|
+
# char type https://github.com/buruzaemon/natto/wiki/Node-Parsing-char_type#appendix-d-node-parsing-and-char_type
|
8
|
+
DEFAULT = 0
|
9
|
+
SPACE = 1
|
10
|
+
KANJI = 2
|
11
|
+
SYMBOL = 3
|
12
|
+
NUMERIC = 4
|
13
|
+
ALPHA = 5
|
14
|
+
HIRAGANA = 6
|
15
|
+
KATAKANA = 7
|
16
|
+
KANJINUMERIC = 8 # 漢字数字
|
17
|
+
GREEK = 9 # ギリシャ文字
|
18
|
+
CYRILLIC = 10 # キリル文字
|
19
|
+
|
20
|
+
def initialize
|
21
|
+
@root = TrieNode.new('')
|
22
|
+
@romaji_dictionary = {}
|
23
|
+
# you need install natto https://github.com/buruzaemon/natto/
|
24
|
+
@nm = Natto::MeCab.new
|
25
|
+
end
|
26
|
+
|
27
|
+
# Trie tree node
|
28
|
+
class TrieNode
|
29
|
+
attr_accessor :val, :score, :next, :is_word
|
30
|
+
def initialize(val)
|
31
|
+
@val = val
|
32
|
+
@score = 0
|
33
|
+
@next = {}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def add_word(keyword, score)
|
38
|
+
return if keyword.blank?
|
39
|
+
return if score < 10
|
40
|
+
keyword = normalize(keyword)
|
41
|
+
return if keyword.to_s.empty?
|
42
|
+
romaji = to_romaji(keyword)
|
43
|
+
if @romaji_dictionary[romaji].nil? || @romaji_dictionary[romaji][1] < score
|
44
|
+
@romaji_dictionary[romaji] = [keyword, score]
|
45
|
+
end
|
46
|
+
|
47
|
+
romaji += '+'
|
48
|
+
current_node = @root
|
49
|
+
for i in 0..romaji.size - 1
|
50
|
+
char = romaji.slice(0, i + 1)
|
51
|
+
idx = romaji[i].ord
|
52
|
+
current_node.next[idx] = TrieNode.new(char) if current_node.next[idx].nil?
|
53
|
+
current_node.next[idx].score += score
|
54
|
+
current_node = current_node.next[idx]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def suggest(keyword)
|
59
|
+
return [] if keyword.blank? || keyword == '+'
|
60
|
+
res = []
|
61
|
+
keyword = normalize(keyword)
|
62
|
+
romaji = to_romaji(keyword)
|
63
|
+
|
64
|
+
current_node = @root
|
65
|
+
# search keyword olog(keyword.length)
|
66
|
+
for i in 0..romaji.size - 1
|
67
|
+
idx = romaji[i].ord
|
68
|
+
return res if current_node.next[idx].nil?
|
69
|
+
current_node = current_node.next[idx]
|
70
|
+
end
|
71
|
+
# Traversal node
|
72
|
+
next_level = current_node.next.sort_by { |r| r[1].score }.reverse
|
73
|
+
until next_level.empty?
|
74
|
+
node = next_level.shift
|
75
|
+
if node[1].val.last == '+'
|
76
|
+
next if @romaji_dictionary[node[1].val.chop].nil?
|
77
|
+
sufrace = @romaji_dictionary[node[1].val.chop][0]
|
78
|
+
res << [sufrace, node[1].score]
|
79
|
+
return res if res.size > 9
|
80
|
+
else
|
81
|
+
next if node[1].next.empty?
|
82
|
+
next_level << node[1].next.sort_by { |r| r[1].score }.reverse[0]
|
83
|
+
next_level = next_level.sort_by { |r| r[1].score }.reverse
|
84
|
+
next_level = next_level[0..9]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
res
|
88
|
+
end
|
89
|
+
|
90
|
+
def match(keyword)
|
91
|
+
return '' if keyword.blank? || keyword == '+'
|
92
|
+
keyword = normalize(keyword)
|
93
|
+
romaji = to_romaji(keyword)
|
94
|
+
|
95
|
+
current_node = @root
|
96
|
+
for i in 0..romaji.size - 1
|
97
|
+
idx = romaji[i].ord
|
98
|
+
return '' if current_node.next[idx].nil?
|
99
|
+
current_node = current_node.next[idx]
|
100
|
+
end
|
101
|
+
return '' if current_node.next['+'.ord].nil?
|
102
|
+
@romaji_dictionary[current_node.val][0]
|
103
|
+
end
|
104
|
+
|
105
|
+
def spellcheck(keyword, max_cost = 3)
|
106
|
+
return [] if keyword.blank? || keyword == '+'
|
107
|
+
keyword += '+'
|
108
|
+
current_node = @root
|
109
|
+
|
110
|
+
current_row = []
|
111
|
+
for i in 0..keyword.size
|
112
|
+
current_row[i] = i
|
113
|
+
end
|
114
|
+
res = []
|
115
|
+
# recursively search each branch of the trie
|
116
|
+
current_node.next.map { |node| search_recursive(node[1], node[1].val[-1], keyword, current_row, res, max_cost) }
|
117
|
+
end
|
118
|
+
|
119
|
+
def levenshtein(word1, word2)
|
120
|
+
columns = word1.size + 1
|
121
|
+
rows = word2.size + 1
|
122
|
+
current_row = [0]
|
123
|
+
for column in 1..columns
|
124
|
+
current_row << current_row[column - 1] + 1
|
125
|
+
end
|
126
|
+
|
127
|
+
for row in 1..rows
|
128
|
+
previous_row = current_row
|
129
|
+
current_row = [ previous_row[0] + 1 ]
|
130
|
+
|
131
|
+
for column in 1..columns
|
132
|
+
insert_cost = current_row[column - 1] + 1
|
133
|
+
delete_cost = previous_row[column] + 1
|
134
|
+
if word1[column - 1] != word2[row - 1]
|
135
|
+
replace_cost = previous_row[ column - 1 ] + 1
|
136
|
+
else
|
137
|
+
replace_cost = previous_row[ column - 1 ]
|
138
|
+
end
|
139
|
+
current_row << [insert_cost, delete_cost, replace_cost].min
|
140
|
+
end
|
141
|
+
end
|
142
|
+
return current_row[-1]
|
143
|
+
end
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def search_recursive(node, letter, keyword, previous_row, res, max_cost)
|
148
|
+
current_row = [previous_row[0] + 1]
|
149
|
+
|
150
|
+
for column in 1..keyword.size
|
151
|
+
insert_cost = current_row[column - 1] + 1
|
152
|
+
delete_cost = previous_row[column] + 1
|
153
|
+
|
154
|
+
replace_cost = if keyword[column - 1] != letter
|
155
|
+
previous_row[column - 1] + 1
|
156
|
+
else
|
157
|
+
previous_row[column - 1]
|
158
|
+
end
|
159
|
+
current_row << [insert_cost, delete_cost, replace_cost].min
|
160
|
+
end
|
161
|
+
|
162
|
+
if current_row[-1] != 0 && current_row[-1] <= max_cost &&
|
163
|
+
letter == '+' && !@romaji_dictionary[node.val.chop].nil?
|
164
|
+
res << [@romaji_dictionary[node.val.chop][0], node.score, current_row[-1]]
|
165
|
+
end
|
166
|
+
|
167
|
+
if current_row.min <= max_cost
|
168
|
+
node.next.map { |next_node| search_recursive(next_node[1], next_node[1].val[-1], keyword, current_row, res, max_cost) }
|
169
|
+
end
|
170
|
+
res
|
171
|
+
end
|
172
|
+
|
173
|
+
def normalize(string)
|
174
|
+
return '' if string.blank?
|
175
|
+
# stringの長さはtrie treeの深さを決めるので、検索走査のスピードも影響を及ぼす、変更する場合は慎重に
|
176
|
+
return '' if string.size > 30
|
177
|
+
string.scrub!
|
178
|
+
string.unicode_normalize!
|
179
|
+
string = string.tr('0-9a-zA-Z', '0-9a-zA-Z').downcase
|
180
|
+
string.gsub(/[^-’[^\p{P}]]|’$|’”$/, '')
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_romaji(string)
|
184
|
+
string = string.delete(' ')
|
185
|
+
string = to_yomi(string) if string =~ /\p{Han}/
|
186
|
+
Romaji.kana2romaji(string)
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_yomi(string)
|
190
|
+
@nm.enum_parse(string).map do |node|
|
191
|
+
if node.char_type.in?([KANJI, HIRAGANA, KATAKANA, KANJINUMERIC])
|
192
|
+
node.feature.split(',')[8]
|
193
|
+
else
|
194
|
+
node.surface
|
195
|
+
end
|
196
|
+
end.join
|
197
|
+
end
|
198
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: trie_suggest
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- cheung
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-05-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: a simple program for suggest or spellcheck keyword use trie tree
|
14
|
+
email: CheungYishin@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/trie_suggest.rb
|
20
|
+
homepage: https://github.com/cheungYX/suggest
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.5.2
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: trie tree for suggest
|
44
|
+
test_files: []
|