trie_suggest 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/trie_suggest.rb +198 -0
  3. metadata +44 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 50a705f68e74677a9c91f94f33696803114dcf26
4
+ data.tar.gz: fa9009a7e0b62d97e45f0f04d4c298c2f45f358e
5
+ SHA512:
6
+ metadata.gz: c0fb07e00a6a682dd7111dde817b0ec6b445ce442694c3c8ac14ce6c5802af2d34f9b0f7cd2a60864f90b62388050c8f042cc1efcaee22f4ac785c43f6dd9de4
7
+ data.tar.gz: f9de0480f7922c6b5c345b4cc49045807f6dbfb14ca7e40336d1d3a0643539a69a0a7807d10861537e4982b40d3ce2e2097f5dfc4fd5e7bf6d677171283a9881
@@ -0,0 +1,198 @@
1
+ # Trie tree class
2
+ class TrieTree
3
+ require 'natto'
4
+ require 'romaji'
5
+ require 'active_support'
6
+ require 'active_support/core_ext'
7
+ # char type https://github.com/buruzaemon/natto/wiki/Node-Parsing-char_type#appendix-d-node-parsing-and-char_type
8
+ DEFAULT = 0
9
+ SPACE = 1
10
+ KANJI = 2
11
+ SYMBOL = 3
12
+ NUMERIC = 4
13
+ ALPHA = 5
14
+ HIRAGANA = 6
15
+ KATAKANA = 7
16
+ KANJINUMERIC = 8 # 漢字数字
17
+ GREEK = 9 # ギリシャ文字
18
+ CYRILLIC = 10 # キリル文字
19
+
20
+ def initialize
21
+ @root = TrieNode.new('')
22
+ @romaji_dictionary = {}
23
+ # you need install natto https://github.com/buruzaemon/natto/
24
+ @nm = Natto::MeCab.new
25
+ end
26
+
27
+ # Trie tree node
28
+ class TrieNode
29
+ attr_accessor :val, :score, :next, :is_word
30
+ def initialize(val)
31
+ @val = val
32
+ @score = 0
33
+ @next = {}
34
+ end
35
+ end
36
+
37
+ def add_word(keyword, score)
38
+ return if keyword.blank?
39
+ return if score < 10
40
+ keyword = normalize(keyword)
41
+ return if keyword.to_s.empty?
42
+ romaji = to_romaji(keyword)
43
+ if @romaji_dictionary[romaji].nil? || @romaji_dictionary[romaji][1] < score
44
+ @romaji_dictionary[romaji] = [keyword, score]
45
+ end
46
+
47
+ romaji += '+'
48
+ current_node = @root
49
+ for i in 0..romaji.size - 1
50
+ char = romaji.slice(0, i + 1)
51
+ idx = romaji[i].ord
52
+ current_node.next[idx] = TrieNode.new(char) if current_node.next[idx].nil?
53
+ current_node.next[idx].score += score
54
+ current_node = current_node.next[idx]
55
+ end
56
+ end
57
+
58
+ def suggest(keyword)
59
+ return [] if keyword.blank? || keyword == '+'
60
+ res = []
61
+ keyword = normalize(keyword)
62
+ romaji = to_romaji(keyword)
63
+
64
+ current_node = @root
65
+ # search keyword olog(keyword.length)
66
+ for i in 0..romaji.size - 1
67
+ idx = romaji[i].ord
68
+ return res if current_node.next[idx].nil?
69
+ current_node = current_node.next[idx]
70
+ end
71
+ # Traversal node
72
+ next_level = current_node.next.sort_by { |r| r[1].score }.reverse
73
+ until next_level.empty?
74
+ node = next_level.shift
75
+ if node[1].val.last == '+'
76
+ next if @romaji_dictionary[node[1].val.chop].nil?
77
+ sufrace = @romaji_dictionary[node[1].val.chop][0]
78
+ res << [sufrace, node[1].score]
79
+ return res if res.size > 9
80
+ else
81
+ next if node[1].next.empty?
82
+ next_level << node[1].next.sort_by { |r| r[1].score }.reverse[0]
83
+ next_level = next_level.sort_by { |r| r[1].score }.reverse
84
+ next_level = next_level[0..9]
85
+ end
86
+ end
87
+ res
88
+ end
89
+
90
+ def match(keyword)
91
+ return '' if keyword.blank? || keyword == '+'
92
+ keyword = normalize(keyword)
93
+ romaji = to_romaji(keyword)
94
+
95
+ current_node = @root
96
+ for i in 0..romaji.size - 1
97
+ idx = romaji[i].ord
98
+ return '' if current_node.next[idx].nil?
99
+ current_node = current_node.next[idx]
100
+ end
101
+ return '' if current_node.next['+'.ord].nil?
102
+ @romaji_dictionary[current_node.val][0]
103
+ end
104
+
105
+ def spellcheck(keyword, max_cost = 3)
106
+ return [] if keyword.blank? || keyword == '+'
107
+ keyword += '+'
108
+ current_node = @root
109
+
110
+ current_row = []
111
+ for i in 0..keyword.size
112
+ current_row[i] = i
113
+ end
114
+ res = []
115
+ # recursively search each branch of the trie
116
+ current_node.next.map { |node| search_recursive(node[1], node[1].val[-1], keyword, current_row, res, max_cost) }
117
+ end
118
+
119
+ def levenshtein(word1, word2)
120
+ columns = word1.size + 1
121
+ rows = word2.size + 1
122
+ current_row = [0]
123
+ for column in 1..columns
124
+ current_row << current_row[column - 1] + 1
125
+ end
126
+
127
+ for row in 1..rows
128
+ previous_row = current_row
129
+ current_row = [ previous_row[0] + 1 ]
130
+
131
+ for column in 1..columns
132
+ insert_cost = current_row[column - 1] + 1
133
+ delete_cost = previous_row[column] + 1
134
+ if word1[column - 1] != word2[row - 1]
135
+ replace_cost = previous_row[ column - 1 ] + 1
136
+ else
137
+ replace_cost = previous_row[ column - 1 ]
138
+ end
139
+ current_row << [insert_cost, delete_cost, replace_cost].min
140
+ end
141
+ end
142
+ return current_row[-1]
143
+ end
144
+
145
+ private
146
+
147
+ def search_recursive(node, letter, keyword, previous_row, res, max_cost)
148
+ current_row = [previous_row[0] + 1]
149
+
150
+ for column in 1..keyword.size
151
+ insert_cost = current_row[column - 1] + 1
152
+ delete_cost = previous_row[column] + 1
153
+
154
+ replace_cost = if keyword[column - 1] != letter
155
+ previous_row[column - 1] + 1
156
+ else
157
+ previous_row[column - 1]
158
+ end
159
+ current_row << [insert_cost, delete_cost, replace_cost].min
160
+ end
161
+
162
+ if current_row[-1] != 0 && current_row[-1] <= max_cost &&
163
+ letter == '+' && !@romaji_dictionary[node.val.chop].nil?
164
+ res << [@romaji_dictionary[node.val.chop][0], node.score, current_row[-1]]
165
+ end
166
+
167
+ if current_row.min <= max_cost
168
+ node.next.map { |next_node| search_recursive(next_node[1], next_node[1].val[-1], keyword, current_row, res, max_cost) }
169
+ end
170
+ res
171
+ end
172
+
173
+ def normalize(string)
174
+ return '' if string.blank?
175
+ # stringの長さはtrie treeの深さを決めるので、検索走査のスピードも影響を及ぼす、変更する場合は慎重に
176
+ return '' if string.size > 30
177
+ string.scrub!
178
+ string.unicode_normalize!
179
+ string = string.tr('0-9a-zA-Z', '0-9a-zA-Z').downcase
180
+ string.gsub(/[^-’[^\p{P}]]|’$|’”$/, '')
181
+ end
182
+
183
+ def to_romaji(string)
184
+ string = string.delete('  ')
185
+ string = to_yomi(string) if string =~ /\p{Han}/
186
+ Romaji.kana2romaji(string)
187
+ end
188
+
189
+ def to_yomi(string)
190
+ @nm.enum_parse(string).map do |node|
191
+ if node.char_type.in?([KANJI, HIRAGANA, KATAKANA, KANJINUMERIC])
192
+ node.feature.split(',')[8]
193
+ else
194
+ node.surface
195
+ end
196
+ end.join
197
+ end
198
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: trie_suggest
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - cheung
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-05-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: a simple program for suggest or spellcheck keyword use trie tree
14
+ email: CheungYishin@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/trie_suggest.rb
20
+ homepage: https://github.com/cheungYX/suggest
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.5.2
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: trie tree for suggest
44
+ test_files: []