suika 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Gemfile.lock +36 -0
- data/README.md +7 -7
- data/lib/suika/char_def.rb +13 -11
- data/lib/suika/lattice.rb +5 -5
- data/lib/suika/tagger.rb +22 -26
- data/lib/suika/version.rb +1 -1
- data/suika.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 108d59930a3a546de3a603d4c7de5851249264a4eddd76c8a350407b2d0b353a
|
4
|
+
data.tar.gz: d879db93ee8593466c415ba9064b50221ee2355fa0caedcd301fe07de9ab4880
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aceff27ac13d1c7b03ae286e2a24c02f5fe75d987e5ea91879777e66adf8155b8d92d185399e2fe31bf6dd7e6448fb175d3ed0b864c55e2df2ff30b6dcaad5a3
|
7
|
+
data.tar.gz: a92d9161f73a761c9621fa720725dbebdfa74da1d8b892ab3ecab8aa9ee835d98bf42341818219b0fdf1dd3a164a1206f306f0e8dcad9030b7137776ed5142e0
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
suika (0.1.2)
|
5
|
+
rambling-trie (~> 2.1)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.4.4)
|
11
|
+
rake (12.3.3)
|
12
|
+
rambling-trie (2.1.1)
|
13
|
+
rspec (3.9.0)
|
14
|
+
rspec-core (~> 3.9.0)
|
15
|
+
rspec-expectations (~> 3.9.0)
|
16
|
+
rspec-mocks (~> 3.9.0)
|
17
|
+
rspec-core (3.9.2)
|
18
|
+
rspec-support (~> 3.9.3)
|
19
|
+
rspec-expectations (3.9.2)
|
20
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
21
|
+
rspec-support (~> 3.9.0)
|
22
|
+
rspec-mocks (3.9.1)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.9.0)
|
25
|
+
rspec-support (3.9.3)
|
26
|
+
|
27
|
+
PLATFORMS
|
28
|
+
ruby
|
29
|
+
|
30
|
+
DEPENDENCIES
|
31
|
+
rake (~> 12.0)
|
32
|
+
rspec (~> 3.0)
|
33
|
+
suika!
|
34
|
+
|
35
|
+
BUNDLED WITH
|
36
|
+
2.1.2
|
data/README.md
CHANGED
@@ -30,13 +30,13 @@ require 'suika'
|
|
30
30
|
tagger = Suika::Tagger.new
|
31
31
|
tagger.parse('すもももももももものうち').each { |token| puts token }
|
32
32
|
|
33
|
-
# すもも
|
34
|
-
# も
|
35
|
-
# もも
|
36
|
-
# も
|
37
|
-
# もも
|
38
|
-
# の
|
39
|
-
# うち
|
33
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
34
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
35
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
36
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
37
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
38
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
39
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
40
40
|
```
|
41
41
|
|
42
42
|
Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
|
data/lib/suika/char_def.rb
CHANGED
@@ -16,39 +16,41 @@ module Suika
|
|
16
16
|
CHAR_CATEGORY[char_type(ch)]
|
17
17
|
end
|
18
18
|
|
19
|
+
MAX_GROUPING_SIZE = 24
|
20
|
+
|
19
21
|
CHAR_CATEGORY = {
|
20
22
|
'DEFAULT' => {
|
21
|
-
invoke:
|
23
|
+
invoke: false, group: true, length: 0
|
22
24
|
},
|
23
25
|
'SPACE' => {
|
24
|
-
invoke:
|
26
|
+
invoke: false, group: true, length: 0
|
25
27
|
},
|
26
28
|
'KANJI' => {
|
27
|
-
invoke:
|
29
|
+
invoke: false, group: false, length: 2
|
28
30
|
},
|
29
31
|
'SYMBOL' => {
|
30
|
-
invoke:
|
32
|
+
invoke: true, group: true, length: 0
|
31
33
|
},
|
32
34
|
'NUMERIC' => {
|
33
|
-
invoke:
|
35
|
+
invoke: true, group: true, length: 0
|
34
36
|
},
|
35
37
|
'ALPHA' => {
|
36
|
-
invoke:
|
38
|
+
invoke: true, group: true, length: 0
|
37
39
|
},
|
38
40
|
'HIRAGANA' => {
|
39
|
-
invoke:
|
41
|
+
invoke: false, group: true, length: 2
|
40
42
|
},
|
41
43
|
'KATAKANA' => {
|
42
|
-
invoke:
|
44
|
+
invoke: true, group: true, length: 2
|
43
45
|
},
|
44
46
|
'KANJINUMERIC' => {
|
45
|
-
invoke:
|
47
|
+
invoke: true, group: true, length: 0
|
46
48
|
},
|
47
49
|
'GREEK' => {
|
48
|
-
invoke:
|
50
|
+
invoke: true, group: true, length: 0
|
49
51
|
},
|
50
52
|
'CYRILLIC' => {
|
51
|
-
invoke:
|
53
|
+
invoke: true, group: true, length: 0
|
52
54
|
}
|
53
55
|
}.freeze
|
54
56
|
|
data/lib/suika/lattice.rb
CHANGED
@@ -4,7 +4,7 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
7
|
+
Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
8
|
|
9
9
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
10
|
|
@@ -13,15 +13,15 @@ module Suika
|
|
13
13
|
@length = length
|
14
14
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
15
|
@end_nodes = Array.new(length + 1) { [] }
|
16
|
-
bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
16
|
+
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
17
|
@end_nodes[0].append(bos)
|
18
|
-
eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
18
|
+
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
19
|
@begin_nodes[length].append(eos)
|
20
20
|
end
|
21
21
|
|
22
22
|
# @!visibility private
|
23
|
-
def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
|
24
|
-
node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
23
|
+
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
24
|
+
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
25
|
@begin_nodes[begin_id].append(node)
|
26
26
|
@end_nodes[end_id].append(node)
|
27
27
|
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -12,13 +12,13 @@ module Suika
|
|
12
12
|
# tagger = Suika::Tagger.new
|
13
13
|
# tagger.parse('すもももももももものうち').each { |token| puts token }
|
14
14
|
#
|
15
|
-
# # すもも
|
16
|
-
# # も
|
17
|
-
# # もも
|
18
|
-
# # も
|
19
|
-
# # もも
|
20
|
-
# # の
|
21
|
-
# # うち
|
15
|
+
# # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
16
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
17
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
18
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
19
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
20
|
+
# # の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
21
|
+
# # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
22
22
|
#
|
23
23
|
class Tagger
|
24
24
|
# Create a new tagger by loading the built-in binary dictionary.
|
@@ -41,41 +41,36 @@ module Suika
|
|
41
41
|
while start < terminal
|
42
42
|
word = sentence[start]
|
43
43
|
pos = start
|
44
|
-
|
44
|
+
matched = false
|
45
45
|
while @trie.match?(word) && pos < terminal
|
46
46
|
if @dictionary.key?(word)
|
47
|
+
matched = true
|
47
48
|
@dictionary[word].each do |el|
|
48
|
-
lattice.insert(start, start + word.length,
|
49
|
-
|
50
|
-
el[3..-1])
|
49
|
+
lattice.insert(start, start + word.length, word, false,
|
50
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
51
51
|
end
|
52
|
-
is_unknown = false
|
53
52
|
end
|
54
53
|
pos += 1
|
55
54
|
word = sentence[start..pos]
|
56
55
|
end
|
57
56
|
|
58
|
-
unless is_unknown
|
59
|
-
start += 1
|
60
|
-
next
|
61
|
-
end
|
62
|
-
|
63
57
|
word = sentence[start]
|
64
|
-
char_type = CharDef.char_type(sentence[start])
|
65
58
|
char_cate = CharDef.char_category(sentence[start])
|
66
|
-
|
67
|
-
|
59
|
+
unless !char_cate[:invoke] && matched
|
60
|
+
char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
|
61
|
+
unk_terminal = [start + char_length, terminal].min
|
68
62
|
pos = start + 1
|
63
|
+
char_type = CharDef.char_type(sentence[start])
|
69
64
|
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
70
65
|
word << sentence[pos]
|
71
66
|
pos += 1
|
72
67
|
end
|
68
|
+
@unknown_dictionary[char_type].each do |el|
|
69
|
+
lattice.insert(start, start + word.length, word, true,
|
70
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
71
|
+
end
|
73
72
|
end
|
74
|
-
|
75
|
-
lattice.insert(start, start + word.length,
|
76
|
-
word, el[0].to_i, el[1].to_i, el[2].to_i,
|
77
|
-
el[3..-1])
|
78
|
-
end
|
73
|
+
|
79
74
|
start += 1
|
80
75
|
end
|
81
76
|
|
@@ -111,9 +106,10 @@ module Suika
|
|
111
106
|
prev_node = eos.min_prev
|
112
107
|
res = []
|
113
108
|
until prev_node.nil?
|
114
|
-
res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',
|
109
|
+
res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
115
110
|
prev_node = prev_node.min_prev
|
116
111
|
end
|
112
|
+
|
117
113
|
res.reverse
|
118
114
|
end
|
119
115
|
end
|
data/lib/suika/version.rb
CHANGED
data/suika.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
|
|
16
16
|
|
17
17
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
18
|
spec.metadata['source_code_uri'] = spec.homepage
|
19
|
-
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/master/CHANGELOG.md'
|
20
20
|
spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
|
21
21
|
|
22
22
|
# Specify which files should be added to the gem when it is released.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rambling-trie
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- CHANGELOG.md
|
39
39
|
- CODE_OF_CONDUCT.md
|
40
40
|
- Gemfile
|
41
|
+
- Gemfile.lock
|
41
42
|
- LICENSE.txt
|
42
43
|
- NOTICE.txt
|
43
44
|
- README.md
|
@@ -57,7 +58,7 @@ licenses:
|
|
57
58
|
metadata:
|
58
59
|
homepage_uri: https://github.com/yoshoku/suika
|
59
60
|
source_code_uri: https://github.com/yoshoku/suika
|
60
|
-
changelog_uri: https://github.com/yoshoku/
|
61
|
+
changelog_uri: https://github.com/yoshoku/suika/blob/master/CHANGELOG.md
|
61
62
|
documentation_uri: https://rubydoc.info/gems/suika
|
62
63
|
post_install_message:
|
63
64
|
rdoc_options: []
|