suika 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Gemfile.lock +36 -0
- data/README.md +7 -7
- data/lib/suika/char_def.rb +13 -11
- data/lib/suika/lattice.rb +5 -5
- data/lib/suika/tagger.rb +22 -26
- data/lib/suika/version.rb +1 -1
- data/suika.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 108d59930a3a546de3a603d4c7de5851249264a4eddd76c8a350407b2d0b353a
|
4
|
+
data.tar.gz: d879db93ee8593466c415ba9064b50221ee2355fa0caedcd301fe07de9ab4880
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aceff27ac13d1c7b03ae286e2a24c02f5fe75d987e5ea91879777e66adf8155b8d92d185399e2fe31bf6dd7e6448fb175d3ed0b864c55e2df2ff30b6dcaad5a3
|
7
|
+
data.tar.gz: a92d9161f73a761c9621fa720725dbebdfa74da1d8b892ab3ecab8aa9ee835d98bf42341818219b0fdf1dd3a164a1206f306f0e8dcad9030b7137776ed5142e0
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
suika (0.1.2)
|
5
|
+
rambling-trie (~> 2.1)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.4.4)
|
11
|
+
rake (12.3.3)
|
12
|
+
rambling-trie (2.1.1)
|
13
|
+
rspec (3.9.0)
|
14
|
+
rspec-core (~> 3.9.0)
|
15
|
+
rspec-expectations (~> 3.9.0)
|
16
|
+
rspec-mocks (~> 3.9.0)
|
17
|
+
rspec-core (3.9.2)
|
18
|
+
rspec-support (~> 3.9.3)
|
19
|
+
rspec-expectations (3.9.2)
|
20
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
21
|
+
rspec-support (~> 3.9.0)
|
22
|
+
rspec-mocks (3.9.1)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.9.0)
|
25
|
+
rspec-support (3.9.3)
|
26
|
+
|
27
|
+
PLATFORMS
|
28
|
+
ruby
|
29
|
+
|
30
|
+
DEPENDENCIES
|
31
|
+
rake (~> 12.0)
|
32
|
+
rspec (~> 3.0)
|
33
|
+
suika!
|
34
|
+
|
35
|
+
BUNDLED WITH
|
36
|
+
2.1.2
|
data/README.md
CHANGED
@@ -30,13 +30,13 @@ require 'suika'
|
|
30
30
|
tagger = Suika::Tagger.new
|
31
31
|
tagger.parse('すもももももももものうち').each { |token| puts token }
|
32
32
|
|
33
|
-
# すもも
|
34
|
-
# も
|
35
|
-
# もも
|
36
|
-
# も
|
37
|
-
# もも
|
38
|
-
# の
|
39
|
-
# うち
|
33
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
34
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
35
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
36
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
37
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
38
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
39
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
40
40
|
```
|
41
41
|
|
42
42
|
Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
|
data/lib/suika/char_def.rb
CHANGED
@@ -16,39 +16,41 @@ module Suika
|
|
16
16
|
CHAR_CATEGORY[char_type(ch)]
|
17
17
|
end
|
18
18
|
|
19
|
+
MAX_GROUPING_SIZE = 24
|
20
|
+
|
19
21
|
CHAR_CATEGORY = {
|
20
22
|
'DEFAULT' => {
|
21
|
-
invoke:
|
23
|
+
invoke: false, group: true, length: 0
|
22
24
|
},
|
23
25
|
'SPACE' => {
|
24
|
-
invoke:
|
26
|
+
invoke: false, group: true, length: 0
|
25
27
|
},
|
26
28
|
'KANJI' => {
|
27
|
-
invoke:
|
29
|
+
invoke: false, group: false, length: 2
|
28
30
|
},
|
29
31
|
'SYMBOL' => {
|
30
|
-
invoke:
|
32
|
+
invoke: true, group: true, length: 0
|
31
33
|
},
|
32
34
|
'NUMERIC' => {
|
33
|
-
invoke:
|
35
|
+
invoke: true, group: true, length: 0
|
34
36
|
},
|
35
37
|
'ALPHA' => {
|
36
|
-
invoke:
|
38
|
+
invoke: true, group: true, length: 0
|
37
39
|
},
|
38
40
|
'HIRAGANA' => {
|
39
|
-
invoke:
|
41
|
+
invoke: false, group: true, length: 2
|
40
42
|
},
|
41
43
|
'KATAKANA' => {
|
42
|
-
invoke:
|
44
|
+
invoke: true, group: true, length: 2
|
43
45
|
},
|
44
46
|
'KANJINUMERIC' => {
|
45
|
-
invoke:
|
47
|
+
invoke: true, group: true, length: 0
|
46
48
|
},
|
47
49
|
'GREEK' => {
|
48
|
-
invoke:
|
50
|
+
invoke: true, group: true, length: 0
|
49
51
|
},
|
50
52
|
'CYRILLIC' => {
|
51
|
-
invoke:
|
53
|
+
invoke: true, group: true, length: 0
|
52
54
|
}
|
53
55
|
}.freeze
|
54
56
|
|
data/lib/suika/lattice.rb
CHANGED
@@ -4,7 +4,7 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
7
|
+
Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
8
|
|
9
9
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
10
|
|
@@ -13,15 +13,15 @@ module Suika
|
|
13
13
|
@length = length
|
14
14
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
15
|
@end_nodes = Array.new(length + 1) { [] }
|
16
|
-
bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
16
|
+
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
17
|
@end_nodes[0].append(bos)
|
18
|
-
eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
18
|
+
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
19
|
@begin_nodes[length].append(eos)
|
20
20
|
end
|
21
21
|
|
22
22
|
# @!visibility private
|
23
|
-
def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
|
24
|
-
node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
23
|
+
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
24
|
+
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
25
|
@begin_nodes[begin_id].append(node)
|
26
26
|
@end_nodes[end_id].append(node)
|
27
27
|
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -12,13 +12,13 @@ module Suika
|
|
12
12
|
# tagger = Suika::Tagger.new
|
13
13
|
# tagger.parse('すもももももももものうち').each { |token| puts token }
|
14
14
|
#
|
15
|
-
# # すもも
|
16
|
-
# # も
|
17
|
-
# # もも
|
18
|
-
# # も
|
19
|
-
# # もも
|
20
|
-
# # の
|
21
|
-
# # うち
|
15
|
+
# # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
16
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
17
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
18
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
19
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
20
|
+
# # の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
21
|
+
# # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
22
22
|
#
|
23
23
|
class Tagger
|
24
24
|
# Create a new tagger by loading the built-in binary dictionary.
|
@@ -41,41 +41,36 @@ module Suika
|
|
41
41
|
while start < terminal
|
42
42
|
word = sentence[start]
|
43
43
|
pos = start
|
44
|
-
|
44
|
+
matched = false
|
45
45
|
while @trie.match?(word) && pos < terminal
|
46
46
|
if @dictionary.key?(word)
|
47
|
+
matched = true
|
47
48
|
@dictionary[word].each do |el|
|
48
|
-
lattice.insert(start, start + word.length,
|
49
|
-
|
50
|
-
el[3..-1])
|
49
|
+
lattice.insert(start, start + word.length, word, false,
|
50
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
51
51
|
end
|
52
|
-
is_unknown = false
|
53
52
|
end
|
54
53
|
pos += 1
|
55
54
|
word = sentence[start..pos]
|
56
55
|
end
|
57
56
|
|
58
|
-
unless is_unknown
|
59
|
-
start += 1
|
60
|
-
next
|
61
|
-
end
|
62
|
-
|
63
57
|
word = sentence[start]
|
64
|
-
char_type = CharDef.char_type(sentence[start])
|
65
58
|
char_cate = CharDef.char_category(sentence[start])
|
66
|
-
|
67
|
-
|
59
|
+
unless !char_cate[:invoke] && matched
|
60
|
+
char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
|
61
|
+
unk_terminal = [start + char_length, terminal].min
|
68
62
|
pos = start + 1
|
63
|
+
char_type = CharDef.char_type(sentence[start])
|
69
64
|
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
70
65
|
word << sentence[pos]
|
71
66
|
pos += 1
|
72
67
|
end
|
68
|
+
@unknown_dictionary[char_type].each do |el|
|
69
|
+
lattice.insert(start, start + word.length, word, true,
|
70
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
71
|
+
end
|
73
72
|
end
|
74
|
-
|
75
|
-
lattice.insert(start, start + word.length,
|
76
|
-
word, el[0].to_i, el[1].to_i, el[2].to_i,
|
77
|
-
el[3..-1])
|
78
|
-
end
|
73
|
+
|
79
74
|
start += 1
|
80
75
|
end
|
81
76
|
|
@@ -111,9 +106,10 @@ module Suika
|
|
111
106
|
prev_node = eos.min_prev
|
112
107
|
res = []
|
113
108
|
until prev_node.nil?
|
114
|
-
res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',
|
109
|
+
res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
115
110
|
prev_node = prev_node.min_prev
|
116
111
|
end
|
112
|
+
|
117
113
|
res.reverse
|
118
114
|
end
|
119
115
|
end
|
data/lib/suika/version.rb
CHANGED
data/suika.gemspec
CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
|
|
16
16
|
|
17
17
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
18
|
spec.metadata['source_code_uri'] = spec.homepage
|
19
|
-
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/master/CHANGELOG.md'
|
20
20
|
spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
|
21
21
|
|
22
22
|
# Specify which files should be added to the gem when it is released.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rambling-trie
|
@@ -38,6 +38,7 @@ files:
|
|
38
38
|
- CHANGELOG.md
|
39
39
|
- CODE_OF_CONDUCT.md
|
40
40
|
- Gemfile
|
41
|
+
- Gemfile.lock
|
41
42
|
- LICENSE.txt
|
42
43
|
- NOTICE.txt
|
43
44
|
- README.md
|
@@ -57,7 +58,7 @@ licenses:
|
|
57
58
|
metadata:
|
58
59
|
homepage_uri: https://github.com/yoshoku/suika
|
59
60
|
source_code_uri: https://github.com/yoshoku/suika
|
60
|
-
changelog_uri: https://github.com/yoshoku/
|
61
|
+
changelog_uri: https://github.com/yoshoku/suika/blob/master/CHANGELOG.md
|
61
62
|
documentation_uri: https://rubydoc.info/gems/suika
|
62
63
|
post_install_message:
|
63
64
|
rdoc_options: []
|