suika 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -1
- data/.travis.yml +8 -2
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +3 -3
- data/README.md +20 -0
- data/dict/{ipadic.gz → sysdic.gz} +0 -0
- data/lib/suika.rb +1 -0
- data/lib/suika/lattice.rb +4 -6
- data/lib/suika/node.rb +21 -0
- data/lib/suika/tagger.rb +43 -25
- data/lib/suika/version.rb +1 -1
- data/suika.gemspec +1 -2
- metadata +10 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfb2e78e15c648ee309868bdfd3f386a66b1cff633cb546880132cdb9b8f3806
|
4
|
+
data.tar.gz: d398f4de11a4af80b7c62c4e468fa2e3f9393bbb2211e0f2e317a08ed05c73b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6fea777a4229725a174aa0955bcf5f775cc6e319fa73cf6f204385b3a47d84997b74b8798f78269f442aa11dcf151b1319c079c315dfea103dcd7a148cf0b5c5
|
7
|
+
data.tar.gz: 17d2fd7f248c965b6d585542c4ec4cb90e87663f96b0522f06f1c3e94c55a18e59f08e4ab1e7724f9dd48a114c2c0f35a0499100fb8850a89d2822de87927988
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
suika (0.
|
5
|
-
|
4
|
+
suika (0.2.0)
|
5
|
+
dartsclone (>= 0.2.0)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
+
dartsclone (0.2.0)
|
10
11
|
diff-lcs (1.4.4)
|
11
12
|
rake (12.3.3)
|
12
|
-
rambling-trie (2.1.1)
|
13
13
|
rspec (3.9.0)
|
14
14
|
rspec-core (~> 3.9.0)
|
15
15
|
rspec-expectations (~> 3.9.0)
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Suika
|
2
2
|
|
3
|
+
[](https://travis-ci.org/yoshoku/suika)
|
3
4
|
[](https://badge.fury.io/rb/suika)
|
4
5
|
[](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
|
5
6
|
[](https://rubydoc.info/gems/suika)
|
@@ -51,6 +52,25 @@ sentences.each do |sentence|
|
|
51
52
|
end
|
52
53
|
```
|
53
54
|
|
55
|
+
## Test
|
56
|
+
Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
|
57
|
+
without any error.
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
require 'suika'
|
61
|
+
|
62
|
+
tagger = Suika::Tagger.new
|
63
|
+
|
64
|
+
Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
|
65
|
+
File.foreach(filename) do |sentence|
|
66
|
+
sentence.strip!
|
67
|
+
puts tagger.parse(sentence) unless sentence.empty?
|
68
|
+
end
|
69
|
+
end
|
70
|
+
```
|
71
|
+
|
72
|
+

|
73
|
+
|
54
74
|
## Contributing
|
55
75
|
|
56
76
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
|
Binary file
|
data/lib/suika.rb
CHANGED
data/lib/suika/lattice.rb
CHANGED
@@ -4,8 +4,6 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
|
-
|
9
7
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
8
|
|
11
9
|
# @!visibility private
|
@@ -14,16 +12,16 @@ module Suika
|
|
14
12
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
13
|
@end_nodes = Array.new(length + 1) { [] }
|
16
14
|
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
-
@end_nodes[0].
|
15
|
+
@end_nodes[0].push(bos)
|
18
16
|
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
|
-
@begin_nodes[length].
|
17
|
+
@begin_nodes[length].push(eos)
|
20
18
|
end
|
21
19
|
|
22
20
|
# @!visibility private
|
23
21
|
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
24
22
|
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
|
-
@begin_nodes[begin_id].
|
26
|
-
@end_nodes[end_id].
|
23
|
+
@begin_nodes[begin_id].push(node)
|
24
|
+
@end_nodes[end_id].push(node)
|
27
25
|
end
|
28
26
|
end
|
29
27
|
end
|
data/lib/suika/node.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class Node
|
6
|
+
# @!visibility private
|
7
|
+
attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
|
8
|
+
|
9
|
+
# @!visibility private
|
10
|
+
def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
11
|
+
@surface = surface
|
12
|
+
@unknown = unknown
|
13
|
+
@min_cost = min_cost
|
14
|
+
@min_prev = min_prev
|
15
|
+
@left_id = left_id
|
16
|
+
@right_id = right_id
|
17
|
+
@cost = cost
|
18
|
+
@attrs = attrs
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'dartsclone'
|
4
|
+
require 'rubygems/package'
|
4
5
|
require 'zlib'
|
5
6
|
|
6
7
|
module Suika
|
@@ -23,11 +24,11 @@ module Suika
|
|
23
24
|
class Tagger
|
24
25
|
# Create a new tagger by loading the built-in binary dictionary.
|
25
26
|
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
27
|
+
raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
|
28
|
+
|
29
|
+
@sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
|
30
|
+
@trie = DartsClone::DoubleArray.new
|
31
|
+
@trie.set_array(@sysdic[:trie])
|
31
32
|
end
|
32
33
|
|
33
34
|
# Parse the given sentence.
|
@@ -39,39 +40,40 @@ module Suika
|
|
39
40
|
terminal = sentence.length
|
40
41
|
|
41
42
|
while start < terminal
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
step = terminal - start
|
44
|
+
|
45
|
+
query = sentence[start..-1]
|
46
|
+
result = trie.common_prefix_search(query)
|
47
|
+
unless result.empty?
|
48
|
+
words, indices = result
|
49
|
+
words.each_with_index do |word, i|
|
50
|
+
features[indices[i]].each do |el|
|
49
51
|
lattice.insert(start, start + word.length, word, false,
|
50
52
|
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
51
53
|
end
|
52
54
|
end
|
53
|
-
|
54
|
-
word = sentence[start..pos]
|
55
|
+
step = words.map(&:size).min
|
55
56
|
end
|
56
57
|
|
57
58
|
word = sentence[start]
|
58
59
|
char_cate = CharDef.char_category(sentence[start])
|
59
|
-
|
60
|
+
char_type = CharDef.char_type(sentence[start])
|
61
|
+
if char_cate[:invoke]
|
60
62
|
char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
|
61
63
|
unk_terminal = [start + char_length, terminal].min
|
62
64
|
pos = start + 1
|
63
|
-
char_type = CharDef.char_type(sentence[start])
|
64
65
|
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
65
66
|
word << sentence[pos]
|
66
67
|
pos += 1
|
67
68
|
end
|
68
|
-
@unknown_dictionary[char_type].each do |el|
|
69
|
-
lattice.insert(start, start + word.length, word, true,
|
70
|
-
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
71
|
-
end
|
72
69
|
end
|
70
|
+
unknowns[char_type].each do |el|
|
71
|
+
lattice.insert(start, start + word.length, word, true,
|
72
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
73
|
+
end
|
74
|
+
step = [step, word.length].min
|
73
75
|
|
74
|
-
start +=
|
76
|
+
start += step
|
75
77
|
end
|
76
78
|
|
77
79
|
viterbi(lattice)
|
@@ -79,9 +81,25 @@ module Suika
|
|
79
81
|
|
80
82
|
private
|
81
83
|
|
84
|
+
DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
|
85
|
+
DICTIONARY_KEY = '562e53853b8a5b9f4857536b0748847a0878ebf0'
|
82
86
|
INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
|
83
87
|
|
84
|
-
private_constant :INT_MAX
|
88
|
+
private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
|
89
|
+
|
90
|
+
attr_reader :trie
|
91
|
+
|
92
|
+
def features
|
93
|
+
@sysdic[:dictionary]
|
94
|
+
end
|
95
|
+
|
96
|
+
def unknowns
|
97
|
+
@sysdic[:unknown_dictionary]
|
98
|
+
end
|
99
|
+
|
100
|
+
def costmat
|
101
|
+
@sysdic[:cost_matrix]
|
102
|
+
end
|
85
103
|
|
86
104
|
def viterbi(lattice)
|
87
105
|
bos = lattice.end_nodes[0].first
|
@@ -93,7 +111,7 @@ module Suika
|
|
93
111
|
rnode.min_cost = INT_MAX
|
94
112
|
rnode.min_prev = nil
|
95
113
|
lattice.end_nodes[n].each do |lnode|
|
96
|
-
cost = lnode.min_cost +
|
114
|
+
cost = lnode.min_cost + costmat[lnode.right_id][rnode.left_id] + rnode.cost
|
97
115
|
if cost < rnode.min_cost
|
98
116
|
rnode.min_cost = cost
|
99
117
|
rnode.min_prev = lnode
|
@@ -106,7 +124,7 @@ module Suika
|
|
106
124
|
prev_node = eos.min_prev
|
107
125
|
res = []
|
108
126
|
until prev_node.nil?
|
109
|
-
res.
|
127
|
+
res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
110
128
|
prev_node = prev_node.min_prev
|
111
129
|
end
|
112
130
|
|
data/lib/suika/version.rb
CHANGED
data/suika.gemspec
CHANGED
@@ -12,7 +12,6 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
13
13
|
spec.homepage = 'https://github.com/yoshoku/suika'
|
14
14
|
spec.license = 'BSD-3-Clause'
|
15
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
15
|
|
17
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
17
|
spec.metadata['source_code_uri'] = spec.homepage
|
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
28
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
28
|
spec.require_paths = ['lib']
|
30
29
|
|
31
|
-
spec.add_runtime_dependency '
|
30
|
+
spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
|
32
31
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: dartsclone
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.2.0
|
27
27
|
description: Suika is a Japanese morphological analyzer written in pure Ruby.
|
28
28
|
email:
|
29
29
|
- yoshoku@outlook.com
|
@@ -45,10 +45,11 @@ files:
|
|
45
45
|
- Rakefile
|
46
46
|
- bin/console
|
47
47
|
- bin/setup
|
48
|
-
- dict/
|
48
|
+
- dict/sysdic.gz
|
49
49
|
- lib/suika.rb
|
50
50
|
- lib/suika/char_def.rb
|
51
51
|
- lib/suika/lattice.rb
|
52
|
+
- lib/suika/node.rb
|
52
53
|
- lib/suika/tagger.rb
|
53
54
|
- lib/suika/version.rb
|
54
55
|
- suika.gemspec
|
@@ -68,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
68
69
|
requirements:
|
69
70
|
- - ">="
|
70
71
|
- !ruby/object:Gem::Version
|
71
|
-
version:
|
72
|
+
version: '0'
|
72
73
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
74
|
requirements:
|
74
75
|
- - ">="
|