suika 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +2 -1
- data/.travis.yml +8 -2
- data/CHANGELOG.md +6 -0
- data/Gemfile.lock +3 -3
- data/README.md +20 -0
- data/dict/{ipadic.gz → sysdic.gz} +0 -0
- data/lib/suika.rb +1 -0
- data/lib/suika/lattice.rb +4 -6
- data/lib/suika/node.rb +21 -0
- data/lib/suika/tagger.rb +43 -25
- data/lib/suika/version.rb +1 -1
- data/suika.gemspec +1 -2
- metadata +10 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfb2e78e15c648ee309868bdfd3f386a66b1cff633cb546880132cdb9b8f3806
|
4
|
+
data.tar.gz: d398f4de11a4af80b7c62c4e468fa2e3f9393bbb2211e0f2e317a08ed05c73b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6fea777a4229725a174aa0955bcf5f775cc6e319fa73cf6f204385b3a47d84997b74b8798f78269f442aa11dcf151b1319c079c315dfea103dcd7a148cf0b5c5
|
7
|
+
data.tar.gz: 17d2fd7f248c965b6d585542c4ec4cb90e87663f96b0522f06f1c3e94c55a18e59f08e4ab1e7724f9dd48a114c2c0f35a0499100fb8850a89d2822de87927988
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
suika (0.
|
5
|
-
|
4
|
+
suika (0.2.0)
|
5
|
+
dartsclone (>= 0.2.0)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
+
dartsclone (0.2.0)
|
10
11
|
diff-lcs (1.4.4)
|
11
12
|
rake (12.3.3)
|
12
|
-
rambling-trie (2.1.1)
|
13
13
|
rspec (3.9.0)
|
14
14
|
rspec-core (~> 3.9.0)
|
15
15
|
rspec-expectations (~> 3.9.0)
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Suika
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/yoshoku/suika.svg?branch=master)](https://travis-ci.org/yoshoku/suika)
|
3
4
|
[![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
|
4
5
|
[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
|
5
6
|
[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
|
@@ -51,6 +52,25 @@ sentences.each do |sentence|
|
|
51
52
|
end
|
52
53
|
```
|
53
54
|
|
55
|
+
## Test
|
56
|
+
Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
|
57
|
+
without any error.
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
require 'suika'
|
61
|
+
|
62
|
+
tagger = Suika::Tagger.new
|
63
|
+
|
64
|
+
Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
|
65
|
+
File.foreach(filename) do |sentence|
|
66
|
+
sentence.strip!
|
67
|
+
puts tagger.parse(sentence) unless sentence.empty?
|
68
|
+
end
|
69
|
+
end
|
70
|
+
```
|
71
|
+
|
72
|
+
![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
|
73
|
+
|
54
74
|
## Contributing
|
55
75
|
|
56
76
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
|
Binary file
|
data/lib/suika.rb
CHANGED
data/lib/suika/lattice.rb
CHANGED
@@ -4,8 +4,6 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
|
-
|
9
7
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
8
|
|
11
9
|
# @!visibility private
|
@@ -14,16 +12,16 @@ module Suika
|
|
14
12
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
13
|
@end_nodes = Array.new(length + 1) { [] }
|
16
14
|
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
-
@end_nodes[0].
|
15
|
+
@end_nodes[0].push(bos)
|
18
16
|
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
|
-
@begin_nodes[length].
|
17
|
+
@begin_nodes[length].push(eos)
|
20
18
|
end
|
21
19
|
|
22
20
|
# @!visibility private
|
23
21
|
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
24
22
|
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
|
-
@begin_nodes[begin_id].
|
26
|
-
@end_nodes[end_id].
|
23
|
+
@begin_nodes[begin_id].push(node)
|
24
|
+
@end_nodes[end_id].push(node)
|
27
25
|
end
|
28
26
|
end
|
29
27
|
end
|
data/lib/suika/node.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class Node
|
6
|
+
# @!visibility private
|
7
|
+
attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
|
8
|
+
|
9
|
+
# @!visibility private
|
10
|
+
def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
11
|
+
@surface = surface
|
12
|
+
@unknown = unknown
|
13
|
+
@min_cost = min_cost
|
14
|
+
@min_prev = min_prev
|
15
|
+
@left_id = left_id
|
16
|
+
@right_id = right_id
|
17
|
+
@cost = cost
|
18
|
+
@attrs = attrs
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'dartsclone'
|
4
|
+
require 'rubygems/package'
|
4
5
|
require 'zlib'
|
5
6
|
|
6
7
|
module Suika
|
@@ -23,11 +24,11 @@ module Suika
|
|
23
24
|
class Tagger
|
24
25
|
# Create a new tagger by loading the built-in binary dictionary.
|
25
26
|
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
27
|
+
raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
|
28
|
+
|
29
|
+
@sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
|
30
|
+
@trie = DartsClone::DoubleArray.new
|
31
|
+
@trie.set_array(@sysdic[:trie])
|
31
32
|
end
|
32
33
|
|
33
34
|
# Parse the given sentence.
|
@@ -39,39 +40,40 @@ module Suika
|
|
39
40
|
terminal = sentence.length
|
40
41
|
|
41
42
|
while start < terminal
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
43
|
+
step = terminal - start
|
44
|
+
|
45
|
+
query = sentence[start..-1]
|
46
|
+
result = trie.common_prefix_search(query)
|
47
|
+
unless result.empty?
|
48
|
+
words, indices = result
|
49
|
+
words.each_with_index do |word, i|
|
50
|
+
features[indices[i]].each do |el|
|
49
51
|
lattice.insert(start, start + word.length, word, false,
|
50
52
|
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
51
53
|
end
|
52
54
|
end
|
53
|
-
|
54
|
-
word = sentence[start..pos]
|
55
|
+
step = words.map(&:size).min
|
55
56
|
end
|
56
57
|
|
57
58
|
word = sentence[start]
|
58
59
|
char_cate = CharDef.char_category(sentence[start])
|
59
|
-
|
60
|
+
char_type = CharDef.char_type(sentence[start])
|
61
|
+
if char_cate[:invoke]
|
60
62
|
char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
|
61
63
|
unk_terminal = [start + char_length, terminal].min
|
62
64
|
pos = start + 1
|
63
|
-
char_type = CharDef.char_type(sentence[start])
|
64
65
|
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
65
66
|
word << sentence[pos]
|
66
67
|
pos += 1
|
67
68
|
end
|
68
|
-
@unknown_dictionary[char_type].each do |el|
|
69
|
-
lattice.insert(start, start + word.length, word, true,
|
70
|
-
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
71
|
-
end
|
72
69
|
end
|
70
|
+
unknowns[char_type].each do |el|
|
71
|
+
lattice.insert(start, start + word.length, word, true,
|
72
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
73
|
+
end
|
74
|
+
step = [step, word.length].min
|
73
75
|
|
74
|
-
start +=
|
76
|
+
start += step
|
75
77
|
end
|
76
78
|
|
77
79
|
viterbi(lattice)
|
@@ -79,9 +81,25 @@ module Suika
|
|
79
81
|
|
80
82
|
private
|
81
83
|
|
84
|
+
DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
|
85
|
+
DICTIONARY_KEY = '562e53853b8a5b9f4857536b0748847a0878ebf0'
|
82
86
|
INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
|
83
87
|
|
84
|
-
private_constant :INT_MAX
|
88
|
+
private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
|
89
|
+
|
90
|
+
attr_reader :trie
|
91
|
+
|
92
|
+
def features
|
93
|
+
@sysdic[:dictionary]
|
94
|
+
end
|
95
|
+
|
96
|
+
def unknowns
|
97
|
+
@sysdic[:unknown_dictionary]
|
98
|
+
end
|
99
|
+
|
100
|
+
def costmat
|
101
|
+
@sysdic[:cost_matrix]
|
102
|
+
end
|
85
103
|
|
86
104
|
def viterbi(lattice)
|
87
105
|
bos = lattice.end_nodes[0].first
|
@@ -93,7 +111,7 @@ module Suika
|
|
93
111
|
rnode.min_cost = INT_MAX
|
94
112
|
rnode.min_prev = nil
|
95
113
|
lattice.end_nodes[n].each do |lnode|
|
96
|
-
cost = lnode.min_cost +
|
114
|
+
cost = lnode.min_cost + costmat[lnode.right_id][rnode.left_id] + rnode.cost
|
97
115
|
if cost < rnode.min_cost
|
98
116
|
rnode.min_cost = cost
|
99
117
|
rnode.min_prev = lnode
|
@@ -106,7 +124,7 @@ module Suika
|
|
106
124
|
prev_node = eos.min_prev
|
107
125
|
res = []
|
108
126
|
until prev_node.nil?
|
109
|
-
res.
|
127
|
+
res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
110
128
|
prev_node = prev_node.min_prev
|
111
129
|
end
|
112
130
|
|
data/lib/suika/version.rb
CHANGED
data/suika.gemspec
CHANGED
@@ -12,7 +12,6 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
13
13
|
spec.homepage = 'https://github.com/yoshoku/suika'
|
14
14
|
spec.license = 'BSD-3-Clause'
|
15
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
15
|
|
17
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
17
|
spec.metadata['source_code_uri'] = spec.homepage
|
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
28
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
28
|
spec.require_paths = ['lib']
|
30
29
|
|
31
|
-
spec.add_runtime_dependency '
|
30
|
+
spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
|
32
31
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: dartsclone
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.2.0
|
27
27
|
description: Suika is a Japanese morphological analyzer written in pure Ruby.
|
28
28
|
email:
|
29
29
|
- yoshoku@outlook.com
|
@@ -45,10 +45,11 @@ files:
|
|
45
45
|
- Rakefile
|
46
46
|
- bin/console
|
47
47
|
- bin/setup
|
48
|
-
- dict/
|
48
|
+
- dict/sysdic.gz
|
49
49
|
- lib/suika.rb
|
50
50
|
- lib/suika/char_def.rb
|
51
51
|
- lib/suika/lattice.rb
|
52
|
+
- lib/suika/node.rb
|
52
53
|
- lib/suika/tagger.rb
|
53
54
|
- lib/suika/version.rb
|
54
55
|
- suika.gemspec
|
@@ -68,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
68
69
|
requirements:
|
69
70
|
- - ">="
|
70
71
|
- !ruby/object:Gem::Version
|
71
|
-
version:
|
72
|
+
version: '0'
|
72
73
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
74
|
requirements:
|
74
75
|
- - ">="
|