suika 0.1.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.coveralls.yml +1 -0
- data/.github/workflows/build.yml +21 -0
- data/.github/workflows/coverage.yml +26 -0
- data/.rubocop.yml +2 -1
- data/CHANGELOG.md +43 -3
- data/Gemfile +5 -1
- data/LICENSE.txt +1 -1
- data/README.md +33 -12
- data/Rakefile +75 -2
- data/Steepfile +20 -0
- data/dict/{ipadic.gz → sysdic.gz} +0 -0
- data/lib/suika.rb +1 -0
- data/lib/suika/char_def.rb +18 -14
- data/lib/suika/lattice.rb +8 -10
- data/lib/suika/node.rb +21 -0
- data/lib/suika/tagger.rb +67 -47
- data/lib/suika/version.rb +1 -1
- data/sig/suika.rbs +3 -0
- data/sig/suika/char_def.rbs +25 -0
- data/sig/suika/lattice.rbs +11 -0
- data/sig/suika/node.rbs +18 -0
- data/sig/suika/tagger.rbs +24 -0
- data/suika.gemspec +2 -3
- metadata +24 -15
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 148e229070959a89197febf9bf9eabfdbe941f3c1cda66b75f87afb9371436c1
|
4
|
+
data.tar.gz: c6d4fa8c654144ad39e19ff23d63161ea193bd8760eb654635d36666bed6f2dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbe9535910050678c51c3ff2d95118959d3b4cdeb4fffb2fb405ad5871258f527d1fa3f36df59706b3cf3d8c6265f19bd5844d778273fc369fc763eec293cf89
|
7
|
+
data.tar.gz: 063ed20722d52ac97b4993093a60ad866fd744c8e74361b3934c1f96082523da510f8d7adcbb46e238aa03ba7dbf53539180215aa5475488294a77d92ea8633e
|
data/.coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: github-ci
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: build
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
ruby: [ '2.6', '2.7', '3.0' ]
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v2
|
13
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
14
|
+
uses: actions/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby }}
|
17
|
+
- name: Build and test with Rake
|
18
|
+
run: |
|
19
|
+
gem install --no-document bundler
|
20
|
+
bundle install --jobs 4 --retry 3
|
21
|
+
bundle exec rake
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: coverage
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ main ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ main ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
coverage:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
steps:
|
13
|
+
- uses: actions/checkout@v2
|
14
|
+
- name: Set up Ruby 2.7
|
15
|
+
uses: actions/setup-ruby@v1
|
16
|
+
with:
|
17
|
+
ruby-version: '2.7'
|
18
|
+
- name: Build and test with Rake
|
19
|
+
run: |
|
20
|
+
gem install --no-document bundler
|
21
|
+
bundle install --jobs 4 --retry 3
|
22
|
+
bundle exec rake
|
23
|
+
- name: Coveralls GitHub Action
|
24
|
+
uses: coverallsapp/github-action@v1.1.2
|
25
|
+
with:
|
26
|
+
github-token: ${{ secrets.GITHUB_TOKEN }}
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,8 +1,48 @@
|
|
1
|
-
|
1
|
+
## 0.3.1
|
2
|
+
- Fix Tagger's inspect method not to expand instance variables for object creation on irb and pry.
|
3
|
+
|
4
|
+
## 0.3.0
|
5
|
+
|
6
|
+
### Changes
|
7
|
+
- Add type declaration files.
|
8
|
+
- Refactor to avoid assigning null to variables.
|
9
|
+
- Fix some configuration files.
|
10
|
+
|
11
|
+
|
12
|
+
## 0.2.0
|
13
|
+
|
14
|
+
### Breaking Change
|
15
|
+
- Change to use dartsclone for trie library.
|
16
|
+
|
17
|
+
|
18
|
+
## 0.1.4
|
19
|
+
|
20
|
+
### Bug Fixes
|
21
|
+
- Fix CharDef.char_type to return 'DEFAULT' when unknown character code is given.
|
22
|
+
|
23
|
+
### Features
|
24
|
+
- Add character code of square era name Reiwa.
|
25
|
+
|
26
|
+
## 0.1.3
|
27
|
+
|
28
|
+
### Bug Fixes
|
29
|
+
- Fix unknown word processing.
|
30
|
+
|
31
|
+
### Changes
|
32
|
+
- Remove redundant spaces from output.
|
33
|
+
|
34
|
+
|
35
|
+
## 0.1.2
|
36
|
+
|
37
|
+
### Bug Fixes
|
2
38
|
- Fix local variable typo in Tagger.parse.
|
3
39
|
|
4
|
-
|
40
|
+
|
41
|
+
## 0.1.1
|
42
|
+
|
43
|
+
### Bug Fixes
|
5
44
|
- Fix specification of class in CharDef.char_type.
|
6
45
|
|
7
|
-
|
46
|
+
|
47
|
+
## 0.1.0
|
8
48
|
- First release.
|
data/Gemfile
CHANGED
@@ -5,5 +5,9 @@ source 'https://rubygems.org'
|
|
5
5
|
# Specify your gem's dependencies in suika.gemspec
|
6
6
|
gemspec
|
7
7
|
|
8
|
-
gem 'rake', '~>
|
8
|
+
gem 'rake', '~> 13.0'
|
9
9
|
gem 'rspec', '~> 3.0'
|
10
|
+
gem 'simplecov', '~> 0.21'
|
11
|
+
gem 'simplecov-lcov', '~> 0.8'
|
12
|
+
gem 'rbs', '~> 1.2'
|
13
|
+
gem 'steep', '~> 0.44'
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
# Suika
|
2
2
|
|
3
|
+
[![Build Status](https://github.com/yoshoku/suika/workflows/build/badge.svg)](https://github.com/yoshoku/suika/actions?query=workflow%3Abuild)
|
4
|
+
[![Coverage Status](https://coveralls.io/repos/github/yoshoku/suika/badge.svg?branch=main)](https://coveralls.io/github/yoshoku/suika?branch=main)
|
3
5
|
[![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
|
4
|
-
[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/
|
6
|
+
[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
|
5
7
|
[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
|
6
8
|
|
7
9
|
Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
|
@@ -30,13 +32,13 @@ require 'suika'
|
|
30
32
|
tagger = Suika::Tagger.new
|
31
33
|
tagger.parse('すもももももももものうち').each { |token| puts token }
|
32
34
|
|
33
|
-
# すもも
|
34
|
-
# も
|
35
|
-
# もも
|
36
|
-
# も
|
37
|
-
# もも
|
38
|
-
# の
|
39
|
-
# うち
|
35
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
36
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
37
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
38
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
39
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
40
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
41
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
40
42
|
```
|
41
43
|
|
42
44
|
Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
|
@@ -51,17 +53,36 @@ sentences.each do |sentence|
|
|
51
53
|
end
|
52
54
|
```
|
53
55
|
|
56
|
+
## Test
|
57
|
+
Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
|
58
|
+
without any error.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
require 'suika'
|
62
|
+
|
63
|
+
tagger = Suika::Tagger.new
|
64
|
+
|
65
|
+
Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
|
66
|
+
File.foreach(filename) do |sentence|
|
67
|
+
sentence.strip!
|
68
|
+
puts tagger.parse(sentence) unless sentence.empty?
|
69
|
+
end
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+
![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
|
74
|
+
|
54
75
|
## Contributing
|
55
76
|
|
56
77
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
|
57
|
-
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/
|
78
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
|
58
79
|
|
59
80
|
## License
|
60
81
|
|
61
82
|
The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
|
62
83
|
In addition, the gem includes binary data generated from mecab-ipadic.
|
63
|
-
The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/
|
64
|
-
and [NOTICE.txt](https://github.com/yoshoku/suika/blob/
|
84
|
+
The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
|
85
|
+
and [NOTICE.txt](https://github.com/yoshoku/suika/blob/main/NOTICE.txt).
|
65
86
|
|
66
87
|
## Respect
|
67
88
|
|
@@ -74,4 +95,4 @@ Janome, a morphological analyzer written in scripting language, gives me the cou
|
|
74
95
|
|
75
96
|
## Code of Conduct
|
76
97
|
|
77
|
-
Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/
|
98
|
+
Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
|
data/Rakefile
CHANGED
@@ -1,6 +1,79 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'dartsclone'
|
6
|
+
require 'nkf'
|
7
|
+
require 'rubygems/package'
|
8
|
+
require 'zlib'
|
3
9
|
|
4
10
|
RSpec::Core::RakeTask.new(:spec)
|
5
11
|
|
6
12
|
task :default => :spec
|
13
|
+
|
14
|
+
desc 'Build suika system dictionary'
|
15
|
+
task :dictionary do
|
16
|
+
base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
|
17
|
+
unless File.directory?(base_dir)
|
18
|
+
puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
|
19
|
+
puts
|
20
|
+
puts 'Example:'
|
21
|
+
puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
|
22
|
+
puts 'cd dict'
|
23
|
+
puts 'tar xzf mecab-ipadic.tgz'
|
24
|
+
puts 'cd ../'
|
25
|
+
next # exit
|
26
|
+
end
|
27
|
+
|
28
|
+
File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
|
29
|
+
f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
|
30
|
+
end
|
31
|
+
|
32
|
+
unknowns = {}
|
33
|
+
File.open("#{base_dir}/unk.def") do |f|
|
34
|
+
f.each_line do |line|
|
35
|
+
row = NKF.nkf('-w', line.chomp).split(',')
|
36
|
+
unknowns[row[0]] ||= []
|
37
|
+
unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
dict = {}
|
42
|
+
Dir.glob("#{base_dir}/*.csv").each do |filename|
|
43
|
+
File.open(filename) do |f|
|
44
|
+
f.each_line do |line|
|
45
|
+
row = NKF.nkf('-w', line.chomp).split(',')
|
46
|
+
dict[row[0]] ||= []
|
47
|
+
dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
da = DartsClone::DoubleArray.new
|
53
|
+
words = dict.keys.sort
|
54
|
+
da.build(words)
|
55
|
+
features = words.map { |w| dict[w] }
|
56
|
+
|
57
|
+
concosts = nil
|
58
|
+
File.open("#{base_dir}/matrix.def") do |f|
|
59
|
+
n_entries = f.readline.chomp.split.map(&:to_i).first
|
60
|
+
concosts = Array.new(n_entries) { Array.new(n_entries) }
|
61
|
+
f.each_line do |line|
|
62
|
+
row, col, cost = line.chomp.split.map(&:to_i)
|
63
|
+
concosts[row][col] = cost
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
ipadic = {
|
68
|
+
trie: da.get_array,
|
69
|
+
features: features,
|
70
|
+
unknowns: unknowns,
|
71
|
+
concosts: concosts
|
72
|
+
}
|
73
|
+
|
74
|
+
Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
|
75
|
+
|
76
|
+
puts 'The system dictionary has been successfully built:'
|
77
|
+
puts "#{__dir__}/dict/sysdic.gz"
|
78
|
+
puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
|
79
|
+
end
|
data/Steepfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
target :lib do
|
2
|
+
signature "sig"
|
3
|
+
#
|
4
|
+
check "lib" # Directory name
|
5
|
+
# check "Gemfile" # File name
|
6
|
+
# check "app/models/**/*.rb" # Glob
|
7
|
+
# # ignore "lib/templates/*.rb"
|
8
|
+
#
|
9
|
+
# # library "pathname", "set" # Standard libraries
|
10
|
+
library "dartsclone" # Gems
|
11
|
+
end
|
12
|
+
|
13
|
+
# target :spec do
|
14
|
+
# signature "sig", "sig-private"
|
15
|
+
#
|
16
|
+
# check "spec"
|
17
|
+
#
|
18
|
+
# # library "pathname", "set" # Standard libraries
|
19
|
+
# # library "rspec"
|
20
|
+
# end
|
Binary file
|
data/lib/suika.rb
CHANGED
data/lib/suika/char_def.rb
CHANGED
@@ -5,10 +5,11 @@ module Suika
|
|
5
5
|
class CharDef
|
6
6
|
# @!visibility private
|
7
7
|
def self.char_type(ch)
|
8
|
-
|
9
|
-
CHAR_TYPES.find do |
|
10
|
-
Object.const_get("::Suika::CharDef::#{
|
8
|
+
ch_code = ch.unpack1('U*')
|
9
|
+
ch_type = CHAR_TYPES.find do |ct|
|
10
|
+
Object.const_get("::Suika::CharDef::#{ct}").any? { |r| r.include?(ch_code) }
|
11
11
|
end
|
12
|
+
ch_type || 'DEFAULT'
|
12
13
|
end
|
13
14
|
|
14
15
|
# @!visibility private
|
@@ -16,39 +17,41 @@ module Suika
|
|
16
17
|
CHAR_CATEGORY[char_type(ch)]
|
17
18
|
end
|
18
19
|
|
20
|
+
MAX_GROUPING_SIZE = 24
|
21
|
+
|
19
22
|
CHAR_CATEGORY = {
|
20
23
|
'DEFAULT' => {
|
21
|
-
invoke:
|
24
|
+
invoke: false, group: true, length: 0
|
22
25
|
},
|
23
26
|
'SPACE' => {
|
24
|
-
invoke:
|
27
|
+
invoke: false, group: true, length: 0
|
25
28
|
},
|
26
29
|
'KANJI' => {
|
27
|
-
invoke:
|
30
|
+
invoke: false, group: false, length: 2
|
28
31
|
},
|
29
32
|
'SYMBOL' => {
|
30
|
-
invoke:
|
33
|
+
invoke: true, group: true, length: 0
|
31
34
|
},
|
32
35
|
'NUMERIC' => {
|
33
|
-
invoke:
|
36
|
+
invoke: true, group: true, length: 0
|
34
37
|
},
|
35
38
|
'ALPHA' => {
|
36
|
-
invoke:
|
39
|
+
invoke: true, group: true, length: 0
|
37
40
|
},
|
38
41
|
'HIRAGANA' => {
|
39
|
-
invoke:
|
42
|
+
invoke: false, group: true, length: 2
|
40
43
|
},
|
41
44
|
'KATAKANA' => {
|
42
|
-
invoke:
|
45
|
+
invoke: true, group: true, length: 2
|
43
46
|
},
|
44
47
|
'KANJINUMERIC' => {
|
45
|
-
invoke:
|
48
|
+
invoke: true, group: true, length: 0
|
46
49
|
},
|
47
50
|
'GREEK' => {
|
48
|
-
invoke:
|
51
|
+
invoke: true, group: true, length: 0
|
49
52
|
},
|
50
53
|
'CYRILLIC' => {
|
51
|
-
invoke:
|
54
|
+
invoke: true, group: true, length: 0
|
52
55
|
}
|
53
56
|
}.freeze
|
54
57
|
|
@@ -117,6 +120,7 @@ module Suika
|
|
117
120
|
0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
|
118
121
|
0x2A00..0x2AFF, # Supplemental Mathematical Operators
|
119
122
|
0x3300..0x33FF,
|
123
|
+
0x32FF..0x32FF, # Square era name REIWA
|
120
124
|
0x3200..0x32FE, # ENclosed CJK Letters and Months
|
121
125
|
0x3000..0x303F, # CJK Symbol and Punctuation
|
122
126
|
0xFE30..0xFE4F, # CJK Compatibility Forms
|
data/lib/suika/lattice.rb
CHANGED
@@ -4,8 +4,6 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
|
-
|
9
7
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
8
|
|
11
9
|
# @!visibility private
|
@@ -13,17 +11,17 @@ module Suika
|
|
13
11
|
@length = length
|
14
12
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
13
|
@end_nodes = Array.new(length + 1) { [] }
|
16
|
-
bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
-
@end_nodes[0].
|
18
|
-
eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
|
-
@begin_nodes[length].
|
14
|
+
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
15
|
+
@end_nodes[0].push(bos)
|
16
|
+
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
+
@begin_nodes[length].push(eos)
|
20
18
|
end
|
21
19
|
|
22
20
|
# @!visibility private
|
23
|
-
def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
|
24
|
-
node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
|
-
@begin_nodes[begin_id].
|
26
|
-
@end_nodes[end_id].
|
21
|
+
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
22
|
+
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
23
|
+
@begin_nodes[begin_id].push(node)
|
24
|
+
@end_nodes[end_id].push(node)
|
27
25
|
end
|
28
26
|
end
|
29
27
|
end
|
data/lib/suika/node.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class Node
|
6
|
+
# @!visibility private
|
7
|
+
attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
|
8
|
+
|
9
|
+
# @!visibility private
|
10
|
+
def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
11
|
+
@surface = surface
|
12
|
+
@unknown = unknown
|
13
|
+
@min_cost = min_cost
|
14
|
+
@min_prev = min_prev
|
15
|
+
@left_id = left_id
|
16
|
+
@right_id = right_id
|
17
|
+
@cost = cost
|
18
|
+
@attrs = attrs
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'dartsclone'
|
4
|
+
require 'rubygems/package'
|
4
5
|
require 'zlib'
|
5
6
|
|
6
7
|
module Suika
|
@@ -12,22 +13,22 @@ module Suika
|
|
12
13
|
# tagger = Suika::Tagger.new
|
13
14
|
# tagger.parse('すもももももももものうち').each { |token| puts token }
|
14
15
|
#
|
15
|
-
# # すもも
|
16
|
-
# # も
|
17
|
-
# # もも
|
18
|
-
# # も
|
19
|
-
# # もも
|
20
|
-
# # の
|
21
|
-
# # うち
|
16
|
+
# # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
17
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
18
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
19
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
20
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
21
|
+
# # の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
22
|
+
# # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
22
23
|
#
|
23
24
|
class Tagger
|
24
25
|
# Create a new tagger by loading the built-in binary dictionary.
|
25
26
|
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
27
|
+
raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
|
28
|
+
|
29
|
+
@sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
|
30
|
+
@trie = DartsClone::DoubleArray.new
|
31
|
+
@trie.set_array(@sysdic[:trie])
|
31
32
|
end
|
32
33
|
|
33
34
|
# Parse the given sentence.
|
@@ -39,57 +40,75 @@ module Suika
|
|
39
40
|
terminal = sentence.length
|
40
41
|
|
41
42
|
while start < terminal
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
43
|
+
step = terminal - start
|
44
|
+
|
45
|
+
query = sentence[start..-1] || ''
|
46
|
+
result = trie.common_prefix_search(query)
|
47
|
+
unless result.empty?
|
48
|
+
words, indices = result
|
49
|
+
unless words.empty?
|
50
|
+
step = INT_MAX
|
51
|
+
words.each_with_index do |word, i|
|
52
|
+
features[indices[i]].each do |el|
|
53
|
+
lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
54
|
+
end
|
55
|
+
step = word.length if word.length < step
|
51
56
|
end
|
52
|
-
is_unknown = false
|
53
57
|
end
|
54
|
-
pos += 1
|
55
|
-
word = sentence[start..pos]
|
56
|
-
end
|
57
|
-
|
58
|
-
unless is_unknown
|
59
|
-
start += 1
|
60
|
-
next
|
61
58
|
end
|
62
59
|
|
63
|
-
word = sentence[start]
|
64
|
-
|
65
|
-
|
66
|
-
if char_cate[:
|
67
|
-
unk_terminal = char_cate[:
|
60
|
+
word = sentence[start] || ''
|
61
|
+
char_cate = CharDef.char_category(sentence[start] || '')
|
62
|
+
char_type = CharDef.char_type(sentence[start] || '')
|
63
|
+
if char_cate[:invoke]
|
64
|
+
unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
|
65
|
+
unk_terminal = terminal if terminal < unk_terminal
|
68
66
|
pos = start + 1
|
69
|
-
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
70
|
-
word << sentence[pos]
|
67
|
+
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
|
68
|
+
word << (sentence[pos] || '')
|
71
69
|
pos += 1
|
72
70
|
end
|
73
71
|
end
|
74
|
-
|
75
|
-
lattice.insert(start, start + word.length,
|
76
|
-
|
77
|
-
el[3..-1])
|
72
|
+
unknowns[char_type].each do |el|
|
73
|
+
lattice.insert(start, start + word.length, word, true,
|
74
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
78
75
|
end
|
79
|
-
|
76
|
+
step = word.length if word.length < step
|
77
|
+
|
78
|
+
start += step
|
80
79
|
end
|
81
80
|
|
82
81
|
viterbi(lattice)
|
83
82
|
end
|
84
83
|
|
84
|
+
def inspect
|
85
|
+
to_s
|
86
|
+
end
|
87
|
+
|
85
88
|
private
|
86
89
|
|
90
|
+
DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
|
91
|
+
DICTIONARY_KEY = 'eb921bf5e67f5733188527b21adbf9dabdda0c7a'
|
87
92
|
INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
|
88
93
|
|
89
|
-
private_constant :INT_MAX
|
94
|
+
private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
|
95
|
+
|
96
|
+
attr_reader :trie
|
97
|
+
|
98
|
+
def features
|
99
|
+
@sysdic[:features]
|
100
|
+
end
|
101
|
+
|
102
|
+
def unknowns
|
103
|
+
@sysdic[:unknowns]
|
104
|
+
end
|
105
|
+
|
106
|
+
def connect_cost(r_id, l_id)
|
107
|
+
@sysdic[:concosts][r_id][l_id]
|
108
|
+
end
|
90
109
|
|
91
110
|
def viterbi(lattice)
|
92
|
-
bos = lattice.end_nodes[0]
|
111
|
+
bos = lattice.end_nodes[0][0]
|
93
112
|
bos.min_cost = 0
|
94
113
|
bos.min_prev = nil
|
95
114
|
|
@@ -98,7 +117,7 @@ module Suika
|
|
98
117
|
rnode.min_cost = INT_MAX
|
99
118
|
rnode.min_prev = nil
|
100
119
|
lattice.end_nodes[n].each do |lnode|
|
101
|
-
cost = lnode.min_cost +
|
120
|
+
cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
|
102
121
|
if cost < rnode.min_cost
|
103
122
|
rnode.min_cost = cost
|
104
123
|
rnode.min_prev = lnode
|
@@ -107,13 +126,14 @@ module Suika
|
|
107
126
|
end
|
108
127
|
end
|
109
128
|
|
110
|
-
eos = lattice.begin_nodes[-1]
|
129
|
+
eos = lattice.begin_nodes[-1][0]
|
111
130
|
prev_node = eos.min_prev
|
112
131
|
res = []
|
113
132
|
until prev_node.nil?
|
114
|
-
res.
|
133
|
+
res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
115
134
|
prev_node = prev_node.min_prev
|
116
135
|
end
|
136
|
+
|
117
137
|
res.reverse
|
118
138
|
end
|
119
139
|
end
|
data/lib/suika/version.rb
CHANGED
data/sig/suika.rbs
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Suika
|
2
|
+
class CharDef
|
3
|
+
def self.char_type: (String ch) -> String
|
4
|
+
def self.char_category: (String ch) -> { invoke: bool, group: bool, length: Integer }
|
5
|
+
|
6
|
+
MAX_GROUPING_SIZE: Integer
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
#CHAR_CATEGORY: Hash[String, { invoke: bool, group: bool, length: Integer }]
|
11
|
+
CHAR_CATEGORY: Hash[String, untyped]
|
12
|
+
CHAR_TYPES: Array[String]
|
13
|
+
|
14
|
+
SPACE: Array[Range[Integer]]
|
15
|
+
NUMERIC: Array[Range[Integer]]
|
16
|
+
SYMBOL: Array[Range[Integer]]
|
17
|
+
ALPHA: Array[Range[Integer]]
|
18
|
+
CYRILLIC: Array[Range[Integer]]
|
19
|
+
GREEK: Array[Range[Integer]]
|
20
|
+
HIRAGANA: Array[Range[Integer]]
|
21
|
+
KATAKANA: Array[Range[Integer]]
|
22
|
+
KANJI: Array[Range[Integer]]
|
23
|
+
KANJINUMERIC: Array[Range[Integer]]
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Suika
|
2
|
+
class Lattice
|
3
|
+
attr_reader begin_nodes: Array[Array[::Suika::Node]]
|
4
|
+
attr_reader end_nodes: Array[Array[::Suika::Node]]
|
5
|
+
attr_reader length: Integer
|
6
|
+
|
7
|
+
def initialize: (Integer length) -> void
|
8
|
+
def insert: (Integer begin_id, Integer end_id, String surface, bool unknown,
|
9
|
+
Integer left_id, Integer right_id, Integer cost, Array[String] attrs) -> void
|
10
|
+
end
|
11
|
+
end
|
data/sig/suika/node.rbs
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Suika
|
2
|
+
class Node
|
3
|
+
attr_accessor surface: String
|
4
|
+
attr_accessor unknown: bool
|
5
|
+
attr_accessor min_cost: Integer
|
6
|
+
# attr_accessor min_prev: ::Suika::Node?
|
7
|
+
attr_accessor min_prev: untyped
|
8
|
+
attr_accessor left_id: Integer
|
9
|
+
attr_accessor right_id: Integer
|
10
|
+
attr_accessor cost: Integer
|
11
|
+
attr_accessor attrs: Array[String]
|
12
|
+
|
13
|
+
def initialize: (?surface: String surface, ?unknown: bool unknown,
|
14
|
+
?min_cost: Integer min_cost, ?min_prev: ::Suika::Node? min_prev,
|
15
|
+
?left_id: ::Integer left_id, ?right_id: ::Integer right_id,
|
16
|
+
?cost: ::Integer cost, ?attrs: Array[String] attrs) -> void
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Suika
|
2
|
+
class Tagger
|
3
|
+
def initialize: () -> void
|
4
|
+
def parse: (String sentence) -> Array[String]
|
5
|
+
def inspect: () -> String
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
DICTIONARY_PATH: String
|
10
|
+
DICTIONARY_KEY: String
|
11
|
+
INT_MAX: untyped
|
12
|
+
|
13
|
+
attr_reader trie: ::DartsClone::DoubleArray
|
14
|
+
|
15
|
+
# type feature = [Integer, Integer, Integer, String, String, String, String, String, String, String]
|
16
|
+
|
17
|
+
# def features: () -> Array[Array[feature]]
|
18
|
+
def features: () -> Array[Array[untyped]]
|
19
|
+
# def unknowns: () -> Hash[String, Array[feature]]
|
20
|
+
def unknowns: () -> Hash[String, Array[untyped]]
|
21
|
+
def connect_cost: (Integer r_id, Integer l_id) -> Integer
|
22
|
+
def viterbi: (::Suika::Lattice lattice) -> Array[String]
|
23
|
+
end
|
24
|
+
end
|
data/suika.gemspec
CHANGED
@@ -12,11 +12,10 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
13
13
|
spec.homepage = 'https://github.com/yoshoku/suika'
|
14
14
|
spec.license = 'BSD-3-Clause'
|
15
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
15
|
|
17
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
17
|
spec.metadata['source_code_uri'] = spec.homepage
|
19
|
-
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/
|
18
|
+
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/main/CHANGELOG.md'
|
20
19
|
spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
|
21
20
|
|
22
21
|
# Specify which files should be added to the gem when it is released.
|
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
28
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
28
|
spec.require_paths = ['lib']
|
30
29
|
|
31
|
-
spec.add_runtime_dependency '
|
30
|
+
spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
|
32
31
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: dartsclone
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.2.0
|
27
27
|
description: Suika is a Japanese morphological analyzer written in pure Ruby.
|
28
28
|
email:
|
29
29
|
- yoshoku@outlook.com
|
@@ -31,10 +31,12 @@ executables: []
|
|
31
31
|
extensions: []
|
32
32
|
extra_rdoc_files: []
|
33
33
|
files:
|
34
|
+
- ".coveralls.yml"
|
35
|
+
- ".github/workflows/build.yml"
|
36
|
+
- ".github/workflows/coverage.yml"
|
34
37
|
- ".gitignore"
|
35
38
|
- ".rspec"
|
36
39
|
- ".rubocop.yml"
|
37
|
-
- ".travis.yml"
|
38
40
|
- CHANGELOG.md
|
39
41
|
- CODE_OF_CONDUCT.md
|
40
42
|
- Gemfile
|
@@ -42,14 +44,21 @@ files:
|
|
42
44
|
- NOTICE.txt
|
43
45
|
- README.md
|
44
46
|
- Rakefile
|
47
|
+
- Steepfile
|
45
48
|
- bin/console
|
46
49
|
- bin/setup
|
47
|
-
- dict/
|
50
|
+
- dict/sysdic.gz
|
48
51
|
- lib/suika.rb
|
49
52
|
- lib/suika/char_def.rb
|
50
53
|
- lib/suika/lattice.rb
|
54
|
+
- lib/suika/node.rb
|
51
55
|
- lib/suika/tagger.rb
|
52
56
|
- lib/suika/version.rb
|
57
|
+
- sig/suika.rbs
|
58
|
+
- sig/suika/char_def.rbs
|
59
|
+
- sig/suika/lattice.rbs
|
60
|
+
- sig/suika/node.rbs
|
61
|
+
- sig/suika/tagger.rbs
|
53
62
|
- suika.gemspec
|
54
63
|
homepage: https://github.com/yoshoku/suika
|
55
64
|
licenses:
|
@@ -57,9 +66,9 @@ licenses:
|
|
57
66
|
metadata:
|
58
67
|
homepage_uri: https://github.com/yoshoku/suika
|
59
68
|
source_code_uri: https://github.com/yoshoku/suika
|
60
|
-
changelog_uri: https://github.com/yoshoku/
|
69
|
+
changelog_uri: https://github.com/yoshoku/suika/blob/main/CHANGELOG.md
|
61
70
|
documentation_uri: https://rubydoc.info/gems/suika
|
62
|
-
post_install_message:
|
71
|
+
post_install_message:
|
63
72
|
rdoc_options: []
|
64
73
|
require_paths:
|
65
74
|
- lib
|
@@ -67,15 +76,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
67
76
|
requirements:
|
68
77
|
- - ">="
|
69
78
|
- !ruby/object:Gem::Version
|
70
|
-
version:
|
79
|
+
version: '0'
|
71
80
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
81
|
requirements:
|
73
82
|
- - ">="
|
74
83
|
- !ruby/object:Gem::Version
|
75
84
|
version: '0'
|
76
85
|
requirements: []
|
77
|
-
rubygems_version: 3.
|
78
|
-
signing_key:
|
86
|
+
rubygems_version: 3.2.21
|
87
|
+
signing_key:
|
79
88
|
specification_version: 4
|
80
89
|
summary: Suika is a Japanese morphological analyzer written in pure Ruby.
|
81
90
|
test_files: []
|