suika 0.1.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.coveralls.yml +1 -0
- data/.github/workflows/build.yml +21 -0
- data/.github/workflows/coverage.yml +26 -0
- data/.rubocop.yml +2 -1
- data/CHANGELOG.md +43 -3
- data/Gemfile +5 -1
- data/LICENSE.txt +1 -1
- data/README.md +33 -12
- data/Rakefile +75 -2
- data/Steepfile +20 -0
- data/dict/{ipadic.gz → sysdic.gz} +0 -0
- data/lib/suika.rb +1 -0
- data/lib/suika/char_def.rb +18 -14
- data/lib/suika/lattice.rb +8 -10
- data/lib/suika/node.rb +21 -0
- data/lib/suika/tagger.rb +67 -47
- data/lib/suika/version.rb +1 -1
- data/sig/suika.rbs +3 -0
- data/sig/suika/char_def.rbs +25 -0
- data/sig/suika/lattice.rbs +11 -0
- data/sig/suika/node.rbs +18 -0
- data/sig/suika/tagger.rbs +24 -0
- data/suika.gemspec +2 -3
- metadata +24 -15
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 148e229070959a89197febf9bf9eabfdbe941f3c1cda66b75f87afb9371436c1
|
4
|
+
data.tar.gz: c6d4fa8c654144ad39e19ff23d63161ea193bd8760eb654635d36666bed6f2dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbe9535910050678c51c3ff2d95118959d3b4cdeb4fffb2fb405ad5871258f527d1fa3f36df59706b3cf3d8c6265f19bd5844d778273fc369fc763eec293cf89
|
7
|
+
data.tar.gz: 063ed20722d52ac97b4993093a60ad866fd744c8e74361b3934c1f96082523da510f8d7adcbb46e238aa03ba7dbf53539180215aa5475488294a77d92ea8633e
|
data/.coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: github-ci
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: build
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
ruby: [ '2.6', '2.7', '3.0' ]
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v2
|
13
|
+
- name: Set up Ruby ${{ matrix.ruby }}
|
14
|
+
uses: actions/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby }}
|
17
|
+
- name: Build and test with Rake
|
18
|
+
run: |
|
19
|
+
gem install --no-document bundler
|
20
|
+
bundle install --jobs 4 --retry 3
|
21
|
+
bundle exec rake
|
@@ -0,0 +1,26 @@
|
|
1
|
+
name: coverage
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ main ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ main ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
coverage:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
steps:
|
13
|
+
- uses: actions/checkout@v2
|
14
|
+
- name: Set up Ruby 2.7
|
15
|
+
uses: actions/setup-ruby@v1
|
16
|
+
with:
|
17
|
+
ruby-version: '2.7'
|
18
|
+
- name: Build and test with Rake
|
19
|
+
run: |
|
20
|
+
gem install --no-document bundler
|
21
|
+
bundle install --jobs 4 --retry 3
|
22
|
+
bundle exec rake
|
23
|
+
- name: Coveralls GitHub Action
|
24
|
+
uses: coverallsapp/github-action@v1.1.2
|
25
|
+
with:
|
26
|
+
github-token: ${{ secrets.GITHUB_TOKEN }}
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,8 +1,48 @@
|
|
1
|
-
|
1
|
+
## 0.3.1
|
2
|
+
- Fix Tagger's inspect method not to expand instance variables for object creation on irb and pry.
|
3
|
+
|
4
|
+
## 0.3.0
|
5
|
+
|
6
|
+
### Changes
|
7
|
+
- Add type declaration files.
|
8
|
+
- Refactor to avoid assigning null to variables.
|
9
|
+
- Fix some configuration files.
|
10
|
+
|
11
|
+
|
12
|
+
## 0.2.0
|
13
|
+
|
14
|
+
### Breaking Change
|
15
|
+
- Change to use dartsclone for trie library.
|
16
|
+
|
17
|
+
|
18
|
+
## 0.1.4
|
19
|
+
|
20
|
+
### Bug Fixes
|
21
|
+
- Fix CharDef.char_type to return 'DEFAULT' when unknown character code is given.
|
22
|
+
|
23
|
+
### Features
|
24
|
+
- Add character code of square era name Reiwa.
|
25
|
+
|
26
|
+
## 0.1.3
|
27
|
+
|
28
|
+
### Bug Fixes
|
29
|
+
- Fix unknown word processing.
|
30
|
+
|
31
|
+
### Changes
|
32
|
+
- Remove redundant spaces from output.
|
33
|
+
|
34
|
+
|
35
|
+
## 0.1.2
|
36
|
+
|
37
|
+
### Bug Fixes
|
2
38
|
- Fix local variable typo in Tagger.parse.
|
3
39
|
|
4
|
-
|
40
|
+
|
41
|
+
## 0.1.1
|
42
|
+
|
43
|
+
### Bug Fixes
|
5
44
|
- Fix specification of class in CharDef.char_type.
|
6
45
|
|
7
|
-
|
46
|
+
|
47
|
+
## 0.1.0
|
8
48
|
- First release.
|
data/Gemfile
CHANGED
@@ -5,5 +5,9 @@ source 'https://rubygems.org'
|
|
5
5
|
# Specify your gem's dependencies in suika.gemspec
|
6
6
|
gemspec
|
7
7
|
|
8
|
-
gem 'rake', '~>
|
8
|
+
gem 'rake', '~> 13.0'
|
9
9
|
gem 'rspec', '~> 3.0'
|
10
|
+
gem 'simplecov', '~> 0.21'
|
11
|
+
gem 'simplecov-lcov', '~> 0.8'
|
12
|
+
gem 'rbs', '~> 1.2'
|
13
|
+
gem 'steep', '~> 0.44'
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
# Suika
|
2
2
|
|
3
|
+
[](https://github.com/yoshoku/suika/actions?query=workflow%3Abuild)
|
4
|
+
[](https://coveralls.io/github/yoshoku/suika?branch=main)
|
3
5
|
[](https://badge.fury.io/rb/suika)
|
4
|
-
[](https://github.com/yoshoku/suika/blob/
|
6
|
+
[](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
|
5
7
|
[](https://rubydoc.info/gems/suika)
|
6
8
|
|
7
9
|
Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
|
@@ -30,13 +32,13 @@ require 'suika'
|
|
30
32
|
tagger = Suika::Tagger.new
|
31
33
|
tagger.parse('すもももももももものうち').each { |token| puts token }
|
32
34
|
|
33
|
-
# すもも
|
34
|
-
# も
|
35
|
-
# もも
|
36
|
-
# も
|
37
|
-
# もも
|
38
|
-
# の
|
39
|
-
# うち
|
35
|
+
# すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
36
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
37
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
38
|
+
# も 助詞,係助詞,*,*,*,*,も,モ,モ
|
39
|
+
# もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
40
|
+
# の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
41
|
+
# うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
40
42
|
```
|
41
43
|
|
42
44
|
Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
|
@@ -51,17 +53,36 @@ sentences.each do |sentence|
|
|
51
53
|
end
|
52
54
|
```
|
53
55
|
|
56
|
+
## Test
|
57
|
+
Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
|
58
|
+
without any error.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
require 'suika'
|
62
|
+
|
63
|
+
tagger = Suika::Tagger.new
|
64
|
+
|
65
|
+
Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
|
66
|
+
File.foreach(filename) do |sentence|
|
67
|
+
sentence.strip!
|
68
|
+
puts tagger.parse(sentence) unless sentence.empty?
|
69
|
+
end
|
70
|
+
end
|
71
|
+
```
|
72
|
+
|
73
|
+

|
74
|
+
|
54
75
|
## Contributing
|
55
76
|
|
56
77
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
|
57
|
-
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/
|
78
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
|
58
79
|
|
59
80
|
## License
|
60
81
|
|
61
82
|
The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
|
62
83
|
In addition, the gem includes binary data generated from mecab-ipadic.
|
63
|
-
The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/
|
64
|
-
and [NOTICE.txt](https://github.com/yoshoku/suika/blob/
|
84
|
+
The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
|
85
|
+
and [NOTICE.txt](https://github.com/yoshoku/suika/blob/main/NOTICE.txt).
|
65
86
|
|
66
87
|
## Respect
|
67
88
|
|
@@ -74,4 +95,4 @@ Janome, a morphological analyzer written in scripting language, gives me the cou
|
|
74
95
|
|
75
96
|
## Code of Conduct
|
76
97
|
|
77
|
-
Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/
|
98
|
+
Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
|
data/Rakefile
CHANGED
@@ -1,6 +1,79 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'dartsclone'
|
6
|
+
require 'nkf'
|
7
|
+
require 'rubygems/package'
|
8
|
+
require 'zlib'
|
3
9
|
|
4
10
|
RSpec::Core::RakeTask.new(:spec)
|
5
11
|
|
6
12
|
task :default => :spec
|
13
|
+
|
14
|
+
desc 'Build suika system dictionary'
|
15
|
+
task :dictionary do
|
16
|
+
base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
|
17
|
+
unless File.directory?(base_dir)
|
18
|
+
puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
|
19
|
+
puts
|
20
|
+
puts 'Example:'
|
21
|
+
puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
|
22
|
+
puts 'cd dict'
|
23
|
+
puts 'tar xzf mecab-ipadic.tgz'
|
24
|
+
puts 'cd ../'
|
25
|
+
next # exit
|
26
|
+
end
|
27
|
+
|
28
|
+
File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
|
29
|
+
f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
|
30
|
+
end
|
31
|
+
|
32
|
+
unknowns = {}
|
33
|
+
File.open("#{base_dir}/unk.def") do |f|
|
34
|
+
f.each_line do |line|
|
35
|
+
row = NKF.nkf('-w', line.chomp).split(',')
|
36
|
+
unknowns[row[0]] ||= []
|
37
|
+
unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
dict = {}
|
42
|
+
Dir.glob("#{base_dir}/*.csv").each do |filename|
|
43
|
+
File.open(filename) do |f|
|
44
|
+
f.each_line do |line|
|
45
|
+
row = NKF.nkf('-w', line.chomp).split(',')
|
46
|
+
dict[row[0]] ||= []
|
47
|
+
dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
da = DartsClone::DoubleArray.new
|
53
|
+
words = dict.keys.sort
|
54
|
+
da.build(words)
|
55
|
+
features = words.map { |w| dict[w] }
|
56
|
+
|
57
|
+
concosts = nil
|
58
|
+
File.open("#{base_dir}/matrix.def") do |f|
|
59
|
+
n_entries = f.readline.chomp.split.map(&:to_i).first
|
60
|
+
concosts = Array.new(n_entries) { Array.new(n_entries) }
|
61
|
+
f.each_line do |line|
|
62
|
+
row, col, cost = line.chomp.split.map(&:to_i)
|
63
|
+
concosts[row][col] = cost
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
ipadic = {
|
68
|
+
trie: da.get_array,
|
69
|
+
features: features,
|
70
|
+
unknowns: unknowns,
|
71
|
+
concosts: concosts
|
72
|
+
}
|
73
|
+
|
74
|
+
Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
|
75
|
+
|
76
|
+
puts 'The system dictionary has been successfully built:'
|
77
|
+
puts "#{__dir__}/dict/sysdic.gz"
|
78
|
+
puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
|
79
|
+
end
|
data/Steepfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
target :lib do
|
2
|
+
signature "sig"
|
3
|
+
#
|
4
|
+
check "lib" # Directory name
|
5
|
+
# check "Gemfile" # File name
|
6
|
+
# check "app/models/**/*.rb" # Glob
|
7
|
+
# # ignore "lib/templates/*.rb"
|
8
|
+
#
|
9
|
+
# # library "pathname", "set" # Standard libraries
|
10
|
+
library "dartsclone" # Gems
|
11
|
+
end
|
12
|
+
|
13
|
+
# target :spec do
|
14
|
+
# signature "sig", "sig-private"
|
15
|
+
#
|
16
|
+
# check "spec"
|
17
|
+
#
|
18
|
+
# # library "pathname", "set" # Standard libraries
|
19
|
+
# # library "rspec"
|
20
|
+
# end
|
Binary file
|
data/lib/suika.rb
CHANGED
data/lib/suika/char_def.rb
CHANGED
@@ -5,10 +5,11 @@ module Suika
|
|
5
5
|
class CharDef
|
6
6
|
# @!visibility private
|
7
7
|
def self.char_type(ch)
|
8
|
-
|
9
|
-
CHAR_TYPES.find do |
|
10
|
-
Object.const_get("::Suika::CharDef::#{
|
8
|
+
ch_code = ch.unpack1('U*')
|
9
|
+
ch_type = CHAR_TYPES.find do |ct|
|
10
|
+
Object.const_get("::Suika::CharDef::#{ct}").any? { |r| r.include?(ch_code) }
|
11
11
|
end
|
12
|
+
ch_type || 'DEFAULT'
|
12
13
|
end
|
13
14
|
|
14
15
|
# @!visibility private
|
@@ -16,39 +17,41 @@ module Suika
|
|
16
17
|
CHAR_CATEGORY[char_type(ch)]
|
17
18
|
end
|
18
19
|
|
20
|
+
MAX_GROUPING_SIZE = 24
|
21
|
+
|
19
22
|
CHAR_CATEGORY = {
|
20
23
|
'DEFAULT' => {
|
21
|
-
invoke:
|
24
|
+
invoke: false, group: true, length: 0
|
22
25
|
},
|
23
26
|
'SPACE' => {
|
24
|
-
invoke:
|
27
|
+
invoke: false, group: true, length: 0
|
25
28
|
},
|
26
29
|
'KANJI' => {
|
27
|
-
invoke:
|
30
|
+
invoke: false, group: false, length: 2
|
28
31
|
},
|
29
32
|
'SYMBOL' => {
|
30
|
-
invoke:
|
33
|
+
invoke: true, group: true, length: 0
|
31
34
|
},
|
32
35
|
'NUMERIC' => {
|
33
|
-
invoke:
|
36
|
+
invoke: true, group: true, length: 0
|
34
37
|
},
|
35
38
|
'ALPHA' => {
|
36
|
-
invoke:
|
39
|
+
invoke: true, group: true, length: 0
|
37
40
|
},
|
38
41
|
'HIRAGANA' => {
|
39
|
-
invoke:
|
42
|
+
invoke: false, group: true, length: 2
|
40
43
|
},
|
41
44
|
'KATAKANA' => {
|
42
|
-
invoke:
|
45
|
+
invoke: true, group: true, length: 2
|
43
46
|
},
|
44
47
|
'KANJINUMERIC' => {
|
45
|
-
invoke:
|
48
|
+
invoke: true, group: true, length: 0
|
46
49
|
},
|
47
50
|
'GREEK' => {
|
48
|
-
invoke:
|
51
|
+
invoke: true, group: true, length: 0
|
49
52
|
},
|
50
53
|
'CYRILLIC' => {
|
51
|
-
invoke:
|
54
|
+
invoke: true, group: true, length: 0
|
52
55
|
}
|
53
56
|
}.freeze
|
54
57
|
|
@@ -117,6 +120,7 @@ module Suika
|
|
117
120
|
0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
|
118
121
|
0x2A00..0x2AFF, # Supplemental Mathematical Operators
|
119
122
|
0x3300..0x33FF,
|
123
|
+
0x32FF..0x32FF, # Square era name REIWA
|
120
124
|
0x3200..0x32FE, # ENclosed CJK Letters and Months
|
121
125
|
0x3000..0x303F, # CJK Symbol and Punctuation
|
122
126
|
0xFE30..0xFE4F, # CJK Compatibility Forms
|
data/lib/suika/lattice.rb
CHANGED
@@ -4,8 +4,6 @@ module Suika
|
|
4
4
|
# @!visibility private
|
5
5
|
class Lattice
|
6
6
|
# @!visibility private
|
7
|
-
Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
|
-
|
9
7
|
attr_reader :begin_nodes, :end_nodes, :length
|
10
8
|
|
11
9
|
# @!visibility private
|
@@ -13,17 +11,17 @@ module Suika
|
|
13
11
|
@length = length
|
14
12
|
@begin_nodes = Array.new(length + 1) { [] }
|
15
13
|
@end_nodes = Array.new(length + 1) { [] }
|
16
|
-
bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
-
@end_nodes[0].
|
18
|
-
eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
|
-
@begin_nodes[length].
|
14
|
+
bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
15
|
+
@end_nodes[0].push(bos)
|
16
|
+
eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
+
@begin_nodes[length].push(eos)
|
20
18
|
end
|
21
19
|
|
22
20
|
# @!visibility private
|
23
|
-
def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
|
24
|
-
node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
|
-
@begin_nodes[begin_id].
|
26
|
-
@end_nodes[end_id].
|
21
|
+
def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
|
22
|
+
node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
23
|
+
@begin_nodes[begin_id].push(node)
|
24
|
+
@end_nodes[end_id].push(node)
|
27
25
|
end
|
28
26
|
end
|
29
27
|
end
|
data/lib/suika/node.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class Node
|
6
|
+
# @!visibility private
|
7
|
+
attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
|
8
|
+
|
9
|
+
# @!visibility private
|
10
|
+
def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
|
11
|
+
@surface = surface
|
12
|
+
@unknown = unknown
|
13
|
+
@min_cost = min_cost
|
14
|
+
@min_prev = min_prev
|
15
|
+
@left_id = left_id
|
16
|
+
@right_id = right_id
|
17
|
+
@cost = cost
|
18
|
+
@attrs = attrs
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/suika/tagger.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require '
|
3
|
+
require 'dartsclone'
|
4
|
+
require 'rubygems/package'
|
4
5
|
require 'zlib'
|
5
6
|
|
6
7
|
module Suika
|
@@ -12,22 +13,22 @@ module Suika
|
|
12
13
|
# tagger = Suika::Tagger.new
|
13
14
|
# tagger.parse('すもももももももものうち').each { |token| puts token }
|
14
15
|
#
|
15
|
-
# # すもも
|
16
|
-
# # も
|
17
|
-
# # もも
|
18
|
-
# # も
|
19
|
-
# # もも
|
20
|
-
# # の
|
21
|
-
# # うち
|
16
|
+
# # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
|
17
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
18
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
19
|
+
# # も 助詞,係助詞,*,*,*,*,も,モ,モ
|
20
|
+
# # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
|
21
|
+
# # の 助詞,連体化,*,*,*,*,の,ノ,ノ
|
22
|
+
# # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
|
22
23
|
#
|
23
24
|
class Tagger
|
24
25
|
# Create a new tagger by loading the built-in binary dictionary.
|
25
26
|
def initialize
|
26
|
-
|
27
|
-
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
27
|
+
raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
|
28
|
+
|
29
|
+
@sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
|
30
|
+
@trie = DartsClone::DoubleArray.new
|
31
|
+
@trie.set_array(@sysdic[:trie])
|
31
32
|
end
|
32
33
|
|
33
34
|
# Parse the given sentence.
|
@@ -39,57 +40,75 @@ module Suika
|
|
39
40
|
terminal = sentence.length
|
40
41
|
|
41
42
|
while start < terminal
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
43
|
+
step = terminal - start
|
44
|
+
|
45
|
+
query = sentence[start..-1] || ''
|
46
|
+
result = trie.common_prefix_search(query)
|
47
|
+
unless result.empty?
|
48
|
+
words, indices = result
|
49
|
+
unless words.empty?
|
50
|
+
step = INT_MAX
|
51
|
+
words.each_with_index do |word, i|
|
52
|
+
features[indices[i]].each do |el|
|
53
|
+
lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
54
|
+
end
|
55
|
+
step = word.length if word.length < step
|
51
56
|
end
|
52
|
-
is_unknown = false
|
53
57
|
end
|
54
|
-
pos += 1
|
55
|
-
word = sentence[start..pos]
|
56
|
-
end
|
57
|
-
|
58
|
-
unless is_unknown
|
59
|
-
start += 1
|
60
|
-
next
|
61
58
|
end
|
62
59
|
|
63
|
-
word = sentence[start]
|
64
|
-
|
65
|
-
|
66
|
-
if char_cate[:
|
67
|
-
unk_terminal = char_cate[:
|
60
|
+
word = sentence[start] || ''
|
61
|
+
char_cate = CharDef.char_category(sentence[start] || '')
|
62
|
+
char_type = CharDef.char_type(sentence[start] || '')
|
63
|
+
if char_cate[:invoke]
|
64
|
+
unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
|
65
|
+
unk_terminal = terminal if terminal < unk_terminal
|
68
66
|
pos = start + 1
|
69
|
-
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
|
70
|
-
word << sentence[pos]
|
67
|
+
while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
|
68
|
+
word << (sentence[pos] || '')
|
71
69
|
pos += 1
|
72
70
|
end
|
73
71
|
end
|
74
|
-
|
75
|
-
lattice.insert(start, start + word.length,
|
76
|
-
|
77
|
-
el[3..-1])
|
72
|
+
unknowns[char_type].each do |el|
|
73
|
+
lattice.insert(start, start + word.length, word, true,
|
74
|
+
el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
|
78
75
|
end
|
79
|
-
|
76
|
+
step = word.length if word.length < step
|
77
|
+
|
78
|
+
start += step
|
80
79
|
end
|
81
80
|
|
82
81
|
viterbi(lattice)
|
83
82
|
end
|
84
83
|
|
84
|
+
def inspect
|
85
|
+
to_s
|
86
|
+
end
|
87
|
+
|
85
88
|
private
|
86
89
|
|
90
|
+
DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
|
91
|
+
DICTIONARY_KEY = 'eb921bf5e67f5733188527b21adbf9dabdda0c7a'
|
87
92
|
INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
|
88
93
|
|
89
|
-
private_constant :INT_MAX
|
94
|
+
private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
|
95
|
+
|
96
|
+
attr_reader :trie
|
97
|
+
|
98
|
+
def features
|
99
|
+
@sysdic[:features]
|
100
|
+
end
|
101
|
+
|
102
|
+
def unknowns
|
103
|
+
@sysdic[:unknowns]
|
104
|
+
end
|
105
|
+
|
106
|
+
def connect_cost(r_id, l_id)
|
107
|
+
@sysdic[:concosts][r_id][l_id]
|
108
|
+
end
|
90
109
|
|
91
110
|
def viterbi(lattice)
|
92
|
-
bos = lattice.end_nodes[0]
|
111
|
+
bos = lattice.end_nodes[0][0]
|
93
112
|
bos.min_cost = 0
|
94
113
|
bos.min_prev = nil
|
95
114
|
|
@@ -98,7 +117,7 @@ module Suika
|
|
98
117
|
rnode.min_cost = INT_MAX
|
99
118
|
rnode.min_prev = nil
|
100
119
|
lattice.end_nodes[n].each do |lnode|
|
101
|
-
cost = lnode.min_cost +
|
120
|
+
cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
|
102
121
|
if cost < rnode.min_cost
|
103
122
|
rnode.min_cost = cost
|
104
123
|
rnode.min_prev = lnode
|
@@ -107,13 +126,14 @@ module Suika
|
|
107
126
|
end
|
108
127
|
end
|
109
128
|
|
110
|
-
eos = lattice.begin_nodes[-1]
|
129
|
+
eos = lattice.begin_nodes[-1][0]
|
111
130
|
prev_node = eos.min_prev
|
112
131
|
res = []
|
113
132
|
until prev_node.nil?
|
114
|
-
res.
|
133
|
+
res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
115
134
|
prev_node = prev_node.min_prev
|
116
135
|
end
|
136
|
+
|
117
137
|
res.reverse
|
118
138
|
end
|
119
139
|
end
|
data/lib/suika/version.rb
CHANGED
data/sig/suika.rbs
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Suika
|
2
|
+
class CharDef
|
3
|
+
def self.char_type: (String ch) -> String
|
4
|
+
def self.char_category: (String ch) -> { invoke: bool, group: bool, length: Integer }
|
5
|
+
|
6
|
+
MAX_GROUPING_SIZE: Integer
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
#CHAR_CATEGORY: Hash[String, { invoke: bool, group: bool, length: Integer }]
|
11
|
+
CHAR_CATEGORY: Hash[String, untyped]
|
12
|
+
CHAR_TYPES: Array[String]
|
13
|
+
|
14
|
+
SPACE: Array[Range[Integer]]
|
15
|
+
NUMERIC: Array[Range[Integer]]
|
16
|
+
SYMBOL: Array[Range[Integer]]
|
17
|
+
ALPHA: Array[Range[Integer]]
|
18
|
+
CYRILLIC: Array[Range[Integer]]
|
19
|
+
GREEK: Array[Range[Integer]]
|
20
|
+
HIRAGANA: Array[Range[Integer]]
|
21
|
+
KATAKANA: Array[Range[Integer]]
|
22
|
+
KANJI: Array[Range[Integer]]
|
23
|
+
KANJINUMERIC: Array[Range[Integer]]
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Suika
|
2
|
+
class Lattice
|
3
|
+
attr_reader begin_nodes: Array[Array[::Suika::Node]]
|
4
|
+
attr_reader end_nodes: Array[Array[::Suika::Node]]
|
5
|
+
attr_reader length: Integer
|
6
|
+
|
7
|
+
def initialize: (Integer length) -> void
|
8
|
+
def insert: (Integer begin_id, Integer end_id, String surface, bool unknown,
|
9
|
+
Integer left_id, Integer right_id, Integer cost, Array[String] attrs) -> void
|
10
|
+
end
|
11
|
+
end
|
data/sig/suika/node.rbs
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Suika
|
2
|
+
class Node
|
3
|
+
attr_accessor surface: String
|
4
|
+
attr_accessor unknown: bool
|
5
|
+
attr_accessor min_cost: Integer
|
6
|
+
# attr_accessor min_prev: ::Suika::Node?
|
7
|
+
attr_accessor min_prev: untyped
|
8
|
+
attr_accessor left_id: Integer
|
9
|
+
attr_accessor right_id: Integer
|
10
|
+
attr_accessor cost: Integer
|
11
|
+
attr_accessor attrs: Array[String]
|
12
|
+
|
13
|
+
def initialize: (?surface: String surface, ?unknown: bool unknown,
|
14
|
+
?min_cost: Integer min_cost, ?min_prev: ::Suika::Node? min_prev,
|
15
|
+
?left_id: ::Integer left_id, ?right_id: ::Integer right_id,
|
16
|
+
?cost: ::Integer cost, ?attrs: Array[String] attrs) -> void
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Suika
|
2
|
+
class Tagger
|
3
|
+
def initialize: () -> void
|
4
|
+
def parse: (String sentence) -> Array[String]
|
5
|
+
def inspect: () -> String
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
DICTIONARY_PATH: String
|
10
|
+
DICTIONARY_KEY: String
|
11
|
+
INT_MAX: untyped
|
12
|
+
|
13
|
+
attr_reader trie: ::DartsClone::DoubleArray
|
14
|
+
|
15
|
+
# type feature = [Integer, Integer, Integer, String, String, String, String, String, String, String]
|
16
|
+
|
17
|
+
# def features: () -> Array[Array[feature]]
|
18
|
+
def features: () -> Array[Array[untyped]]
|
19
|
+
# def unknowns: () -> Hash[String, Array[feature]]
|
20
|
+
def unknowns: () -> Hash[String, Array[untyped]]
|
21
|
+
def connect_cost: (Integer r_id, Integer l_id) -> Integer
|
22
|
+
def viterbi: (::Suika::Lattice lattice) -> Array[String]
|
23
|
+
end
|
24
|
+
end
|
data/suika.gemspec
CHANGED
@@ -12,11 +12,10 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
13
13
|
spec.homepage = 'https://github.com/yoshoku/suika'
|
14
14
|
spec.license = 'BSD-3-Clause'
|
15
|
-
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
15
|
|
17
16
|
spec.metadata['homepage_uri'] = spec.homepage
|
18
17
|
spec.metadata['source_code_uri'] = spec.homepage
|
19
|
-
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/
|
18
|
+
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/main/CHANGELOG.md'
|
20
19
|
spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
|
21
20
|
|
22
21
|
# Specify which files should be added to the gem when it is released.
|
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
|
|
28
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
28
|
spec.require_paths = ['lib']
|
30
29
|
|
31
|
-
spec.add_runtime_dependency '
|
30
|
+
spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
|
32
31
|
end
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: dartsclone
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 0.2.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 0.2.0
|
27
27
|
description: Suika is a Japanese morphological analyzer written in pure Ruby.
|
28
28
|
email:
|
29
29
|
- yoshoku@outlook.com
|
@@ -31,10 +31,12 @@ executables: []
|
|
31
31
|
extensions: []
|
32
32
|
extra_rdoc_files: []
|
33
33
|
files:
|
34
|
+
- ".coveralls.yml"
|
35
|
+
- ".github/workflows/build.yml"
|
36
|
+
- ".github/workflows/coverage.yml"
|
34
37
|
- ".gitignore"
|
35
38
|
- ".rspec"
|
36
39
|
- ".rubocop.yml"
|
37
|
-
- ".travis.yml"
|
38
40
|
- CHANGELOG.md
|
39
41
|
- CODE_OF_CONDUCT.md
|
40
42
|
- Gemfile
|
@@ -42,14 +44,21 @@ files:
|
|
42
44
|
- NOTICE.txt
|
43
45
|
- README.md
|
44
46
|
- Rakefile
|
47
|
+
- Steepfile
|
45
48
|
- bin/console
|
46
49
|
- bin/setup
|
47
|
-
- dict/
|
50
|
+
- dict/sysdic.gz
|
48
51
|
- lib/suika.rb
|
49
52
|
- lib/suika/char_def.rb
|
50
53
|
- lib/suika/lattice.rb
|
54
|
+
- lib/suika/node.rb
|
51
55
|
- lib/suika/tagger.rb
|
52
56
|
- lib/suika/version.rb
|
57
|
+
- sig/suika.rbs
|
58
|
+
- sig/suika/char_def.rbs
|
59
|
+
- sig/suika/lattice.rbs
|
60
|
+
- sig/suika/node.rbs
|
61
|
+
- sig/suika/tagger.rbs
|
53
62
|
- suika.gemspec
|
54
63
|
homepage: https://github.com/yoshoku/suika
|
55
64
|
licenses:
|
@@ -57,9 +66,9 @@ licenses:
|
|
57
66
|
metadata:
|
58
67
|
homepage_uri: https://github.com/yoshoku/suika
|
59
68
|
source_code_uri: https://github.com/yoshoku/suika
|
60
|
-
changelog_uri: https://github.com/yoshoku/
|
69
|
+
changelog_uri: https://github.com/yoshoku/suika/blob/main/CHANGELOG.md
|
61
70
|
documentation_uri: https://rubydoc.info/gems/suika
|
62
|
-
post_install_message:
|
71
|
+
post_install_message:
|
63
72
|
rdoc_options: []
|
64
73
|
require_paths:
|
65
74
|
- lib
|
@@ -67,15 +76,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
67
76
|
requirements:
|
68
77
|
- - ">="
|
69
78
|
- !ruby/object:Gem::Version
|
70
|
-
version:
|
79
|
+
version: '0'
|
71
80
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
81
|
requirements:
|
73
82
|
- - ">="
|
74
83
|
- !ruby/object:Gem::Version
|
75
84
|
version: '0'
|
76
85
|
requirements: []
|
77
|
-
rubygems_version: 3.
|
78
|
-
signing_key:
|
86
|
+
rubygems_version: 3.2.21
|
87
|
+
signing_key:
|
79
88
|
specification_version: 4
|
80
89
|
summary: Suika is a Japanese morphological analyzer written in pure Ruby.
|
81
90
|
test_files: []
|