suika 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24d1a6401851f6ca9fa7b4ea11d1966110b33c736499e5ad4facffda746d135d
4
- data.tar.gz: 46763eabe75de50858ecfd98e92adee305d837033c996c8da2faeac5d56c694f
3
+ metadata.gz: f2aaecfc53b051cfbd06052bf22ac14614e459d42789670df1a7a31f601d533f
4
+ data.tar.gz: 39d4ac2b9fc0f4f164f2e3408b3dfcfe478f922d889334243c572a48ddf781de
5
5
  SHA512:
6
- metadata.gz: '01690ace19b17fb68a368fc1d7a572a079c1d19876204ed85a239efbf6a5a2f4adcd5bab5f9d4f6c7c83dc54ff63e0c5a3c99819f88fed81065a96925bc27c5f'
7
- data.tar.gz: 35b773639f6379a788f9dd3178e857752090420e406783e1abc27b276aa0c89f17b393a9c5ff60dc273f762b26c55f5155f5baa2a3723faaa1d8026d27b71c64
6
+ metadata.gz: dfdf6c08812109d7f6c6a52b64b8d8f7e6febd858bae23ccead4b3e598c76acd24a9c56cd893d958199b6545e37a4e01d6058c2acf39241a472260fda6accd90
7
+ data.tar.gz: 495016afc1854269d20b35e27bbb65cdffc307a9a94ab0149b8dcee884ac8560077ce0367a65f70b11bbdf0d27175134a2f66d67cc9bccc8f91a45353f2f1cf8
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: github-ci
@@ -0,0 +1,21 @@
1
+ name: build
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ matrix:
10
+ ruby: [ '2.6', '2.7', '3.0' ]
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby ${{ matrix.ruby }}
14
+ uses: actions/setup-ruby@v1
15
+ with:
16
+ ruby-version: ${{ matrix.ruby }}
17
+ - name: Build and test with Rake
18
+ run: |
19
+ gem install --no-document bundler
20
+ bundle install --jobs 4 --retry 3
21
+ bundle exec rake
@@ -0,0 +1,26 @@
1
+ name: coverage
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ coverage:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ - name: Set up Ruby 2.7
15
+ uses: actions/setup-ruby@v1
16
+ with:
17
+ ruby-version: '2.7'
18
+ - name: Build and test with Rake
19
+ run: |
20
+ gem install --no-document bundler
21
+ bundle install --jobs 4 --retry 3
22
+ bundle exec rake
23
+ - name: Coveralls GitHub Action
24
+ uses: coverallsapp/github-action@v1.1.2
25
+ with:
26
+ github-token: ${{ secrets.GITHUB_TOKEN }}
data/.rubocop.yml CHANGED
@@ -3,7 +3,8 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.4
6
+ NewCops: enable
7
+ TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
9
10
  Exclude:
data/CHANGELOG.md CHANGED
@@ -1,5 +1,45 @@
1
- # 0.1.1
1
+ ## 0.3.0
2
+
3
+ ### Changes
4
+ - Add type declaration files.
5
+ - Refactor to avoid assigning null to variables.
6
+ - Fix some configuration files.
7
+
8
+
9
+ ## 0.2.0
10
+
11
+ ### Breaking Change
12
+ - Change to use dartsclone for trie library.
13
+
14
+
15
+ ## 0.1.4
16
+
17
+ ### Bug Fixes
18
+ - Fix CharDef.char_type to return 'DEFAULT' when unknown character code is given.
19
+
20
+ ### Features
21
+ - Add character code of square era name Reiwa.
22
+
23
+ ## 0.1.3
24
+
25
+ ### Bug Fixes
26
+ - Fix unknown word processing.
27
+
28
+ ### Changes
29
+ - Remove redundant spaces from output.
30
+
31
+
32
+ ## 0.1.2
33
+
34
+ ### Bug Fixes
35
+ - Fix local variable typo in Tagger.parse.
36
+
37
+
38
+ ## 0.1.1
39
+
40
+ ### Bug Fixes
2
41
  - Fix specification of class in CharDef.char_type.
3
42
 
4
- # 0.1.0
43
+
44
+ ## 0.1.0
5
45
  - First release.
data/Gemfile CHANGED
@@ -5,5 +5,9 @@ source 'https://rubygems.org'
5
5
  # Specify your gem's dependencies in suika.gemspec
6
6
  gemspec
7
7
 
8
- gem 'rake', '~> 12.0'
8
+ gem 'rake', '~> 13.0'
9
9
  gem 'rspec', '~> 3.0'
10
+ gem 'simplecov', '~> 0.21'
11
+ gem 'simplecov-lcov', '~> 0.8'
12
+ gem 'rbs', '~> 1.2'
13
+ gem 'steep', '~> 0.44'
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020 Atsushi Tatsuma
1
+ Copyright (c) 2020-2021 Atsushi Tatsuma
2
2
  All rights reserved.
3
3
 
4
4
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  # Suika
2
2
 
3
+ [![Build Status](https://github.com/yoshoku/suika/workflows/build/badge.svg)](https://github.com/yoshoku/suika/actions?query=workflow%3Abuild)
4
+ [![Coverage Status](https://coveralls.io/repos/github/yoshoku/suika/badge.svg?branch=main)](https://coveralls.io/github/yoshoku/suika?branch=main)
3
5
  [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
- [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
6
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
5
7
  [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
6
8
 
7
9
  Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
@@ -30,13 +32,13 @@ require 'suika'
30
32
  tagger = Suika::Tagger.new
31
33
  tagger.parse('すもももももももものうち').each { |token| puts token }
32
34
 
33
- # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
- # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
- # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
35
+ # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
36
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
37
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
38
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
39
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
40
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
41
+ # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
40
42
  ```
41
43
 
42
44
  Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
@@ -51,17 +53,36 @@ sentences.each do |sentence|
51
53
  end
52
54
  ```
53
55
 
56
+ ## Test
57
+ Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
58
+ without any error.
59
+
60
+ ```ruby
61
+ require 'suika'
62
+
63
+ tagger = Suika::Tagger.new
64
+
65
+ Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
66
+ File.foreach(filename) do |sentence|
67
+ sentence.strip!
68
+ puts tagger.parse(sentence) unless sentence.empty?
69
+ end
70
+ end
71
+ ```
72
+
73
+ ![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
74
+
54
75
  ## Contributing
55
76
 
56
77
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
57
- This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
78
+ This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
58
79
 
59
80
  ## License
60
81
 
61
82
  The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
62
83
  In addition, the gem includes binary data generated from mecab-ipadic.
63
- The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
64
- and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
84
+ The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
85
+ and [NOTICE.txt](https://github.com/yoshoku/suika/blob/main/NOTICE.txt).
65
86
 
66
87
  ## Respect
67
88
 
@@ -74,4 +95,4 @@ Janome, a morphological analyzer written in scripting language, gives me the cou
74
95
 
75
96
  ## Code of Conduct
76
97
 
77
- Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
98
+ Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
data/Rakefile CHANGED
@@ -1,6 +1,79 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ require 'csv'
5
+ require 'dartsclone'
6
+ require 'nkf'
7
+ require 'rubygems/package'
8
+ require 'zlib'
3
9
 
4
10
  RSpec::Core::RakeTask.new(:spec)
5
11
 
6
12
  task :default => :spec
13
+
14
+ desc 'Build suika system dictionary'
15
+ task :dictionary do
16
+ base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
17
+ unless File.directory?(base_dir)
18
+ puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
19
+ puts
20
+ puts 'Example:'
21
+ puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
22
+ puts 'cd dict'
23
+ puts 'tar xzf mecab-ipadic.tgz'
24
+ puts 'cd ../'
25
+ next # exit
26
+ end
27
+
28
+ File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
29
+ f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
30
+ end
31
+
32
+ unknowns = {}
33
+ File.open("#{base_dir}/unk.def") do |f|
34
+ f.each_line do |line|
35
+ row = NKF.nkf('-w', line.chomp).split(',')
36
+ unknowns[row[0]] ||= []
37
+ unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
38
+ end
39
+ end
40
+
41
+ dict = {}
42
+ Dir.glob("#{base_dir}/*.csv").each do |filename|
43
+ File.open(filename) do |f|
44
+ f.each_line do |line|
45
+ row = NKF.nkf('-w', line.chomp).split(',')
46
+ dict[row[0]] ||= []
47
+ dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
48
+ end
49
+ end
50
+ end
51
+
52
+ da = DartsClone::DoubleArray.new
53
+ words = dict.keys.sort
54
+ da.build(words)
55
+ features = words.map { |w| dict[w] }
56
+
57
+ concosts = nil
58
+ File.open("#{base_dir}/matrix.def") do |f|
59
+ n_entries = f.readline.chomp.split.map(&:to_i).first
60
+ concosts = Array.new(n_entries) { Array.new(n_entries) }
61
+ f.each_line do |line|
62
+ row, col, cost = line.chomp.split.map(&:to_i)
63
+ concosts[row][col] = cost
64
+ end
65
+ end
66
+
67
+ ipadic = {
68
+ trie: da.get_array,
69
+ features: features,
70
+ unknowns: unknowns,
71
+ concosts: concosts
72
+ }
73
+
74
+ Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
75
+
76
+ puts 'The system dictionary has been successfully built:'
77
+ puts "#{__dir__}/dict/sysdic.gz"
78
+ puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
79
+ end
data/Steepfile ADDED
@@ -0,0 +1,20 @@
1
+ target :lib do
2
+ signature "sig"
3
+ #
4
+ check "lib" # Directory name
5
+ # check "Gemfile" # File name
6
+ # check "app/models/**/*.rb" # Glob
7
+ # # ignore "lib/templates/*.rb"
8
+ #
9
+ # # library "pathname", "set" # Standard libraries
10
+ library "dartsclone" # Gems
11
+ end
12
+
13
+ # target :spec do
14
+ # signature "sig", "sig-private"
15
+ #
16
+ # check "spec"
17
+ #
18
+ # # library "pathname", "set" # Standard libraries
19
+ # # library "rspec"
20
+ # end
Binary file
data/lib/suika.rb CHANGED
@@ -2,5 +2,6 @@
2
2
 
3
3
  require 'suika/version'
4
4
  require 'suika/char_def'
5
+ require 'suika/node'
5
6
  require 'suika/lattice'
6
7
  require 'suika/tagger'
@@ -5,10 +5,11 @@ module Suika
5
5
  class CharDef
6
6
  # @!visibility private
7
7
  def self.char_type(ch)
8
- code = ch.unpack1('U*')
9
- CHAR_TYPES.find do |ctype|
10
- Object.const_get("::Suika::CharDef::#{ctype}").any? { |r| r.include?(code) }
8
+ ch_code = ch.unpack1('U*')
9
+ ch_type = CHAR_TYPES.find do |ct|
10
+ Object.const_get("::Suika::CharDef::#{ct}").any? { |r| r.include?(ch_code) }
11
11
  end
12
+ ch_type || 'DEFAULT'
12
13
  end
13
14
 
14
15
  # @!visibility private
@@ -16,39 +17,41 @@ module Suika
16
17
  CHAR_CATEGORY[char_type(ch)]
17
18
  end
18
19
 
20
+ MAX_GROUPING_SIZE = 24
21
+
19
22
  CHAR_CATEGORY = {
20
23
  'DEFAULT' => {
21
- invoke: 0, group: 1, length: 0
24
+ invoke: false, group: true, length: 0
22
25
  },
23
26
  'SPACE' => {
24
- invoke: 0, group: 1, length: 0
27
+ invoke: false, group: true, length: 0
25
28
  },
26
29
  'KANJI' => {
27
- invoke: 0, group: 0, length: 2
30
+ invoke: false, group: false, length: 2
28
31
  },
29
32
  'SYMBOL' => {
30
- invoke: 1, group: 1, length: 0
33
+ invoke: true, group: true, length: 0
31
34
  },
32
35
  'NUMERIC' => {
33
- invoke: 1, group: 1, length: 0
36
+ invoke: true, group: true, length: 0
34
37
  },
35
38
  'ALPHA' => {
36
- invoke: 1, group: 1, length: 0
39
+ invoke: true, group: true, length: 0
37
40
  },
38
41
  'HIRAGANA' => {
39
- invoke: 0, group: 1, length: 2
42
+ invoke: false, group: true, length: 2
40
43
  },
41
44
  'KATAKANA' => {
42
- invoke: 1, group: 1, length: 2
45
+ invoke: true, group: true, length: 2
43
46
  },
44
47
  'KANJINUMERIC' => {
45
- invoke: 1, group: 1, length: 0
48
+ invoke: true, group: true, length: 0
46
49
  },
47
50
  'GREEK' => {
48
- invoke: 1, group: 1, length: 0
51
+ invoke: true, group: true, length: 0
49
52
  },
50
53
  'CYRILLIC' => {
51
- invoke: 1, group: 1, length: 0
54
+ invoke: true, group: true, length: 0
52
55
  }
53
56
  }.freeze
54
57
 
@@ -117,6 +120,7 @@ module Suika
117
120
  0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
118
121
  0x2A00..0x2AFF, # Supplemental Mathematical Operators
119
122
  0x3300..0x33FF,
123
+ 0x32FF..0x32FF, # Square era name REIWA
120
124
  0x3200..0x32FE, # ENclosed CJK Letters and Months
121
125
  0x3000..0x303F, # CJK Symbol and Punctuation
122
126
  0xFE30..0xFE4F, # CJK Compatibility Forms
@@ -171,8 +175,6 @@ module Suika
171
175
  0xF900..0xFA2D,
172
176
  0xFA30..0xFA6A
173
177
  ].freeze
174
-
175
- # rubocop:disable Style/AsciiComments
176
178
  # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
177
179
  # 0x4E00 KANJINUMERIC KANJI
178
180
  KANJINUMERIC = [
@@ -192,7 +194,6 @@ module Suika
192
194
  0x5104..0x5104,
193
195
  0x5146..0x5146
194
196
  ].freeze
195
- # rubocop:enable Style/AsciiComments
196
197
 
197
198
  private_constant :CHAR_CATEGORY, :CHAR_TYPES
198
199
 
data/lib/suika/lattice.rb CHANGED
@@ -4,8 +4,6 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
-
9
7
  attr_reader :begin_nodes, :end_nodes, :length
10
8
 
11
9
  # @!visibility private
@@ -13,17 +11,17 @@ module Suika
13
11
  @length = length
14
12
  @begin_nodes = Array.new(length + 1) { [] }
15
13
  @end_nodes = Array.new(length + 1) { [] }
16
- bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
17
- @end_nodes[0].append(bos)
18
- eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
19
- @begin_nodes[length].append(eos)
14
+ bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
15
+ @end_nodes[0].push(bos)
16
+ eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
+ @begin_nodes[length].push(eos)
20
18
  end
21
19
 
22
20
  # @!visibility private
23
- def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
- node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
- @begin_nodes[begin_id].append(node)
26
- @end_nodes[end_id].append(node)
21
+ def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
22
+ node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
23
+ @begin_nodes[begin_id].push(node)
24
+ @end_nodes[end_id].push(node)
27
25
  end
28
26
  end
29
27
  end
data/lib/suika/node.rb ADDED
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Node
6
+ # @!visibility private
7
+ attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
8
+
9
+ # @!visibility private
10
+ def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
11
+ @surface = surface
12
+ @unknown = unknown
13
+ @min_cost = min_cost
14
+ @min_prev = min_prev
15
+ @left_id = left_id
16
+ @right_id = right_id
17
+ @cost = cost
18
+ @attrs = attrs
19
+ end
20
+ end
21
+ end
data/lib/suika/tagger.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rambling-trie'
3
+ require 'dartsclone'
4
+ require 'rubygems/package'
4
5
  require 'zlib'
5
6
 
6
7
  module Suika
@@ -12,22 +13,22 @@ module Suika
12
13
  # tagger = Suika::Tagger.new
13
14
  # tagger.parse('すもももももももものうち').each { |token| puts token }
14
15
  #
15
- # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
- # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
- # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
16
+ # # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
17
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
18
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
19
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
20
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
21
+ # # の 助詞,連体化,*,*,*,*,の,ノ,ノ
22
+ # # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
22
23
  #
23
24
  class Tagger
24
25
  # Create a new tagger by loading the built-in binary dictionary.
25
26
  def initialize
26
- ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
- @trie = ipadic[:trie]
28
- @dictionary = ipadic[:dictionary]
29
- @unknown_dictionary = ipadic[:unknown_dictionary]
30
- @cost_mat = ipadic[:cost_matrix]
27
+ raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
28
+
29
+ @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
30
+ @trie = DartsClone::DoubleArray.new
31
+ @trie.set_array(@sysdic[:trie])
31
32
  end
32
33
 
33
34
  # Parse the given sentence.
@@ -39,44 +40,42 @@ module Suika
39
40
  terminal = sentence.length
40
41
 
41
42
  while start < terminal
42
- word = sentence[start]
43
- pos = start
44
- is_unknown = true
45
- while @trie.match?(word) && pos < terminal
46
- if @dictionary.key?(word)
47
- @dictionary[word].each do |el|
48
- lattice.insert(start, start + word.length,
49
- word, el[0].to_i, el[1].to_i, el[2].to_i,
50
- el[3..-1])
43
+ step = terminal - start
44
+
45
+ query = sentence[start..-1] || ''
46
+ result = trie.common_prefix_search(query)
47
+ unless result.empty?
48
+ words, indices = result
49
+ unless words.empty?
50
+ step = INT_MAX
51
+ words.each_with_index do |word, i|
52
+ features[indices[i]].each do |el|
53
+ lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
54
+ end
55
+ step = word.length if word.length < step
51
56
  end
52
- is_unknown = false
53
57
  end
54
- pos += 1
55
- word = sentence[start..pos]
56
- end
57
-
58
- unless is_unknown
59
- start += 1
60
- next
61
58
  end
62
59
 
63
- word = sentence[start]
64
- char_type = CharDef.char_type(sentence[start])
65
- char_cate = CharDef.char_category(sentence[start])
66
- if char_cate[:group] == 1
67
- unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
60
+ word = sentence[start] || ''
61
+ char_cate = CharDef.char_category(sentence[start] || '')
62
+ char_type = CharDef.char_type(sentence[start] || '')
63
+ if char_cate[:invoke]
64
+ unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
65
+ unk_terminal = terminal if terminal < unk_terminal
68
66
  pos = start + 1
69
- while pos < unk_terminal && char_type == CharDef.char_type(text[t])
70
- word << text[t]
67
+ while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
68
+ word << (sentence[pos] || '')
71
69
  pos += 1
72
70
  end
73
71
  end
74
- @unknown_dictionary[char_type].each do |el|
75
- lattice.insert(start, start + word.length,
76
- word, el[0].to_i, el[1].to_i, el[2].to_i,
77
- el[3..-1])
72
+ unknowns[char_type].each do |el|
73
+ lattice.insert(start, start + word.length, word, true,
74
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
78
75
  end
79
- start += 1
76
+ step = word.length if word.length < step
77
+
78
+ start += step
80
79
  end
81
80
 
82
81
  viterbi(lattice)
@@ -84,12 +83,28 @@ module Suika
84
83
 
85
84
  private
86
85
 
86
+ DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
87
+ DICTIONARY_KEY = 'eb921bf5e67f5733188527b21adbf9dabdda0c7a'
87
88
  INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
88
89
 
89
- private_constant :INT_MAX
90
+ private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
91
+
92
+ attr_reader :trie
93
+
94
+ def features
95
+ @sysdic[:features]
96
+ end
97
+
98
+ def unknowns
99
+ @sysdic[:unknowns]
100
+ end
101
+
102
+ def connect_cost(r_id, l_id)
103
+ @sysdic[:concosts][r_id][l_id]
104
+ end
90
105
 
91
106
  def viterbi(lattice)
92
- bos = lattice.end_nodes[0].first
107
+ bos = lattice.end_nodes[0][0]
93
108
  bos.min_cost = 0
94
109
  bos.min_prev = nil
95
110
 
@@ -98,7 +113,7 @@ module Suika
98
113
  rnode.min_cost = INT_MAX
99
114
  rnode.min_prev = nil
100
115
  lattice.end_nodes[n].each do |lnode|
101
- cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
116
+ cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
102
117
  if cost < rnode.min_cost
103
118
  rnode.min_cost = cost
104
119
  rnode.min_prev = lnode
@@ -107,13 +122,14 @@ module Suika
107
122
  end
108
123
  end
109
124
 
110
- eos = lattice.begin_nodes[-1].first
125
+ eos = lattice.begin_nodes[-1][0]
111
126
  prev_node = eos.min_prev
112
127
  res = []
113
128
  until prev_node.nil?
114
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
129
+ res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
130
  prev_node = prev_node.min_prev
116
131
  end
132
+
117
133
  res.reverse
118
134
  end
119
135
  end
data/lib/suika/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.1'
6
+ VERSION = '0.3.0'
7
7
  end
data/sig/suika.rbs ADDED
@@ -0,0 +1,3 @@
1
+ module Suika
2
+ VERSION: String
3
+ end
@@ -0,0 +1,25 @@
1
+ module Suika
2
+ class CharDef
3
+ def self.char_type: (String ch) -> String
4
+ def self.char_category: (String ch) -> { invoke: bool, group: bool, length: Integer }
5
+
6
+ MAX_GROUPING_SIZE: Integer
7
+
8
+ private
9
+
10
+ #CHAR_CATEGORY: Hash[String, { invoke: bool, group: bool, length: Integer }]
11
+ CHAR_CATEGORY: Hash[String, untyped]
12
+ CHAR_TYPES: Array[String]
13
+
14
+ SPACE: Array[Range[Integer]]
15
+ NUMERIC: Array[Range[Integer]]
16
+ SYMBOL: Array[Range[Integer]]
17
+ ALPHA: Array[Range[Integer]]
18
+ CYRILLIC: Array[Range[Integer]]
19
+ GREEK: Array[Range[Integer]]
20
+ HIRAGANA: Array[Range[Integer]]
21
+ KATAKANA: Array[Range[Integer]]
22
+ KANJI: Array[Range[Integer]]
23
+ KANJINUMERIC: Array[Range[Integer]]
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ module Suika
2
+ class Lattice
3
+ attr_reader begin_nodes: Array[Array[::Suika::Node]]
4
+ attr_reader end_nodes: Array[Array[::Suika::Node]]
5
+ attr_reader length: Integer
6
+
7
+ def initialize: (Integer length) -> void
8
+ def insert: (Integer begin_id, Integer end_id, String surface, bool unknown,
9
+ Integer left_id, Integer right_id, Integer cost, Array[String] attrs) -> void
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ module Suika
2
+ class Node
3
+ attr_accessor surface: String
4
+ attr_accessor unknown: bool
5
+ attr_accessor min_cost: Integer
6
+ # attr_accessor min_prev: ::Suika::Node?
7
+ attr_accessor min_prev: untyped
8
+ attr_accessor left_id: Integer
9
+ attr_accessor right_id: Integer
10
+ attr_accessor cost: Integer
11
+ attr_accessor attrs: Array[String]
12
+
13
+ def initialize: (?surface: String surface, ?unknown: bool unknown,
14
+ ?min_cost: Integer min_cost, ?min_prev: ::Suika::Node? min_prev,
15
+ ?left_id: ::Integer left_id, ?right_id: ::Integer right_id,
16
+ ?cost: ::Integer cost, ?attrs: Array[String] attrs) -> void
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ module Suika
2
+ class Tagger
3
+ def initialize: () -> void
4
+ def parse: (String sentence) -> Array[String]
5
+
6
+ private
7
+
8
+ DICTIONARY_PATH: String
9
+ DICTIONARY_KEY: String
10
+ INT_MAX: untyped
11
+
12
+ attr_reader trie: ::DartsClone::DoubleArray
13
+
14
+ # type feature = [Integer, Integer, Integer, String, String, String, String, String, String, String]
15
+
16
+ # def features: () -> Array[Array[feature]]
17
+ def features: () -> Array[Array[untyped]]
18
+ # def unknowns: () -> Hash[String, Array[feature]]
19
+ def unknowns: () -> Hash[String, Array[untyped]]
20
+ def connect_cost: (Integer r_id, Integer l_id) -> Integer
21
+ def viterbi: (::Suika::Lattice lattice) -> Array[String]
22
+ end
23
+ end
data/suika.gemspec CHANGED
@@ -12,11 +12,10 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
13
  spec.homepage = 'https://github.com/yoshoku/suika'
14
14
  spec.license = 'BSD-3-Clause'
15
- spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
15
 
17
16
  spec.metadata['homepage_uri'] = spec.homepage
18
17
  spec.metadata['source_code_uri'] = spec.homepage
19
- spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/main/CHANGELOG.md'
20
19
  spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
20
 
22
21
  # Specify which files should be added to the gem when it is released.
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
28
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
28
  spec.require_paths = ['lib']
30
29
 
31
- spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
30
+ spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
32
31
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2021-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: rambling-trie
14
+ name: dartsclone
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.2.0
27
27
  description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
28
  email:
29
29
  - yoshoku@outlook.com
@@ -31,10 +31,12 @@ executables: []
31
31
  extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
+ - ".coveralls.yml"
35
+ - ".github/workflows/build.yml"
36
+ - ".github/workflows/coverage.yml"
34
37
  - ".gitignore"
35
38
  - ".rspec"
36
39
  - ".rubocop.yml"
37
- - ".travis.yml"
38
40
  - CHANGELOG.md
39
41
  - CODE_OF_CONDUCT.md
40
42
  - Gemfile
@@ -42,14 +44,21 @@ files:
42
44
  - NOTICE.txt
43
45
  - README.md
44
46
  - Rakefile
47
+ - Steepfile
45
48
  - bin/console
46
49
  - bin/setup
47
- - dict/ipadic.gz
50
+ - dict/sysdic.gz
48
51
  - lib/suika.rb
49
52
  - lib/suika/char_def.rb
50
53
  - lib/suika/lattice.rb
54
+ - lib/suika/node.rb
51
55
  - lib/suika/tagger.rb
52
56
  - lib/suika/version.rb
57
+ - sig/suika.rbs
58
+ - sig/suika/char_def.rbs
59
+ - sig/suika/lattice.rbs
60
+ - sig/suika/node.rbs
61
+ - sig/suika/tagger.rbs
53
62
  - suika.gemspec
54
63
  homepage: https://github.com/yoshoku/suika
55
64
  licenses:
@@ -57,9 +66,9 @@ licenses:
57
66
  metadata:
58
67
  homepage_uri: https://github.com/yoshoku/suika
59
68
  source_code_uri: https://github.com/yoshoku/suika
60
- changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
69
+ changelog_uri: https://github.com/yoshoku/suika/blob/main/CHANGELOG.md
61
70
  documentation_uri: https://rubydoc.info/gems/suika
62
- post_install_message:
71
+ post_install_message:
63
72
  rdoc_options: []
64
73
  require_paths:
65
74
  - lib
@@ -67,15 +76,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
67
76
  requirements:
68
77
  - - ">="
69
78
  - !ruby/object:Gem::Version
70
- version: 2.3.0
79
+ version: '0'
71
80
  required_rubygems_version: !ruby/object:Gem::Requirement
72
81
  requirements:
73
82
  - - ">="
74
83
  - !ruby/object:Gem::Version
75
84
  version: '0'
76
85
  requirements: []
77
- rubygems_version: 3.1.2
78
- signing_key:
86
+ rubygems_version: 3.1.6
87
+ signing_key:
79
88
  specification_version: 4
80
89
  summary: Suika is a Japanese morphological analyzer written in pure Ruby.
81
90
  test_files: []
data/.travis.yml DELETED
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.0
6
- before_install: gem install bundler -v 2.1.2