suika 0.1.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 24d1a6401851f6ca9fa7b4ea11d1966110b33c736499e5ad4facffda746d135d
4
- data.tar.gz: 46763eabe75de50858ecfd98e92adee305d837033c996c8da2faeac5d56c694f
3
+ metadata.gz: f2aaecfc53b051cfbd06052bf22ac14614e459d42789670df1a7a31f601d533f
4
+ data.tar.gz: 39d4ac2b9fc0f4f164f2e3408b3dfcfe478f922d889334243c572a48ddf781de
5
5
  SHA512:
6
- metadata.gz: '01690ace19b17fb68a368fc1d7a572a079c1d19876204ed85a239efbf6a5a2f4adcd5bab5f9d4f6c7c83dc54ff63e0c5a3c99819f88fed81065a96925bc27c5f'
7
- data.tar.gz: 35b773639f6379a788f9dd3178e857752090420e406783e1abc27b276aa0c89f17b393a9c5ff60dc273f762b26c55f5155f5baa2a3723faaa1d8026d27b71c64
6
+ metadata.gz: dfdf6c08812109d7f6c6a52b64b8d8f7e6febd858bae23ccead4b3e598c76acd24a9c56cd893d958199b6545e37a4e01d6058c2acf39241a472260fda6accd90
7
+ data.tar.gz: 495016afc1854269d20b35e27bbb65cdffc307a9a94ab0149b8dcee884ac8560077ce0367a65f70b11bbdf0d27175134a2f66d67cc9bccc8f91a45353f2f1cf8
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: github-ci
@@ -0,0 +1,21 @@
1
+ name: build
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ matrix:
10
+ ruby: [ '2.6', '2.7', '3.0' ]
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby ${{ matrix.ruby }}
14
+ uses: actions/setup-ruby@v1
15
+ with:
16
+ ruby-version: ${{ matrix.ruby }}
17
+ - name: Build and test with Rake
18
+ run: |
19
+ gem install --no-document bundler
20
+ bundle install --jobs 4 --retry 3
21
+ bundle exec rake
@@ -0,0 +1,26 @@
1
+ name: coverage
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ coverage:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ - name: Set up Ruby 2.7
15
+ uses: actions/setup-ruby@v1
16
+ with:
17
+ ruby-version: '2.7'
18
+ - name: Build and test with Rake
19
+ run: |
20
+ gem install --no-document bundler
21
+ bundle install --jobs 4 --retry 3
22
+ bundle exec rake
23
+ - name: Coveralls GitHub Action
24
+ uses: coverallsapp/github-action@v1.1.2
25
+ with:
26
+ github-token: ${{ secrets.GITHUB_TOKEN }}
data/.rubocop.yml CHANGED
@@ -3,7 +3,8 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.4
6
+ NewCops: enable
7
+ TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
9
10
  Exclude:
data/CHANGELOG.md CHANGED
@@ -1,5 +1,45 @@
1
- # 0.1.1
1
+ ## 0.3.0
2
+
3
+ ### Changes
4
+ - Add type declaration files.
5
+ - Refactor to avoid assigning null to variables.
6
+ - Fix some configuration files.
7
+
8
+
9
+ ## 0.2.0
10
+
11
+ ### Breaking Change
12
+ - Change to use dartsclone for trie library.
13
+
14
+
15
+ ## 0.1.4
16
+
17
+ ### Bug Fixes
18
+ - Fix CharDef.char_type to return 'DEFAULT' when unknown character code is given.
19
+
20
+ ### Features
21
+ - Add character code of square era name Reiwa.
22
+
23
+ ## 0.1.3
24
+
25
+ ### Bug Fixes
26
+ - Fix unknown word processing.
27
+
28
+ ### Changes
29
+ - Remove redundant spaces from output.
30
+
31
+
32
+ ## 0.1.2
33
+
34
+ ### Bug Fixes
35
+ - Fix local variable typo in Tagger.parse.
36
+
37
+
38
+ ## 0.1.1
39
+
40
+ ### Bug Fixes
2
41
  - Fix specification of class in CharDef.char_type.
3
42
 
4
- # 0.1.0
43
+
44
+ ## 0.1.0
5
45
  - First release.
data/Gemfile CHANGED
@@ -5,5 +5,9 @@ source 'https://rubygems.org'
5
5
  # Specify your gem's dependencies in suika.gemspec
6
6
  gemspec
7
7
 
8
- gem 'rake', '~> 12.0'
8
+ gem 'rake', '~> 13.0'
9
9
  gem 'rspec', '~> 3.0'
10
+ gem 'simplecov', '~> 0.21'
11
+ gem 'simplecov-lcov', '~> 0.8'
12
+ gem 'rbs', '~> 1.2'
13
+ gem 'steep', '~> 0.44'
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020 Atsushi Tatsuma
1
+ Copyright (c) 2020-2021 Atsushi Tatsuma
2
2
  All rights reserved.
3
3
 
4
4
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  # Suika
2
2
 
3
+ [![Build Status](https://github.com/yoshoku/suika/workflows/build/badge.svg)](https://github.com/yoshoku/suika/actions?query=workflow%3Abuild)
4
+ [![Coverage Status](https://coveralls.io/repos/github/yoshoku/suika/badge.svg?branch=main)](https://coveralls.io/github/yoshoku/suika?branch=main)
3
5
  [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
- [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
6
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
5
7
  [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
6
8
 
7
9
  Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
@@ -30,13 +32,13 @@ require 'suika'
30
32
  tagger = Suika::Tagger.new
31
33
  tagger.parse('すもももももももものうち').each { |token| puts token }
32
34
 
33
- # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
- # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
- # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
35
+ # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
36
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
37
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
38
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
39
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
40
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
41
+ # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
40
42
  ```
41
43
 
42
44
  Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
@@ -51,17 +53,36 @@ sentences.each do |sentence|
51
53
  end
52
54
  ```
53
55
 
56
+ ## Test
57
+ Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
58
+ without any error.
59
+
60
+ ```ruby
61
+ require 'suika'
62
+
63
+ tagger = Suika::Tagger.new
64
+
65
+ Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
66
+ File.foreach(filename) do |sentence|
67
+ sentence.strip!
68
+ puts tagger.parse(sentence) unless sentence.empty?
69
+ end
70
+ end
71
+ ```
72
+
73
+ ![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
74
+
54
75
  ## Contributing
55
76
 
56
77
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
57
- This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
78
+ This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
58
79
 
59
80
  ## License
60
81
 
61
82
  The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
62
83
  In addition, the gem includes binary data generated from mecab-ipadic.
63
- The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
64
- and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
84
+ The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
85
+ and [NOTICE.txt](https://github.com/yoshoku/suika/blob/main/NOTICE.txt).
65
86
 
66
87
  ## Respect
67
88
 
@@ -74,4 +95,4 @@ Janome, a morphological analyzer written in scripting language, gives me the cou
74
95
 
75
96
  ## Code of Conduct
76
97
 
77
- Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
98
+ Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
data/Rakefile CHANGED
@@ -1,6 +1,79 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ require 'csv'
5
+ require 'dartsclone'
6
+ require 'nkf'
7
+ require 'rubygems/package'
8
+ require 'zlib'
3
9
 
4
10
  RSpec::Core::RakeTask.new(:spec)
5
11
 
6
12
  task :default => :spec
13
+
14
+ desc 'Build suika system dictionary'
15
+ task :dictionary do
16
+ base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
17
+ unless File.directory?(base_dir)
18
+ puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
19
+ puts
20
+ puts 'Example:'
21
+ puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
22
+ puts 'cd dict'
23
+ puts 'tar xzf mecab-ipadic.tgz'
24
+ puts 'cd ../'
25
+ next # exit
26
+ end
27
+
28
+ File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
29
+ f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
30
+ end
31
+
32
+ unknowns = {}
33
+ File.open("#{base_dir}/unk.def") do |f|
34
+ f.each_line do |line|
35
+ row = NKF.nkf('-w', line.chomp).split(',')
36
+ unknowns[row[0]] ||= []
37
+ unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
38
+ end
39
+ end
40
+
41
+ dict = {}
42
+ Dir.glob("#{base_dir}/*.csv").each do |filename|
43
+ File.open(filename) do |f|
44
+ f.each_line do |line|
45
+ row = NKF.nkf('-w', line.chomp).split(',')
46
+ dict[row[0]] ||= []
47
+ dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
48
+ end
49
+ end
50
+ end
51
+
52
+ da = DartsClone::DoubleArray.new
53
+ words = dict.keys.sort
54
+ da.build(words)
55
+ features = words.map { |w| dict[w] }
56
+
57
+ concosts = nil
58
+ File.open("#{base_dir}/matrix.def") do |f|
59
+ n_entries = f.readline.chomp.split.map(&:to_i).first
60
+ concosts = Array.new(n_entries) { Array.new(n_entries) }
61
+ f.each_line do |line|
62
+ row, col, cost = line.chomp.split.map(&:to_i)
63
+ concosts[row][col] = cost
64
+ end
65
+ end
66
+
67
+ ipadic = {
68
+ trie: da.get_array,
69
+ features: features,
70
+ unknowns: unknowns,
71
+ concosts: concosts
72
+ }
73
+
74
+ Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
75
+
76
+ puts 'The system dictionary has been successfully built:'
77
+ puts "#{__dir__}/dict/sysdic.gz"
78
+ puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
79
+ end
data/Steepfile ADDED
@@ -0,0 +1,20 @@
1
+ target :lib do
2
+ signature "sig"
3
+ #
4
+ check "lib" # Directory name
5
+ # check "Gemfile" # File name
6
+ # check "app/models/**/*.rb" # Glob
7
+ # # ignore "lib/templates/*.rb"
8
+ #
9
+ # # library "pathname", "set" # Standard libraries
10
+ library "dartsclone" # Gems
11
+ end
12
+
13
+ # target :spec do
14
+ # signature "sig", "sig-private"
15
+ #
16
+ # check "spec"
17
+ #
18
+ # # library "pathname", "set" # Standard libraries
19
+ # # library "rspec"
20
+ # end
Binary file
data/lib/suika.rb CHANGED
@@ -2,5 +2,6 @@
2
2
 
3
3
  require 'suika/version'
4
4
  require 'suika/char_def'
5
+ require 'suika/node'
5
6
  require 'suika/lattice'
6
7
  require 'suika/tagger'
@@ -5,10 +5,11 @@ module Suika
5
5
  class CharDef
6
6
  # @!visibility private
7
7
  def self.char_type(ch)
8
- code = ch.unpack1('U*')
9
- CHAR_TYPES.find do |ctype|
10
- Object.const_get("::Suika::CharDef::#{ctype}").any? { |r| r.include?(code) }
8
+ ch_code = ch.unpack1('U*')
9
+ ch_type = CHAR_TYPES.find do |ct|
10
+ Object.const_get("::Suika::CharDef::#{ct}").any? { |r| r.include?(ch_code) }
11
11
  end
12
+ ch_type || 'DEFAULT'
12
13
  end
13
14
 
14
15
  # @!visibility private
@@ -16,39 +17,41 @@ module Suika
16
17
  CHAR_CATEGORY[char_type(ch)]
17
18
  end
18
19
 
20
+ MAX_GROUPING_SIZE = 24
21
+
19
22
  CHAR_CATEGORY = {
20
23
  'DEFAULT' => {
21
- invoke: 0, group: 1, length: 0
24
+ invoke: false, group: true, length: 0
22
25
  },
23
26
  'SPACE' => {
24
- invoke: 0, group: 1, length: 0
27
+ invoke: false, group: true, length: 0
25
28
  },
26
29
  'KANJI' => {
27
- invoke: 0, group: 0, length: 2
30
+ invoke: false, group: false, length: 2
28
31
  },
29
32
  'SYMBOL' => {
30
- invoke: 1, group: 1, length: 0
33
+ invoke: true, group: true, length: 0
31
34
  },
32
35
  'NUMERIC' => {
33
- invoke: 1, group: 1, length: 0
36
+ invoke: true, group: true, length: 0
34
37
  },
35
38
  'ALPHA' => {
36
- invoke: 1, group: 1, length: 0
39
+ invoke: true, group: true, length: 0
37
40
  },
38
41
  'HIRAGANA' => {
39
- invoke: 0, group: 1, length: 2
42
+ invoke: false, group: true, length: 2
40
43
  },
41
44
  'KATAKANA' => {
42
- invoke: 1, group: 1, length: 2
45
+ invoke: true, group: true, length: 2
43
46
  },
44
47
  'KANJINUMERIC' => {
45
- invoke: 1, group: 1, length: 0
48
+ invoke: true, group: true, length: 0
46
49
  },
47
50
  'GREEK' => {
48
- invoke: 1, group: 1, length: 0
51
+ invoke: true, group: true, length: 0
49
52
  },
50
53
  'CYRILLIC' => {
51
- invoke: 1, group: 1, length: 0
54
+ invoke: true, group: true, length: 0
52
55
  }
53
56
  }.freeze
54
57
 
@@ -117,6 +120,7 @@ module Suika
117
120
  0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
118
121
  0x2A00..0x2AFF, # Supplemental Mathematical Operators
119
122
  0x3300..0x33FF,
123
+ 0x32FF..0x32FF, # Square era name REIWA
120
124
  0x3200..0x32FE, # ENclosed CJK Letters and Months
121
125
  0x3000..0x303F, # CJK Symbol and Punctuation
122
126
  0xFE30..0xFE4F, # CJK Compatibility Forms
@@ -171,8 +175,6 @@ module Suika
171
175
  0xF900..0xFA2D,
172
176
  0xFA30..0xFA6A
173
177
  ].freeze
174
-
175
- # rubocop:disable Style/AsciiComments
176
178
  # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
177
179
  # 0x4E00 KANJINUMERIC KANJI
178
180
  KANJINUMERIC = [
@@ -192,7 +194,6 @@ module Suika
192
194
  0x5104..0x5104,
193
195
  0x5146..0x5146
194
196
  ].freeze
195
- # rubocop:enable Style/AsciiComments
196
197
 
197
198
  private_constant :CHAR_CATEGORY, :CHAR_TYPES
198
199
 
data/lib/suika/lattice.rb CHANGED
@@ -4,8 +4,6 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
-
9
7
  attr_reader :begin_nodes, :end_nodes, :length
10
8
 
11
9
  # @!visibility private
@@ -13,17 +11,17 @@ module Suika
13
11
  @length = length
14
12
  @begin_nodes = Array.new(length + 1) { [] }
15
13
  @end_nodes = Array.new(length + 1) { [] }
16
- bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
17
- @end_nodes[0].append(bos)
18
- eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
19
- @begin_nodes[length].append(eos)
14
+ bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
15
+ @end_nodes[0].push(bos)
16
+ eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
+ @begin_nodes[length].push(eos)
20
18
  end
21
19
 
22
20
  # @!visibility private
23
- def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
- node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
- @begin_nodes[begin_id].append(node)
26
- @end_nodes[end_id].append(node)
21
+ def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
22
+ node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
23
+ @begin_nodes[begin_id].push(node)
24
+ @end_nodes[end_id].push(node)
27
25
  end
28
26
  end
29
27
  end
data/lib/suika/node.rb ADDED
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Node
6
+ # @!visibility private
7
+ attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
8
+
9
+ # @!visibility private
10
+ def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
11
+ @surface = surface
12
+ @unknown = unknown
13
+ @min_cost = min_cost
14
+ @min_prev = min_prev
15
+ @left_id = left_id
16
+ @right_id = right_id
17
+ @cost = cost
18
+ @attrs = attrs
19
+ end
20
+ end
21
+ end
data/lib/suika/tagger.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rambling-trie'
3
+ require 'dartsclone'
4
+ require 'rubygems/package'
4
5
  require 'zlib'
5
6
 
6
7
  module Suika
@@ -12,22 +13,22 @@ module Suika
12
13
  # tagger = Suika::Tagger.new
13
14
  # tagger.parse('すもももももももものうち').each { |token| puts token }
14
15
  #
15
- # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
- # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
- # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
16
+ # # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
17
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
18
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
19
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
20
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
21
+ # # の 助詞,連体化,*,*,*,*,の,ノ,ノ
22
+ # # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
22
23
  #
23
24
  class Tagger
24
25
  # Create a new tagger by loading the built-in binary dictionary.
25
26
  def initialize
26
- ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
- @trie = ipadic[:trie]
28
- @dictionary = ipadic[:dictionary]
29
- @unknown_dictionary = ipadic[:unknown_dictionary]
30
- @cost_mat = ipadic[:cost_matrix]
27
+ raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
28
+
29
+ @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
30
+ @trie = DartsClone::DoubleArray.new
31
+ @trie.set_array(@sysdic[:trie])
31
32
  end
32
33
 
33
34
  # Parse the given sentence.
@@ -39,44 +40,42 @@ module Suika
39
40
  terminal = sentence.length
40
41
 
41
42
  while start < terminal
42
- word = sentence[start]
43
- pos = start
44
- is_unknown = true
45
- while @trie.match?(word) && pos < terminal
46
- if @dictionary.key?(word)
47
- @dictionary[word].each do |el|
48
- lattice.insert(start, start + word.length,
49
- word, el[0].to_i, el[1].to_i, el[2].to_i,
50
- el[3..-1])
43
+ step = terminal - start
44
+
45
+ query = sentence[start..-1] || ''
46
+ result = trie.common_prefix_search(query)
47
+ unless result.empty?
48
+ words, indices = result
49
+ unless words.empty?
50
+ step = INT_MAX
51
+ words.each_with_index do |word, i|
52
+ features[indices[i]].each do |el|
53
+ lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
54
+ end
55
+ step = word.length if word.length < step
51
56
  end
52
- is_unknown = false
53
57
  end
54
- pos += 1
55
- word = sentence[start..pos]
56
- end
57
-
58
- unless is_unknown
59
- start += 1
60
- next
61
58
  end
62
59
 
63
- word = sentence[start]
64
- char_type = CharDef.char_type(sentence[start])
65
- char_cate = CharDef.char_category(sentence[start])
66
- if char_cate[:group] == 1
67
- unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
60
+ word = sentence[start] || ''
61
+ char_cate = CharDef.char_category(sentence[start] || '')
62
+ char_type = CharDef.char_type(sentence[start] || '')
63
+ if char_cate[:invoke]
64
+ unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
65
+ unk_terminal = terminal if terminal < unk_terminal
68
66
  pos = start + 1
69
- while pos < unk_terminal && char_type == CharDef.char_type(text[t])
70
- word << text[t]
67
+ while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
68
+ word << (sentence[pos] || '')
71
69
  pos += 1
72
70
  end
73
71
  end
74
- @unknown_dictionary[char_type].each do |el|
75
- lattice.insert(start, start + word.length,
76
- word, el[0].to_i, el[1].to_i, el[2].to_i,
77
- el[3..-1])
72
+ unknowns[char_type].each do |el|
73
+ lattice.insert(start, start + word.length, word, true,
74
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
78
75
  end
79
- start += 1
76
+ step = word.length if word.length < step
77
+
78
+ start += step
80
79
  end
81
80
 
82
81
  viterbi(lattice)
@@ -84,12 +83,28 @@ module Suika
84
83
 
85
84
  private
86
85
 
86
+ DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
87
+ DICTIONARY_KEY = 'eb921bf5e67f5733188527b21adbf9dabdda0c7a'
87
88
  INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
88
89
 
89
- private_constant :INT_MAX
90
+ private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
91
+
92
+ attr_reader :trie
93
+
94
+ def features
95
+ @sysdic[:features]
96
+ end
97
+
98
+ def unknowns
99
+ @sysdic[:unknowns]
100
+ end
101
+
102
+ def connect_cost(r_id, l_id)
103
+ @sysdic[:concosts][r_id][l_id]
104
+ end
90
105
 
91
106
  def viterbi(lattice)
92
- bos = lattice.end_nodes[0].first
107
+ bos = lattice.end_nodes[0][0]
93
108
  bos.min_cost = 0
94
109
  bos.min_prev = nil
95
110
 
@@ -98,7 +113,7 @@ module Suika
98
113
  rnode.min_cost = INT_MAX
99
114
  rnode.min_prev = nil
100
115
  lattice.end_nodes[n].each do |lnode|
101
- cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
116
+ cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
102
117
  if cost < rnode.min_cost
103
118
  rnode.min_cost = cost
104
119
  rnode.min_prev = lnode
@@ -107,13 +122,14 @@ module Suika
107
122
  end
108
123
  end
109
124
 
110
- eos = lattice.begin_nodes[-1].first
125
+ eos = lattice.begin_nodes[-1][0]
111
126
  prev_node = eos.min_prev
112
127
  res = []
113
128
  until prev_node.nil?
114
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
129
+ res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
130
  prev_node = prev_node.min_prev
116
131
  end
132
+
117
133
  res.reverse
118
134
  end
119
135
  end
data/lib/suika/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.1'
6
+ VERSION = '0.3.0'
7
7
  end
data/sig/suika.rbs ADDED
@@ -0,0 +1,3 @@
1
+ module Suika
2
+ VERSION: String
3
+ end
@@ -0,0 +1,25 @@
1
+ module Suika
2
+ class CharDef
3
+ def self.char_type: (String ch) -> String
4
+ def self.char_category: (String ch) -> { invoke: bool, group: bool, length: Integer }
5
+
6
+ MAX_GROUPING_SIZE: Integer
7
+
8
+ private
9
+
10
+ #CHAR_CATEGORY: Hash[String, { invoke: bool, group: bool, length: Integer }]
11
+ CHAR_CATEGORY: Hash[String, untyped]
12
+ CHAR_TYPES: Array[String]
13
+
14
+ SPACE: Array[Range[Integer]]
15
+ NUMERIC: Array[Range[Integer]]
16
+ SYMBOL: Array[Range[Integer]]
17
+ ALPHA: Array[Range[Integer]]
18
+ CYRILLIC: Array[Range[Integer]]
19
+ GREEK: Array[Range[Integer]]
20
+ HIRAGANA: Array[Range[Integer]]
21
+ KATAKANA: Array[Range[Integer]]
22
+ KANJI: Array[Range[Integer]]
23
+ KANJINUMERIC: Array[Range[Integer]]
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ module Suika
2
+ class Lattice
3
+ attr_reader begin_nodes: Array[Array[::Suika::Node]]
4
+ attr_reader end_nodes: Array[Array[::Suika::Node]]
5
+ attr_reader length: Integer
6
+
7
+ def initialize: (Integer length) -> void
8
+ def insert: (Integer begin_id, Integer end_id, String surface, bool unknown,
9
+ Integer left_id, Integer right_id, Integer cost, Array[String] attrs) -> void
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ module Suika
2
+ class Node
3
+ attr_accessor surface: String
4
+ attr_accessor unknown: bool
5
+ attr_accessor min_cost: Integer
6
+ # attr_accessor min_prev: ::Suika::Node?
7
+ attr_accessor min_prev: untyped
8
+ attr_accessor left_id: Integer
9
+ attr_accessor right_id: Integer
10
+ attr_accessor cost: Integer
11
+ attr_accessor attrs: Array[String]
12
+
13
+ def initialize: (?surface: String surface, ?unknown: bool unknown,
14
+ ?min_cost: Integer min_cost, ?min_prev: ::Suika::Node? min_prev,
15
+ ?left_id: ::Integer left_id, ?right_id: ::Integer right_id,
16
+ ?cost: ::Integer cost, ?attrs: Array[String] attrs) -> void
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ module Suika
2
+ class Tagger
3
+ def initialize: () -> void
4
+ def parse: (String sentence) -> Array[String]
5
+
6
+ private
7
+
8
+ DICTIONARY_PATH: String
9
+ DICTIONARY_KEY: String
10
+ INT_MAX: untyped
11
+
12
+ attr_reader trie: ::DartsClone::DoubleArray
13
+
14
+ # type feature = [Integer, Integer, Integer, String, String, String, String, String, String, String]
15
+
16
+ # def features: () -> Array[Array[feature]]
17
+ def features: () -> Array[Array[untyped]]
18
+ # def unknowns: () -> Hash[String, Array[feature]]
19
+ def unknowns: () -> Hash[String, Array[untyped]]
20
+ def connect_cost: (Integer r_id, Integer l_id) -> Integer
21
+ def viterbi: (::Suika::Lattice lattice) -> Array[String]
22
+ end
23
+ end
data/suika.gemspec CHANGED
@@ -12,11 +12,10 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
13
  spec.homepage = 'https://github.com/yoshoku/suika'
14
14
  spec.license = 'BSD-3-Clause'
15
- spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
15
 
17
16
  spec.metadata['homepage_uri'] = spec.homepage
18
17
  spec.metadata['source_code_uri'] = spec.homepage
19
- spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/main/CHANGELOG.md'
20
19
  spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
20
 
22
21
  # Specify which files should be added to the gem when it is released.
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
28
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
28
  spec.require_paths = ['lib']
30
29
 
31
- spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
30
+ spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
32
31
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2021-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: rambling-trie
14
+ name: dartsclone
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.2.0
27
27
  description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
28
  email:
29
29
  - yoshoku@outlook.com
@@ -31,10 +31,12 @@ executables: []
31
31
  extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
+ - ".coveralls.yml"
35
+ - ".github/workflows/build.yml"
36
+ - ".github/workflows/coverage.yml"
34
37
  - ".gitignore"
35
38
  - ".rspec"
36
39
  - ".rubocop.yml"
37
- - ".travis.yml"
38
40
  - CHANGELOG.md
39
41
  - CODE_OF_CONDUCT.md
40
42
  - Gemfile
@@ -42,14 +44,21 @@ files:
42
44
  - NOTICE.txt
43
45
  - README.md
44
46
  - Rakefile
47
+ - Steepfile
45
48
  - bin/console
46
49
  - bin/setup
47
- - dict/ipadic.gz
50
+ - dict/sysdic.gz
48
51
  - lib/suika.rb
49
52
  - lib/suika/char_def.rb
50
53
  - lib/suika/lattice.rb
54
+ - lib/suika/node.rb
51
55
  - lib/suika/tagger.rb
52
56
  - lib/suika/version.rb
57
+ - sig/suika.rbs
58
+ - sig/suika/char_def.rbs
59
+ - sig/suika/lattice.rbs
60
+ - sig/suika/node.rbs
61
+ - sig/suika/tagger.rbs
53
62
  - suika.gemspec
54
63
  homepage: https://github.com/yoshoku/suika
55
64
  licenses:
@@ -57,9 +66,9 @@ licenses:
57
66
  metadata:
58
67
  homepage_uri: https://github.com/yoshoku/suika
59
68
  source_code_uri: https://github.com/yoshoku/suika
60
- changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
69
+ changelog_uri: https://github.com/yoshoku/suika/blob/main/CHANGELOG.md
61
70
  documentation_uri: https://rubydoc.info/gems/suika
62
- post_install_message:
71
+ post_install_message:
63
72
  rdoc_options: []
64
73
  require_paths:
65
74
  - lib
@@ -67,15 +76,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
67
76
  requirements:
68
77
  - - ">="
69
78
  - !ruby/object:Gem::Version
70
- version: 2.3.0
79
+ version: '0'
71
80
  required_rubygems_version: !ruby/object:Gem::Requirement
72
81
  requirements:
73
82
  - - ">="
74
83
  - !ruby/object:Gem::Version
75
84
  version: '0'
76
85
  requirements: []
77
- rubygems_version: 3.1.2
78
- signing_key:
86
+ rubygems_version: 3.1.6
87
+ signing_key:
79
88
  specification_version: 4
80
89
  summary: Suika is a Japanese morphological analyzer written in pure Ruby.
81
90
  test_files: []
data/.travis.yml DELETED
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.0
6
- before_install: gem install bundler -v 2.1.2