suika 0.1.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1fb292a3b36ec1dde8f93da624092876220ec33fa49ff635cbaff93fa9337137
4
- data.tar.gz: 0e2c0cc53c6f25099dfa455f85d618491819dd98cc33047a6aebd4fb5858d13a
3
+ metadata.gz: 148e229070959a89197febf9bf9eabfdbe941f3c1cda66b75f87afb9371436c1
4
+ data.tar.gz: c6d4fa8c654144ad39e19ff23d63161ea193bd8760eb654635d36666bed6f2dd
5
5
  SHA512:
6
- metadata.gz: 1f54af2d9955f7c562df25d0d724c0b8f1460959b4d35d835c9412a7897e329a95016a7bc151c7125a8b4be9788a18dc7265cc4b4355ad7c1a5b1c828af0f622
7
- data.tar.gz: a7b1a9a484b51d03ec92a09a2e4ee1a24e1cfcbaafe27f8cd7cbc0b7b02989c0960ebc1799974816adf6399e2e21fa02e06cad1fb8a584db4238e08f93baf93b
6
+ metadata.gz: dbe9535910050678c51c3ff2d95118959d3b4cdeb4fffb2fb405ad5871258f527d1fa3f36df59706b3cf3d8c6265f19bd5844d778273fc369fc763eec293cf89
7
+ data.tar.gz: 063ed20722d52ac97b4993093a60ad866fd744c8e74361b3934c1f96082523da510f8d7adcbb46e238aa03ba7dbf53539180215aa5475488294a77d92ea8633e
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: github-ci
@@ -0,0 +1,21 @@
1
+ name: build
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ matrix:
10
+ ruby: [ '2.6', '2.7', '3.0' ]
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ - name: Set up Ruby ${{ matrix.ruby }}
14
+ uses: actions/setup-ruby@v1
15
+ with:
16
+ ruby-version: ${{ matrix.ruby }}
17
+ - name: Build and test with Rake
18
+ run: |
19
+ gem install --no-document bundler
20
+ bundle install --jobs 4 --retry 3
21
+ bundle exec rake
@@ -0,0 +1,26 @@
1
+ name: coverage
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ coverage:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ - name: Set up Ruby 2.7
15
+ uses: actions/setup-ruby@v1
16
+ with:
17
+ ruby-version: '2.7'
18
+ - name: Build and test with Rake
19
+ run: |
20
+ gem install --no-document bundler
21
+ bundle install --jobs 4 --retry 3
22
+ bundle exec rake
23
+ - name: Coveralls GitHub Action
24
+ uses: coverallsapp/github-action@v1.1.2
25
+ with:
26
+ github-token: ${{ secrets.GITHUB_TOKEN }}
data/.rubocop.yml CHANGED
@@ -3,7 +3,8 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.4
6
+ NewCops: enable
7
+ TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
9
10
  Exclude:
data/CHANGELOG.md CHANGED
@@ -1,8 +1,48 @@
1
- # 0.1.2
1
+ ## 0.3.1
2
+ - Fix Tagger's inspect method not to expand instance variables for object creation on irb and pry.
3
+
4
+ ## 0.3.0
5
+
6
+ ### Changes
7
+ - Add type declaration files.
8
+ - Refactor to avoid assigning null to variables.
9
+ - Fix some configuration files.
10
+
11
+
12
+ ## 0.2.0
13
+
14
+ ### Breaking Change
15
+ - Change to use dartsclone for trie library.
16
+
17
+
18
+ ## 0.1.4
19
+
20
+ ### Bug Fixes
21
+ - Fix CharDef.char_type to return 'DEFAULT' when unknown character code is given.
22
+
23
+ ### Features
24
+ - Add character code of square era name Reiwa.
25
+
26
+ ## 0.1.3
27
+
28
+ ### Bug Fixes
29
+ - Fix unknown word processing.
30
+
31
+ ### Changes
32
+ - Remove redundant spaces from output.
33
+
34
+
35
+ ## 0.1.2
36
+
37
+ ### Bug Fixes
2
38
  - Fix local variable typo in Tagger.parse.
3
39
 
4
- # 0.1.1
40
+
41
+ ## 0.1.1
42
+
43
+ ### Bug Fixes
5
44
  - Fix specification of class in CharDef.char_type.
6
45
 
7
- # 0.1.0
46
+
47
+ ## 0.1.0
8
48
  - First release.
data/Gemfile CHANGED
@@ -5,5 +5,9 @@ source 'https://rubygems.org'
5
5
  # Specify your gem's dependencies in suika.gemspec
6
6
  gemspec
7
7
 
8
- gem 'rake', '~> 12.0'
8
+ gem 'rake', '~> 13.0'
9
9
  gem 'rspec', '~> 3.0'
10
+ gem 'simplecov', '~> 0.21'
11
+ gem 'simplecov-lcov', '~> 0.8'
12
+ gem 'rbs', '~> 1.2'
13
+ gem 'steep', '~> 0.44'
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020 Atsushi Tatsuma
1
+ Copyright (c) 2020-2021 Atsushi Tatsuma
2
2
  All rights reserved.
3
3
 
4
4
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  # Suika
2
2
 
3
+ [![Build Status](https://github.com/yoshoku/suika/workflows/build/badge.svg)](https://github.com/yoshoku/suika/actions?query=workflow%3Abuild)
4
+ [![Coverage Status](https://coveralls.io/repos/github/yoshoku/suika/badge.svg?branch=main)](https://coveralls.io/github/yoshoku/suika?branch=main)
3
5
  [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
- [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
6
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
5
7
  [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
6
8
 
7
9
  Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
@@ -30,13 +32,13 @@ require 'suika'
30
32
  tagger = Suika::Tagger.new
31
33
  tagger.parse('すもももももももものうち').each { |token| puts token }
32
34
 
33
- # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
- # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
- # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
35
+ # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
36
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
37
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
38
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
39
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
40
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
41
+ # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
40
42
  ```
41
43
 
42
44
  Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
@@ -51,17 +53,36 @@ sentences.each do |sentence|
51
53
  end
52
54
  ```
53
55
 
56
+ ## Test
57
+ Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
58
+ without any error.
59
+
60
+ ```ruby
61
+ require 'suika'
62
+
63
+ tagger = Suika::Tagger.new
64
+
65
+ Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
66
+ File.foreach(filename) do |sentence|
67
+ sentence.strip!
68
+ puts tagger.parse(sentence) unless sentence.empty?
69
+ end
70
+ end
71
+ ```
72
+
73
+ ![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
74
+
54
75
  ## Contributing
55
76
 
56
77
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
57
- This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
78
+ This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
58
79
 
59
80
  ## License
60
81
 
61
82
  The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
62
83
  In addition, the gem includes binary data generated from mecab-ipadic.
63
- The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
64
- and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
84
+ The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/main/LICENSE.txt)
85
+ and [NOTICE.txt](https://github.com/yoshoku/suika/blob/main/NOTICE.txt).
65
86
 
66
87
  ## Respect
67
88
 
@@ -74,4 +95,4 @@ Janome, a morphological analyzer written in scripting language, gives me the cou
74
95
 
75
96
  ## Code of Conduct
76
97
 
77
- Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
98
+ Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/main/CODE_OF_CONDUCT.md).
data/Rakefile CHANGED
@@ -1,6 +1,79 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ require 'csv'
5
+ require 'dartsclone'
6
+ require 'nkf'
7
+ require 'rubygems/package'
8
+ require 'zlib'
3
9
 
4
10
  RSpec::Core::RakeTask.new(:spec)
5
11
 
6
12
  task :default => :spec
13
+
14
+ desc 'Build suika system dictionary'
15
+ task :dictionary do
16
+ base_dir = "#{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
17
+ unless File.directory?(base_dir)
18
+ puts "Download mecab-ipadic file and expand that under dict directory: #{__dir__}/dict/mecab-ipadic-2.7.0-20070801"
19
+ puts
20
+ puts 'Example:'
21
+ puts 'wget -O dict/mecab-ipadic.tgz https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
22
+ puts 'cd dict'
23
+ puts 'tar xzf mecab-ipadic.tgz'
24
+ puts 'cd ../'
25
+ next # exit
26
+ end
27
+
28
+ File.open("#{__dir__}/dict/mecab-ipadic-2.7.0-20070801/Reiwa.csv", 'w') do |f|
29
+ f.puts('令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ')
30
+ end
31
+
32
+ unknowns = {}
33
+ File.open("#{base_dir}/unk.def") do |f|
34
+ f.each_line do |line|
35
+ row = NKF.nkf('-w', line.chomp).split(',')
36
+ unknowns[row[0]] ||= []
37
+ unknowns[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
38
+ end
39
+ end
40
+
41
+ dict = {}
42
+ Dir.glob("#{base_dir}/*.csv").each do |filename|
43
+ File.open(filename) do |f|
44
+ f.each_line do |line|
45
+ row = NKF.nkf('-w', line.chomp).split(',')
46
+ dict[row[0]] ||= []
47
+ dict[row[0]] << [row[1].to_i, row[2].to_i, row[3].to_i, *row[4..-1]]
48
+ end
49
+ end
50
+ end
51
+
52
+ da = DartsClone::DoubleArray.new
53
+ words = dict.keys.sort
54
+ da.build(words)
55
+ features = words.map { |w| dict[w] }
56
+
57
+ concosts = nil
58
+ File.open("#{base_dir}/matrix.def") do |f|
59
+ n_entries = f.readline.chomp.split.map(&:to_i).first
60
+ concosts = Array.new(n_entries) { Array.new(n_entries) }
61
+ f.each_line do |line|
62
+ row, col, cost = line.chomp.split.map(&:to_i)
63
+ concosts[row][col] = cost
64
+ end
65
+ end
66
+
67
+ ipadic = {
68
+ trie: da.get_array,
69
+ features: features,
70
+ unknowns: unknowns,
71
+ concosts: concosts
72
+ }
73
+
74
+ Zlib::GzipWriter.open("#{__dir__}/dict/sysdic.gz", Zlib::BEST_SPEED) { |f| f.write(Marshal.dump(ipadic)) }
75
+
76
+ puts 'The system dictionary has been successfully built:'
77
+ puts "#{__dir__}/dict/sysdic.gz"
78
+ puts Digest::SHA1.file("#{__dir__}/dict/sysdic.gz").to_s
79
+ end
data/Steepfile ADDED
@@ -0,0 +1,20 @@
1
+ target :lib do
2
+ signature "sig"
3
+ #
4
+ check "lib" # Directory name
5
+ # check "Gemfile" # File name
6
+ # check "app/models/**/*.rb" # Glob
7
+ # # ignore "lib/templates/*.rb"
8
+ #
9
+ # # library "pathname", "set" # Standard libraries
10
+ library "dartsclone" # Gems
11
+ end
12
+
13
+ # target :spec do
14
+ # signature "sig", "sig-private"
15
+ #
16
+ # check "spec"
17
+ #
18
+ # # library "pathname", "set" # Standard libraries
19
+ # # library "rspec"
20
+ # end
Binary file
data/lib/suika.rb CHANGED
@@ -2,5 +2,6 @@
2
2
 
3
3
  require 'suika/version'
4
4
  require 'suika/char_def'
5
+ require 'suika/node'
5
6
  require 'suika/lattice'
6
7
  require 'suika/tagger'
@@ -5,10 +5,11 @@ module Suika
5
5
  class CharDef
6
6
  # @!visibility private
7
7
  def self.char_type(ch)
8
- code = ch.unpack1('U*')
9
- CHAR_TYPES.find do |ctype|
10
- Object.const_get("::Suika::CharDef::#{ctype}").any? { |r| r.include?(code) }
8
+ ch_code = ch.unpack1('U*')
9
+ ch_type = CHAR_TYPES.find do |ct|
10
+ Object.const_get("::Suika::CharDef::#{ct}").any? { |r| r.include?(ch_code) }
11
11
  end
12
+ ch_type || 'DEFAULT'
12
13
  end
13
14
 
14
15
  # @!visibility private
@@ -16,39 +17,41 @@ module Suika
16
17
  CHAR_CATEGORY[char_type(ch)]
17
18
  end
18
19
 
20
+ MAX_GROUPING_SIZE = 24
21
+
19
22
  CHAR_CATEGORY = {
20
23
  'DEFAULT' => {
21
- invoke: 0, group: 1, length: 0
24
+ invoke: false, group: true, length: 0
22
25
  },
23
26
  'SPACE' => {
24
- invoke: 0, group: 1, length: 0
27
+ invoke: false, group: true, length: 0
25
28
  },
26
29
  'KANJI' => {
27
- invoke: 0, group: 0, length: 2
30
+ invoke: false, group: false, length: 2
28
31
  },
29
32
  'SYMBOL' => {
30
- invoke: 1, group: 1, length: 0
33
+ invoke: true, group: true, length: 0
31
34
  },
32
35
  'NUMERIC' => {
33
- invoke: 1, group: 1, length: 0
36
+ invoke: true, group: true, length: 0
34
37
  },
35
38
  'ALPHA' => {
36
- invoke: 1, group: 1, length: 0
39
+ invoke: true, group: true, length: 0
37
40
  },
38
41
  'HIRAGANA' => {
39
- invoke: 0, group: 1, length: 2
42
+ invoke: false, group: true, length: 2
40
43
  },
41
44
  'KATAKANA' => {
42
- invoke: 1, group: 1, length: 2
45
+ invoke: true, group: true, length: 2
43
46
  },
44
47
  'KANJINUMERIC' => {
45
- invoke: 1, group: 1, length: 0
48
+ invoke: true, group: true, length: 0
46
49
  },
47
50
  'GREEK' => {
48
- invoke: 1, group: 1, length: 0
51
+ invoke: true, group: true, length: 0
49
52
  },
50
53
  'CYRILLIC' => {
51
- invoke: 1, group: 1, length: 0
54
+ invoke: true, group: true, length: 0
52
55
  }
53
56
  }.freeze
54
57
 
@@ -117,6 +120,7 @@ module Suika
117
120
  0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
118
121
  0x2A00..0x2AFF, # Supplemental Mathematical Operators
119
122
  0x3300..0x33FF,
123
+ 0x32FF..0x32FF, # Square era name REIWA
120
124
  0x3200..0x32FE, # ENclosed CJK Letters and Months
121
125
  0x3000..0x303F, # CJK Symbol and Punctuation
122
126
  0xFE30..0xFE4F, # CJK Compatibility Forms
data/lib/suika/lattice.rb CHANGED
@@ -4,8 +4,6 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
-
9
7
  attr_reader :begin_nodes, :end_nodes, :length
10
8
 
11
9
  # @!visibility private
@@ -13,17 +11,17 @@ module Suika
13
11
  @length = length
14
12
  @begin_nodes = Array.new(length + 1) { [] }
15
13
  @end_nodes = Array.new(length + 1) { [] }
16
- bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
17
- @end_nodes[0].append(bos)
18
- eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
19
- @begin_nodes[length].append(eos)
14
+ bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
15
+ @end_nodes[0].push(bos)
16
+ eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
+ @begin_nodes[length].push(eos)
20
18
  end
21
19
 
22
20
  # @!visibility private
23
- def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
- node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
- @begin_nodes[begin_id].append(node)
26
- @end_nodes[end_id].append(node)
21
+ def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
22
+ node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
23
+ @begin_nodes[begin_id].push(node)
24
+ @end_nodes[end_id].push(node)
27
25
  end
28
26
  end
29
27
  end
data/lib/suika/node.rb ADDED
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Node
6
+ # @!visibility private
7
+ attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
8
+
9
+ # @!visibility private
10
+ def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
11
+ @surface = surface
12
+ @unknown = unknown
13
+ @min_cost = min_cost
14
+ @min_prev = min_prev
15
+ @left_id = left_id
16
+ @right_id = right_id
17
+ @cost = cost
18
+ @attrs = attrs
19
+ end
20
+ end
21
+ end
data/lib/suika/tagger.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rambling-trie'
3
+ require 'dartsclone'
4
+ require 'rubygems/package'
4
5
  require 'zlib'
5
6
 
6
7
  module Suika
@@ -12,22 +13,22 @@ module Suika
12
13
  # tagger = Suika::Tagger.new
13
14
  # tagger.parse('すもももももももものうち').each { |token| puts token }
14
15
  #
15
- # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
- # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
- # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
16
+ # # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
17
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
18
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
19
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
20
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
21
+ # # の 助詞,連体化,*,*,*,*,の,ノ,ノ
22
+ # # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
22
23
  #
23
24
  class Tagger
24
25
  # Create a new tagger by loading the built-in binary dictionary.
25
26
  def initialize
26
- ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
- @trie = ipadic[:trie]
28
- @dictionary = ipadic[:dictionary]
29
- @unknown_dictionary = ipadic[:unknown_dictionary]
30
- @cost_mat = ipadic[:cost_matrix]
27
+ raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
28
+
29
+ @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
30
+ @trie = DartsClone::DoubleArray.new
31
+ @trie.set_array(@sysdic[:trie])
31
32
  end
32
33
 
33
34
  # Parse the given sentence.
@@ -39,57 +40,75 @@ module Suika
39
40
  terminal = sentence.length
40
41
 
41
42
  while start < terminal
42
- word = sentence[start]
43
- pos = start
44
- is_unknown = true
45
- while @trie.match?(word) && pos < terminal
46
- if @dictionary.key?(word)
47
- @dictionary[word].each do |el|
48
- lattice.insert(start, start + word.length,
49
- word, el[0].to_i, el[1].to_i, el[2].to_i,
50
- el[3..-1])
43
+ step = terminal - start
44
+
45
+ query = sentence[start..-1] || ''
46
+ result = trie.common_prefix_search(query)
47
+ unless result.empty?
48
+ words, indices = result
49
+ unless words.empty?
50
+ step = INT_MAX
51
+ words.each_with_index do |word, i|
52
+ features[indices[i]].each do |el|
53
+ lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
54
+ end
55
+ step = word.length if word.length < step
51
56
  end
52
- is_unknown = false
53
57
  end
54
- pos += 1
55
- word = sentence[start..pos]
56
- end
57
-
58
- unless is_unknown
59
- start += 1
60
- next
61
58
  end
62
59
 
63
- word = sentence[start]
64
- char_type = CharDef.char_type(sentence[start])
65
- char_cate = CharDef.char_category(sentence[start])
66
- if char_cate[:group] == 1
67
- unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
60
+ word = sentence[start] || ''
61
+ char_cate = CharDef.char_category(sentence[start] || '')
62
+ char_type = CharDef.char_type(sentence[start] || '')
63
+ if char_cate[:invoke]
64
+ unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
65
+ unk_terminal = terminal if terminal < unk_terminal
68
66
  pos = start + 1
69
- while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
70
- word << sentence[pos]
67
+ while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
68
+ word << (sentence[pos] || '')
71
69
  pos += 1
72
70
  end
73
71
  end
74
- @unknown_dictionary[char_type].each do |el|
75
- lattice.insert(start, start + word.length,
76
- word, el[0].to_i, el[1].to_i, el[2].to_i,
77
- el[3..-1])
72
+ unknowns[char_type].each do |el|
73
+ lattice.insert(start, start + word.length, word, true,
74
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
78
75
  end
79
- start += 1
76
+ step = word.length if word.length < step
77
+
78
+ start += step
80
79
  end
81
80
 
82
81
  viterbi(lattice)
83
82
  end
84
83
 
84
+ def inspect
85
+ to_s
86
+ end
87
+
85
88
  private
86
89
 
90
+ DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
91
+ DICTIONARY_KEY = 'eb921bf5e67f5733188527b21adbf9dabdda0c7a'
87
92
  INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
88
93
 
89
- private_constant :INT_MAX
94
+ private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
95
+
96
+ attr_reader :trie
97
+
98
+ def features
99
+ @sysdic[:features]
100
+ end
101
+
102
+ def unknowns
103
+ @sysdic[:unknowns]
104
+ end
105
+
106
+ def connect_cost(r_id, l_id)
107
+ @sysdic[:concosts][r_id][l_id]
108
+ end
90
109
 
91
110
  def viterbi(lattice)
92
- bos = lattice.end_nodes[0].first
111
+ bos = lattice.end_nodes[0][0]
93
112
  bos.min_cost = 0
94
113
  bos.min_prev = nil
95
114
 
@@ -98,7 +117,7 @@ module Suika
98
117
  rnode.min_cost = INT_MAX
99
118
  rnode.min_prev = nil
100
119
  lattice.end_nodes[n].each do |lnode|
101
- cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
120
+ cost = lnode.min_cost + connect_cost(lnode.right_id, rnode.left_id) + rnode.cost
102
121
  if cost < rnode.min_cost
103
122
  rnode.min_cost = cost
104
123
  rnode.min_prev = lnode
@@ -107,13 +126,14 @@ module Suika
107
126
  end
108
127
  end
109
128
 
110
- eos = lattice.begin_nodes[-1].first
129
+ eos = lattice.begin_nodes[-1][0]
111
130
  prev_node = eos.min_prev
112
131
  res = []
113
132
  until prev_node.nil?
114
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
133
+ res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
134
  prev_node = prev_node.min_prev
116
135
  end
136
+
117
137
  res.reverse
118
138
  end
119
139
  end
data/lib/suika/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.3.1'
7
7
  end
data/sig/suika.rbs ADDED
@@ -0,0 +1,3 @@
1
+ module Suika
2
+ VERSION: String
3
+ end
@@ -0,0 +1,25 @@
1
+ module Suika
2
+ class CharDef
3
+ def self.char_type: (String ch) -> String
4
+ def self.char_category: (String ch) -> { invoke: bool, group: bool, length: Integer }
5
+
6
+ MAX_GROUPING_SIZE: Integer
7
+
8
+ private
9
+
10
+ #CHAR_CATEGORY: Hash[String, { invoke: bool, group: bool, length: Integer }]
11
+ CHAR_CATEGORY: Hash[String, untyped]
12
+ CHAR_TYPES: Array[String]
13
+
14
+ SPACE: Array[Range[Integer]]
15
+ NUMERIC: Array[Range[Integer]]
16
+ SYMBOL: Array[Range[Integer]]
17
+ ALPHA: Array[Range[Integer]]
18
+ CYRILLIC: Array[Range[Integer]]
19
+ GREEK: Array[Range[Integer]]
20
+ HIRAGANA: Array[Range[Integer]]
21
+ KATAKANA: Array[Range[Integer]]
22
+ KANJI: Array[Range[Integer]]
23
+ KANJINUMERIC: Array[Range[Integer]]
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ module Suika
2
+ class Lattice
3
+ attr_reader begin_nodes: Array[Array[::Suika::Node]]
4
+ attr_reader end_nodes: Array[Array[::Suika::Node]]
5
+ attr_reader length: Integer
6
+
7
+ def initialize: (Integer length) -> void
8
+ def insert: (Integer begin_id, Integer end_id, String surface, bool unknown,
9
+ Integer left_id, Integer right_id, Integer cost, Array[String] attrs) -> void
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ module Suika
2
+ class Node
3
+ attr_accessor surface: String
4
+ attr_accessor unknown: bool
5
+ attr_accessor min_cost: Integer
6
+ # attr_accessor min_prev: ::Suika::Node?
7
+ attr_accessor min_prev: untyped
8
+ attr_accessor left_id: Integer
9
+ attr_accessor right_id: Integer
10
+ attr_accessor cost: Integer
11
+ attr_accessor attrs: Array[String]
12
+
13
+ def initialize: (?surface: String surface, ?unknown: bool unknown,
14
+ ?min_cost: Integer min_cost, ?min_prev: ::Suika::Node? min_prev,
15
+ ?left_id: ::Integer left_id, ?right_id: ::Integer right_id,
16
+ ?cost: ::Integer cost, ?attrs: Array[String] attrs) -> void
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ module Suika
2
+ class Tagger
3
+ def initialize: () -> void
4
+ def parse: (String sentence) -> Array[String]
5
+ def inspect: () -> String
6
+
7
+ private
8
+
9
+ DICTIONARY_PATH: String
10
+ DICTIONARY_KEY: String
11
+ INT_MAX: untyped
12
+
13
+ attr_reader trie: ::DartsClone::DoubleArray
14
+
15
+ # type feature = [Integer, Integer, Integer, String, String, String, String, String, String, String]
16
+
17
+ # def features: () -> Array[Array[feature]]
18
+ def features: () -> Array[Array[untyped]]
19
+ # def unknowns: () -> Hash[String, Array[feature]]
20
+ def unknowns: () -> Hash[String, Array[untyped]]
21
+ def connect_cost: (Integer r_id, Integer l_id) -> Integer
22
+ def viterbi: (::Suika::Lattice lattice) -> Array[String]
23
+ end
24
+ end
data/suika.gemspec CHANGED
@@ -12,11 +12,10 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
13
  spec.homepage = 'https://github.com/yoshoku/suika'
14
14
  spec.license = 'BSD-3-Clause'
15
- spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
15
 
17
16
  spec.metadata['homepage_uri'] = spec.homepage
18
17
  spec.metadata['source_code_uri'] = spec.homepage
19
- spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
18
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/main/CHANGELOG.md'
20
19
  spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
20
 
22
21
  # Specify which files should be added to the gem when it is released.
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
28
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
28
  spec.require_paths = ['lib']
30
29
 
31
- spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
30
+ spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
32
31
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2021-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: rambling-trie
14
+ name: dartsclone
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.2.0
27
27
  description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
28
  email:
29
29
  - yoshoku@outlook.com
@@ -31,10 +31,12 @@ executables: []
31
31
  extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
+ - ".coveralls.yml"
35
+ - ".github/workflows/build.yml"
36
+ - ".github/workflows/coverage.yml"
34
37
  - ".gitignore"
35
38
  - ".rspec"
36
39
  - ".rubocop.yml"
37
- - ".travis.yml"
38
40
  - CHANGELOG.md
39
41
  - CODE_OF_CONDUCT.md
40
42
  - Gemfile
@@ -42,14 +44,21 @@ files:
42
44
  - NOTICE.txt
43
45
  - README.md
44
46
  - Rakefile
47
+ - Steepfile
45
48
  - bin/console
46
49
  - bin/setup
47
- - dict/ipadic.gz
50
+ - dict/sysdic.gz
48
51
  - lib/suika.rb
49
52
  - lib/suika/char_def.rb
50
53
  - lib/suika/lattice.rb
54
+ - lib/suika/node.rb
51
55
  - lib/suika/tagger.rb
52
56
  - lib/suika/version.rb
57
+ - sig/suika.rbs
58
+ - sig/suika/char_def.rbs
59
+ - sig/suika/lattice.rbs
60
+ - sig/suika/node.rbs
61
+ - sig/suika/tagger.rbs
53
62
  - suika.gemspec
54
63
  homepage: https://github.com/yoshoku/suika
55
64
  licenses:
@@ -57,9 +66,9 @@ licenses:
57
66
  metadata:
58
67
  homepage_uri: https://github.com/yoshoku/suika
59
68
  source_code_uri: https://github.com/yoshoku/suika
60
- changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
69
+ changelog_uri: https://github.com/yoshoku/suika/blob/main/CHANGELOG.md
61
70
  documentation_uri: https://rubydoc.info/gems/suika
62
- post_install_message:
71
+ post_install_message:
63
72
  rdoc_options: []
64
73
  require_paths:
65
74
  - lib
@@ -67,15 +76,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
67
76
  requirements:
68
77
  - - ">="
69
78
  - !ruby/object:Gem::Version
70
- version: 2.3.0
79
+ version: '0'
71
80
  required_rubygems_version: !ruby/object:Gem::Requirement
72
81
  requirements:
73
82
  - - ">="
74
83
  - !ruby/object:Gem::Version
75
84
  version: '0'
76
85
  requirements: []
77
- rubygems_version: 3.1.2
78
- signing_key:
86
+ rubygems_version: 3.2.21
87
+ signing_key:
79
88
  specification_version: 4
80
89
  summary: Suika is a Japanese morphological analyzer written in pure Ruby.
81
90
  test_files: []
data/.travis.yml DELETED
@@ -1,6 +0,0 @@
1
- ---
2
- language: ruby
3
- cache: bundler
4
- rvm:
5
- - 2.7.0
6
- before_install: gem install bundler -v 2.1.2