suika 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2aafd394952b381595891fab5652ed86c1c71a18d09befd1ffa2cd4bfdb05e8e
4
- data.tar.gz: 33385a74bac5c831a7116aa34ee31c5470154c928e971926b7f0f112b3cf1264
3
+ metadata.gz: bfb2e78e15c648ee309868bdfd3f386a66b1cff633cb546880132cdb9b8f3806
4
+ data.tar.gz: d398f4de11a4af80b7c62c4e468fa2e3f9393bbb2211e0f2e317a08ed05c73b5
5
5
  SHA512:
6
- metadata.gz: 80de1122cb9aae4a3313bb1ee95594435415dcacb197ef9bbf33a2695f57b9cc14c8f466aab19c8f487f1f4c3e4107ecd3851fad217921f61511655b5309fb21
7
- data.tar.gz: e0e9b76d04c847f6c1a8013db4b8a05ab9050dbde59d25c76d66fee078ac6b2ca230652b03ee61f3d97b7eebdc1139a34fb790fc5c954b072b55747e1933415a
6
+ metadata.gz: 6fea777a4229725a174aa0955bcf5f775cc6e319fa73cf6f204385b3a47d84997b74b8798f78269f442aa11dcf151b1319c079c315dfea103dcd7a148cf0b5c5
7
+ data.tar.gz: 17d2fd7f248c965b6d585542c4ec4cb90e87663f96b0522f06f1c3e94c55a18e59f08e4ab1e7724f9dd48a114c2c0f35a0499100fb8850a89d2822de87927988
@@ -3,7 +3,8 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.4
6
+ NewCops: enable
7
+ TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
9
10
  Exclude:
@@ -1,6 +1,12 @@
1
1
  ---
2
+ os: linux
3
+ dist: xenial
2
4
  language: ruby
3
5
  cache: bundler
4
6
  rvm:
5
- - 2.7.0
6
- before_install: gem install bundler -v 2.1.2
7
+ - '2.5'
8
+ - '2.6'
9
+ - '2.7'
10
+
11
+ before_install:
12
+ - gem install bundler -v 2.1.4
@@ -1,3 +1,9 @@
1
+ ## 0.2.0
2
+
3
+ ### Breaking Change
4
+ - Change to use dartsclone for trie library.
5
+
6
+
1
7
  ## 0.1.4
2
8
 
3
9
  ### Bug Fixes
@@ -1,15 +1,15 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- suika (0.1.2)
5
- rambling-trie (~> 2.1)
4
+ suika (0.2.0)
5
+ dartsclone (>= 0.2.0)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
+ dartsclone (0.2.0)
10
11
  diff-lcs (1.4.4)
11
12
  rake (12.3.3)
12
- rambling-trie (2.1.1)
13
13
  rspec (3.9.0)
14
14
  rspec-core (~> 3.9.0)
15
15
  rspec-expectations (~> 3.9.0)
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Suika
2
2
 
3
+ [![Build Status](https://travis-ci.org/yoshoku/suika.svg?branch=master)](https://travis-ci.org/yoshoku/suika)
3
4
  [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
5
  [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
5
6
  [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
@@ -51,6 +52,25 @@ sentences.each do |sentence|
51
52
  end
52
53
  ```
53
54
 
55
+ ## Test
56
+ Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
57
+ without any error.
58
+
59
+ ```ruby
60
+ require 'suika'
61
+
62
+ tagger = Suika::Tagger.new
63
+
64
+ Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
65
+ File.foreach(filename) do |sentence|
66
+ sentence.strip!
67
+ puts tagger.parse(sentence) unless sentence.empty?
68
+ end
69
+ end
70
+ ```
71
+
72
+ ![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
73
+
54
74
  ## Contributing
55
75
 
56
76
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
@@ -2,5 +2,6 @@
2
2
 
3
3
  require 'suika/version'
4
4
  require 'suika/char_def'
5
+ require 'suika/node'
5
6
  require 'suika/lattice'
6
7
  require 'suika/tagger'
@@ -4,8 +4,6 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
-
9
7
  attr_reader :begin_nodes, :end_nodes, :length
10
8
 
11
9
  # @!visibility private
@@ -14,16 +12,16 @@ module Suika
14
12
  @begin_nodes = Array.new(length + 1) { [] }
15
13
  @end_nodes = Array.new(length + 1) { [] }
16
14
  bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
- @end_nodes[0].append(bos)
15
+ @end_nodes[0].push(bos)
18
16
  eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
19
- @begin_nodes[length].append(eos)
17
+ @begin_nodes[length].push(eos)
20
18
  end
21
19
 
22
20
  # @!visibility private
23
21
  def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
24
22
  node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
- @begin_nodes[begin_id].append(node)
26
- @end_nodes[end_id].append(node)
23
+ @begin_nodes[begin_id].push(node)
24
+ @end_nodes[end_id].push(node)
27
25
  end
28
26
  end
29
27
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Node
6
+ # @!visibility private
7
+ attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
8
+
9
+ # @!visibility private
10
+ def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
11
+ @surface = surface
12
+ @unknown = unknown
13
+ @min_cost = min_cost
14
+ @min_prev = min_prev
15
+ @left_id = left_id
16
+ @right_id = right_id
17
+ @cost = cost
18
+ @attrs = attrs
19
+ end
20
+ end
21
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rambling-trie'
3
+ require 'dartsclone'
4
+ require 'rubygems/package'
4
5
  require 'zlib'
5
6
 
6
7
  module Suika
@@ -23,11 +24,11 @@ module Suika
23
24
  class Tagger
24
25
  # Create a new tagger by loading the built-in binary dictionary.
25
26
  def initialize
26
- ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
- @trie = ipadic[:trie]
28
- @dictionary = ipadic[:dictionary]
29
- @unknown_dictionary = ipadic[:unknown_dictionary]
30
- @cost_mat = ipadic[:cost_matrix]
27
+ raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
28
+
29
+ @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
30
+ @trie = DartsClone::DoubleArray.new
31
+ @trie.set_array(@sysdic[:trie])
31
32
  end
32
33
 
33
34
  # Parse the given sentence.
@@ -39,39 +40,40 @@ module Suika
39
40
  terminal = sentence.length
40
41
 
41
42
  while start < terminal
42
- word = sentence[start]
43
- pos = start
44
- matched = false
45
- while @trie.match?(word) && pos < terminal
46
- if @dictionary.key?(word)
47
- matched = true
48
- @dictionary[word].each do |el|
43
+ step = terminal - start
44
+
45
+ query = sentence[start..-1]
46
+ result = trie.common_prefix_search(query)
47
+ unless result.empty?
48
+ words, indices = result
49
+ words.each_with_index do |word, i|
50
+ features[indices[i]].each do |el|
49
51
  lattice.insert(start, start + word.length, word, false,
50
52
  el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
51
53
  end
52
54
  end
53
- pos += 1
54
- word = sentence[start..pos]
55
+ step = words.map(&:size).min
55
56
  end
56
57
 
57
58
  word = sentence[start]
58
59
  char_cate = CharDef.char_category(sentence[start])
59
- unless !char_cate[:invoke] && matched
60
+ char_type = CharDef.char_type(sentence[start])
61
+ if char_cate[:invoke]
60
62
  char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
61
63
  unk_terminal = [start + char_length, terminal].min
62
64
  pos = start + 1
63
- char_type = CharDef.char_type(sentence[start])
64
65
  while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
65
66
  word << sentence[pos]
66
67
  pos += 1
67
68
  end
68
- @unknown_dictionary[char_type].each do |el|
69
- lattice.insert(start, start + word.length, word, true,
70
- el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
71
- end
72
69
  end
70
+ unknowns[char_type].each do |el|
71
+ lattice.insert(start, start + word.length, word, true,
72
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
73
+ end
74
+ step = [step, word.length].min
73
75
 
74
- start += 1
76
+ start += step
75
77
  end
76
78
 
77
79
  viterbi(lattice)
@@ -79,9 +81,25 @@ module Suika
79
81
 
80
82
  private
81
83
 
84
+ DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
85
+ DICTIONARY_KEY = '562e53853b8a5b9f4857536b0748847a0878ebf0'
82
86
  INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
83
87
 
84
- private_constant :INT_MAX
88
+ private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
89
+
90
+ attr_reader :trie
91
+
92
+ def features
93
+ @sysdic[:dictionary]
94
+ end
95
+
96
+ def unknowns
97
+ @sysdic[:unknown_dictionary]
98
+ end
99
+
100
+ def costmat
101
+ @sysdic[:cost_matrix]
102
+ end
85
103
 
86
104
  def viterbi(lattice)
87
105
  bos = lattice.end_nodes[0].first
@@ -93,7 +111,7 @@ module Suika
93
111
  rnode.min_cost = INT_MAX
94
112
  rnode.min_prev = nil
95
113
  lattice.end_nodes[n].each do |lnode|
96
- cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
114
+ cost = lnode.min_cost + costmat[lnode.right_id][rnode.left_id] + rnode.cost
97
115
  if cost < rnode.min_cost
98
116
  rnode.min_cost = cost
99
117
  rnode.min_prev = lnode
@@ -106,7 +124,7 @@ module Suika
106
124
  prev_node = eos.min_prev
107
125
  res = []
108
126
  until prev_node.nil?
109
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
127
+ res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
110
128
  prev_node = prev_node.min_prev
111
129
  end
112
130
 
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.0'
7
7
  end
@@ -12,7 +12,6 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
13
  spec.homepage = 'https://github.com/yoshoku/suika'
14
14
  spec.license = 'BSD-3-Clause'
15
- spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
15
 
17
16
  spec.metadata['homepage_uri'] = spec.homepage
18
17
  spec.metadata['source_code_uri'] = spec.homepage
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
28
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
28
  spec.require_paths = ['lib']
30
29
 
31
- spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
30
+ spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
32
31
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-08 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: rambling-trie
14
+ name: dartsclone
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.2.0
27
27
  description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
28
  email:
29
29
  - yoshoku@outlook.com
@@ -45,10 +45,11 @@ files:
45
45
  - Rakefile
46
46
  - bin/console
47
47
  - bin/setup
48
- - dict/ipadic.gz
48
+ - dict/sysdic.gz
49
49
  - lib/suika.rb
50
50
  - lib/suika/char_def.rb
51
51
  - lib/suika/lattice.rb
52
+ - lib/suika/node.rb
52
53
  - lib/suika/tagger.rb
53
54
  - lib/suika/version.rb
54
55
  - suika.gemspec
@@ -68,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
69
  requirements:
69
70
  - - ">="
70
71
  - !ruby/object:Gem::Version
71
- version: 2.3.0
72
+ version: '0'
72
73
  required_rubygems_version: !ruby/object:Gem::Requirement
73
74
  requirements:
74
75
  - - ">="