suika 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2aafd394952b381595891fab5652ed86c1c71a18d09befd1ffa2cd4bfdb05e8e
4
- data.tar.gz: 33385a74bac5c831a7116aa34ee31c5470154c928e971926b7f0f112b3cf1264
3
+ metadata.gz: bfb2e78e15c648ee309868bdfd3f386a66b1cff633cb546880132cdb9b8f3806
4
+ data.tar.gz: d398f4de11a4af80b7c62c4e468fa2e3f9393bbb2211e0f2e317a08ed05c73b5
5
5
  SHA512:
6
- metadata.gz: 80de1122cb9aae4a3313bb1ee95594435415dcacb197ef9bbf33a2695f57b9cc14c8f466aab19c8f487f1f4c3e4107ecd3851fad217921f61511655b5309fb21
7
- data.tar.gz: e0e9b76d04c847f6c1a8013db4b8a05ab9050dbde59d25c76d66fee078ac6b2ca230652b03ee61f3d97b7eebdc1139a34fb790fc5c954b072b55747e1933415a
6
+ metadata.gz: 6fea777a4229725a174aa0955bcf5f775cc6e319fa73cf6f204385b3a47d84997b74b8798f78269f442aa11dcf151b1319c079c315dfea103dcd7a148cf0b5c5
7
+ data.tar.gz: 17d2fd7f248c965b6d585542c4ec4cb90e87663f96b0522f06f1c3e94c55a18e59f08e4ab1e7724f9dd48a114c2c0f35a0499100fb8850a89d2822de87927988
@@ -3,7 +3,8 @@ require:
3
3
  - rubocop-rspec
4
4
 
5
5
  AllCops:
6
- TargetRubyVersion: 2.4
6
+ NewCops: enable
7
+ TargetRubyVersion: 2.5
7
8
  DisplayCopNames: true
8
9
  DisplayStyleGuide: true
9
10
  Exclude:
@@ -1,6 +1,12 @@
1
1
  ---
2
+ os: linux
3
+ dist: xenial
2
4
  language: ruby
3
5
  cache: bundler
4
6
  rvm:
5
- - 2.7.0
6
- before_install: gem install bundler -v 2.1.2
7
+ - '2.5'
8
+ - '2.6'
9
+ - '2.7'
10
+
11
+ before_install:
12
+ - gem install bundler -v 2.1.4
@@ -1,3 +1,9 @@
1
+ ## 0.2.0
2
+
3
+ ### Breaking Change
4
+ - Change to use dartsclone for trie library.
5
+
6
+
1
7
  ## 0.1.4
2
8
 
3
9
  ### Bug Fixes
@@ -1,15 +1,15 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- suika (0.1.2)
5
- rambling-trie (~> 2.1)
4
+ suika (0.2.0)
5
+ dartsclone (>= 0.2.0)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
+ dartsclone (0.2.0)
10
11
  diff-lcs (1.4.4)
11
12
  rake (12.3.3)
12
- rambling-trie (2.1.1)
13
13
  rspec (3.9.0)
14
14
  rspec-core (~> 3.9.0)
15
15
  rspec-expectations (~> 3.9.0)
data/README.md CHANGED
@@ -1,5 +1,6 @@
1
1
  # Suika
2
2
 
3
+ [![Build Status](https://travis-ci.org/yoshoku/suika.svg?branch=master)](https://travis-ci.org/yoshoku/suika)
3
4
  [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
5
  [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
5
6
  [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
@@ -51,6 +52,25 @@ sentences.each do |sentence|
51
52
  end
52
53
  ```
53
54
 
55
+ ## Test
56
+ Suika was able to parse all sentences in the [Livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
57
+ without any error.
58
+
59
+ ```ruby
60
+ require 'suika'
61
+
62
+ tagger = Suika::Tagger.new
63
+
64
+ Dir.glob('ldcc-20140209/text/*/*.txt').each do |filename|
65
+ File.foreach(filename) do |sentence|
66
+ sentence.strip!
67
+ puts tagger.parse(sentence) unless sentence.empty?
68
+ end
69
+ end
70
+ ```
71
+
72
+ ![suika_test](https://user-images.githubusercontent.com/5562409/90264778-8f593f80-de8c-11ea-81f1-20831e3c8b12.gif)
73
+
54
74
  ## Contributing
55
75
 
56
76
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
@@ -2,5 +2,6 @@
2
2
 
3
3
  require 'suika/version'
4
4
  require 'suika/char_def'
5
+ require 'suika/node'
5
6
  require 'suika/lattice'
6
7
  require 'suika/tagger'
@@ -4,8 +4,6 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
-
9
7
  attr_reader :begin_nodes, :end_nodes, :length
10
8
 
11
9
  # @!visibility private
@@ -14,16 +12,16 @@ module Suika
14
12
  @begin_nodes = Array.new(length + 1) { [] }
15
13
  @end_nodes = Array.new(length + 1) { [] }
16
14
  bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
- @end_nodes[0].append(bos)
15
+ @end_nodes[0].push(bos)
18
16
  eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
19
- @begin_nodes[length].append(eos)
17
+ @begin_nodes[length].push(eos)
20
18
  end
21
19
 
22
20
  # @!visibility private
23
21
  def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
24
22
  node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
- @begin_nodes[begin_id].append(node)
26
- @end_nodes[end_id].append(node)
23
+ @begin_nodes[begin_id].push(node)
24
+ @end_nodes[end_id].push(node)
27
25
  end
28
26
  end
29
27
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Node
6
+ # @!visibility private
7
+ attr_accessor :surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs
8
+
9
+ # @!visibility private
10
+ def initialize(surface: '', unknown: false, min_cost: 0, min_prev: nil, left_id: 0, right_id: 0, cost: 0, attrs: [])
11
+ @surface = surface
12
+ @unknown = unknown
13
+ @min_cost = min_cost
14
+ @min_prev = min_prev
15
+ @left_id = left_id
16
+ @right_id = right_id
17
+ @cost = cost
18
+ @attrs = attrs
19
+ end
20
+ end
21
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'rambling-trie'
3
+ require 'dartsclone'
4
+ require 'rubygems/package'
4
5
  require 'zlib'
5
6
 
6
7
  module Suika
@@ -23,11 +24,11 @@ module Suika
23
24
  class Tagger
24
25
  # Create a new tagger by loading the built-in binary dictionary.
25
26
  def initialize
26
- ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
- @trie = ipadic[:trie]
28
- @dictionary = ipadic[:dictionary]
29
- @unknown_dictionary = ipadic[:unknown_dictionary]
30
- @cost_mat = ipadic[:cost_matrix]
27
+ raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s
28
+
29
+ @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
30
+ @trie = DartsClone::DoubleArray.new
31
+ @trie.set_array(@sysdic[:trie])
31
32
  end
32
33
 
33
34
  # Parse the given sentence.
@@ -39,39 +40,40 @@ module Suika
39
40
  terminal = sentence.length
40
41
 
41
42
  while start < terminal
42
- word = sentence[start]
43
- pos = start
44
- matched = false
45
- while @trie.match?(word) && pos < terminal
46
- if @dictionary.key?(word)
47
- matched = true
48
- @dictionary[word].each do |el|
43
+ step = terminal - start
44
+
45
+ query = sentence[start..-1]
46
+ result = trie.common_prefix_search(query)
47
+ unless result.empty?
48
+ words, indices = result
49
+ words.each_with_index do |word, i|
50
+ features[indices[i]].each do |el|
49
51
  lattice.insert(start, start + word.length, word, false,
50
52
  el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
51
53
  end
52
54
  end
53
- pos += 1
54
- word = sentence[start..pos]
55
+ step = words.map(&:size).min
55
56
  end
56
57
 
57
58
  word = sentence[start]
58
59
  char_cate = CharDef.char_category(sentence[start])
59
- unless !char_cate[:invoke] && matched
60
+ char_type = CharDef.char_type(sentence[start])
61
+ if char_cate[:invoke]
60
62
  char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
61
63
  unk_terminal = [start + char_length, terminal].min
62
64
  pos = start + 1
63
- char_type = CharDef.char_type(sentence[start])
64
65
  while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
65
66
  word << sentence[pos]
66
67
  pos += 1
67
68
  end
68
- @unknown_dictionary[char_type].each do |el|
69
- lattice.insert(start, start + word.length, word, true,
70
- el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
71
- end
72
69
  end
70
+ unknowns[char_type].each do |el|
71
+ lattice.insert(start, start + word.length, word, true,
72
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
73
+ end
74
+ step = [step, word.length].min
73
75
 
74
- start += 1
76
+ start += step
75
77
  end
76
78
 
77
79
  viterbi(lattice)
@@ -79,9 +81,25 @@ module Suika
79
81
 
80
82
  private
81
83
 
84
+ DICTIONARY_PATH = "#{__dir__}/../../dict/sysdic.gz"
85
+ DICTIONARY_KEY = '562e53853b8a5b9f4857536b0748847a0878ebf0'
82
86
  INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
83
87
 
84
- private_constant :INT_MAX
88
+ private_constant :DICTIONARY_PATH, :DICTIONARY_KEY, :INT_MAX
89
+
90
+ attr_reader :trie
91
+
92
+ def features
93
+ @sysdic[:dictionary]
94
+ end
95
+
96
+ def unknowns
97
+ @sysdic[:unknown_dictionary]
98
+ end
99
+
100
+ def costmat
101
+ @sysdic[:cost_matrix]
102
+ end
85
103
 
86
104
  def viterbi(lattice)
87
105
  bos = lattice.end_nodes[0].first
@@ -93,7 +111,7 @@ module Suika
93
111
  rnode.min_cost = INT_MAX
94
112
  rnode.min_prev = nil
95
113
  lattice.end_nodes[n].each do |lnode|
96
- cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
114
+ cost = lnode.min_cost + costmat[lnode.right_id][rnode.left_id] + rnode.cost
97
115
  if cost < rnode.min_cost
98
116
  rnode.min_cost = cost
99
117
  rnode.min_prev = lnode
@@ -106,7 +124,7 @@ module Suika
106
124
  prev_node = eos.min_prev
107
125
  res = []
108
126
  until prev_node.nil?
109
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
127
+ res.push("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
110
128
  prev_node = prev_node.min_prev
111
129
  end
112
130
 
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.4'
6
+ VERSION = '0.2.0'
7
7
  end
@@ -12,7 +12,6 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
13
  spec.homepage = 'https://github.com/yoshoku/suika'
14
14
  spec.license = 'BSD-3-Clause'
15
- spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
15
 
17
16
  spec.metadata['homepage_uri'] = spec.homepage
18
17
  spec.metadata['source_code_uri'] = spec.homepage
@@ -28,5 +27,5 @@ Gem::Specification.new do |spec|
28
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
28
  spec.require_paths = ['lib']
30
29
 
31
- spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
30
+ spec.add_runtime_dependency 'dartsclone', '>= 0.2.0'
32
31
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-08 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: rambling-trie
14
+ name: dartsclone
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.1'
19
+ version: 0.2.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.1'
26
+ version: 0.2.0
27
27
  description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
28
  email:
29
29
  - yoshoku@outlook.com
@@ -45,10 +45,11 @@ files:
45
45
  - Rakefile
46
46
  - bin/console
47
47
  - bin/setup
48
- - dict/ipadic.gz
48
+ - dict/sysdic.gz
49
49
  - lib/suika.rb
50
50
  - lib/suika/char_def.rb
51
51
  - lib/suika/lattice.rb
52
+ - lib/suika/node.rb
52
53
  - lib/suika/tagger.rb
53
54
  - lib/suika/version.rb
54
55
  - suika.gemspec
@@ -68,7 +69,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
69
  requirements:
69
70
  - - ">="
70
71
  - !ruby/object:Gem::Version
71
- version: 2.3.0
72
+ version: '0'
72
73
  required_rubygems_version: !ruby/object:Gem::Requirement
73
74
  requirements:
74
75
  - - ">="