suika 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1fb292a3b36ec1dde8f93da624092876220ec33fa49ff635cbaff93fa9337137
4
- data.tar.gz: 0e2c0cc53c6f25099dfa455f85d618491819dd98cc33047a6aebd4fb5858d13a
3
+ metadata.gz: 108d59930a3a546de3a603d4c7de5851249264a4eddd76c8a350407b2d0b353a
4
+ data.tar.gz: d879db93ee8593466c415ba9064b50221ee2355fa0caedcd301fe07de9ab4880
5
5
  SHA512:
6
- metadata.gz: 1f54af2d9955f7c562df25d0d724c0b8f1460959b4d35d835c9412a7897e329a95016a7bc151c7125a8b4be9788a18dc7265cc4b4355ad7c1a5b1c828af0f622
7
- data.tar.gz: a7b1a9a484b51d03ec92a09a2e4ee1a24e1cfcbaafe27f8cd7cbc0b7b02989c0960ebc1799974816adf6399e2e21fa02e06cad1fb8a584db4238e08f93baf93b
6
+ metadata.gz: aceff27ac13d1c7b03ae286e2a24c02f5fe75d987e5ea91879777e66adf8155b8d92d185399e2fe31bf6dd7e6448fb175d3ed0b864c55e2df2ff30b6dcaad5a3
7
+ data.tar.gz: a92d9161f73a761c9621fa720725dbebdfa74da1d8b892ab3ecab8aa9ee835d98bf42341818219b0fdf1dd3a164a1206f306f0e8dcad9030b7137776ed5142e0
@@ -1,3 +1,7 @@
1
+ # 0.1.3
2
+ - Fix unknown word processing.
3
+ - Remove redundant spaces from output.
4
+
1
5
  # 0.1.2
2
6
  - Fix local variable typo in Tagger.parse.
3
7
 
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ suika (0.1.2)
5
+ rambling-trie (~> 2.1)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.4.4)
11
+ rake (12.3.3)
12
+ rambling-trie (2.1.1)
13
+ rspec (3.9.0)
14
+ rspec-core (~> 3.9.0)
15
+ rspec-expectations (~> 3.9.0)
16
+ rspec-mocks (~> 3.9.0)
17
+ rspec-core (3.9.2)
18
+ rspec-support (~> 3.9.3)
19
+ rspec-expectations (3.9.2)
20
+ diff-lcs (>= 1.2.0, < 2.0)
21
+ rspec-support (~> 3.9.0)
22
+ rspec-mocks (3.9.1)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.9.0)
25
+ rspec-support (3.9.3)
26
+
27
+ PLATFORMS
28
+ ruby
29
+
30
+ DEPENDENCIES
31
+ rake (~> 12.0)
32
+ rspec (~> 3.0)
33
+ suika!
34
+
35
+ BUNDLED WITH
36
+ 2.1.2
data/README.md CHANGED
@@ -30,13 +30,13 @@ require 'suika'
30
30
  tagger = Suika::Tagger.new
31
31
  tagger.parse('すもももももももものうち').each { |token| puts token }
32
32
 
33
- # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
- # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
- # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
33
+ # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
34
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
35
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
36
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
37
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
38
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
39
+ # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
40
40
  ```
41
41
 
42
42
  Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
@@ -16,39 +16,41 @@ module Suika
16
16
  CHAR_CATEGORY[char_type(ch)]
17
17
  end
18
18
 
19
+ MAX_GROUPING_SIZE = 24
20
+
19
21
  CHAR_CATEGORY = {
20
22
  'DEFAULT' => {
21
- invoke: 0, group: 1, length: 0
23
+ invoke: false, group: true, length: 0
22
24
  },
23
25
  'SPACE' => {
24
- invoke: 0, group: 1, length: 0
26
+ invoke: false, group: true, length: 0
25
27
  },
26
28
  'KANJI' => {
27
- invoke: 0, group: 0, length: 2
29
+ invoke: false, group: false, length: 2
28
30
  },
29
31
  'SYMBOL' => {
30
- invoke: 1, group: 1, length: 0
32
+ invoke: true, group: true, length: 0
31
33
  },
32
34
  'NUMERIC' => {
33
- invoke: 1, group: 1, length: 0
35
+ invoke: true, group: true, length: 0
34
36
  },
35
37
  'ALPHA' => {
36
- invoke: 1, group: 1, length: 0
38
+ invoke: true, group: true, length: 0
37
39
  },
38
40
  'HIRAGANA' => {
39
- invoke: 0, group: 1, length: 2
41
+ invoke: false, group: true, length: 2
40
42
  },
41
43
  'KATAKANA' => {
42
- invoke: 1, group: 1, length: 2
44
+ invoke: true, group: true, length: 2
43
45
  },
44
46
  'KANJINUMERIC' => {
45
- invoke: 1, group: 1, length: 0
47
+ invoke: true, group: true, length: 0
46
48
  },
47
49
  'GREEK' => {
48
- invoke: 1, group: 1, length: 0
50
+ invoke: true, group: true, length: 0
49
51
  },
50
52
  'CYRILLIC' => {
51
- invoke: 1, group: 1, length: 0
53
+ invoke: true, group: true, length: 0
52
54
  }
53
55
  }.freeze
54
56
 
@@ -4,7 +4,7 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
7
+ Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
8
 
9
9
  attr_reader :begin_nodes, :end_nodes, :length
10
10
 
@@ -13,15 +13,15 @@ module Suika
13
13
  @length = length
14
14
  @begin_nodes = Array.new(length + 1) { [] }
15
15
  @end_nodes = Array.new(length + 1) { [] }
16
- bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
16
+ bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
17
  @end_nodes[0].append(bos)
18
- eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
18
+ eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
19
19
  @begin_nodes[length].append(eos)
20
20
  end
21
21
 
22
22
  # @!visibility private
23
- def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
- node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
23
+ def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
24
+ node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
25
  @begin_nodes[begin_id].append(node)
26
26
  @end_nodes[end_id].append(node)
27
27
  end
@@ -12,13 +12,13 @@ module Suika
12
12
  # tagger = Suika::Tagger.new
13
13
  # tagger.parse('すもももももももものうち').each { |token| puts token }
14
14
  #
15
- # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
- # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
- # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
15
+ # # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
16
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
17
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
18
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
19
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
20
+ # # の 助詞,連体化,*,*,*,*,の,ノ,ノ
21
+ # # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
22
22
  #
23
23
  class Tagger
24
24
  # Create a new tagger by loading the built-in binary dictionary.
@@ -41,41 +41,36 @@ module Suika
41
41
  while start < terminal
42
42
  word = sentence[start]
43
43
  pos = start
44
- is_unknown = true
44
+ matched = false
45
45
  while @trie.match?(word) && pos < terminal
46
46
  if @dictionary.key?(word)
47
+ matched = true
47
48
  @dictionary[word].each do |el|
48
- lattice.insert(start, start + word.length,
49
- word, el[0].to_i, el[1].to_i, el[2].to_i,
50
- el[3..-1])
49
+ lattice.insert(start, start + word.length, word, false,
50
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
51
51
  end
52
- is_unknown = false
53
52
  end
54
53
  pos += 1
55
54
  word = sentence[start..pos]
56
55
  end
57
56
 
58
- unless is_unknown
59
- start += 1
60
- next
61
- end
62
-
63
57
  word = sentence[start]
64
- char_type = CharDef.char_type(sentence[start])
65
58
  char_cate = CharDef.char_category(sentence[start])
66
- if char_cate[:group] == 1
67
- unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
59
+ unless !char_cate[:invoke] && matched
60
+ char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
61
+ unk_terminal = [start + char_length, terminal].min
68
62
  pos = start + 1
63
+ char_type = CharDef.char_type(sentence[start])
69
64
  while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
70
65
  word << sentence[pos]
71
66
  pos += 1
72
67
  end
68
+ @unknown_dictionary[char_type].each do |el|
69
+ lattice.insert(start, start + word.length, word, true,
70
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
71
+ end
73
72
  end
74
- @unknown_dictionary[char_type].each do |el|
75
- lattice.insert(start, start + word.length,
76
- word, el[0].to_i, el[1].to_i, el[2].to_i,
77
- el[3..-1])
78
- end
73
+
79
74
  start += 1
80
75
  end
81
76
 
@@ -111,9 +106,10 @@ module Suika
111
106
  prev_node = eos.min_prev
112
107
  res = []
113
108
  until prev_node.nil?
114
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
109
+ res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
110
  prev_node = prev_node.min_prev
116
111
  end
112
+
117
113
  res.reverse
118
114
  end
119
115
  end
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.3'
7
7
  end
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
16
16
 
17
17
  spec.metadata['homepage_uri'] = spec.homepage
18
18
  spec.metadata['source_code_uri'] = spec.homepage
19
- spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
19
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/master/CHANGELOG.md'
20
20
  spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
21
 
22
22
  # Specify which files should be added to the gem when it is released.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rambling-trie
@@ -38,6 +38,7 @@ files:
38
38
  - CHANGELOG.md
39
39
  - CODE_OF_CONDUCT.md
40
40
  - Gemfile
41
+ - Gemfile.lock
41
42
  - LICENSE.txt
42
43
  - NOTICE.txt
43
44
  - README.md
@@ -57,7 +58,7 @@ licenses:
57
58
  metadata:
58
59
  homepage_uri: https://github.com/yoshoku/suika
59
60
  source_code_uri: https://github.com/yoshoku/suika
60
- changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
61
+ changelog_uri: https://github.com/yoshoku/suika/blob/master/CHANGELOG.md
61
62
  documentation_uri: https://rubydoc.info/gems/suika
62
63
  post_install_message:
63
64
  rdoc_options: []