suika 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1fb292a3b36ec1dde8f93da624092876220ec33fa49ff635cbaff93fa9337137
4
- data.tar.gz: 0e2c0cc53c6f25099dfa455f85d618491819dd98cc33047a6aebd4fb5858d13a
3
+ metadata.gz: 108d59930a3a546de3a603d4c7de5851249264a4eddd76c8a350407b2d0b353a
4
+ data.tar.gz: d879db93ee8593466c415ba9064b50221ee2355fa0caedcd301fe07de9ab4880
5
5
  SHA512:
6
- metadata.gz: 1f54af2d9955f7c562df25d0d724c0b8f1460959b4d35d835c9412a7897e329a95016a7bc151c7125a8b4be9788a18dc7265cc4b4355ad7c1a5b1c828af0f622
7
- data.tar.gz: a7b1a9a484b51d03ec92a09a2e4ee1a24e1cfcbaafe27f8cd7cbc0b7b02989c0960ebc1799974816adf6399e2e21fa02e06cad1fb8a584db4238e08f93baf93b
6
+ metadata.gz: aceff27ac13d1c7b03ae286e2a24c02f5fe75d987e5ea91879777e66adf8155b8d92d185399e2fe31bf6dd7e6448fb175d3ed0b864c55e2df2ff30b6dcaad5a3
7
+ data.tar.gz: a92d9161f73a761c9621fa720725dbebdfa74da1d8b892ab3ecab8aa9ee835d98bf42341818219b0fdf1dd3a164a1206f306f0e8dcad9030b7137776ed5142e0
@@ -1,3 +1,7 @@
1
+ # 0.1.3
2
+ - Fix unknown word processing.
3
+ - Remove redundant spaces from output.
4
+
1
5
  # 0.1.2
2
6
  - Fix local variable typo in Tagger.parse.
3
7
 
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ suika (0.1.2)
5
+ rambling-trie (~> 2.1)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.4.4)
11
+ rake (12.3.3)
12
+ rambling-trie (2.1.1)
13
+ rspec (3.9.0)
14
+ rspec-core (~> 3.9.0)
15
+ rspec-expectations (~> 3.9.0)
16
+ rspec-mocks (~> 3.9.0)
17
+ rspec-core (3.9.2)
18
+ rspec-support (~> 3.9.3)
19
+ rspec-expectations (3.9.2)
20
+ diff-lcs (>= 1.2.0, < 2.0)
21
+ rspec-support (~> 3.9.0)
22
+ rspec-mocks (3.9.1)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.9.0)
25
+ rspec-support (3.9.3)
26
+
27
+ PLATFORMS
28
+ ruby
29
+
30
+ DEPENDENCIES
31
+ rake (~> 12.0)
32
+ rspec (~> 3.0)
33
+ suika!
34
+
35
+ BUNDLED WITH
36
+ 2.1.2
data/README.md CHANGED
@@ -30,13 +30,13 @@ require 'suika'
30
30
  tagger = Suika::Tagger.new
31
31
  tagger.parse('すもももももももものうち').each { |token| puts token }
32
32
 
33
- # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
- # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
- # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
- # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
- # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
33
+ # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
34
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
35
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
36
+ # も 助詞,係助詞,*,*,*,*,も,モ,モ
37
+ # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
38
+ # の 助詞,連体化,*,*,*,*,の,ノ,ノ
39
+ # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
40
40
  ```
41
41
 
42
42
  Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
@@ -16,39 +16,41 @@ module Suika
16
16
  CHAR_CATEGORY[char_type(ch)]
17
17
  end
18
18
 
19
+ MAX_GROUPING_SIZE = 24
20
+
19
21
  CHAR_CATEGORY = {
20
22
  'DEFAULT' => {
21
- invoke: 0, group: 1, length: 0
23
+ invoke: false, group: true, length: 0
22
24
  },
23
25
  'SPACE' => {
24
- invoke: 0, group: 1, length: 0
26
+ invoke: false, group: true, length: 0
25
27
  },
26
28
  'KANJI' => {
27
- invoke: 0, group: 0, length: 2
29
+ invoke: false, group: false, length: 2
28
30
  },
29
31
  'SYMBOL' => {
30
- invoke: 1, group: 1, length: 0
32
+ invoke: true, group: true, length: 0
31
33
  },
32
34
  'NUMERIC' => {
33
- invoke: 1, group: 1, length: 0
35
+ invoke: true, group: true, length: 0
34
36
  },
35
37
  'ALPHA' => {
36
- invoke: 1, group: 1, length: 0
38
+ invoke: true, group: true, length: 0
37
39
  },
38
40
  'HIRAGANA' => {
39
- invoke: 0, group: 1, length: 2
41
+ invoke: false, group: true, length: 2
40
42
  },
41
43
  'KATAKANA' => {
42
- invoke: 1, group: 1, length: 2
44
+ invoke: true, group: true, length: 2
43
45
  },
44
46
  'KANJINUMERIC' => {
45
- invoke: 1, group: 1, length: 0
47
+ invoke: true, group: true, length: 0
46
48
  },
47
49
  'GREEK' => {
48
- invoke: 1, group: 1, length: 0
50
+ invoke: true, group: true, length: 0
49
51
  },
50
52
  'CYRILLIC' => {
51
- invoke: 1, group: 1, length: 0
53
+ invoke: true, group: true, length: 0
52
54
  }
53
55
  }.freeze
54
56
 
@@ -4,7 +4,7 @@ module Suika
4
4
  # @!visibility private
5
5
  class Lattice
6
6
  # @!visibility private
7
- Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
7
+ Node = Struct.new(:surface, :unknown, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
8
 
9
9
  attr_reader :begin_nodes, :end_nodes, :length
10
10
 
@@ -13,15 +13,15 @@ module Suika
13
13
  @length = length
14
14
  @begin_nodes = Array.new(length + 1) { [] }
15
15
  @end_nodes = Array.new(length + 1) { [] }
16
- bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
16
+ bos = Node.new(surface: 'BOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
17
17
  @end_nodes[0].append(bos)
18
- eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
18
+ eos = Node.new(surface: 'EOS', unknown: false, left_id: 0, right_id: 0, cost: 0, attrs: [])
19
19
  @begin_nodes[length].append(eos)
20
20
  end
21
21
 
22
22
  # @!visibility private
23
- def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
- node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
23
+ def insert(begin_id, end_id, surface, unknown, left_id, right_id, cost, attrs)
24
+ node = Node.new(surface: surface, unknown: unknown, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
25
  @begin_nodes[begin_id].append(node)
26
26
  @end_nodes[end_id].append(node)
27
27
  end
@@ -12,13 +12,13 @@ module Suika
12
12
  # tagger = Suika::Tagger.new
13
13
  # tagger.parse('すもももももももものうち').each { |token| puts token }
14
14
  #
15
- # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
- # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
- # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
- # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
- # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
15
+ # # すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
16
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
17
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
18
+ # # も 助詞,係助詞,*,*,*,*,も,モ,モ
19
+ # # もも 名詞,一般,*,*,*,*,もも,モモ,モモ
20
+ # # の 助詞,連体化,*,*,*,*,の,ノ,ノ
21
+ # # うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
22
22
  #
23
23
  class Tagger
24
24
  # Create a new tagger by loading the built-in binary dictionary.
@@ -41,41 +41,36 @@ module Suika
41
41
  while start < terminal
42
42
  word = sentence[start]
43
43
  pos = start
44
- is_unknown = true
44
+ matched = false
45
45
  while @trie.match?(word) && pos < terminal
46
46
  if @dictionary.key?(word)
47
+ matched = true
47
48
  @dictionary[word].each do |el|
48
- lattice.insert(start, start + word.length,
49
- word, el[0].to_i, el[1].to_i, el[2].to_i,
50
- el[3..-1])
49
+ lattice.insert(start, start + word.length, word, false,
50
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
51
51
  end
52
- is_unknown = false
53
52
  end
54
53
  pos += 1
55
54
  word = sentence[start..pos]
56
55
  end
57
56
 
58
- unless is_unknown
59
- start += 1
60
- next
61
- end
62
-
63
57
  word = sentence[start]
64
- char_type = CharDef.char_type(sentence[start])
65
58
  char_cate = CharDef.char_category(sentence[start])
66
- if char_cate[:group] == 1
67
- unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
59
+ unless !char_cate[:invoke] && matched
60
+ char_length = char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]
61
+ unk_terminal = [start + char_length, terminal].min
68
62
  pos = start + 1
63
+ char_type = CharDef.char_type(sentence[start])
69
64
  while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos])
70
65
  word << sentence[pos]
71
66
  pos += 1
72
67
  end
68
+ @unknown_dictionary[char_type].each do |el|
69
+ lattice.insert(start, start + word.length, word, true,
70
+ el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
71
+ end
73
72
  end
74
- @unknown_dictionary[char_type].each do |el|
75
- lattice.insert(start, start + word.length,
76
- word, el[0].to_i, el[1].to_i, el[2].to_i,
77
- el[3..-1])
78
- end
73
+
79
74
  start += 1
80
75
  end
81
76
 
@@ -111,9 +106,10 @@ module Suika
111
106
  prev_node = eos.min_prev
112
107
  res = []
113
108
  until prev_node.nil?
114
- res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
109
+ res.append("#{prev_node.surface}\t#{prev_node.attrs.join(',')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
110
  prev_node = prev_node.min_prev
116
111
  end
112
+
117
113
  res.reverse
118
114
  end
119
115
  end
@@ -3,5 +3,5 @@
3
3
  # Suika is a Japanese morphological analyzer written in pure Ruby.
4
4
  module Suika
5
5
  # The version of Suika you are using.
6
- VERSION = '0.1.2'
6
+ VERSION = '0.1.3'
7
7
  end
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
16
16
 
17
17
  spec.metadata['homepage_uri'] = spec.homepage
18
18
  spec.metadata['source_code_uri'] = spec.homepage
19
- spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
19
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/suika/blob/master/CHANGELOG.md'
20
20
  spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
21
 
22
22
  # Specify which files should be added to the gem when it is released.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: suika
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-05 00:00:00.000000000 Z
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rambling-trie
@@ -38,6 +38,7 @@ files:
38
38
  - CHANGELOG.md
39
39
  - CODE_OF_CONDUCT.md
40
40
  - Gemfile
41
+ - Gemfile.lock
41
42
  - LICENSE.txt
42
43
  - NOTICE.txt
43
44
  - README.md
@@ -57,7 +58,7 @@ licenses:
57
58
  metadata:
58
59
  homepage_uri: https://github.com/yoshoku/suika
59
60
  source_code_uri: https://github.com/yoshoku/suika
60
- changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
61
+ changelog_uri: https://github.com/yoshoku/suika/blob/master/CHANGELOG.md
61
62
  documentation_uri: https://rubydoc.info/gems/suika
62
63
  post_install_message:
63
64
  rdoc_options: []