kampyo 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0d96499aeb214750eddaa4eecdede3a629b05dcd05755c9a3ca798be84f400e3
4
- data.tar.gz: '094e4f87fe8c86726c46deb081c4d2fd4f7e356c62d1dfbd15384c9bb3eca6f0'
3
+ metadata.gz: 7f831a683b8d1f6bf690fffe535b7f301c4e2ed9de3b9bbb58efe65b12c7e38b
4
+ data.tar.gz: 62e652f7d9a8d0d953247b2b2e8926844dc92ea668d3b94a102ac577c57c2101
5
5
  SHA512:
6
- metadata.gz: cd8aab55671360c9acb592050383f717fe1a93c4efd3233943b2bd3b7ffb7585c90090561b11c0dde13010c44743ab2e89842b4f317d1428d37b58a1438a0a3a
7
- data.tar.gz: 3d85e4fed2888ab5c3322da3685bdedafbfc8fb8c79c6b9f17eb578c4fe1ec793b9c85a34b803b3f1d58e722be3c109a935bf1a5b4f7078bbb005e341ae285cb
6
+ metadata.gz: eb42364413b79900b512d8e830091334ffd7bee17ac19cf2e3b1c1387139861a7d0324a3ab05c65f3a45f8f690286cf3ade7ac2d0735cea0c971bedf09f46644
7
+ data.tar.gz: f18179269d98c2ff02c1bd18d928da1dc0190e4d9fbdad22f88b35c10a918d4b04f6fc2ae1d925bfe0bfe576fcb26e3c08bb92432ceb138cfabab1accb11e8f5
data/README.base.md CHANGED
@@ -20,8 +20,8 @@ Unique features already implemented are subject and predicate, and stylistic inf
20
20
  Basic usage with Cabocha.
21
21
 
22
22
  ```
23
- text = Kampyo::Text.new
24
- text.cabocha_parser("今日は雨です")
23
+ cabocha = Kampyo::Cabocha.new
24
+ cabocha.parser("今日は雨です")
25
25
  ```
26
26
 
27
27
  You will get the following result.
@@ -38,8 +38,8 @@ You will get the following result.
38
38
  Guess the subject, predicate and sentence system.
39
39
 
40
40
  ```
41
- text = Kampyo::Text.new
42
- text.analysis(text.cabocha_parser("今日は雨です"))
41
+ cabocha = Kampyo::Cabocha.new
42
+ cabocha.analysis(text.parser("今日は雨です"))
43
43
  ```
44
44
 
45
45
  You will get the following result.
data/README.ja.md CHANGED
@@ -15,8 +15,8 @@ Unique features already implemented are subject and predicate, and stylistic inf
15
15
  Basic usage with Cabocha.
16
16
 
17
17
  ```
18
- text = Kampyo::Text.new
19
- text.cabocha_parser("今日は雨です")
18
+ cabocha = Kampyo::Cabocha.new
19
+ cabocha.parser("今日は雨です")
20
20
  ```
21
21
 
22
22
  You will get the following result.
@@ -33,8 +33,8 @@ You will get the following result.
33
33
  Guess the subject, predicate and sentence system.
34
34
 
35
35
  ```
36
- text = Kampyo::Text.new
37
- text.analysis(text.cabocha_parser("今日は雨です"))
36
+ cabocha = Kampyo::Cabocha.new
37
+ cabocha.analysis(text.parser("今日は雨です"))
38
38
  ```
39
39
 
40
40
  You will get the following result.
data/README.md CHANGED
@@ -15,8 +15,8 @@ Unique features already implemented are subject and predicate, and stylistic inf
15
15
  Basic usage with Cabocha.
16
16
 
17
17
  ```
18
- text = Kampyo::Text.new
19
- text.cabocha_parser("今日は雨です")
18
+ cabocha = Kampyo::Cabocha.new
19
+ cabocha.parser("今日は雨です")
20
20
  ```
21
21
 
22
22
  You will get the following result.
@@ -33,8 +33,8 @@ You will get the following result.
33
33
  Guess the subject, predicate and sentence system.
34
34
 
35
35
  ```
36
- text = Kampyo::Text.new
37
- text.analysis(text.cabocha_parser("今日は雨です"))
36
+ cabocha = Kampyo::Cabocha.new
37
+ cabocha.analysis(text.parser("今日は雨です"))
38
38
  ```
39
39
 
40
40
  You will get the following result.
@@ -0,0 +1,172 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kampyo'
4
+ require 'kampyo/string'
5
+ require 'kampyo/text'
6
+ require 'cabocha'
7
+
8
+ # Kampyo
9
+ module Kampyo
10
+ # Text
11
+ class Cabocha < Text # rubocop:disable Metrics/ClassLength
12
+ # rubocop:todo Metrics/MethodLength
13
+ def parser(input) # rubocop:todo Metrics/AbcSize, Metrics/MethodLength
14
+ # <sentence>
15
+ # <chunk id="0" link="1" rel="D" score="0.000000" head="0" func="1">
16
+ # <tok id="0" feature="名詞,副詞可能,*,*,*,*,今日,キョウ,キョー" ne="B-DATE">今日</tok>
17
+ # <tok id="1" feature="助詞,係助詞,*,*,*,*,は,ハ,ワ" ne="O">は</tok>
18
+ # </chunk>
19
+ # <chunk id="1" link="-1" rel="D" score="0.000000" head="2" func="3">
20
+ # <tok id="2" feature="名詞,一般,*,*,*,*,雨,アメ,アメ" ne="O">雨</tok>
21
+ # <tok id="3" feature="助動詞,*,*,*,特殊・デス,基本形,です,デス,デス" ne="O">です</tok>
22
+ # </chunk>
23
+ # </sentence>
24
+
25
+ parser = CaboCha::Parser.new
26
+ tree = parser.parse(input)
27
+
28
+ chunks = []
29
+ tokens = []
30
+ token_position = 0
31
+ (0..tree.chunk_size - 1).each do |i| # rubocop:todo Metrics/BlockLength
32
+ chunk = tree.chunk(i)
33
+ token_size = chunk.token_size
34
+
35
+ chunks << {
36
+ id: chunks.size + 1,
37
+ link: chunk.link >= 0 ? chunk.link + 1 : -1,
38
+ score: chunk.score
39
+ }
40
+
41
+ (token_position..token_position + token_size - 1).each do |j|
42
+ token = tree.token(j)
43
+
44
+ surface = token.surface.to_utf8
45
+ feature0 = token.feature_list(0).to_utf8
46
+ feature1 = token.feature_list(1).to_utf8
47
+ feature6 = token.feature_list(6).to_utf8
48
+ feature7 = token.feature_list(7).to_utf8
49
+
50
+ tokens << {
51
+ id: tokens.size + 1,
52
+ chunk: i + 1,
53
+ surface: surface,
54
+ feature1: feature0,
55
+ feature2: feature1,
56
+ baseform: feature6,
57
+ reading: feature7,
58
+ ext_reading: ext_reading(feature7)
59
+ }
60
+ end
61
+
62
+ token_position += token_size
63
+ end
64
+
65
+ { chunks: chunks, tokens: tokens }
66
+ end
67
+ # rubocop:enable Metrics/MethodLength
68
+
69
+ def analysis(cabocha) # rubocop:todo Metrics/CyclomaticComplexity, Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
70
+ chunks = cabocha[:chunks]
71
+ tokens = cabocha[:tokens]
72
+
73
+ subject_token = nil
74
+ predicate_token = nil
75
+ tod = '*'
76
+
77
+ # 述語の候補
78
+ predicate_chunk = chunks.find { |item| item[:link] == -1 }
79
+ predicate_tokens = tokens.select { |item| item[:chunk] == predicate_chunk[:id] }
80
+
81
+ # 主語の候補
82
+ subject_chunk = chunks.find do |item|
83
+ item[:link] == predicate_chunk[:id]
84
+ end
85
+ unless subject_chunk.nil?
86
+ subject_tokens = tokens.select do |item|
87
+ item[:chunk] == subject_chunk[:id] &&
88
+ item[:feature1] == '名詞' &&
89
+ %w[一般 固有名詞 サ変接続 接尾 数 副詞可能].include?(item[:feature2])
90
+ end
91
+
92
+ subject_tokens.each do |token|
93
+ next_token = tokens.find do |item|
94
+ item[:id] == token[:id] + 1 &&
95
+ item[:chunk] == token[:chunk] &&
96
+ %w[は って も が].include?(item[:baseform])
97
+ end
98
+
99
+ next if next_token.nil?
100
+
101
+ # 主語として確定する
102
+ subject_token = token
103
+ next
104
+ end
105
+ end
106
+
107
+ predicate_tokens.each do |token| # rubocop:todo Metrics/BlockLength
108
+ if %w[形容詞 動詞 名詞].include?(token[:feature1]) &&
109
+ %w[一般 自立 サ変接続 接尾].include?(token[:feature2]) &&
110
+ token[:baseform] != '*'
111
+ # 述語として確定する
112
+
113
+ predicate_token = token
114
+ end
115
+
116
+ next unless %w[動詞 終助詞 助動詞 助詞].include?(token[:feature1])
117
+
118
+ # 文体系を確定する
119
+ tods = {
120
+ 'れる' => '受身・尊敬・可能・自発',
121
+ 'られる' => '受身・尊敬・可能・自発',
122
+ 'せる' => '使役',
123
+ 'させる' => '使役',
124
+ 'ない' => '打消',
125
+ 'ぬ' => '打消',
126
+ 'ん' => '打消',
127
+ 'う' => '推量・意志・勧誘',
128
+ 'よう' => '推量・意志・勧誘',
129
+ 'まい' => '打消推量・打消意志',
130
+ 'たい' => '希望',
131
+ 'たがる' => '希望',
132
+ 'た' => '過去・完了・存在・確認',
133
+ 'ます' => '丁寧',
134
+ 'そうだ' => '様態・伝聞',
135
+ 'らしい' => '推定',
136
+ 'ようだ' => '比況・例示・推定',
137
+ 'だ' => '断定',
138
+ 'です' => '断定',
139
+ 'な' => '禁止・感動',
140
+ 'か' => '疑問・反語・感動',
141
+ 'の' => '断定・質問',
142
+ 'よ' => '強意・呼びかけ',
143
+ 'ぞ' => '強意',
144
+ 'も' => '確信',
145
+ 'ね' => '感動・念押し',
146
+ 'わ' => '感動・強意',
147
+ 'さ' => '断定'
148
+ }
149
+
150
+ tod = tods[token[:baseform]]
151
+ end
152
+
153
+ last_token = predicate_tokens.last
154
+
155
+ if predicate_token.nil? && ['助動詞'].include?(last_token[:feature1])
156
+ # 述語が確定していないとき最後の助動詞を述語として確定する
157
+ predicate_token = last_token
158
+ end
159
+
160
+ if ['?', '?'].include?(last_token[:surface])
161
+ # 述語の最後の形態素が?のとき
162
+ tod = '疑問・反語・感動'
163
+ end
164
+
165
+ {
166
+ subject: subject_token,
167
+ predicate: predicate_token,
168
+ tod: tod
169
+ }
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'kampyo'
4
+ require 'kampyo/string'
5
+ require 'kampyo/text'
6
+ require 'mecab'
7
+
8
+ # Kampyo
9
+ module Kampyo
10
+ # Text
11
+ class Mecab < Text
12
+ # rubocop:todo Metrics/MethodLength
13
+ def parser(input) # rubocop:todo Metrics/AbcSize, Metrics/MethodLength
14
+ result = []
15
+ parser = MeCab::Tagger.new
16
+ node = parser.parseToNode(input)
17
+ while node
18
+ features = node.feature.split(',')
19
+ if features[0] != 'BOS/EOS'
20
+ result << {
21
+ id: result.size + 1,
22
+ chunk: 0,
23
+ surface: node.surface,
24
+ feature1: features[0],
25
+ feature2: features[1],
26
+ baseform: features[6],
27
+ reading: features[7],
28
+ ext_reading: ext_reading(features[7]),
29
+ cost: node.cost,
30
+ wcost: node.wcost,
31
+ right_context: node.rcAttr,
32
+ left_context: node.lcAttr
33
+ }
34
+ end
35
+ node = node.next
36
+ end
37
+
38
+ result
39
+ end
40
+ # rubocop:enable Metrics/MethodLength
41
+ end
42
+ end
data/lib/kampyo/string.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # String
3
4
  class String
4
5
  def to_utf8
5
6
  force_encoding('UTF-8')
data/lib/kampyo/text.rb CHANGED
@@ -2,201 +2,15 @@
2
2
 
3
3
  require 'kampyo'
4
4
  require 'kampyo/string'
5
- require 'cabocha'
6
- require 'mecab'
7
5
 
6
+ # Kampyo
8
7
  module Kampyo
8
+ # Text
9
9
  class Text
10
10
  def initialize; end
11
11
 
12
- def cabocha_parser(input)
13
- # <sentence>
14
- # <chunk id="0" link="1" rel="D" score="0.000000" head="0" func="1">
15
- # <tok id="0" feature="名詞,副詞可能,*,*,*,*,今日,キョウ,キョー" ne="B-DATE">今日</tok>
16
- # <tok id="1" feature="助詞,係助詞,*,*,*,*,は,ハ,ワ" ne="O">は</tok>
17
- # </chunk>
18
- # <chunk id="1" link="-1" rel="D" score="0.000000" head="2" func="3">
19
- # <tok id="2" feature="名詞,一般,*,*,*,*,雨,アメ,アメ" ne="O">雨</tok>
20
- # <tok id="3" feature="助動詞,*,*,*,特殊・デス,基本形,です,デス,デス" ne="O">です</tok>
21
- # </chunk>
22
- # </sentence>
23
-
24
- parser = CaboCha::Parser.new
25
- tree = parser.parse(input)
26
-
27
- chunks = []
28
- tokens = []
29
- token_position = 0
30
- (0..tree.chunk_size - 1).each do |i|
31
- chunk = tree.chunk(i)
32
- token_size = chunk.token_size
33
-
34
- chunks << {
35
- id: chunks.size + 1,
36
- link: chunk.link >= 0 ? chunk.link + 1 : -1,
37
- score: chunk.score
38
- }
39
-
40
- (token_position..token_position + token_size - 1).each do |j|
41
- token = tree.token(j)
42
-
43
- surface = token.surface.to_utf8
44
- feature0 = token.feature_list(0).to_utf8
45
- feature1 = token.feature_list(1).to_utf8
46
- feature6 = token.feature_list(6).to_utf8
47
- feature7 = token.feature_list(7).to_utf8
48
-
49
- tokens << {
50
- id: tokens.size + 1,
51
- chunk: i + 1,
52
- surface: surface,
53
- feature1: feature0,
54
- feature2: feature1,
55
- baseform: feature6,
56
- reading: feature7,
57
- ext_reading: ext_reading(feature7)
58
- }
59
- end
60
-
61
- token_position += token_size
62
- end
63
-
64
- { chunks: chunks, tokens: tokens }
65
- end
66
-
67
- def mecab_parser(input)
68
- result = []
69
- parser = MeCab::Tagger.new
70
- node = parser.parseToNode(input)
71
- while node
72
- features = node.feature.split(',')
73
- if features[0] != 'BOS/EOS'
74
- result << {
75
- id: result.size + 1,
76
- chunk: 0,
77
- surface: node.surface,
78
- feature1: features[0],
79
- feature2: features[1],
80
- baseform: features[6],
81
- reading: features[7],
82
- ext_reading: ext_reading(features[7]),
83
- cost: node.cost,
84
- wcost: node.wcost,
85
- right_context: node.rcAttr,
86
- left_context: node.lcAttr
87
- }
88
- end
89
- node = node.next
90
- end
91
-
92
- result
93
- end
94
-
95
12
  def ext_reading(feature)
96
13
  (feature =~ /\A[\p{katakana}|ー]+\z/).nil? ? feature : nil
97
14
  end
98
-
99
- def analysis(cabocha)
100
- chunks = cabocha[:chunks]
101
- tokens = cabocha[:tokens]
102
-
103
- subject_token = nil
104
- predicate_token = nil
105
- tod = '*'
106
-
107
- # 述語の候補
108
- predicate_chunk = chunks.find { |item| item[:link] == -1 }
109
- predicate_tokens = tokens.select { |item| item[:chunk] == predicate_chunk[:id] }
110
-
111
- # 主語の候補
112
- subject_chunk = chunks.find do |item|
113
- item[:link] == predicate_chunk[:id]
114
- end
115
- unless subject_chunk.nil?
116
- subject_tokens = tokens.select do |item|
117
- item[:chunk] == subject_chunk[:id] &&
118
- item[:feature1] == '名詞' &&
119
- %w[一般 固有名詞 サ変接続 接尾 数 副詞可能].include?(item[:feature2])
120
- end
121
-
122
- subject_tokens.each do |token|
123
- next_token = tokens.find do |item|
124
- item[:id] == token[:id] + 1 &&
125
- item[:chunk] == token[:chunk] &&
126
- %w[は って も が].include?(item[:baseform])
127
- end
128
-
129
- next if next_token.nil?
130
-
131
- # 主語として確定する
132
- subject_token = token
133
- next
134
- end
135
- end
136
-
137
- predicate_tokens.each do |token|
138
- if %w[形容詞 動詞 名詞].include?(token[:feature1]) &&
139
- %w[一般 自立 サ変接続 接尾].include?(token[:feature2]) &&
140
- token[:baseform] != '*'
141
- # 述語として確定する
142
-
143
- predicate_token = token
144
- end
145
-
146
- next unless %w[動詞 終助詞 助動詞 助詞].include?(token[:feature1])
147
-
148
- # 文体系を確定する
149
- tods = {
150
- 'れる' => '受身・尊敬・可能・自発',
151
- 'られる' => '受身・尊敬・可能・自発',
152
- 'せる' => '使役',
153
- 'させる' => '使役',
154
- 'ない' => '打消',
155
- 'ぬ' => '打消',
156
- 'ん' => '打消',
157
- 'う' => '推量・意志・勧誘',
158
- 'よう' => '推量・意志・勧誘',
159
- 'まい' => '打消推量・打消意志',
160
- 'たい' => '希望',
161
- 'たがる' => '希望',
162
- 'た' => '過去・完了・存在・確認',
163
- 'ます' => '丁寧',
164
- 'そうだ' => '様態・伝聞',
165
- 'らしい' => '推定',
166
- 'ようだ' => '比況・例示・推定',
167
- 'だ' => '断定',
168
- 'です' => '断定',
169
- 'な' => '禁止・感動',
170
- 'か' => '疑問・反語・感動',
171
- 'の' => '断定・質問',
172
- 'よ' => '強意・呼びかけ',
173
- 'ぞ' => '強意',
174
- 'も' => '確信',
175
- 'ね' => '感動・念押し',
176
- 'わ' => '感動・強意',
177
- 'さ' => '断定'
178
- }
179
-
180
- tod = tods[token[:baseform]]
181
- end
182
-
183
- last_token = predicate_tokens.last
184
-
185
- if predicate_token.nil? && ['助動詞'].include?(last_token[:feature1])
186
- # 述語が確定していないとき最後の助動詞を述語として確定する
187
- predicate_token = last_token
188
- end
189
-
190
- if ['?', '?'].include?(last_token[:surface])
191
- # 述語の最後の形態素が?のとき
192
- tod = '疑問・反語・感動'
193
- end
194
-
195
- {
196
- subject: subject_token,
197
- predicate: predicate_token,
198
- tod: tod
199
- }
200
- end
201
15
  end
202
16
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kampyo
4
- VERSION = '0.2.0'
4
+ VERSION = '1.0.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kampyo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - arthur87
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-12 00:00:00.000000000 Z
11
+ date: 2025-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rubocop
@@ -66,6 +66,8 @@ files:
66
66
  - README.md
67
67
  - Rakefile
68
68
  - lib/kampyo.rb
69
+ - lib/kampyo/cabocha.rb
70
+ - lib/kampyo/mecab.rb
69
71
  - lib/kampyo/string.rb
70
72
  - lib/kampyo/text.rb
71
73
  - lib/kampyo/version.rb