kokugo_tagger 0.0.6 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 07d91224845f955eb5e2433d059361d99f2de2f4
4
- data.tar.gz: fbe4d1d40cd9f533f745c28bcacc3954aee77f58
3
+ metadata.gz: 572d91b93afea39ab298c267f0068436e62670c1
4
+ data.tar.gz: a5ffe79141ca346aa6c346b7514dab584eb234eb
5
5
  SHA512:
6
- metadata.gz: 700d2dcf6d0dab285c7036ea32ba3e88ae3f5f51edb5434dd0a32c61745a6455ef0aad9a67074452c82a6036485136c021c8ecd05a50c30393351430e9e9a157
7
- data.tar.gz: 713b0e7ff1872541a8fc5fd6dcd681cb58fb8d2d988cef6694650ee18d53f6a0529946cac4f4fce117ae4630286ba86015a923f901966553e6dea0c2ad9cda95
6
+ metadata.gz: 57414c4a4ec004c4f8a1483118ef638d58d744e1b0b71a9e7cb43a1f89e660de3fd9fe4408dc350901f5afb7c3094a2ad83b8461c406766b8b554b9345c0f411
7
+ data.tar.gz: 459cc09789690d93f943d027af74481aecce5ebd3283181c77c809c1f83f0e6fef56db379c10da69c3bc7a313a9c0711a3ce35fe5a04957601bbc9a24ddcf861
data/README.md CHANGED
@@ -1,26 +1,51 @@
1
1
  # KokugoTagger
2
2
 
3
- TODO: Write a gem description
3
+ cabocha形式のファイルに対して、学校文法に準拠した係り受けラベルを付与します。
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
7
+ 事前に以下のツールをインストールし、パスを通しておく必要があります。
8
8
 
9
- ```ruby
10
- gem 'kokugo_tagger'
11
- ```
9
+ - Ruby
10
+ - YamCha
12
11
 
13
- And then execute:
14
-
15
- $ bundle
16
-
17
- Or install it yourself as:
12
+ コマンドラインから以下のように入力し、インストールしてください。
18
13
 
19
14
  $ gem install kokugo_tagger
20
15
 
21
16
  ## Usage
22
17
 
23
- TODO: Write usage instructions here
18
+ UTF-8/UniDicのCaboCha形式データにのみ対応しています。CaboCha形式のファイルにラベルを付与する場合は、次のように実行してください。
19
+
20
+ $ cat neko.cabocha | kokugo_tagger > output.cabocha
21
+
22
+ プレーンテキストの場合は、次のようにCaboChaと組み合わせて下さい。CaboChaは別途インストールしてください。
23
+
24
+ $ cat neko.txt | cabocha -f1 | kokugo_tagger > output.cabocha
25
+
26
+ コマンドライン上で対話的に実行することもできます。
27
+
28
+ $ cabocha -f1 | kokugo_tagger
29
+ 吾輩は猫である
30
+ * 0 1S 0/1 0.000000
31
+ 吾輩 代名詞,*,*,*,*,*,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,*,*,*,*,ワガハイ,ワガハイ,ワガハイ,ワガハイ,*,*,0,*,* O
32
+ は 助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* O
33
+ * 1 -1X 2/2 0.000000
34
+ 猫 名詞,普通名詞,一般,*,*,*,ネコ,猫,猫,ネコ,猫,ネコ,和,*,*,*,*,ネコ,ネコ,ネコ,ネコ,*,*,1,C4,* O
35
+ で 助動詞,*,*,*,助動詞-ダ,連用形-一般,ダ,だ,で,デ,だ,ダ,和,*,*,*,*,デ,ダ,デ,ダ,*,*,*,名詞%F1,* O
36
+ ある 動詞,非自立可能,*,*,五段-ラ行,終止形-一般,アル,有る,ある,アル,ある,アル,和,*,*,*,*,アル,アル,アル,アル,*,*,1,C3,* O
37
+ EOS
38
+
39
+ 係り受けラベルは以下の8種類です。
40
+
41
+ - S: 主語
42
+ - R: 連用修飾語
43
+ - T: 連体修飾語
44
+ - Z: 接続語
45
+ - D: 独立語
46
+ - H: 並立の関係
47
+ - J: 補助の関係
48
+ - X: その他(文末など)
24
49
 
25
50
  ## Contributing
26
51
 
data/bin/kokugo_tagger CHANGED
@@ -1,6 +1,27 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'optparse'
3
4
  require 'kokugo_tagger'
4
5
 
5
- Encoding.default_external = 'UTF-8'
6
- KokugoTagger.annotate ARGF
6
+ $enc = "UTF-8"
7
+ $model = nil
8
+ $learn = nil
9
+ $valid = nil
10
+
11
+ OptionParser.new do |opt|
12
+ Version = "0.1.0"
13
+ opt.on('-e STR', '--encoding STR', String, 'encoding'){|str| $enc = str }
14
+ opt.on('-m FILE', '--model FILE', String, 'model file'){|file| $model = file }
15
+ opt.on('-l DIR', '--learn DIR', String, 'train corpus directory'){|dir| $learn = dir }
16
+ opt.on('-v DIR', '--valid DIR', String, 'K-fold cross-validation'){|dir| $learn = dir; $valid = 3 }
17
+ opt.parse!
18
+ end
19
+
20
+ # $stdin.set_encoding 'UTF-8'
21
+ if $learn and $valid
22
+ KokugoTagger.validation $learn, $enc, $valid
23
+ elsif $learn
24
+ KokugoTagger.learn $learn, $model, $enc
25
+ else
26
+ KokugoTagger.annotate $stdin, $model, $enc
27
+ end
@@ -0,0 +1,75 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module KokugoTagger::Cabocha
4
+ module_function
5
+ def parse(file = @source)
6
+ document = KokugoTagger::Cabocha::Document.new
7
+ sentence, chunk = nil, nil
8
+ file.each_line do |line|
9
+ sentence ||= KokugoTagger::Cabocha::Sentence.new
10
+ case line
11
+ when /^EOS/
12
+ sentence.each{|chunk| chunk.detect_structure} # sem_headとsyn_headの独自判定
13
+ document << sentence
14
+ sentence, chunk = nil, nil
15
+ when /^\*/
16
+ chunk = KokugoTagger::Cabocha::Chunk.new(line)
17
+ sentence << chunk
18
+ chunk.sentence = sentence
19
+ when /^#/
20
+ # nothing
21
+ else
22
+ token = KokugoTagger::Cabocha::Token.new(line)
23
+ chunk.tokens << token
24
+ end
25
+ end
26
+ return document
27
+ end
28
+ end
29
+
30
+ class KokugoTagger::Cabocha::Document < Array
31
+ end
32
+
33
+ class KokugoTagger::Cabocha::Sentence < Array
34
+ end
35
+
36
+ class KokugoTagger::Cabocha::Chunk
37
+ attr_accessor :info, :id, :link, :rel, :sem_head_id, :syn_head_id, :ext, :tokens, :sentence
38
+ def initialize(line)
39
+ @info = line.chomp.split(/\s/)[1..-1]
40
+ @id = @info[0].to_i
41
+ @link = @info[1].to_i
42
+ @rel = @info[1].delete("-0-9")
43
+ @sem_head_id = @info[2].split('/')[0].to_i
44
+ @syn_head_id = @info[2].split('/')[1].to_i
45
+ @ext = @info[4]
46
+ @tokens = []
47
+ end
48
+ def detect_structure
49
+ sem_head_id, syn_head_id = 0, 0
50
+ @tokens.each_with_index do |token, num|
51
+ if token.pos =~ /^(助詞|助動詞)/
52
+ syn_head_id = num
53
+ elsif token.pos !~ /^(補助記号|空白)/
54
+ sem_head_id = num if sem_head_id == syn_head_id
55
+ syn_head_id = num
56
+ end
57
+ end
58
+ @sem_head_id, @syn_head_id = sem_head_id, syn_head_id
59
+ end
60
+ def text
61
+ @tokens.map{|token| token.text}.join
62
+ end
63
+ end
64
+
65
+ class KokugoTagger::Cabocha::Token
66
+ attr_accessor :info, :text, :pos, :ctype, :cform
67
+ def initialize(line)
68
+ text, info = line.chomp.split("\t")
69
+ @info = info.split(",")
70
+ @text = text
71
+ @pos = @info[0, 4].delete_if{|s| s == '*'}.join('-')
72
+ @ctype = @info[4]
73
+ @cform = @info[5]
74
+ end
75
+ end
Binary file
@@ -1,151 +1,241 @@
1
1
  # -*- coding: utf-8 -*-
2
- require 'csv'
3
2
 
4
3
  module KokugoTagger
5
4
  module_function
6
- def annotate(file)
7
- file.each_line do |line|
8
- next unless data = CabochaParser.parse(line)
9
- method_name = data[:type].downcase.to_sym
10
- method(method_name).call(data) if methods.include?(method_name)
11
- puts line
5
+ def annotate(source, model, enc)
6
+ enc ||= "UTF-8"
7
+ source.set_encoding enc
8
+ source_0, source_1 = KokugoTagger::Duplicator.duplicate(source, 2)
9
+ converter = KokugoTagger::Converter.connect(source_0)
10
+ yamcha = KokugoTagger::Yamcha.connect(converter, model)
11
+ merger = KokugoTagger::Merger.connect(yamcha, source_1)
12
+ merger.each_line{|line| puts line }
13
+ end
14
+ def convert(source)
15
+ converter = KokugoTagger::Converter.connect(source)
16
+ converter.each_line{|line| puts line }
17
+ end
18
+ def learn(source_dir, model, enc)
19
+ model ||= "kokugo"
20
+ enc ||= "UTF-8"
21
+ model = File.basename(model, ".model")
22
+ KokugoTagger::Learner.learn source_dir, model, enc
23
+ end
24
+ def validation(source_dir, enc, k)
25
+ enc ||= "UTF-8"
26
+ k_num ||= 3
27
+ KokugoTagger::Learner.validation source_dir, enc, k
28
+ end
29
+ end
30
+
31
+ module KokugoTagger::Duplicator
32
+ module_function
33
+ def duplicate(source, number = 2)
34
+ pipes = number.times.map{ IO.pipe("UTF-8") }
35
+ Thread.fork do
36
+ source.each_line { |line| pipes.each{|r, w| w.puts line } }
37
+ pipes.each{|r, w| w.close}
38
+ end.abort_on_exception = true
39
+ return pipes.map{|r, w| r }
40
+ end
41
+ end
42
+
43
+ module KokugoTagger::Converter
44
+ module_function
45
+ def connect(source)
46
+ read, write = IO.pipe("UTF-8")
47
+ Thread.fork do
48
+ self.process source, write
49
+ write.close
50
+ end.abort_on_exception = true
51
+ return read
52
+ end
53
+ def process(source, output)
54
+ buffer = ""
55
+ source.each_line do |line|
56
+ buffer << line
57
+ self.flush(buffer, output) if line.chomp == "EOS"
12
58
  end
13
59
  end
14
- def chunk(data)
15
- @chunks ||= []
16
- @chunks << @chunk = data
17
- @lpos ||= 0
18
- @chunk.update start:@lpos, end:@lpos, text:'', pos:nil, pred:nil, conj:nil
19
- end
20
- def token(data)
21
- @lpos += data[:text].size
22
- @chunk[:end] = @lpos
23
- @chunk[:text] += data[:text]
24
- pos data
25
- cform data
26
- end
27
- def segment_s(data)
28
- @segments ||= []
29
- @segments << data
30
- @last_item = data
31
- end
32
- def group_s(data)
33
- @groups ||= []
34
- @groups << data
35
- @last_item = data
36
- end
37
- def attr(data)
38
- @last_item[:attributes] ||= []
39
- @last_item[:attributes] << data
40
- end
41
- def eos(data)
42
- return unless @chunks
43
- before_eos
44
- @chunks.each do |chunk|
45
- puts '#! SEGMENT_S bccwj-kok:Bnst %d %d "%s"' % [chunk[:start], chunk[:end], chunk[:text]]
46
- puts '#! ATTR bccwj-kok:pred "%s述語"' % chunk[:pos] if chunk[:pred]
47
- puts '#! ATTR bccwj-kok:conj "%s"' % chunk[:conj] if chunk[:conj]
60
+ def flush(buffer, output)
61
+ document = KokugoTagger::Cabocha.parse(buffer)
62
+ document.each do |sentence|
63
+ sentence.each { |chunk| output.puts chunk_features(chunk).join(" ") }
64
+ output.puts
48
65
  end
49
- @chunks, @chunk, @lpos, @segments, @groups = nil
50
- end
51
- def pos(token)
52
- case token[:pos]
53
- when /^(名詞|代名詞|接尾辞-名詞的)/
54
- @chunk.update pos:'名詞', pred:nil, conj:nil
55
- when /^(形状詞|接尾辞-形状詞的)/
56
- @chunk.update pos:'形状詞', pred:nil, conj:nil
57
- when /^連体詞/
58
- @chunk.update pos:'連体詞', pred:nil, conj:'修飾(連体)'
59
- when /^副詞/
60
- @chunk.update pos:'副詞', pred:nil, conj:'修飾(連用)'
61
- when /^接続詞/
62
- @chunk.update pos:'接続詞', pred:nil, conj:'接続'
63
- when /^感動詞/
64
- @chunk.update pos:'感動詞', pred:nil, conj:'独立'
65
- when /^(動詞|接尾辞-動詞的)/
66
- @chunk.update pos:'動詞', pred:true, conj:nil
67
- when /^(形容詞|接尾辞-形容詞的)/
68
- @chunk.update pos:'形容詞', pred:true, conj:nil
69
- when /^助動詞/
70
- @chunk.update pred:true, conj:nil
71
- when /^助詞-格助詞/
72
- case token[:text]
73
- when 'が'
74
- @chunk.update conj:'主語'
75
- when 'を', 'に'
76
- @chunk.update conj:'補語'
77
- when 'の', 'との', 'という', 'といった'
78
- @chunk.update conj:'修飾(連体)'
79
- else
80
- @chunk.update conj:'修飾(連用)'
66
+ buffer.clear
67
+ end
68
+ def token_features(token)
69
+ return %w(* * * *) unless token
70
+ text, pos, cform = token.text, token.pos.split('-'), token.cform.split('-')
71
+ features = [text, pos[0], pos[1], cform[0]].map{|f| f || '*'}
72
+ return features
73
+ end
74
+ def chunk_features(chunk)
75
+ tokens = [sem_head(chunk), case_marker(chunk), syn_head(chunk), punct(chunk), sem_head(link_to(chunk))]
76
+ features = tokens.map{|token| token_features(token)}
77
+ return [chunk.text, features, chunk.rel].flatten
78
+ # return [chunk.text, features, chunk.ext].flatten
79
+ end
80
+ def sem_head(chunk)
81
+ return nil unless chunk
82
+ chunk.tokens[chunk.sem_head_id]
83
+ end
84
+ def case_marker(chunk)
85
+ return nil unless chunk
86
+ chunk.tokens.find{|token| token.pos.split('-')[1] == '格助詞'}
87
+ end
88
+ def syn_head(chunk)
89
+ return nil unless chunk
90
+ chunk.tokens[chunk.syn_head_id]
91
+ end
92
+ def punct(chunk)
93
+ return nil unless chunk
94
+ chunk.tokens[chunk.syn_head_id + 1]
95
+ end
96
+ def link_to(chunk)
97
+ chunk.sentence[chunk.link] if chunk.link != -1
98
+ end
99
+ end
100
+
101
+ module KokugoTagger::Yamcha
102
+ module_function
103
+ def connect(source, model)
104
+ model ||= File.dirname(__FILE__) + "/kokugo.model"
105
+ io = IO.popen("yamcha -m \"#{model}\"", "r+", encoding: "UTF-8")
106
+ Thread.fork {
107
+ source.each_line{|line| io.puts line }
108
+ io.close_write
109
+ }.abort_on_exception = true
110
+ return io
111
+ end
112
+ end
113
+
114
+ module KokugoTagger::Merger
115
+ module_function
116
+ def connect(yamcha, cabocha)
117
+ read, write = IO.pipe("UTF-8")
118
+ Thread.fork do
119
+ self.process yamcha, cabocha, write
120
+ write.close
121
+ end.abort_on_exception = true
122
+ return read
123
+ end
124
+ def process(yamcha, cabocha, output)
125
+ cabocha.each_line do |line|
126
+ if line[0] == "*"
127
+ record = yamcha.gets
128
+ record = yamcha.gets until record.chomp != ""
129
+ letter = record.chomp.split("\t").last.upcase
130
+ line.sub! /[A-Z]+/, letter
81
131
  end
82
- when /^(助詞-副助詞|助詞-係助詞)/
83
- @chunk.update conj:'修飾(連用)'
84
- when /^助詞-接続詞/
85
- @chunk.update pred:true, conj:'接続'
86
- when /^助詞-終助詞/
87
- @chunk.update pred:true, conj:nil
88
- when /^助詞-準体助詞/
89
- @chunk.update conj:nil
132
+ output.puts line
90
133
  end
91
134
  end
92
- def cform(token)
93
- case token[:cform]
94
- when /^語幹/
95
- when /^(未然形|連用形|仮定形|已然形)/
96
- @chunk.update conj:'接続'
97
- when /^(意志推量形|連体形)/
98
- @chunk.update conj:'修飾(連体)'
99
- when /^(終止形|命令形)/
100
- @chunk.update conj:nil
101
- end
102
- end
103
- def before_eos
104
- # BCCWJ-DepPara
105
- @chunks.each do |chunk|
106
- chunk[:conj] = [chunk[:conj], '断片'].compact.join(':') if chunk[:rel] == 'F'
107
- chunk[:conj] = [chunk[:conj], '文節内'].compact.join(':') if chunk[:rel] == 'B'
108
- chunk[:conj] = '文末' if chunk[:rel] == 'Z'
135
+ end
136
+
137
+ module KokugoTagger::Learner
138
+ module_function
139
+ def learn(source_dir, model, enc)
140
+ model ||= "kokugo"
141
+ convert source_dir, "train.data", enc
142
+ yamcha_learn "train.data", model
143
+ end
144
+ def validation(source_dir, enc, k)
145
+ convert source_dir, "train.data", enc
146
+ filenames = split("train.data", enc, k)
147
+ filenames.each_with_index do |filename, n|
148
+ others = filenames - [filename]
149
+ concat others, "temp.data"
150
+ model = "test"
151
+ yamcha_learn "temp.data", model
152
+ result = "result.#{n}.data"
153
+ system "cat #{filename} | yamcha -m test.model > #{result}"
109
154
  end
110
- # 並列・同格関係
111
- @groups ||= []
112
- @segments ||= []
113
- @groups.each do |group|
114
- next unless group[:name] =~ /^(Parallel|Apposition)$/
115
- members = group[:member].map{|n| n.to_i}
116
- members = @segments.values_at(*members)
117
- chunk_ids = members.map do |segment|
118
- _end = segment[:end].to_i
119
- chunk = @chunks.find{|c| c[:start] < _end and c[:end] >= _end}
120
- chunk[:id].to_i if chunk
121
- end
122
- chunk_ids = chunk_ids.compact.uniq.sort
123
- if chunk_ids.size > 1
124
- conj = {'Parallel' => '並立', 'Apposition' => '同格'}[group[:name]]
125
- chunk_ids[0..-2].each{|cid| @chunks[cid][:conj] = conj}
155
+ data_set = []
156
+ filenames = Array.new(k){|n| "result.#{n}.data"}
157
+ filenames.each do |filename|
158
+ sentence = 0
159
+ chunk = 0
160
+ accuracy = 0
161
+ label_data = Hash.new{|h, k| h[k] = Hash.new(0)}
162
+ File.foreach(filename, encoding: enc) do |line|
163
+ line.chomp!
164
+ if line.empty?
165
+ sentence += 1
166
+ else
167
+ t, p = *line.split(/\s/)[-2, 2]
168
+ acc = (t == p)
169
+ chunk += 1
170
+ accuracy += 1 if acc
171
+ label_data[t][[true, acc]] += 1
172
+ label_data[p][[acc, true]] += 1
173
+ end
126
174
  end
175
+ data_set << [sentence, chunk, accuracy, label_data]
127
176
  end
128
- # 属性を付与できなかった文節に対して、係り受けを利用して属性を補完
129
- # 連用成分を受ける文節を述語とみなす
130
- @chunks.each do |chunk|
131
- chunk[:pred] ||= @chunks.any?{|_chunk| _chunk[:link] == chunk[:id] && _chunk[:conj] =~ /^(主語|補語|修飾\(連用\)|接続)$/}
177
+ report = open("validation.txt", "w:UTF-8")
178
+ report.puts "# #{k}-fold cross-validation"
179
+ report.puts
180
+ report.puts "## test files"
181
+ k.times do |n|
182
+ sentence, chunk = *data_set[n]
183
+ report.puts "train.#{n}.data: #{sentence} sentences. #{chunk} chunks."
132
184
  end
133
- # 述語にかかる文節を修飾(連用)とみなす
134
- @chunks.each do |chunk|
135
- chunk[:conj] = '修飾(連用)' if chunk[:conj] == nil && @chunks.any?{|_chunk| _chunk[:id] == chunk[:link] && _chunk[:pred]}
185
+ report.puts
186
+ k.times do |n|
187
+ sentence, chunk, accuracy, label_data = *data_set[n]
188
+ report.puts "## train.#{n}.data"
189
+ report.puts "accuracy: #{accuracy.to_f / chunk}"
190
+ report.puts "labels:"
191
+ report.puts %w(label tp tn fp fn recall precision f-score accuracy).join("\t")
192
+ label_data.sort.each do |label, data|
193
+ tp = data[[true, true]]
194
+ tn = data[[true, false]]
195
+ fp = data[[false, true]]
196
+ fn = chunk - (tp + tn + fp)
197
+ recall = tp.to_f / (tp + tn)
198
+ precision = tp.to_f / (tp + fp)
199
+ f = 2 * precision * recall / (precision + recall)
200
+ acc = (tp + fn).to_f / chunk
201
+ report.puts %w(%s %d %d %d %d %.2f %.2f %.2f %.2f).join("\t") % [label, tp, tn, fp, fn, recall, precision, f, acc]
202
+ end
203
+ report.puts
136
204
  end
137
- # 述語項構造が付与されている文節を補語にする
138
- @chunks.each do |chunk|
139
- next if chunk[:link] == '-1' || chunk[:arg] == nil
140
- next unless chunk[:conj] == nil || chunk[:conj] == '修飾(連用)'
141
- pred = @chunks[chunk[:link].to_i]
142
- if chunk[:arg] == 'Ga' and pred[:passive] == nil
143
- chunk[:conj] = '主語'
144
- elsif chunk[:arg] == 'O' and pred[:passive] == '直接'
145
- chunk[:conj] = '主語'
146
- else
147
- chunk[:conj] = '補語'
205
+ end
206
+ def convert(source_dir, target_filename, enc)
207
+ target = open(target_filename, "w:#{enc}")
208
+ source_filenames = Dir.glob(source_dir + "/*.cabocha")
209
+ # source_filenames = source_filenames[0, 2] # debug
210
+ source_filenames.each do |filename|
211
+ source = open(filename, encoding: enc)
212
+ converter = KokugoTagger::Converter.connect(source)
213
+ converter.each_line{|line| target.puts line }
214
+ source.close
215
+ end
216
+ target.close
217
+ end
218
+ def split(source_filename, enc, k)
219
+ basename = File.basename(source_filename, ".data")
220
+ target_filenames = Array.new(k){|n| "#{basename}.#{n}.data"}
221
+ targets = target_filenames.map{|filename| open(filename, "w:#{enc}")}
222
+ index = 0
223
+ File.foreach(source_filename, encoding: enc) do |line|
224
+ targets[index].puts line
225
+ if line.chomp.empty?
226
+ index += 1
227
+ index = 0 if index == k
148
228
  end
149
229
  end
230
+ targets.each{|f| f.close}
231
+ return target_filenames
232
+ end
233
+ def concat(source_filenames, target_filename)
234
+ system "cat #{source_filenames.join(" ")} > #{target_filename}"
235
+ end
236
+ def yamcha_learn(train_data, model)
237
+ libexecdir = `yamcha-config --libexecdir`.chomp
238
+ system "cp #{libexecdir}/Makefile ."
239
+ system "make CORPUS=#{train_data} MODEL=#{model} FEATURE=\"F:0..0:1..\" SVM_PARAM=\"-t 1 -d 2 -c 1\" MULTI_CLASS=1 train"
150
240
  end
151
241
  end
@@ -1,3 +1,3 @@
1
1
  module KokugoTagger
2
- VERSION = "0.0.6"
2
+ VERSION = "1.0.3"
3
3
  end
data/lib/kokugo_tagger.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require "kokugo_tagger/version"
2
- require "kokugo_tagger/parser"
2
+ require "kokugo_tagger/cabocha"
3
3
  require "kokugo_tagger/tagger"
4
4
 
5
5
  module KokugoTagger
6
6
  # Your code goes here...
7
- end
7
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kokugo_tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mizuho IMADA
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-31 00:00:00.000000000 Z
11
+ date: 2016-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,7 +54,8 @@ files:
54
54
  - bin/kokugo_tagger
55
55
  - kokugo_tagger.gemspec
56
56
  - lib/kokugo_tagger.rb
57
- - lib/kokugo_tagger/parser.rb
57
+ - lib/kokugo_tagger/cabocha.rb
58
+ - lib/kokugo_tagger/kokugo.model
58
59
  - lib/kokugo_tagger/tagger.rb
59
60
  - lib/kokugo_tagger/version.rb
60
61
  homepage: ''
@@ -77,9 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
77
78
  version: '0'
78
79
  requirements: []
79
80
  rubyforge_project:
80
- rubygems_version: 2.4.5
81
+ rubygems_version: 2.5.2
81
82
  signing_key:
82
83
  specification_version: 4
83
84
  summary: Write a short summary. Required.
84
85
  test_files: []
85
- has_rdoc:
@@ -1,47 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'csv'
3
-
4
- module CabochaParser
5
- def parse(line)
6
- case line.chomp
7
- when /^#/
8
- return parse_excab(line)
9
- when /^\*/
10
- return parse_chunk(line)
11
- when 'EOS'
12
- return {type: 'EOS'}
13
- when ''
14
- return nil
15
- else
16
- return parse_token(line)
17
- end
18
- end
19
- def parse_excab(line)
20
- line = line.gsub('\"', '""')
21
- null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
22
- case type
23
- when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
24
- excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
25
- when 'GROUP', 'GROUP_S'
26
- excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
27
- when 'ATTR'
28
- excab = {type: type, name: data[0], value: data[1]}
29
- end
30
- return excab
31
- end
32
- def parse_chunk(line)
33
- null, id, dep, part, score = line.chomp.split("\s")
34
- link, rel = dep[0..-2], dep[-1]
35
- head, func = part.split('/')
36
- chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
37
- return chunk
38
- end
39
- def parse_token(line)
40
- text, attrs, ne = line.chomp.split("\t")
41
- attrs = CSV.parse_line(attrs, col_sep:',')
42
- pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
43
- token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
44
- return token
45
- end
46
- module_function :parse, :parse_excab, :parse_chunk, :parse_token
47
- end