kokugo_tagger 0.0.6 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 07d91224845f955eb5e2433d059361d99f2de2f4
4
- data.tar.gz: fbe4d1d40cd9f533f745c28bcacc3954aee77f58
3
+ metadata.gz: 572d91b93afea39ab298c267f0068436e62670c1
4
+ data.tar.gz: a5ffe79141ca346aa6c346b7514dab584eb234eb
5
5
  SHA512:
6
- metadata.gz: 700d2dcf6d0dab285c7036ea32ba3e88ae3f5f51edb5434dd0a32c61745a6455ef0aad9a67074452c82a6036485136c021c8ecd05a50c30393351430e9e9a157
7
- data.tar.gz: 713b0e7ff1872541a8fc5fd6dcd681cb58fb8d2d988cef6694650ee18d53f6a0529946cac4f4fce117ae4630286ba86015a923f901966553e6dea0c2ad9cda95
6
+ metadata.gz: 57414c4a4ec004c4f8a1483118ef638d58d744e1b0b71a9e7cb43a1f89e660de3fd9fe4408dc350901f5afb7c3094a2ad83b8461c406766b8b554b9345c0f411
7
+ data.tar.gz: 459cc09789690d93f943d027af74481aecce5ebd3283181c77c809c1f83f0e6fef56db379c10da69c3bc7a313a9c0711a3ce35fe5a04957601bbc9a24ddcf861
data/README.md CHANGED
@@ -1,26 +1,51 @@
1
1
  # KokugoTagger
2
2
 
3
- TODO: Write a gem description
3
+ cabocha形式のファイルに対して、学校文法に準拠した係り受けラベルを付与します。
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
7
+ 事前に以下のツールをインストールし、パスを通しておく必要があります。
8
8
 
9
- ```ruby
10
- gem 'kokugo_tagger'
11
- ```
9
+ - Ruby
10
+ - YamCha
12
11
 
13
- And then execute:
14
-
15
- $ bundle
16
-
17
- Or install it yourself as:
12
+ コマンドラインから以下のように入力し、インストールしてください。
18
13
 
19
14
  $ gem install kokugo_tagger
20
15
 
21
16
  ## Usage
22
17
 
23
- TODO: Write usage instructions here
18
+ UTF-8/UniDicのCaboCha形式データにのみ対応しています。CaboCha形式のファイルにラベルを付与する場合は、次のように実行してください。
19
+
20
+ $ cat neko.cabocha | kokugo_tagger > output.cabocha
21
+
22
+ プレーンテキストの場合は、次のようにCaboChaと組み合わせて下さい。CaboChaは別途インストールしてください。
23
+
24
+ $ cat neko.txt | cabocha -f1 | kokugo_tagger > output.cabocha
25
+
26
+ コマンドライン上で対話的に実行することもできます。
27
+
28
+ $ cabocha -f1 | kokugo_tagger
29
+ 吾輩は猫である
30
+ * 0 1S 0/1 0.000000
31
+ 吾輩 代名詞,*,*,*,*,*,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,*,*,*,*,ワガハイ,ワガハイ,ワガハイ,ワガハイ,*,*,0,*,* O
32
+ は 助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* O
33
+ * 1 -1X 2/2 0.000000
34
+ 猫 名詞,普通名詞,一般,*,*,*,ネコ,猫,猫,ネコ,猫,ネコ,和,*,*,*,*,ネコ,ネコ,ネコ,ネコ,*,*,1,C4,* O
35
+ で 助動詞,*,*,*,助動詞-ダ,連用形-一般,ダ,だ,で,デ,だ,ダ,和,*,*,*,*,デ,ダ,デ,ダ,*,*,*,名詞%F1,* O
36
+ ある 動詞,非自立可能,*,*,五段-ラ行,終止形-一般,アル,有る,ある,アル,ある,アル,和,*,*,*,*,アル,アル,アル,アル,*,*,1,C3,* O
37
+ EOS
38
+
39
+ 係り受けラベルは以下の8種類です。
40
+
41
+ - S: 主語
42
+ - R: 連用修飾語
43
+ - T: 連体修飾語
44
+ - Z: 接続語
45
+ - D: 独立語
46
+ - H: 並立の関係
47
+ - J: 補助の関係
48
+ - X: その他(文末など)
24
49
 
25
50
  ## Contributing
26
51
 
data/bin/kokugo_tagger CHANGED
@@ -1,6 +1,27 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'optparse'
3
4
  require 'kokugo_tagger'
4
5
 
5
- Encoding.default_external = 'UTF-8'
6
- KokugoTagger.annotate ARGF
6
+ $enc = "UTF-8"
7
+ $model = nil
8
+ $learn = nil
9
+ $valid = nil
10
+
11
+ OptionParser.new do |opt|
12
+ Version = "0.1.0"
13
+ opt.on('-e STR', '--encoding STR', String, 'encoding'){|str| $enc = str }
14
+ opt.on('-m FILE', '--model FILE', String, 'model file'){|file| $model = file }
15
+ opt.on('-l DIR', '--learn DIR', String, 'train corpus directory'){|dir| $learn = dir }
16
+ opt.on('-v DIR', '--valid DIR', String, 'K-fold cross-validation'){|dir| $learn = dir; $valid = 3 }
17
+ opt.parse!
18
+ end
19
+
20
+ # $stdin.set_encoding 'UTF-8'
21
+ if $learn and $valid
22
+ KokugoTagger.validation $learn, $enc, $valid
23
+ elsif $learn
24
+ KokugoTagger.learn $learn, $model, $enc
25
+ else
26
+ KokugoTagger.annotate $stdin, $model, $enc
27
+ end
@@ -0,0 +1,75 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ module KokugoTagger::Cabocha
4
+ module_function
5
+ def parse(file = @source)
6
+ document = KokugoTagger::Cabocha::Document.new
7
+ sentence, chunk = nil, nil
8
+ file.each_line do |line|
9
+ sentence ||= KokugoTagger::Cabocha::Sentence.new
10
+ case line
11
+ when /^EOS/
12
+ sentence.each{|chunk| chunk.detect_structure} # sem_headとsyn_headの独自判定
13
+ document << sentence
14
+ sentence, chunk = nil, nil
15
+ when /^\*/
16
+ chunk = KokugoTagger::Cabocha::Chunk.new(line)
17
+ sentence << chunk
18
+ chunk.sentence = sentence
19
+ when /^#/
20
+ # nothing
21
+ else
22
+ token = KokugoTagger::Cabocha::Token.new(line)
23
+ chunk.tokens << token
24
+ end
25
+ end
26
+ return document
27
+ end
28
+ end
29
+
30
+ class KokugoTagger::Cabocha::Document < Array
31
+ end
32
+
33
+ class KokugoTagger::Cabocha::Sentence < Array
34
+ end
35
+
36
+ class KokugoTagger::Cabocha::Chunk
37
+ attr_accessor :info, :id, :link, :rel, :sem_head_id, :syn_head_id, :ext, :tokens, :sentence
38
+ def initialize(line)
39
+ @info = line.chomp.split(/\s/)[1..-1]
40
+ @id = @info[0].to_i
41
+ @link = @info[1].to_i
42
+ @rel = @info[1].delete("-0-9")
43
+ @sem_head_id = @info[2].split('/')[0].to_i
44
+ @syn_head_id = @info[2].split('/')[1].to_i
45
+ @ext = @info[4]
46
+ @tokens = []
47
+ end
48
+ def detect_structure
49
+ sem_head_id, syn_head_id = 0, 0
50
+ @tokens.each_with_index do |token, num|
51
+ if token.pos =~ /^(助詞|助動詞)/
52
+ syn_head_id = num
53
+ elsif token.pos !~ /^(補助記号|空白)/
54
+ sem_head_id = num if sem_head_id == syn_head_id
55
+ syn_head_id = num
56
+ end
57
+ end
58
+ @sem_head_id, @syn_head_id = sem_head_id, syn_head_id
59
+ end
60
+ def text
61
+ @tokens.map{|token| token.text}.join
62
+ end
63
+ end
64
+
65
+ class KokugoTagger::Cabocha::Token
66
+ attr_accessor :info, :text, :pos, :ctype, :cform
67
+ def initialize(line)
68
+ text, info = line.chomp.split("\t")
69
+ @info = info.split(",")
70
+ @text = text
71
+ @pos = @info[0, 4].delete_if{|s| s == '*'}.join('-')
72
+ @ctype = @info[4]
73
+ @cform = @info[5]
74
+ end
75
+ end
Binary file
@@ -1,151 +1,241 @@
1
1
  # -*- coding: utf-8 -*-
2
- require 'csv'
3
2
 
4
3
  module KokugoTagger
5
4
  module_function
6
- def annotate(file)
7
- file.each_line do |line|
8
- next unless data = CabochaParser.parse(line)
9
- method_name = data[:type].downcase.to_sym
10
- method(method_name).call(data) if methods.include?(method_name)
11
- puts line
5
+ def annotate(source, model, enc)
6
+ enc ||= "UTF-8"
7
+ source.set_encoding enc
8
+ source_0, source_1 = KokugoTagger::Duplicator.duplicate(source, 2)
9
+ converter = KokugoTagger::Converter.connect(source_0)
10
+ yamcha = KokugoTagger::Yamcha.connect(converter, model)
11
+ merger = KokugoTagger::Merger.connect(yamcha, source_1)
12
+ merger.each_line{|line| puts line }
13
+ end
14
+ def convert(source)
15
+ converter = KokugoTagger::Converter.connect(source)
16
+ converter.each_line{|line| puts line }
17
+ end
18
+ def learn(source_dir, model, enc)
19
+ model ||= "kokugo"
20
+ enc ||= "UTF-8"
21
+ model = File.basename(model, ".model")
22
+ KokugoTagger::Learner.learn source_dir, model, enc
23
+ end
24
+ def validation(source_dir, enc, k)
25
+ enc ||= "UTF-8"
26
+ k_num ||= 3
27
+ KokugoTagger::Learner.validation source_dir, enc, k
28
+ end
29
+ end
30
+
31
+ module KokugoTagger::Duplicator
32
+ module_function
33
+ def duplicate(source, number = 2)
34
+ pipes = number.times.map{ IO.pipe("UTF-8") }
35
+ Thread.fork do
36
+ source.each_line { |line| pipes.each{|r, w| w.puts line } }
37
+ pipes.each{|r, w| w.close}
38
+ end.abort_on_exception = true
39
+ return pipes.map{|r, w| r }
40
+ end
41
+ end
42
+
43
+ module KokugoTagger::Converter
44
+ module_function
45
+ def connect(source)
46
+ read, write = IO.pipe("UTF-8")
47
+ Thread.fork do
48
+ self.process source, write
49
+ write.close
50
+ end.abort_on_exception = true
51
+ return read
52
+ end
53
+ def process(source, output)
54
+ buffer = ""
55
+ source.each_line do |line|
56
+ buffer << line
57
+ self.flush(buffer, output) if line.chomp == "EOS"
12
58
  end
13
59
  end
14
- def chunk(data)
15
- @chunks ||= []
16
- @chunks << @chunk = data
17
- @lpos ||= 0
18
- @chunk.update start:@lpos, end:@lpos, text:'', pos:nil, pred:nil, conj:nil
19
- end
20
- def token(data)
21
- @lpos += data[:text].size
22
- @chunk[:end] = @lpos
23
- @chunk[:text] += data[:text]
24
- pos data
25
- cform data
26
- end
27
- def segment_s(data)
28
- @segments ||= []
29
- @segments << data
30
- @last_item = data
31
- end
32
- def group_s(data)
33
- @groups ||= []
34
- @groups << data
35
- @last_item = data
36
- end
37
- def attr(data)
38
- @last_item[:attributes] ||= []
39
- @last_item[:attributes] << data
40
- end
41
- def eos(data)
42
- return unless @chunks
43
- before_eos
44
- @chunks.each do |chunk|
45
- puts '#! SEGMENT_S bccwj-kok:Bnst %d %d "%s"' % [chunk[:start], chunk[:end], chunk[:text]]
46
- puts '#! ATTR bccwj-kok:pred "%s述語"' % chunk[:pos] if chunk[:pred]
47
- puts '#! ATTR bccwj-kok:conj "%s"' % chunk[:conj] if chunk[:conj]
60
+ def flush(buffer, output)
61
+ document = KokugoTagger::Cabocha.parse(buffer)
62
+ document.each do |sentence|
63
+ sentence.each { |chunk| output.puts chunk_features(chunk).join(" ") }
64
+ output.puts
48
65
  end
49
- @chunks, @chunk, @lpos, @segments, @groups = nil
50
- end
51
- def pos(token)
52
- case token[:pos]
53
- when /^(名詞|代名詞|接尾辞-名詞的)/
54
- @chunk.update pos:'名詞', pred:nil, conj:nil
55
- when /^(形状詞|接尾辞-形状詞的)/
56
- @chunk.update pos:'形状詞', pred:nil, conj:nil
57
- when /^連体詞/
58
- @chunk.update pos:'連体詞', pred:nil, conj:'修飾(連体)'
59
- when /^副詞/
60
- @chunk.update pos:'副詞', pred:nil, conj:'修飾(連用)'
61
- when /^接続詞/
62
- @chunk.update pos:'接続詞', pred:nil, conj:'接続'
63
- when /^感動詞/
64
- @chunk.update pos:'感動詞', pred:nil, conj:'独立'
65
- when /^(動詞|接尾辞-動詞的)/
66
- @chunk.update pos:'動詞', pred:true, conj:nil
67
- when /^(形容詞|接尾辞-形容詞的)/
68
- @chunk.update pos:'形容詞', pred:true, conj:nil
69
- when /^助動詞/
70
- @chunk.update pred:true, conj:nil
71
- when /^助詞-格助詞/
72
- case token[:text]
73
- when 'が'
74
- @chunk.update conj:'主語'
75
- when 'を', 'に'
76
- @chunk.update conj:'補語'
77
- when 'の', 'との', 'という', 'といった'
78
- @chunk.update conj:'修飾(連体)'
79
- else
80
- @chunk.update conj:'修飾(連用)'
66
+ buffer.clear
67
+ end
68
+ def token_features(token)
69
+ return %w(* * * *) unless token
70
+ text, pos, cform = token.text, token.pos.split('-'), token.cform.split('-')
71
+ features = [text, pos[0], pos[1], cform[0]].map{|f| f || '*'}
72
+ return features
73
+ end
74
+ def chunk_features(chunk)
75
+ tokens = [sem_head(chunk), case_marker(chunk), syn_head(chunk), punct(chunk), sem_head(link_to(chunk))]
76
+ features = tokens.map{|token| token_features(token)}
77
+ return [chunk.text, features, chunk.rel].flatten
78
+ # return [chunk.text, features, chunk.ext].flatten
79
+ end
80
+ def sem_head(chunk)
81
+ return nil unless chunk
82
+ chunk.tokens[chunk.sem_head_id]
83
+ end
84
+ def case_marker(chunk)
85
+ return nil unless chunk
86
+ chunk.tokens.find{|token| token.pos.split('-')[1] == '格助詞'}
87
+ end
88
+ def syn_head(chunk)
89
+ return nil unless chunk
90
+ chunk.tokens[chunk.syn_head_id]
91
+ end
92
+ def punct(chunk)
93
+ return nil unless chunk
94
+ chunk.tokens[chunk.syn_head_id + 1]
95
+ end
96
+ def link_to(chunk)
97
+ chunk.sentence[chunk.link] if chunk.link != -1
98
+ end
99
+ end
100
+
101
+ module KokugoTagger::Yamcha
102
+ module_function
103
+ def connect(source, model)
104
+ model ||= File.dirname(__FILE__) + "/kokugo.model"
105
+ io = IO.popen("yamcha -m \"#{model}\"", "r+", encoding: "UTF-8")
106
+ Thread.fork {
107
+ source.each_line{|line| io.puts line }
108
+ io.close_write
109
+ }.abort_on_exception = true
110
+ return io
111
+ end
112
+ end
113
+
114
+ module KokugoTagger::Merger
115
+ module_function
116
+ def connect(yamcha, cabocha)
117
+ read, write = IO.pipe("UTF-8")
118
+ Thread.fork do
119
+ self.process yamcha, cabocha, write
120
+ write.close
121
+ end.abort_on_exception = true
122
+ return read
123
+ end
124
+ def process(yamcha, cabocha, output)
125
+ cabocha.each_line do |line|
126
+ if line[0] == "*"
127
+ record = yamcha.gets
128
+ record = yamcha.gets until record.chomp != ""
129
+ letter = record.chomp.split("\t").last.upcase
130
+ line.sub! /[A-Z]+/, letter
81
131
  end
82
- when /^(助詞-副助詞|助詞-係助詞)/
83
- @chunk.update conj:'修飾(連用)'
84
- when /^助詞-接続詞/
85
- @chunk.update pred:true, conj:'接続'
86
- when /^助詞-終助詞/
87
- @chunk.update pred:true, conj:nil
88
- when /^助詞-準体助詞/
89
- @chunk.update conj:nil
132
+ output.puts line
90
133
  end
91
134
  end
92
- def cform(token)
93
- case token[:cform]
94
- when /^語幹/
95
- when /^(未然形|連用形|仮定形|已然形)/
96
- @chunk.update conj:'接続'
97
- when /^(意志推量形|連体形)/
98
- @chunk.update conj:'修飾(連体)'
99
- when /^(終止形|命令形)/
100
- @chunk.update conj:nil
101
- end
102
- end
103
- def before_eos
104
- # BCCWJ-DepPara
105
- @chunks.each do |chunk|
106
- chunk[:conj] = [chunk[:conj], '断片'].compact.join(':') if chunk[:rel] == 'F'
107
- chunk[:conj] = [chunk[:conj], '文節内'].compact.join(':') if chunk[:rel] == 'B'
108
- chunk[:conj] = '文末' if chunk[:rel] == 'Z'
135
+ end
136
+
137
+ module KokugoTagger::Learner
138
+ module_function
139
+ def learn(source_dir, model, enc)
140
+ model ||= "kokugo"
141
+ convert source_dir, "train.data", enc
142
+ yamcha_learn "train.data", model
143
+ end
144
+ def validation(source_dir, enc, k)
145
+ convert source_dir, "train.data", enc
146
+ filenames = split("train.data", enc, k)
147
+ filenames.each_with_index do |filename, n|
148
+ others = filenames - [filename]
149
+ concat others, "temp.data"
150
+ model = "test"
151
+ yamcha_learn "temp.data", model
152
+ result = "result.#{n}.data"
153
+ system "cat #{filename} | yamcha -m test.model > #{result}"
109
154
  end
110
- # 並列・同格関係
111
- @groups ||= []
112
- @segments ||= []
113
- @groups.each do |group|
114
- next unless group[:name] =~ /^(Parallel|Apposition)$/
115
- members = group[:member].map{|n| n.to_i}
116
- members = @segments.values_at(*members)
117
- chunk_ids = members.map do |segment|
118
- _end = segment[:end].to_i
119
- chunk = @chunks.find{|c| c[:start] < _end and c[:end] >= _end}
120
- chunk[:id].to_i if chunk
121
- end
122
- chunk_ids = chunk_ids.compact.uniq.sort
123
- if chunk_ids.size > 1
124
- conj = {'Parallel' => '並立', 'Apposition' => '同格'}[group[:name]]
125
- chunk_ids[0..-2].each{|cid| @chunks[cid][:conj] = conj}
155
+ data_set = []
156
+ filenames = Array.new(k){|n| "result.#{n}.data"}
157
+ filenames.each do |filename|
158
+ sentence = 0
159
+ chunk = 0
160
+ accuracy = 0
161
+ label_data = Hash.new{|h, k| h[k] = Hash.new(0)}
162
+ File.foreach(filename, encoding: enc) do |line|
163
+ line.chomp!
164
+ if line.empty?
165
+ sentence += 1
166
+ else
167
+ t, p = *line.split(/\s/)[-2, 2]
168
+ acc = (t == p)
169
+ chunk += 1
170
+ accuracy += 1 if acc
171
+ label_data[t][[true, acc]] += 1
172
+ label_data[p][[acc, true]] += 1
173
+ end
126
174
  end
175
+ data_set << [sentence, chunk, accuracy, label_data]
127
176
  end
128
- # 属性を付与できなかった文節に対して、係り受けを利用して属性を補完
129
- # 連用成分を受ける文節を述語とみなす
130
- @chunks.each do |chunk|
131
- chunk[:pred] ||= @chunks.any?{|_chunk| _chunk[:link] == chunk[:id] && _chunk[:conj] =~ /^(主語|補語|修飾\(連用\)|接続)$/}
177
+ report = open("validation.txt", "w:UTF-8")
178
+ report.puts "# #{k}-fold cross-validation"
179
+ report.puts
180
+ report.puts "## test files"
181
+ k.times do |n|
182
+ sentence, chunk = *data_set[n]
183
+ report.puts "train.#{n}.data: #{sentence} sentences. #{chunk} chunks."
132
184
  end
133
- # 述語にかかる文節を修飾(連用)とみなす
134
- @chunks.each do |chunk|
135
- chunk[:conj] = '修飾(連用)' if chunk[:conj] == nil && @chunks.any?{|_chunk| _chunk[:id] == chunk[:link] && _chunk[:pred]}
185
+ report.puts
186
+ k.times do |n|
187
+ sentence, chunk, accuracy, label_data = *data_set[n]
188
+ report.puts "## train.#{n}.data"
189
+ report.puts "accuracy: #{accuracy.to_f / chunk}"
190
+ report.puts "labels:"
191
+ report.puts %w(label tp tn fp fn recall precision f-score accuracy).join("\t")
192
+ label_data.sort.each do |label, data|
193
+ tp = data[[true, true]]
194
+ tn = data[[true, false]]
195
+ fp = data[[false, true]]
196
+ fn = chunk - (tp + tn + fp)
197
+ recall = tp.to_f / (tp + tn)
198
+ precision = tp.to_f / (tp + fp)
199
+ f = 2 * precision * recall / (precision + recall)
200
+ acc = (tp + fn).to_f / chunk
201
+ report.puts %w(%s %d %d %d %d %.2f %.2f %.2f %.2f).join("\t") % [label, tp, tn, fp, fn, recall, precision, f, acc]
202
+ end
203
+ report.puts
136
204
  end
137
- # 述語項構造が付与されている文節を補語にする
138
- @chunks.each do |chunk|
139
- next if chunk[:link] == '-1' || chunk[:arg] == nil
140
- next unless chunk[:conj] == nil || chunk[:conj] == '修飾(連用)'
141
- pred = @chunks[chunk[:link].to_i]
142
- if chunk[:arg] == 'Ga' and pred[:passive] == nil
143
- chunk[:conj] = '主語'
144
- elsif chunk[:arg] == 'O' and pred[:passive] == '直接'
145
- chunk[:conj] = '主語'
146
- else
147
- chunk[:conj] = '補語'
205
+ end
206
+ def convert(source_dir, target_filename, enc)
207
+ target = open(target_filename, "w:#{enc}")
208
+ source_filenames = Dir.glob(source_dir + "/*.cabocha")
209
+ # source_filenames = source_filenames[0, 2] # debug
210
+ source_filenames.each do |filename|
211
+ source = open(filename, encoding: enc)
212
+ converter = KokugoTagger::Converter.connect(source)
213
+ converter.each_line{|line| target.puts line }
214
+ source.close
215
+ end
216
+ target.close
217
+ end
218
+ def split(source_filename, enc, k)
219
+ basename = File.basename(source_filename, ".data")
220
+ target_filenames = Array.new(k){|n| "#{basename}.#{n}.data"}
221
+ targets = target_filenames.map{|filename| open(filename, "w:#{enc}")}
222
+ index = 0
223
+ File.foreach(source_filename, encoding: enc) do |line|
224
+ targets[index].puts line
225
+ if line.chomp.empty?
226
+ index += 1
227
+ index = 0 if index == k
148
228
  end
149
229
  end
230
+ targets.each{|f| f.close}
231
+ return target_filenames
232
+ end
233
+ def concat(source_filenames, target_filename)
234
+ system "cat #{source_filenames.join(" ")} > #{target_filename}"
235
+ end
236
+ def yamcha_learn(train_data, model)
237
+ libexecdir = `yamcha-config --libexecdir`.chomp
238
+ system "cp #{libexecdir}/Makefile ."
239
+ system "make CORPUS=#{train_data} MODEL=#{model} FEATURE=\"F:0..0:1..\" SVM_PARAM=\"-t 1 -d 2 -c 1\" MULTI_CLASS=1 train"
150
240
  end
151
241
  end
@@ -1,3 +1,3 @@
1
1
  module KokugoTagger
2
- VERSION = "0.0.6"
2
+ VERSION = "1.0.3"
3
3
  end
data/lib/kokugo_tagger.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require "kokugo_tagger/version"
2
- require "kokugo_tagger/parser"
2
+ require "kokugo_tagger/cabocha"
3
3
  require "kokugo_tagger/tagger"
4
4
 
5
5
  module KokugoTagger
6
6
  # Your code goes here...
7
- end
7
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kokugo_tagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mizuho IMADA
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-31 00:00:00.000000000 Z
11
+ date: 2016-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,7 +54,8 @@ files:
54
54
  - bin/kokugo_tagger
55
55
  - kokugo_tagger.gemspec
56
56
  - lib/kokugo_tagger.rb
57
- - lib/kokugo_tagger/parser.rb
57
+ - lib/kokugo_tagger/cabocha.rb
58
+ - lib/kokugo_tagger/kokugo.model
58
59
  - lib/kokugo_tagger/tagger.rb
59
60
  - lib/kokugo_tagger/version.rb
60
61
  homepage: ''
@@ -77,9 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
77
78
  version: '0'
78
79
  requirements: []
79
80
  rubyforge_project:
80
- rubygems_version: 2.4.5
81
+ rubygems_version: 2.5.2
81
82
  signing_key:
82
83
  specification_version: 4
83
84
  summary: Write a short summary. Required.
84
85
  test_files: []
85
- has_rdoc:
@@ -1,47 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'csv'
3
-
4
- module CabochaParser
5
- def parse(line)
6
- case line.chomp
7
- when /^#/
8
- return parse_excab(line)
9
- when /^\*/
10
- return parse_chunk(line)
11
- when 'EOS'
12
- return {type: 'EOS'}
13
- when ''
14
- return nil
15
- else
16
- return parse_token(line)
17
- end
18
- end
19
- def parse_excab(line)
20
- line = line.gsub('\"', '""')
21
- null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
22
- case type
23
- when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
24
- excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
25
- when 'GROUP', 'GROUP_S'
26
- excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
27
- when 'ATTR'
28
- excab = {type: type, name: data[0], value: data[1]}
29
- end
30
- return excab
31
- end
32
- def parse_chunk(line)
33
- null, id, dep, part, score = line.chomp.split("\s")
34
- link, rel = dep[0..-2], dep[-1]
35
- head, func = part.split('/')
36
- chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
37
- return chunk
38
- end
39
- def parse_token(line)
40
- text, attrs, ne = line.chomp.split("\t")
41
- attrs = CSV.parse_line(attrs, col_sep:',')
42
- pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
43
- token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
44
- return token
45
- end
46
- module_function :parse, :parse_excab, :parse_chunk, :parse_token
47
- end