kokugo_tagger 0.0.6 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +36 -11
- data/bin/kokugo_tagger +23 -2
- data/lib/kokugo_tagger/cabocha.rb +75 -0
- data/lib/kokugo_tagger/kokugo.model +0 -0
- data/lib/kokugo_tagger/tagger.rb +222 -132
- data/lib/kokugo_tagger/version.rb +1 -1
- data/lib/kokugo_tagger.rb +2 -2
- metadata +5 -5
- data/lib/kokugo_tagger/parser.rb +0 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 572d91b93afea39ab298c267f0068436e62670c1
|
4
|
+
data.tar.gz: a5ffe79141ca346aa6c346b7514dab584eb234eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57414c4a4ec004c4f8a1483118ef638d58d744e1b0b71a9e7cb43a1f89e660de3fd9fe4408dc350901f5afb7c3094a2ad83b8461c406766b8b554b9345c0f411
|
7
|
+
data.tar.gz: 459cc09789690d93f943d027af74481aecce5ebd3283181c77c809c1f83f0e6fef56db379c10da69c3bc7a313a9c0711a3ce35fe5a04957601bbc9a24ddcf861
|
data/README.md
CHANGED
@@ -1,26 +1,51 @@
|
|
1
1
|
# KokugoTagger
|
2
2
|
|
3
|
-
|
3
|
+
cabocha形式のファイルに対して、学校文法に準拠した係り受けラベルを付与します。
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
|
-
|
7
|
+
事前に以下のツールをインストールし、パスを通しておく必要があります。
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
```
|
9
|
+
- Ruby
|
10
|
+
- YamCha
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
$ bundle
|
16
|
-
|
17
|
-
Or install it yourself as:
|
12
|
+
コマンドラインから以下のように入力し、インストールしてください。
|
18
13
|
|
19
14
|
$ gem install kokugo_tagger
|
20
15
|
|
21
16
|
## Usage
|
22
17
|
|
23
|
-
|
18
|
+
UTF-8/UniDicのCaboCha形式データにのみ対応しています。CaboCha形式のファイルにラベルを付与する場合は、次のように実行してください。
|
19
|
+
|
20
|
+
$ cat neko.cabocha | kokugo_tagger > output.cabocha
|
21
|
+
|
22
|
+
プレーンテキストの場合は、次のようにCaboChaと組み合わせて下さい。CaboChaは別途インストールしてください。
|
23
|
+
|
24
|
+
$ cat neko.txt | cabocha -f1 | kokugo_tagger > output.cabocha
|
25
|
+
|
26
|
+
コマンドライン上で対話的に実行することもできます。
|
27
|
+
|
28
|
+
$ cabocha -f1 | kokugo_tagger
|
29
|
+
吾輩は猫である
|
30
|
+
* 0 1S 0/1 0.000000
|
31
|
+
吾輩 代名詞,*,*,*,*,*,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,*,*,*,*,ワガハイ,ワガハイ,ワガハイ,ワガハイ,*,*,0,*,* O
|
32
|
+
は 助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* O
|
33
|
+
* 1 -1X 2/2 0.000000
|
34
|
+
猫 名詞,普通名詞,一般,*,*,*,ネコ,猫,猫,ネコ,猫,ネコ,和,*,*,*,*,ネコ,ネコ,ネコ,ネコ,*,*,1,C4,* O
|
35
|
+
で 助動詞,*,*,*,助動詞-ダ,連用形-一般,ダ,だ,で,デ,だ,ダ,和,*,*,*,*,デ,ダ,デ,ダ,*,*,*,名詞%F1,* O
|
36
|
+
ある 動詞,非自立可能,*,*,五段-ラ行,終止形-一般,アル,有る,ある,アル,ある,アル,和,*,*,*,*,アル,アル,アル,アル,*,*,1,C3,* O
|
37
|
+
EOS
|
38
|
+
|
39
|
+
係り受けラベルは以下の8種類です。
|
40
|
+
|
41
|
+
- S: 主語
|
42
|
+
- R: 連用修飾語
|
43
|
+
- T: 連体修飾語
|
44
|
+
- Z: 接続語
|
45
|
+
- D: 独立語
|
46
|
+
- H: 並立の関係
|
47
|
+
- J: 補助の関係
|
48
|
+
- X: その他(文末など)
|
24
49
|
|
25
50
|
## Contributing
|
26
51
|
|
data/bin/kokugo_tagger
CHANGED
@@ -1,6 +1,27 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'kokugo_tagger'
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
$enc = "UTF-8"
|
7
|
+
$model = nil
|
8
|
+
$learn = nil
|
9
|
+
$valid = nil
|
10
|
+
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
Version = "0.1.0"
|
13
|
+
opt.on('-e STR', '--encoding STR', String, 'encoding'){|str| $enc = str }
|
14
|
+
opt.on('-m FILE', '--model FILE', String, 'model file'){|file| $model = file }
|
15
|
+
opt.on('-l DIR', '--learn DIR', String, 'train corpus directory'){|dir| $learn = dir }
|
16
|
+
opt.on('-v DIR', '--valid DIR', String, 'K-fold cross-validation'){|dir| $learn = dir; $valid = 3 }
|
17
|
+
opt.parse!
|
18
|
+
end
|
19
|
+
|
20
|
+
# $stdin.set_encoding 'UTF-8'
|
21
|
+
if $learn and $valid
|
22
|
+
KokugoTagger.validation $learn, $enc, $valid
|
23
|
+
elsif $learn
|
24
|
+
KokugoTagger.learn $learn, $model, $enc
|
25
|
+
else
|
26
|
+
KokugoTagger.annotate $stdin, $model, $enc
|
27
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module KokugoTagger::Cabocha
|
4
|
+
module_function
|
5
|
+
def parse(file = @source)
|
6
|
+
document = KokugoTagger::Cabocha::Document.new
|
7
|
+
sentence, chunk = nil, nil
|
8
|
+
file.each_line do |line|
|
9
|
+
sentence ||= KokugoTagger::Cabocha::Sentence.new
|
10
|
+
case line
|
11
|
+
when /^EOS/
|
12
|
+
sentence.each{|chunk| chunk.detect_structure} # sem_headとsyn_headの独自判定
|
13
|
+
document << sentence
|
14
|
+
sentence, chunk = nil, nil
|
15
|
+
when /^\*/
|
16
|
+
chunk = KokugoTagger::Cabocha::Chunk.new(line)
|
17
|
+
sentence << chunk
|
18
|
+
chunk.sentence = sentence
|
19
|
+
when /^#/
|
20
|
+
# nothing
|
21
|
+
else
|
22
|
+
token = KokugoTagger::Cabocha::Token.new(line)
|
23
|
+
chunk.tokens << token
|
24
|
+
end
|
25
|
+
end
|
26
|
+
return document
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class KokugoTagger::Cabocha::Document < Array
|
31
|
+
end
|
32
|
+
|
33
|
+
class KokugoTagger::Cabocha::Sentence < Array
|
34
|
+
end
|
35
|
+
|
36
|
+
class KokugoTagger::Cabocha::Chunk
|
37
|
+
attr_accessor :info, :id, :link, :rel, :sem_head_id, :syn_head_id, :ext, :tokens, :sentence
|
38
|
+
def initialize(line)
|
39
|
+
@info = line.chomp.split(/\s/)[1..-1]
|
40
|
+
@id = @info[0].to_i
|
41
|
+
@link = @info[1].to_i
|
42
|
+
@rel = @info[1].delete("-0-9")
|
43
|
+
@sem_head_id = @info[2].split('/')[0].to_i
|
44
|
+
@syn_head_id = @info[2].split('/')[1].to_i
|
45
|
+
@ext = @info[4]
|
46
|
+
@tokens = []
|
47
|
+
end
|
48
|
+
def detect_structure
|
49
|
+
sem_head_id, syn_head_id = 0, 0
|
50
|
+
@tokens.each_with_index do |token, num|
|
51
|
+
if token.pos =~ /^(助詞|助動詞)/
|
52
|
+
syn_head_id = num
|
53
|
+
elsif token.pos !~ /^(補助記号|空白)/
|
54
|
+
sem_head_id = num if sem_head_id == syn_head_id
|
55
|
+
syn_head_id = num
|
56
|
+
end
|
57
|
+
end
|
58
|
+
@sem_head_id, @syn_head_id = sem_head_id, syn_head_id
|
59
|
+
end
|
60
|
+
def text
|
61
|
+
@tokens.map{|token| token.text}.join
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class KokugoTagger::Cabocha::Token
|
66
|
+
attr_accessor :info, :text, :pos, :ctype, :cform
|
67
|
+
def initialize(line)
|
68
|
+
text, info = line.chomp.split("\t")
|
69
|
+
@info = info.split(",")
|
70
|
+
@text = text
|
71
|
+
@pos = @info[0, 4].delete_if{|s| s == '*'}.join('-')
|
72
|
+
@ctype = @info[4]
|
73
|
+
@cform = @info[5]
|
74
|
+
end
|
75
|
+
end
|
Binary file
|
data/lib/kokugo_tagger/tagger.rb
CHANGED
@@ -1,151 +1,241 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
-
require 'csv'
|
3
2
|
|
4
3
|
module KokugoTagger
|
5
4
|
module_function
|
6
|
-
def annotate(
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
def annotate(source, model, enc)
|
6
|
+
enc ||= "UTF-8"
|
7
|
+
source.set_encoding enc
|
8
|
+
source_0, source_1 = KokugoTagger::Duplicator.duplicate(source, 2)
|
9
|
+
converter = KokugoTagger::Converter.connect(source_0)
|
10
|
+
yamcha = KokugoTagger::Yamcha.connect(converter, model)
|
11
|
+
merger = KokugoTagger::Merger.connect(yamcha, source_1)
|
12
|
+
merger.each_line{|line| puts line }
|
13
|
+
end
|
14
|
+
def convert(source)
|
15
|
+
converter = KokugoTagger::Converter.connect(source)
|
16
|
+
converter.each_line{|line| puts line }
|
17
|
+
end
|
18
|
+
def learn(source_dir, model, enc)
|
19
|
+
model ||= "kokugo"
|
20
|
+
enc ||= "UTF-8"
|
21
|
+
model = File.basename(model, ".model")
|
22
|
+
KokugoTagger::Learner.learn source_dir, model, enc
|
23
|
+
end
|
24
|
+
def validation(source_dir, enc, k)
|
25
|
+
enc ||= "UTF-8"
|
26
|
+
k_num ||= 3
|
27
|
+
KokugoTagger::Learner.validation source_dir, enc, k
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module KokugoTagger::Duplicator
|
32
|
+
module_function
|
33
|
+
def duplicate(source, number = 2)
|
34
|
+
pipes = number.times.map{ IO.pipe("UTF-8") }
|
35
|
+
Thread.fork do
|
36
|
+
source.each_line { |line| pipes.each{|r, w| w.puts line } }
|
37
|
+
pipes.each{|r, w| w.close}
|
38
|
+
end.abort_on_exception = true
|
39
|
+
return pipes.map{|r, w| r }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module KokugoTagger::Converter
|
44
|
+
module_function
|
45
|
+
def connect(source)
|
46
|
+
read, write = IO.pipe("UTF-8")
|
47
|
+
Thread.fork do
|
48
|
+
self.process source, write
|
49
|
+
write.close
|
50
|
+
end.abort_on_exception = true
|
51
|
+
return read
|
52
|
+
end
|
53
|
+
def process(source, output)
|
54
|
+
buffer = ""
|
55
|
+
source.each_line do |line|
|
56
|
+
buffer << line
|
57
|
+
self.flush(buffer, output) if line.chomp == "EOS"
|
12
58
|
end
|
13
59
|
end
|
14
|
-
def
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
def token(data)
|
21
|
-
@lpos += data[:text].size
|
22
|
-
@chunk[:end] = @lpos
|
23
|
-
@chunk[:text] += data[:text]
|
24
|
-
pos data
|
25
|
-
cform data
|
26
|
-
end
|
27
|
-
def segment_s(data)
|
28
|
-
@segments ||= []
|
29
|
-
@segments << data
|
30
|
-
@last_item = data
|
31
|
-
end
|
32
|
-
def group_s(data)
|
33
|
-
@groups ||= []
|
34
|
-
@groups << data
|
35
|
-
@last_item = data
|
36
|
-
end
|
37
|
-
def attr(data)
|
38
|
-
@last_item[:attributes] ||= []
|
39
|
-
@last_item[:attributes] << data
|
40
|
-
end
|
41
|
-
def eos(data)
|
42
|
-
return unless @chunks
|
43
|
-
before_eos
|
44
|
-
@chunks.each do |chunk|
|
45
|
-
puts '#! SEGMENT_S bccwj-kok:Bnst %d %d "%s"' % [chunk[:start], chunk[:end], chunk[:text]]
|
46
|
-
puts '#! ATTR bccwj-kok:pred "%s述語"' % chunk[:pos] if chunk[:pred]
|
47
|
-
puts '#! ATTR bccwj-kok:conj "%s"' % chunk[:conj] if chunk[:conj]
|
60
|
+
def flush(buffer, output)
|
61
|
+
document = KokugoTagger::Cabocha.parse(buffer)
|
62
|
+
document.each do |sentence|
|
63
|
+
sentence.each { |chunk| output.puts chunk_features(chunk).join(" ") }
|
64
|
+
output.puts
|
48
65
|
end
|
49
|
-
|
50
|
-
end
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
66
|
+
buffer.clear
|
67
|
+
end
|
68
|
+
def token_features(token)
|
69
|
+
return %w(* * * *) unless token
|
70
|
+
text, pos, cform = token.text, token.pos.split('-'), token.cform.split('-')
|
71
|
+
features = [text, pos[0], pos[1], cform[0]].map{|f| f || '*'}
|
72
|
+
return features
|
73
|
+
end
|
74
|
+
def chunk_features(chunk)
|
75
|
+
tokens = [sem_head(chunk), case_marker(chunk), syn_head(chunk), punct(chunk), sem_head(link_to(chunk))]
|
76
|
+
features = tokens.map{|token| token_features(token)}
|
77
|
+
return [chunk.text, features, chunk.rel].flatten
|
78
|
+
# return [chunk.text, features, chunk.ext].flatten
|
79
|
+
end
|
80
|
+
def sem_head(chunk)
|
81
|
+
return nil unless chunk
|
82
|
+
chunk.tokens[chunk.sem_head_id]
|
83
|
+
end
|
84
|
+
def case_marker(chunk)
|
85
|
+
return nil unless chunk
|
86
|
+
chunk.tokens.find{|token| token.pos.split('-')[1] == '格助詞'}
|
87
|
+
end
|
88
|
+
def syn_head(chunk)
|
89
|
+
return nil unless chunk
|
90
|
+
chunk.tokens[chunk.syn_head_id]
|
91
|
+
end
|
92
|
+
def punct(chunk)
|
93
|
+
return nil unless chunk
|
94
|
+
chunk.tokens[chunk.syn_head_id + 1]
|
95
|
+
end
|
96
|
+
def link_to(chunk)
|
97
|
+
chunk.sentence[chunk.link] if chunk.link != -1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
module KokugoTagger::Yamcha
|
102
|
+
module_function
|
103
|
+
def connect(source, model)
|
104
|
+
model ||= File.dirname(__FILE__) + "/kokugo.model"
|
105
|
+
io = IO.popen("yamcha -m \"#{model}\"", "r+", encoding: "UTF-8")
|
106
|
+
Thread.fork {
|
107
|
+
source.each_line{|line| io.puts line }
|
108
|
+
io.close_write
|
109
|
+
}.abort_on_exception = true
|
110
|
+
return io
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
module KokugoTagger::Merger
|
115
|
+
module_function
|
116
|
+
def connect(yamcha, cabocha)
|
117
|
+
read, write = IO.pipe("UTF-8")
|
118
|
+
Thread.fork do
|
119
|
+
self.process yamcha, cabocha, write
|
120
|
+
write.close
|
121
|
+
end.abort_on_exception = true
|
122
|
+
return read
|
123
|
+
end
|
124
|
+
def process(yamcha, cabocha, output)
|
125
|
+
cabocha.each_line do |line|
|
126
|
+
if line[0] == "*"
|
127
|
+
record = yamcha.gets
|
128
|
+
record = yamcha.gets until record.chomp != ""
|
129
|
+
letter = record.chomp.split("\t").last.upcase
|
130
|
+
line.sub! /[A-Z]+/, letter
|
81
131
|
end
|
82
|
-
|
83
|
-
@chunk.update conj:'修飾(連用)'
|
84
|
-
when /^助詞-接続詞/
|
85
|
-
@chunk.update pred:true, conj:'接続'
|
86
|
-
when /^助詞-終助詞/
|
87
|
-
@chunk.update pred:true, conj:nil
|
88
|
-
when /^助詞-準体助詞/
|
89
|
-
@chunk.update conj:nil
|
132
|
+
output.puts line
|
90
133
|
end
|
91
134
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
135
|
+
end
|
136
|
+
|
137
|
+
module KokugoTagger::Learner
|
138
|
+
module_function
|
139
|
+
def learn(source_dir, model, enc)
|
140
|
+
model ||= "kokugo"
|
141
|
+
convert source_dir, "train.data", enc
|
142
|
+
yamcha_learn "train.data", model
|
143
|
+
end
|
144
|
+
def validation(source_dir, enc, k)
|
145
|
+
convert source_dir, "train.data", enc
|
146
|
+
filenames = split("train.data", enc, k)
|
147
|
+
filenames.each_with_index do |filename, n|
|
148
|
+
others = filenames - [filename]
|
149
|
+
concat others, "temp.data"
|
150
|
+
model = "test"
|
151
|
+
yamcha_learn "temp.data", model
|
152
|
+
result = "result.#{n}.data"
|
153
|
+
system "cat #{filename} | yamcha -m test.model > #{result}"
|
109
154
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
155
|
+
data_set = []
|
156
|
+
filenames = Array.new(k){|n| "result.#{n}.data"}
|
157
|
+
filenames.each do |filename|
|
158
|
+
sentence = 0
|
159
|
+
chunk = 0
|
160
|
+
accuracy = 0
|
161
|
+
label_data = Hash.new{|h, k| h[k] = Hash.new(0)}
|
162
|
+
File.foreach(filename, encoding: enc) do |line|
|
163
|
+
line.chomp!
|
164
|
+
if line.empty?
|
165
|
+
sentence += 1
|
166
|
+
else
|
167
|
+
t, p = *line.split(/\s/)[-2, 2]
|
168
|
+
acc = (t == p)
|
169
|
+
chunk += 1
|
170
|
+
accuracy += 1 if acc
|
171
|
+
label_data[t][[true, acc]] += 1
|
172
|
+
label_data[p][[acc, true]] += 1
|
173
|
+
end
|
126
174
|
end
|
175
|
+
data_set << [sentence, chunk, accuracy, label_data]
|
127
176
|
end
|
128
|
-
|
129
|
-
#
|
130
|
-
|
131
|
-
|
177
|
+
report = open("validation.txt", "w:UTF-8")
|
178
|
+
report.puts "# #{k}-fold cross-validation"
|
179
|
+
report.puts
|
180
|
+
report.puts "## test files"
|
181
|
+
k.times do |n|
|
182
|
+
sentence, chunk = *data_set[n]
|
183
|
+
report.puts "train.#{n}.data: #{sentence} sentences. #{chunk} chunks."
|
132
184
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
185
|
+
report.puts
|
186
|
+
k.times do |n|
|
187
|
+
sentence, chunk, accuracy, label_data = *data_set[n]
|
188
|
+
report.puts "## train.#{n}.data"
|
189
|
+
report.puts "accuracy: #{accuracy.to_f / chunk}"
|
190
|
+
report.puts "labels:"
|
191
|
+
report.puts %w(label tp tn fp fn recall precision f-score accuracy).join("\t")
|
192
|
+
label_data.sort.each do |label, data|
|
193
|
+
tp = data[[true, true]]
|
194
|
+
tn = data[[true, false]]
|
195
|
+
fp = data[[false, true]]
|
196
|
+
fn = chunk - (tp + tn + fp)
|
197
|
+
recall = tp.to_f / (tp + tn)
|
198
|
+
precision = tp.to_f / (tp + fp)
|
199
|
+
f = 2 * precision * recall / (precision + recall)
|
200
|
+
acc = (tp + fn).to_f / chunk
|
201
|
+
report.puts %w(%s %d %d %d %d %.2f %.2f %.2f %.2f).join("\t") % [label, tp, tn, fp, fn, recall, precision, f, acc]
|
202
|
+
end
|
203
|
+
report.puts
|
136
204
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
205
|
+
end
|
206
|
+
def convert(source_dir, target_filename, enc)
|
207
|
+
target = open(target_filename, "w:#{enc}")
|
208
|
+
source_filenames = Dir.glob(source_dir + "/*.cabocha")
|
209
|
+
# source_filenames = source_filenames[0, 2] # debug
|
210
|
+
source_filenames.each do |filename|
|
211
|
+
source = open(filename, encoding: enc)
|
212
|
+
converter = KokugoTagger::Converter.connect(source)
|
213
|
+
converter.each_line{|line| target.puts line }
|
214
|
+
source.close
|
215
|
+
end
|
216
|
+
target.close
|
217
|
+
end
|
218
|
+
def split(source_filename, enc, k)
|
219
|
+
basename = File.basename(source_filename, ".data")
|
220
|
+
target_filenames = Array.new(k){|n| "#{basename}.#{n}.data"}
|
221
|
+
targets = target_filenames.map{|filename| open(filename, "w:#{enc}")}
|
222
|
+
index = 0
|
223
|
+
File.foreach(source_filename, encoding: enc) do |line|
|
224
|
+
targets[index].puts line
|
225
|
+
if line.chomp.empty?
|
226
|
+
index += 1
|
227
|
+
index = 0 if index == k
|
148
228
|
end
|
149
229
|
end
|
230
|
+
targets.each{|f| f.close}
|
231
|
+
return target_filenames
|
232
|
+
end
|
233
|
+
def concat(source_filenames, target_filename)
|
234
|
+
system "cat #{source_filenames.join(" ")} > #{target_filename}"
|
235
|
+
end
|
236
|
+
def yamcha_learn(train_data, model)
|
237
|
+
libexecdir = `yamcha-config --libexecdir`.chomp
|
238
|
+
system "cp #{libexecdir}/Makefile ."
|
239
|
+
system "make CORPUS=#{train_data} MODEL=#{model} FEATURE=\"F:0..0:1..\" SVM_PARAM=\"-t 1 -d 2 -c 1\" MULTI_CLASS=1 train"
|
150
240
|
end
|
151
241
|
end
|
data/lib/kokugo_tagger.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kokugo_tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mizuho IMADA
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,7 +54,8 @@ files:
|
|
54
54
|
- bin/kokugo_tagger
|
55
55
|
- kokugo_tagger.gemspec
|
56
56
|
- lib/kokugo_tagger.rb
|
57
|
-
- lib/kokugo_tagger/
|
57
|
+
- lib/kokugo_tagger/cabocha.rb
|
58
|
+
- lib/kokugo_tagger/kokugo.model
|
58
59
|
- lib/kokugo_tagger/tagger.rb
|
59
60
|
- lib/kokugo_tagger/version.rb
|
60
61
|
homepage: ''
|
@@ -77,9 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
78
|
version: '0'
|
78
79
|
requirements: []
|
79
80
|
rubyforge_project:
|
80
|
-
rubygems_version: 2.
|
81
|
+
rubygems_version: 2.5.2
|
81
82
|
signing_key:
|
82
83
|
specification_version: 4
|
83
84
|
summary: Write a short summary. Required.
|
84
85
|
test_files: []
|
85
|
-
has_rdoc:
|
data/lib/kokugo_tagger/parser.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'csv'
|
3
|
-
|
4
|
-
module CabochaParser
|
5
|
-
def parse(line)
|
6
|
-
case line.chomp
|
7
|
-
when /^#/
|
8
|
-
return parse_excab(line)
|
9
|
-
when /^\*/
|
10
|
-
return parse_chunk(line)
|
11
|
-
when 'EOS'
|
12
|
-
return {type: 'EOS'}
|
13
|
-
when ''
|
14
|
-
return nil
|
15
|
-
else
|
16
|
-
return parse_token(line)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
def parse_excab(line)
|
20
|
-
line = line.gsub('\"', '""')
|
21
|
-
null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
|
22
|
-
case type
|
23
|
-
when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
|
24
|
-
excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
|
25
|
-
when 'GROUP', 'GROUP_S'
|
26
|
-
excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
|
27
|
-
when 'ATTR'
|
28
|
-
excab = {type: type, name: data[0], value: data[1]}
|
29
|
-
end
|
30
|
-
return excab
|
31
|
-
end
|
32
|
-
def parse_chunk(line)
|
33
|
-
null, id, dep, part, score = line.chomp.split("\s")
|
34
|
-
link, rel = dep[0..-2], dep[-1]
|
35
|
-
head, func = part.split('/')
|
36
|
-
chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
|
37
|
-
return chunk
|
38
|
-
end
|
39
|
-
def parse_token(line)
|
40
|
-
text, attrs, ne = line.chomp.split("\t")
|
41
|
-
attrs = CSV.parse_line(attrs, col_sep:',')
|
42
|
-
pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
|
43
|
-
token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
|
44
|
-
return token
|
45
|
-
end
|
46
|
-
module_function :parse, :parse_excab, :parse_chunk, :parse_token
|
47
|
-
end
|