kokugo_tagger 0.0.6 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +36 -11
- data/bin/kokugo_tagger +23 -2
- data/lib/kokugo_tagger/cabocha.rb +75 -0
- data/lib/kokugo_tagger/kokugo.model +0 -0
- data/lib/kokugo_tagger/tagger.rb +222 -132
- data/lib/kokugo_tagger/version.rb +1 -1
- data/lib/kokugo_tagger.rb +2 -2
- metadata +5 -5
- data/lib/kokugo_tagger/parser.rb +0 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 572d91b93afea39ab298c267f0068436e62670c1
|
4
|
+
data.tar.gz: a5ffe79141ca346aa6c346b7514dab584eb234eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57414c4a4ec004c4f8a1483118ef638d58d744e1b0b71a9e7cb43a1f89e660de3fd9fe4408dc350901f5afb7c3094a2ad83b8461c406766b8b554b9345c0f411
|
7
|
+
data.tar.gz: 459cc09789690d93f943d027af74481aecce5ebd3283181c77c809c1f83f0e6fef56db379c10da69c3bc7a313a9c0711a3ce35fe5a04957601bbc9a24ddcf861
|
data/README.md
CHANGED
@@ -1,26 +1,51 @@
|
|
1
1
|
# KokugoTagger
|
2
2
|
|
3
|
-
|
3
|
+
cabocha形式のファイルに対して、学校文法に準拠した係り受けラベルを付与します。
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
|
-
|
7
|
+
事前に以下のツールをインストールし、パスを通しておく必要があります。
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
```
|
9
|
+
- Ruby
|
10
|
+
- YamCha
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
$ bundle
|
16
|
-
|
17
|
-
Or install it yourself as:
|
12
|
+
コマンドラインから以下のように入力し、インストールしてください。
|
18
13
|
|
19
14
|
$ gem install kokugo_tagger
|
20
15
|
|
21
16
|
## Usage
|
22
17
|
|
23
|
-
|
18
|
+
UTF-8/UniDicのCaboCha形式データにのみ対応しています。CaboCha形式のファイルにラベルを付与する場合は、次のように実行してください。
|
19
|
+
|
20
|
+
$ cat neko.cabocha | kokugo_tagger > output.cabocha
|
21
|
+
|
22
|
+
プレーンテキストの場合は、次のようにCaboChaと組み合わせて下さい。CaboChaは別途インストールしてください。
|
23
|
+
|
24
|
+
$ cat neko.txt | cabocha -f1 | kokugo_tagger > output.cabocha
|
25
|
+
|
26
|
+
コマンドライン上で対話的に実行することもできます。
|
27
|
+
|
28
|
+
$ cabocha -f1 | kokugo_tagger
|
29
|
+
吾輩は猫である
|
30
|
+
* 0 1S 0/1 0.000000
|
31
|
+
吾輩 代名詞,*,*,*,*,*,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,*,*,*,*,ワガハイ,ワガハイ,ワガハイ,ワガハイ,*,*,0,*,* O
|
32
|
+
は 助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",* O
|
33
|
+
* 1 -1X 2/2 0.000000
|
34
|
+
猫 名詞,普通名詞,一般,*,*,*,ネコ,猫,猫,ネコ,猫,ネコ,和,*,*,*,*,ネコ,ネコ,ネコ,ネコ,*,*,1,C4,* O
|
35
|
+
で 助動詞,*,*,*,助動詞-ダ,連用形-一般,ダ,だ,で,デ,だ,ダ,和,*,*,*,*,デ,ダ,デ,ダ,*,*,*,名詞%F1,* O
|
36
|
+
ある 動詞,非自立可能,*,*,五段-ラ行,終止形-一般,アル,有る,ある,アル,ある,アル,和,*,*,*,*,アル,アル,アル,アル,*,*,1,C3,* O
|
37
|
+
EOS
|
38
|
+
|
39
|
+
係り受けラベルは以下の8種類です。
|
40
|
+
|
41
|
+
- S: 主語
|
42
|
+
- R: 連用修飾語
|
43
|
+
- T: 連体修飾語
|
44
|
+
- Z: 接続語
|
45
|
+
- D: 独立語
|
46
|
+
- H: 並立の関係
|
47
|
+
- J: 補助の関係
|
48
|
+
- X: その他(文末など)
|
24
49
|
|
25
50
|
## Contributing
|
26
51
|
|
data/bin/kokugo_tagger
CHANGED
@@ -1,6 +1,27 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
require 'optparse'
|
3
4
|
require 'kokugo_tagger'
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
$enc = "UTF-8"
|
7
|
+
$model = nil
|
8
|
+
$learn = nil
|
9
|
+
$valid = nil
|
10
|
+
|
11
|
+
OptionParser.new do |opt|
|
12
|
+
Version = "0.1.0"
|
13
|
+
opt.on('-e STR', '--encoding STR', String, 'encoding'){|str| $enc = str }
|
14
|
+
opt.on('-m FILE', '--model FILE', String, 'model file'){|file| $model = file }
|
15
|
+
opt.on('-l DIR', '--learn DIR', String, 'train corpus directory'){|dir| $learn = dir }
|
16
|
+
opt.on('-v DIR', '--valid DIR', String, 'K-fold cross-validation'){|dir| $learn = dir; $valid = 3 }
|
17
|
+
opt.parse!
|
18
|
+
end
|
19
|
+
|
20
|
+
# $stdin.set_encoding 'UTF-8'
|
21
|
+
if $learn and $valid
|
22
|
+
KokugoTagger.validation $learn, $enc, $valid
|
23
|
+
elsif $learn
|
24
|
+
KokugoTagger.learn $learn, $model, $enc
|
25
|
+
else
|
26
|
+
KokugoTagger.annotate $stdin, $model, $enc
|
27
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module KokugoTagger::Cabocha
|
4
|
+
module_function
|
5
|
+
def parse(file = @source)
|
6
|
+
document = KokugoTagger::Cabocha::Document.new
|
7
|
+
sentence, chunk = nil, nil
|
8
|
+
file.each_line do |line|
|
9
|
+
sentence ||= KokugoTagger::Cabocha::Sentence.new
|
10
|
+
case line
|
11
|
+
when /^EOS/
|
12
|
+
sentence.each{|chunk| chunk.detect_structure} # sem_headとsyn_headの独自判定
|
13
|
+
document << sentence
|
14
|
+
sentence, chunk = nil, nil
|
15
|
+
when /^\*/
|
16
|
+
chunk = KokugoTagger::Cabocha::Chunk.new(line)
|
17
|
+
sentence << chunk
|
18
|
+
chunk.sentence = sentence
|
19
|
+
when /^#/
|
20
|
+
# nothing
|
21
|
+
else
|
22
|
+
token = KokugoTagger::Cabocha::Token.new(line)
|
23
|
+
chunk.tokens << token
|
24
|
+
end
|
25
|
+
end
|
26
|
+
return document
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class KokugoTagger::Cabocha::Document < Array
|
31
|
+
end
|
32
|
+
|
33
|
+
class KokugoTagger::Cabocha::Sentence < Array
|
34
|
+
end
|
35
|
+
|
36
|
+
class KokugoTagger::Cabocha::Chunk
|
37
|
+
attr_accessor :info, :id, :link, :rel, :sem_head_id, :syn_head_id, :ext, :tokens, :sentence
|
38
|
+
def initialize(line)
|
39
|
+
@info = line.chomp.split(/\s/)[1..-1]
|
40
|
+
@id = @info[0].to_i
|
41
|
+
@link = @info[1].to_i
|
42
|
+
@rel = @info[1].delete("-0-9")
|
43
|
+
@sem_head_id = @info[2].split('/')[0].to_i
|
44
|
+
@syn_head_id = @info[2].split('/')[1].to_i
|
45
|
+
@ext = @info[4]
|
46
|
+
@tokens = []
|
47
|
+
end
|
48
|
+
def detect_structure
|
49
|
+
sem_head_id, syn_head_id = 0, 0
|
50
|
+
@tokens.each_with_index do |token, num|
|
51
|
+
if token.pos =~ /^(助詞|助動詞)/
|
52
|
+
syn_head_id = num
|
53
|
+
elsif token.pos !~ /^(補助記号|空白)/
|
54
|
+
sem_head_id = num if sem_head_id == syn_head_id
|
55
|
+
syn_head_id = num
|
56
|
+
end
|
57
|
+
end
|
58
|
+
@sem_head_id, @syn_head_id = sem_head_id, syn_head_id
|
59
|
+
end
|
60
|
+
def text
|
61
|
+
@tokens.map{|token| token.text}.join
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class KokugoTagger::Cabocha::Token
|
66
|
+
attr_accessor :info, :text, :pos, :ctype, :cform
|
67
|
+
def initialize(line)
|
68
|
+
text, info = line.chomp.split("\t")
|
69
|
+
@info = info.split(",")
|
70
|
+
@text = text
|
71
|
+
@pos = @info[0, 4].delete_if{|s| s == '*'}.join('-')
|
72
|
+
@ctype = @info[4]
|
73
|
+
@cform = @info[5]
|
74
|
+
end
|
75
|
+
end
|
Binary file
|
data/lib/kokugo_tagger/tagger.rb
CHANGED
@@ -1,151 +1,241 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
-
require 'csv'
|
3
2
|
|
4
3
|
module KokugoTagger
|
5
4
|
module_function
|
6
|
-
def annotate(
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
def annotate(source, model, enc)
|
6
|
+
enc ||= "UTF-8"
|
7
|
+
source.set_encoding enc
|
8
|
+
source_0, source_1 = KokugoTagger::Duplicator.duplicate(source, 2)
|
9
|
+
converter = KokugoTagger::Converter.connect(source_0)
|
10
|
+
yamcha = KokugoTagger::Yamcha.connect(converter, model)
|
11
|
+
merger = KokugoTagger::Merger.connect(yamcha, source_1)
|
12
|
+
merger.each_line{|line| puts line }
|
13
|
+
end
|
14
|
+
def convert(source)
|
15
|
+
converter = KokugoTagger::Converter.connect(source)
|
16
|
+
converter.each_line{|line| puts line }
|
17
|
+
end
|
18
|
+
def learn(source_dir, model, enc)
|
19
|
+
model ||= "kokugo"
|
20
|
+
enc ||= "UTF-8"
|
21
|
+
model = File.basename(model, ".model")
|
22
|
+
KokugoTagger::Learner.learn source_dir, model, enc
|
23
|
+
end
|
24
|
+
def validation(source_dir, enc, k)
|
25
|
+
enc ||= "UTF-8"
|
26
|
+
k_num ||= 3
|
27
|
+
KokugoTagger::Learner.validation source_dir, enc, k
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module KokugoTagger::Duplicator
|
32
|
+
module_function
|
33
|
+
def duplicate(source, number = 2)
|
34
|
+
pipes = number.times.map{ IO.pipe("UTF-8") }
|
35
|
+
Thread.fork do
|
36
|
+
source.each_line { |line| pipes.each{|r, w| w.puts line } }
|
37
|
+
pipes.each{|r, w| w.close}
|
38
|
+
end.abort_on_exception = true
|
39
|
+
return pipes.map{|r, w| r }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module KokugoTagger::Converter
|
44
|
+
module_function
|
45
|
+
def connect(source)
|
46
|
+
read, write = IO.pipe("UTF-8")
|
47
|
+
Thread.fork do
|
48
|
+
self.process source, write
|
49
|
+
write.close
|
50
|
+
end.abort_on_exception = true
|
51
|
+
return read
|
52
|
+
end
|
53
|
+
def process(source, output)
|
54
|
+
buffer = ""
|
55
|
+
source.each_line do |line|
|
56
|
+
buffer << line
|
57
|
+
self.flush(buffer, output) if line.chomp == "EOS"
|
12
58
|
end
|
13
59
|
end
|
14
|
-
def
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
def token(data)
|
21
|
-
@lpos += data[:text].size
|
22
|
-
@chunk[:end] = @lpos
|
23
|
-
@chunk[:text] += data[:text]
|
24
|
-
pos data
|
25
|
-
cform data
|
26
|
-
end
|
27
|
-
def segment_s(data)
|
28
|
-
@segments ||= []
|
29
|
-
@segments << data
|
30
|
-
@last_item = data
|
31
|
-
end
|
32
|
-
def group_s(data)
|
33
|
-
@groups ||= []
|
34
|
-
@groups << data
|
35
|
-
@last_item = data
|
36
|
-
end
|
37
|
-
def attr(data)
|
38
|
-
@last_item[:attributes] ||= []
|
39
|
-
@last_item[:attributes] << data
|
40
|
-
end
|
41
|
-
def eos(data)
|
42
|
-
return unless @chunks
|
43
|
-
before_eos
|
44
|
-
@chunks.each do |chunk|
|
45
|
-
puts '#! SEGMENT_S bccwj-kok:Bnst %d %d "%s"' % [chunk[:start], chunk[:end], chunk[:text]]
|
46
|
-
puts '#! ATTR bccwj-kok:pred "%s述語"' % chunk[:pos] if chunk[:pred]
|
47
|
-
puts '#! ATTR bccwj-kok:conj "%s"' % chunk[:conj] if chunk[:conj]
|
60
|
+
def flush(buffer, output)
|
61
|
+
document = KokugoTagger::Cabocha.parse(buffer)
|
62
|
+
document.each do |sentence|
|
63
|
+
sentence.each { |chunk| output.puts chunk_features(chunk).join(" ") }
|
64
|
+
output.puts
|
48
65
|
end
|
49
|
-
|
50
|
-
end
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
66
|
+
buffer.clear
|
67
|
+
end
|
68
|
+
def token_features(token)
|
69
|
+
return %w(* * * *) unless token
|
70
|
+
text, pos, cform = token.text, token.pos.split('-'), token.cform.split('-')
|
71
|
+
features = [text, pos[0], pos[1], cform[0]].map{|f| f || '*'}
|
72
|
+
return features
|
73
|
+
end
|
74
|
+
def chunk_features(chunk)
|
75
|
+
tokens = [sem_head(chunk), case_marker(chunk), syn_head(chunk), punct(chunk), sem_head(link_to(chunk))]
|
76
|
+
features = tokens.map{|token| token_features(token)}
|
77
|
+
return [chunk.text, features, chunk.rel].flatten
|
78
|
+
# return [chunk.text, features, chunk.ext].flatten
|
79
|
+
end
|
80
|
+
def sem_head(chunk)
|
81
|
+
return nil unless chunk
|
82
|
+
chunk.tokens[chunk.sem_head_id]
|
83
|
+
end
|
84
|
+
def case_marker(chunk)
|
85
|
+
return nil unless chunk
|
86
|
+
chunk.tokens.find{|token| token.pos.split('-')[1] == '格助詞'}
|
87
|
+
end
|
88
|
+
def syn_head(chunk)
|
89
|
+
return nil unless chunk
|
90
|
+
chunk.tokens[chunk.syn_head_id]
|
91
|
+
end
|
92
|
+
def punct(chunk)
|
93
|
+
return nil unless chunk
|
94
|
+
chunk.tokens[chunk.syn_head_id + 1]
|
95
|
+
end
|
96
|
+
def link_to(chunk)
|
97
|
+
chunk.sentence[chunk.link] if chunk.link != -1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
module KokugoTagger::Yamcha
|
102
|
+
module_function
|
103
|
+
def connect(source, model)
|
104
|
+
model ||= File.dirname(__FILE__) + "/kokugo.model"
|
105
|
+
io = IO.popen("yamcha -m \"#{model}\"", "r+", encoding: "UTF-8")
|
106
|
+
Thread.fork {
|
107
|
+
source.each_line{|line| io.puts line }
|
108
|
+
io.close_write
|
109
|
+
}.abort_on_exception = true
|
110
|
+
return io
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
module KokugoTagger::Merger
|
115
|
+
module_function
|
116
|
+
def connect(yamcha, cabocha)
|
117
|
+
read, write = IO.pipe("UTF-8")
|
118
|
+
Thread.fork do
|
119
|
+
self.process yamcha, cabocha, write
|
120
|
+
write.close
|
121
|
+
end.abort_on_exception = true
|
122
|
+
return read
|
123
|
+
end
|
124
|
+
def process(yamcha, cabocha, output)
|
125
|
+
cabocha.each_line do |line|
|
126
|
+
if line[0] == "*"
|
127
|
+
record = yamcha.gets
|
128
|
+
record = yamcha.gets until record.chomp != ""
|
129
|
+
letter = record.chomp.split("\t").last.upcase
|
130
|
+
line.sub! /[A-Z]+/, letter
|
81
131
|
end
|
82
|
-
|
83
|
-
@chunk.update conj:'修飾(連用)'
|
84
|
-
when /^助詞-接続詞/
|
85
|
-
@chunk.update pred:true, conj:'接続'
|
86
|
-
when /^助詞-終助詞/
|
87
|
-
@chunk.update pred:true, conj:nil
|
88
|
-
when /^助詞-準体助詞/
|
89
|
-
@chunk.update conj:nil
|
132
|
+
output.puts line
|
90
133
|
end
|
91
134
|
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
135
|
+
end
|
136
|
+
|
137
|
+
module KokugoTagger::Learner
|
138
|
+
module_function
|
139
|
+
def learn(source_dir, model, enc)
|
140
|
+
model ||= "kokugo"
|
141
|
+
convert source_dir, "train.data", enc
|
142
|
+
yamcha_learn "train.data", model
|
143
|
+
end
|
144
|
+
def validation(source_dir, enc, k)
|
145
|
+
convert source_dir, "train.data", enc
|
146
|
+
filenames = split("train.data", enc, k)
|
147
|
+
filenames.each_with_index do |filename, n|
|
148
|
+
others = filenames - [filename]
|
149
|
+
concat others, "temp.data"
|
150
|
+
model = "test"
|
151
|
+
yamcha_learn "temp.data", model
|
152
|
+
result = "result.#{n}.data"
|
153
|
+
system "cat #{filename} | yamcha -m test.model > #{result}"
|
109
154
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
155
|
+
data_set = []
|
156
|
+
filenames = Array.new(k){|n| "result.#{n}.data"}
|
157
|
+
filenames.each do |filename|
|
158
|
+
sentence = 0
|
159
|
+
chunk = 0
|
160
|
+
accuracy = 0
|
161
|
+
label_data = Hash.new{|h, k| h[k] = Hash.new(0)}
|
162
|
+
File.foreach(filename, encoding: enc) do |line|
|
163
|
+
line.chomp!
|
164
|
+
if line.empty?
|
165
|
+
sentence += 1
|
166
|
+
else
|
167
|
+
t, p = *line.split(/\s/)[-2, 2]
|
168
|
+
acc = (t == p)
|
169
|
+
chunk += 1
|
170
|
+
accuracy += 1 if acc
|
171
|
+
label_data[t][[true, acc]] += 1
|
172
|
+
label_data[p][[acc, true]] += 1
|
173
|
+
end
|
126
174
|
end
|
175
|
+
data_set << [sentence, chunk, accuracy, label_data]
|
127
176
|
end
|
128
|
-
|
129
|
-
#
|
130
|
-
|
131
|
-
|
177
|
+
report = open("validation.txt", "w:UTF-8")
|
178
|
+
report.puts "# #{k}-fold cross-validation"
|
179
|
+
report.puts
|
180
|
+
report.puts "## test files"
|
181
|
+
k.times do |n|
|
182
|
+
sentence, chunk = *data_set[n]
|
183
|
+
report.puts "train.#{n}.data: #{sentence} sentences. #{chunk} chunks."
|
132
184
|
end
|
133
|
-
|
134
|
-
|
135
|
-
|
185
|
+
report.puts
|
186
|
+
k.times do |n|
|
187
|
+
sentence, chunk, accuracy, label_data = *data_set[n]
|
188
|
+
report.puts "## train.#{n}.data"
|
189
|
+
report.puts "accuracy: #{accuracy.to_f / chunk}"
|
190
|
+
report.puts "labels:"
|
191
|
+
report.puts %w(label tp tn fp fn recall precision f-score accuracy).join("\t")
|
192
|
+
label_data.sort.each do |label, data|
|
193
|
+
tp = data[[true, true]]
|
194
|
+
tn = data[[true, false]]
|
195
|
+
fp = data[[false, true]]
|
196
|
+
fn = chunk - (tp + tn + fp)
|
197
|
+
recall = tp.to_f / (tp + tn)
|
198
|
+
precision = tp.to_f / (tp + fp)
|
199
|
+
f = 2 * precision * recall / (precision + recall)
|
200
|
+
acc = (tp + fn).to_f / chunk
|
201
|
+
report.puts %w(%s %d %d %d %d %.2f %.2f %.2f %.2f).join("\t") % [label, tp, tn, fp, fn, recall, precision, f, acc]
|
202
|
+
end
|
203
|
+
report.puts
|
136
204
|
end
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
205
|
+
end
|
206
|
+
def convert(source_dir, target_filename, enc)
|
207
|
+
target = open(target_filename, "w:#{enc}")
|
208
|
+
source_filenames = Dir.glob(source_dir + "/*.cabocha")
|
209
|
+
# source_filenames = source_filenames[0, 2] # debug
|
210
|
+
source_filenames.each do |filename|
|
211
|
+
source = open(filename, encoding: enc)
|
212
|
+
converter = KokugoTagger::Converter.connect(source)
|
213
|
+
converter.each_line{|line| target.puts line }
|
214
|
+
source.close
|
215
|
+
end
|
216
|
+
target.close
|
217
|
+
end
|
218
|
+
def split(source_filename, enc, k)
|
219
|
+
basename = File.basename(source_filename, ".data")
|
220
|
+
target_filenames = Array.new(k){|n| "#{basename}.#{n}.data"}
|
221
|
+
targets = target_filenames.map{|filename| open(filename, "w:#{enc}")}
|
222
|
+
index = 0
|
223
|
+
File.foreach(source_filename, encoding: enc) do |line|
|
224
|
+
targets[index].puts line
|
225
|
+
if line.chomp.empty?
|
226
|
+
index += 1
|
227
|
+
index = 0 if index == k
|
148
228
|
end
|
149
229
|
end
|
230
|
+
targets.each{|f| f.close}
|
231
|
+
return target_filenames
|
232
|
+
end
|
233
|
+
def concat(source_filenames, target_filename)
|
234
|
+
system "cat #{source_filenames.join(" ")} > #{target_filename}"
|
235
|
+
end
|
236
|
+
def yamcha_learn(train_data, model)
|
237
|
+
libexecdir = `yamcha-config --libexecdir`.chomp
|
238
|
+
system "cp #{libexecdir}/Makefile ."
|
239
|
+
system "make CORPUS=#{train_data} MODEL=#{model} FEATURE=\"F:0..0:1..\" SVM_PARAM=\"-t 1 -d 2 -c 1\" MULTI_CLASS=1 train"
|
150
240
|
end
|
151
241
|
end
|
data/lib/kokugo_tagger.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kokugo_tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mizuho IMADA
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,7 +54,8 @@ files:
|
|
54
54
|
- bin/kokugo_tagger
|
55
55
|
- kokugo_tagger.gemspec
|
56
56
|
- lib/kokugo_tagger.rb
|
57
|
-
- lib/kokugo_tagger/
|
57
|
+
- lib/kokugo_tagger/cabocha.rb
|
58
|
+
- lib/kokugo_tagger/kokugo.model
|
58
59
|
- lib/kokugo_tagger/tagger.rb
|
59
60
|
- lib/kokugo_tagger/version.rb
|
60
61
|
homepage: ''
|
@@ -77,9 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
78
|
version: '0'
|
78
79
|
requirements: []
|
79
80
|
rubyforge_project:
|
80
|
-
rubygems_version: 2.
|
81
|
+
rubygems_version: 2.5.2
|
81
82
|
signing_key:
|
82
83
|
specification_version: 4
|
83
84
|
summary: Write a short summary. Required.
|
84
85
|
test_files: []
|
85
|
-
has_rdoc:
|
data/lib/kokugo_tagger/parser.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'csv'
|
3
|
-
|
4
|
-
module CabochaParser
|
5
|
-
def parse(line)
|
6
|
-
case line.chomp
|
7
|
-
when /^#/
|
8
|
-
return parse_excab(line)
|
9
|
-
when /^\*/
|
10
|
-
return parse_chunk(line)
|
11
|
-
when 'EOS'
|
12
|
-
return {type: 'EOS'}
|
13
|
-
when ''
|
14
|
-
return nil
|
15
|
-
else
|
16
|
-
return parse_token(line)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
def parse_excab(line)
|
20
|
-
line = line.gsub('\"', '""')
|
21
|
-
null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
|
22
|
-
case type
|
23
|
-
when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
|
24
|
-
excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
|
25
|
-
when 'GROUP', 'GROUP_S'
|
26
|
-
excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
|
27
|
-
when 'ATTR'
|
28
|
-
excab = {type: type, name: data[0], value: data[1]}
|
29
|
-
end
|
30
|
-
return excab
|
31
|
-
end
|
32
|
-
def parse_chunk(line)
|
33
|
-
null, id, dep, part, score = line.chomp.split("\s")
|
34
|
-
link, rel = dep[0..-2], dep[-1]
|
35
|
-
head, func = part.split('/')
|
36
|
-
chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
|
37
|
-
return chunk
|
38
|
-
end
|
39
|
-
def parse_token(line)
|
40
|
-
text, attrs, ne = line.chomp.split("\t")
|
41
|
-
attrs = CSV.parse_line(attrs, col_sep:',')
|
42
|
-
pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
|
43
|
-
token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
|
44
|
-
return token
|
45
|
-
end
|
46
|
-
module_function :parse, :parse_excab, :parse_chunk, :parse_token
|
47
|
-
end
|