nekoneko_gen 0.1.1 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/nekoneko_gen/arow.rb +22 -38
- data/lib/nekoneko_gen/classifier.rb +21 -0
- data/lib/nekoneko_gen/classifier_factory.rb +22 -0
- data/lib/nekoneko_gen/linear_classifier.rb +86 -0
- data/lib/nekoneko_gen/mlp.rb +176 -0
- data/lib/nekoneko_gen/pa.rb +68 -0
- data/lib/nekoneko_gen/text_classifier_generator.rb +39 -40
- data/lib/nekoneko_gen/version.rb +1 -1
- data/lib/nekoneko_gen.rb +30 -8
- data/test/nekoneko_gen_test.rb +69 -41
- metadata +13 -8
data/lib/nekoneko_gen/arow.rb
CHANGED
@@ -1,72 +1,56 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'linear_classifier'))
|
3
|
+
|
2
4
|
module NekonekoGen
|
3
|
-
|
5
|
+
# Adaptive Regularization of Weight Vector
|
6
|
+
class Arow < LinearClassifier
|
4
7
|
R = 6.0
|
5
|
-
|
8
|
+
DEFAULT_ITERATION = 20
|
9
|
+
|
6
10
|
def initialize(k, options = {})
|
7
|
-
@r = options[:
|
11
|
+
@r = options[:c] || R
|
8
12
|
@k = k
|
9
13
|
@cov = []
|
14
|
+
@covb = []
|
10
15
|
@w = []
|
16
|
+
@bias = []
|
11
17
|
if (@k == 2)
|
12
18
|
@cov[0] = Hash.new(1.0)
|
13
19
|
@w[0] = Hash.new(0.0)
|
20
|
+
@covb[0] = 1.0
|
21
|
+
@bias[0] = 0.0
|
14
22
|
else
|
15
23
|
k.times do |i|
|
16
24
|
@cov[i] = Hash.new(1.0)
|
17
25
|
@w[i] = Hash.new(0.0)
|
26
|
+
@covb[i] = 1.0
|
27
|
+
@bias[i] = 0.0
|
18
28
|
end
|
19
29
|
end
|
20
30
|
end
|
21
|
-
def update(vec, label)
|
22
|
-
loss = 0.0
|
23
|
-
if (@k == 2)
|
24
|
-
loss = update_at(0, vec, label)
|
25
|
-
else
|
26
|
-
nega = rand(@k - 1)
|
27
|
-
if (nega == label)
|
28
|
-
nega += 1
|
29
|
-
end
|
30
|
-
s = 1.0 / @k
|
31
|
-
@k.times do |i|
|
32
|
-
loss += update_at(i, vec, label) * s
|
33
|
-
end
|
34
|
-
end
|
35
|
-
loss
|
36
|
-
end
|
37
|
-
def strip!
|
38
|
-
@w.each do |w|
|
39
|
-
w.reject!{|k,v| v.abs <= Float::EPSILON }
|
40
|
-
end
|
41
|
-
@w
|
42
|
-
end
|
43
|
-
|
44
|
-
private
|
45
|
-
def dot(vec, w)
|
46
|
-
dot = 0.0
|
47
|
-
vec.each do |k, v|
|
48
|
-
if (a = w[k])
|
49
|
-
dot += a * v
|
50
|
-
end
|
51
|
-
end
|
52
|
-
dot
|
53
|
-
end
|
54
31
|
def update_at(i, vec, label)
|
55
32
|
w = @w[i]
|
56
33
|
cov = @cov[i]
|
34
|
+
covb = @covb[i]
|
35
|
+
bias = @bias[i]
|
57
36
|
y = label == i ? 1 : -1
|
58
|
-
score = dot(vec, w)
|
37
|
+
score = bias + dot(vec, w)
|
59
38
|
alpha = 1.0 - y * score
|
60
39
|
if (alpha > 0.0)
|
61
40
|
r_inv= 1.0 / @r
|
62
|
-
var = vec.map
|
41
|
+
var = vec.map{|k, v| cov[k] * v * v }.reduce(:+) + covb
|
63
42
|
alpha *= (1.0 / (var + @r)) * y
|
64
43
|
vec.each do |k, v|
|
65
44
|
w[k] += alpha * cov[k] * v
|
66
45
|
cov[k] = 1.0 / ((1.0 / cov[k]) + (v * v * r_inv))
|
67
46
|
end
|
47
|
+
@bias[i] += alpha * covb
|
48
|
+
@covb[i] = 1.0 / ((1.0 / covb) + r_inv)
|
68
49
|
end
|
69
50
|
score * y < 0.0 ? 1.0 : 0.0
|
70
51
|
end
|
52
|
+
def default_iteration
|
53
|
+
DEFAULT_ITERATION
|
54
|
+
end
|
71
55
|
end
|
72
56
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module NekonekoGen
|
3
|
+
class Classifier
|
4
|
+
attr_reader :k
|
5
|
+
def parameter_code(index_converter = nil)
|
6
|
+
raise NotImplementedError
|
7
|
+
end
|
8
|
+
def classify_method_code
|
9
|
+
raise NotImplementedError
|
10
|
+
end
|
11
|
+
def update(vec, label)
|
12
|
+
raise NotImplementedError
|
13
|
+
end
|
14
|
+
def features(i = -1)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
def default_iteration
|
18
|
+
raise NotImplementedError
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'arow'))
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'pa'))
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'mlp'))
|
5
|
+
|
6
|
+
module NekonekoGen
|
7
|
+
module ClassifierFactory
|
8
|
+
def self.create(k, options)
|
9
|
+
method = options[:method] || :arow
|
10
|
+
case (method)
|
11
|
+
when :arow
|
12
|
+
Arow.new(k, options)
|
13
|
+
when :pa, :pa1, :pa2
|
14
|
+
PA.new(k, options)
|
15
|
+
when :mlp
|
16
|
+
MLP.new(k, options)
|
17
|
+
else
|
18
|
+
raise ArgumentError
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'json'
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier'))
|
4
|
+
|
5
|
+
module NekonekoGen
|
6
|
+
class LinearClassifier < Classifier
|
7
|
+
attr_reader :w, :bias
|
8
|
+
def dot(vec, w)
|
9
|
+
dot = 0.0
|
10
|
+
vec.each do |k, v|
|
11
|
+
if (a = w[k])
|
12
|
+
dot += a * v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
dot
|
16
|
+
end
|
17
|
+
def strip!
|
18
|
+
@w.each {|w|
|
19
|
+
w.reject!{|k,v|
|
20
|
+
if (v.abs < Float::EPSILON)
|
21
|
+
# p v
|
22
|
+
true
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
}
|
27
|
+
}
|
28
|
+
@w
|
29
|
+
end
|
30
|
+
def update(vec, label)
|
31
|
+
loss = 0.0
|
32
|
+
if (@k == 2)
|
33
|
+
loss = update_at(0, vec, label)
|
34
|
+
else
|
35
|
+
s = 1.0 / @k
|
36
|
+
@k.times do |i|
|
37
|
+
loss += update_at(i, vec, label) * s
|
38
|
+
end
|
39
|
+
end
|
40
|
+
loss
|
41
|
+
end
|
42
|
+
def features(i = -1)
|
43
|
+
if (i < 0)
|
44
|
+
w.reduce(0){|sum, v| sum + v.size }
|
45
|
+
else
|
46
|
+
w[i].size
|
47
|
+
end
|
48
|
+
end
|
49
|
+
def parameter_code(lang, index_converter = lambda{|i| i})
|
50
|
+
lang ||= :ruby
|
51
|
+
case lang
|
52
|
+
when :ruby
|
53
|
+
else
|
54
|
+
raise NotImplementedError
|
55
|
+
end
|
56
|
+
|
57
|
+
wvec = self.strip!.map {|w|
|
58
|
+
w.reduce({}) {|h, kv| h[index_converter.call(kv[0])] = kv[1]; h }
|
59
|
+
}
|
60
|
+
<<CODE
|
61
|
+
BIAS = #{self.bias.inspect}
|
62
|
+
W = JSON.load(#{wvec.to_json.inspect})
|
63
|
+
CODE
|
64
|
+
end
|
65
|
+
def classify_method_code(lang)
|
66
|
+
lang ||= :ruby
|
67
|
+
case lang
|
68
|
+
when :ruby
|
69
|
+
else
|
70
|
+
raise NotImplementedError
|
71
|
+
end
|
72
|
+
|
73
|
+
<<CODE
|
74
|
+
def self.classify(vec)
|
75
|
+
if (K == 2)
|
76
|
+
BIAS[0] + W[0].values_at(*vec).compact.reduce(0.0, :+) > 0.0 ? 0 : 1
|
77
|
+
else
|
78
|
+
W.each_with_index.map {|w, i|
|
79
|
+
[BIAS[i] + w.values_at(*vec).compact.reduce(0.0, :+), i]
|
80
|
+
}.max.pop
|
81
|
+
end
|
82
|
+
end
|
83
|
+
CODE
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require 'json'
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier'))
|
3
|
+
|
4
|
+
module NekonekoGen
|
5
|
+
# Multi Layer Perceptron
|
6
|
+
class MLP < Classifier
|
7
|
+
IR = 0.4
|
8
|
+
HR = 0.1
|
9
|
+
NOISE_VAR = 0.3
|
10
|
+
MARGIN = 0.2
|
11
|
+
DEFAULT_ITERATION = 40
|
12
|
+
|
13
|
+
def default_hidden_unit
|
14
|
+
@k
|
15
|
+
end
|
16
|
+
def initialize(k, options)
|
17
|
+
@k = k
|
18
|
+
@output_units = @k == 2 ? 1 : @k
|
19
|
+
@hidden_units = (options[:c] || default_hidden_unit).to_i
|
20
|
+
@input = []
|
21
|
+
@hidden = []
|
22
|
+
@input_bias = []
|
23
|
+
@hidden_bias = []
|
24
|
+
@hidden_units.times do |i|
|
25
|
+
@input[i] = Hash.new {|hash, key| hash[key] = default_value }
|
26
|
+
@input_bias[i] = default_value
|
27
|
+
end
|
28
|
+
@output_units.times do |i|
|
29
|
+
@hidden[i] = []
|
30
|
+
@hidden_units.times do |j|
|
31
|
+
@hidden[i][j] = default_value
|
32
|
+
end
|
33
|
+
@hidden_bias[i] = default_value
|
34
|
+
end
|
35
|
+
end
|
36
|
+
def update(vec, label)
|
37
|
+
input_y = []
|
38
|
+
hidden_y = []
|
39
|
+
output_y = []
|
40
|
+
|
41
|
+
input_y = @hidden_units.times.map do |i|
|
42
|
+
w = @input[i]
|
43
|
+
sigmoid(@input_bias[i] + vec.map{|k, v| w[k] * v}.reduce(:+) + noise)
|
44
|
+
end
|
45
|
+
hidden_y = @output_units.times.map do |i|
|
46
|
+
@hidden_bias[i] + input_y.zip(@hidden[i]).map{|a, b| a * b }.reduce(:+)
|
47
|
+
end
|
48
|
+
output_y = @output_units.times.map do |i|
|
49
|
+
sigmoid(hidden_y[i])
|
50
|
+
end
|
51
|
+
|
52
|
+
loss = 0.0
|
53
|
+
dotrain = false
|
54
|
+
if (@output_units == 1)
|
55
|
+
if (output_y[0] > 0.5)
|
56
|
+
l = 0
|
57
|
+
else
|
58
|
+
l = 1
|
59
|
+
end
|
60
|
+
if (label == 0)
|
61
|
+
if (output_y[0] < 1.0 - MARGIN)
|
62
|
+
dotrain = true
|
63
|
+
end
|
64
|
+
else
|
65
|
+
if (output_y[0] > MARGIN)
|
66
|
+
dotrain = true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
loss = (label == l) ? 0.0 : 1.0
|
70
|
+
else
|
71
|
+
max_p, l = output_y.each_with_index.max
|
72
|
+
if (l == label)
|
73
|
+
if (max_p < 1.0 - MARGIN)
|
74
|
+
dotrain = true
|
75
|
+
end
|
76
|
+
else
|
77
|
+
loss = 1.0
|
78
|
+
dotrain = true
|
79
|
+
end
|
80
|
+
end
|
81
|
+
if (dotrain)
|
82
|
+
output_bp = @output_units.times.map do |i|
|
83
|
+
y = hidden_y[i]
|
84
|
+
yt = (label == i) ? 1.0 : 0.0
|
85
|
+
expy = Math.exp(y)
|
86
|
+
-((2.0 * yt - 1.0) * expy + yt) / (Math.exp(2.0 * y) + 2.0 * expy + 1.0)
|
87
|
+
end
|
88
|
+
hidden_bp = @hidden_units.times.map do |j|
|
89
|
+
y = 0.0
|
90
|
+
@output_units.times do |i|
|
91
|
+
y += output_bp[i] * @hidden[i][j]
|
92
|
+
end
|
93
|
+
y * (1.0 - input_y[j]) * input_y[j]
|
94
|
+
end
|
95
|
+
@output_units.times do |j|
|
96
|
+
hidden = @hidden[j]
|
97
|
+
@hidden_units.times do |i|
|
98
|
+
hidden[i] -= HR * input_y[i] * output_bp[j]
|
99
|
+
end
|
100
|
+
@hidden_bias[j] -= HR * output_bp[j]
|
101
|
+
end
|
102
|
+
@hidden_units.times do |i|
|
103
|
+
input = @input[i]
|
104
|
+
vec.each do |k, v|
|
105
|
+
input[k] -= IR * v * hidden_bp[i]
|
106
|
+
end
|
107
|
+
@input_bias[i] -= IR * hidden_bp[i]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
loss
|
111
|
+
end
|
112
|
+
def features(i = -1)
|
113
|
+
@input.map{|v| v.size }.reduce(:+)
|
114
|
+
end
|
115
|
+
def sigmoid(a)
|
116
|
+
1.0 / (1.0 + Math.exp(-a))
|
117
|
+
end
|
118
|
+
def default_value
|
119
|
+
(rand - 0.5)
|
120
|
+
end
|
121
|
+
def noise
|
122
|
+
(Math.sqrt(-2.0 * Math.log(rand)) * Math.sin(2.0 * Math::PI * rand)) * NOISE_VAR
|
123
|
+
end
|
124
|
+
def default_iteration
|
125
|
+
DEFAULT_ITERATION
|
126
|
+
end
|
127
|
+
def parameter_code(lang, index_converter = lambda{|i| i})
|
128
|
+
lang ||= :ruby
|
129
|
+
case lang
|
130
|
+
when :ruby
|
131
|
+
else
|
132
|
+
raise NotImplementedError
|
133
|
+
end
|
134
|
+
|
135
|
+
wvec = @input.map {|w|
|
136
|
+
w.reduce({}) {|h, kv| h[index_converter.call(kv[0])] = kv[1]; h }
|
137
|
+
}
|
138
|
+
<<CODE
|
139
|
+
HIDDEN_UNITS = #{@hidden_units}
|
140
|
+
INPUT_BIAS = #{@input_bias.inspect}
|
141
|
+
HIDDEN_BIAS = #{@hidden_bias.inspect}
|
142
|
+
INPUT_W = JSON.load(#{wvec.to_json.inspect})
|
143
|
+
HIDDEN_W = #{@hidden.inspect}
|
144
|
+
CODE
|
145
|
+
end
|
146
|
+
def classify_method_code(lang)
|
147
|
+
lang ||= :ruby
|
148
|
+
case lang
|
149
|
+
when :ruby
|
150
|
+
else
|
151
|
+
raise NotImplementedError
|
152
|
+
end
|
153
|
+
<<CODE
|
154
|
+
def self.classify(vec)
|
155
|
+
input_y = []
|
156
|
+
output_y = []
|
157
|
+
HIDDEN_UNITS.times do |i|
|
158
|
+
input_y[i] = sigmoid(INPUT_BIAS[i] +
|
159
|
+
INPUT_W[i].values_at(*vec).compact.reduce(0.0, :+))
|
160
|
+
end
|
161
|
+
if (K == 2)
|
162
|
+
HIDDEN_BIAS[0] +
|
163
|
+
input_y.zip(HIDDEN_W[0]).map{|a, b| a * b }.reduce(:+) > 0.0 ? 0 : 1
|
164
|
+
else
|
165
|
+
K.times.map{|i|
|
166
|
+
[HIDDEN_BIAS[i] + input_y.zip(HIDDEN_W[i]).map{|a, b| a * b }.reduce(:+), i]
|
167
|
+
}.max.pop
|
168
|
+
end
|
169
|
+
end
|
170
|
+
def self.sigmoid(a)
|
171
|
+
1.0 / (1.0 + Math.exp(-a))
|
172
|
+
end
|
173
|
+
CODE
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'linear_classifier'))
|
3
|
+
|
4
|
+
module NekonekoGen
|
5
|
+
# Passive Agressive
|
6
|
+
class PA < LinearClassifier
|
7
|
+
C = 1.0
|
8
|
+
NORM = 2.0 # norm + BIAS
|
9
|
+
DEFAULT_ITERATION = 20
|
10
|
+
|
11
|
+
def initialize(k, options = {})
|
12
|
+
@k = k
|
13
|
+
@c = options[:c] || C
|
14
|
+
@w = []
|
15
|
+
@bias = []
|
16
|
+
if (@k == 2)
|
17
|
+
@w[0] = Hash.new(0.0)
|
18
|
+
@bias[0] = 0.0
|
19
|
+
else
|
20
|
+
k.times do |i|
|
21
|
+
@w[i] = Hash.new(0.0)
|
22
|
+
@bias[i] = 0.0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
if options[:method]
|
26
|
+
@tau =
|
27
|
+
case options[:method]
|
28
|
+
when :pa
|
29
|
+
lambda{|y, l| pa(y, l)}
|
30
|
+
when :pa1
|
31
|
+
lambda{|y, l| pa1(y, l)}
|
32
|
+
when :pa2
|
33
|
+
lambda{|y, l| pa2(y, l)}
|
34
|
+
else
|
35
|
+
lambda{|y, l| pa2(y, l)}
|
36
|
+
end
|
37
|
+
else
|
38
|
+
@tau = lambda{|y, l| pa2(y, l)}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def pa2(y, l)
|
42
|
+
y * (l / NORM + 0.5 / @c)
|
43
|
+
end
|
44
|
+
def pa1(y, l)
|
45
|
+
y * [@c, (l / NORM)].min
|
46
|
+
end
|
47
|
+
def pa(y, l)
|
48
|
+
y * l / NORM
|
49
|
+
end
|
50
|
+
def update_at(i, vec, label)
|
51
|
+
y = label == i ? 1 : -1
|
52
|
+
w = @w[i]
|
53
|
+
score = @bias[i] + dot(vec, w)
|
54
|
+
l = 1.0 - score * y
|
55
|
+
if (l > 0.0)
|
56
|
+
alpha = @tau.call(y, l)
|
57
|
+
vec.each do |k, v|
|
58
|
+
w[k] += alpha * v
|
59
|
+
end
|
60
|
+
@bias[i] += alpha
|
61
|
+
end
|
62
|
+
y * score < 0.0 ? 1.0 : 0.0
|
63
|
+
end
|
64
|
+
def default_iteration
|
65
|
+
DEFAULT_ITERATION
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -1,9 +1,8 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
-
require 'json'
|
3
2
|
require 'nkf'
|
4
3
|
require 'bimyou_segmenter'
|
5
4
|
|
6
|
-
require File.expand_path(File.join(File.dirname(__FILE__), '
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier_factory'))
|
7
6
|
|
8
7
|
module NekonekoGen
|
9
8
|
class TextClassifierGenerator
|
@@ -15,16 +14,15 @@ module NekonekoGen
|
|
15
14
|
@files = files
|
16
15
|
@word2id = {}
|
17
16
|
@id2word = {}
|
18
|
-
@
|
19
|
-
|
17
|
+
@classifier = ClassifierFactory.create(files.size, options)
|
20
18
|
@name = safe_name(@filename).split("_").map(&:capitalize).join
|
21
19
|
@labels = files.map {|file| "#{safe_name(file).upcase}"}
|
22
20
|
end
|
23
21
|
|
24
|
-
def train(iteration =
|
25
|
-
iteration ||=
|
22
|
+
def train(iteration = nil)
|
23
|
+
iteration ||= @classifier.default_iteration
|
26
24
|
data = []
|
27
|
-
@
|
25
|
+
@classifier.k.times do |i|
|
28
26
|
t = Time.now
|
29
27
|
data[i] = []
|
30
28
|
print "loading #{@files[i]}... "
|
@@ -49,31 +47,35 @@ module NekonekoGen
|
|
49
47
|
t = Time.now
|
50
48
|
print sprintf("step %3d...", step)
|
51
49
|
|
52
|
-
@
|
50
|
+
@classifier.k.times.map do |i|
|
53
51
|
sampling(data[i], samples).map {|vec| [vec, i] }
|
54
52
|
end.flatten(1).shuffle!.each do |v|
|
55
|
-
loss += @
|
53
|
+
loss += @classifier.update(v[0], v[1])
|
56
54
|
c += 1
|
57
55
|
end
|
58
56
|
print sprintf(" %.6f, %.4fs\n", 1.0 - loss / c.to_f, Time.now - t)
|
59
57
|
end
|
60
|
-
@
|
61
|
-
|
62
|
-
|
63
|
-
@arow.w.each_with_index do |w, i|
|
64
|
-
puts "#{@labels[i]} : #{w.size} features"
|
58
|
+
if (@classifier.k > 2)
|
59
|
+
@classifier.k.times do |i|
|
60
|
+
puts "#{@labels[i]} : #{@classifier.features(i)} features"
|
65
61
|
end
|
66
62
|
else
|
67
|
-
puts "#{@labels[0]}, #{@labels[1]} : #{@
|
63
|
+
puts "#{@labels[0]}, #{@labels[1]} : #{@classifier.features(0)} features"
|
68
64
|
end
|
69
65
|
puts "done nyan! "
|
70
66
|
end
|
71
|
-
def generate
|
72
|
-
|
73
|
-
|
74
|
-
|
67
|
+
def generate(lang = :ruby)
|
68
|
+
lang ||= :ruby
|
69
|
+
case lang
|
70
|
+
when :ruby
|
71
|
+
generate_ruby_code
|
72
|
+
else
|
73
|
+
raise NotImplementedError
|
74
|
+
end
|
75
|
+
@name
|
76
|
+
end
|
77
|
+
def generate_ruby_code
|
75
78
|
labels = @labels.each_with_index.map{|v, i| " #{v} = #{i}"}.join("\n")
|
76
|
-
|
77
79
|
File.open(@filename, "w") do |f|
|
78
80
|
f.write <<MODEL
|
79
81
|
# -*- coding: utf-8 -*-
|
@@ -82,9 +84,21 @@ require 'json'
|
|
82
84
|
require 'bimyou_segmenter'
|
83
85
|
|
84
86
|
class #{@name}
|
87
|
+
def self.k
|
88
|
+
K
|
89
|
+
end
|
85
90
|
def self.predict(text)
|
91
|
+
classify(fv(text))
|
92
|
+
end
|
93
|
+
|
94
|
+
#{labels}
|
95
|
+
LABELS = #{@labels.inspect}
|
96
|
+
K = #{@classifier.k}
|
97
|
+
|
98
|
+
private
|
99
|
+
def self.fv(text)
|
86
100
|
prev = nil
|
87
|
-
|
101
|
+
BimyouSegmenter.segment(text).map do |word|
|
88
102
|
if (prev)
|
89
103
|
if (NGRAM_TARGET =~ word)
|
90
104
|
nword = [prev + word, word]
|
@@ -101,27 +115,14 @@ class #{@name}
|
|
101
115
|
word
|
102
116
|
end
|
103
117
|
end.flatten(1)
|
104
|
-
vec << " bias "
|
105
|
-
if (W.size == 1)
|
106
|
-
W[0].values_at(*vec).compact.reduce(:+) > 0.0 ? 0 : 1
|
107
|
-
else
|
108
|
-
W.each_with_index.map {|w,i|
|
109
|
-
[w.values_at(*vec).compact.reduce(:+), i]
|
110
|
-
}.max.pop
|
111
|
-
end
|
112
118
|
end
|
113
|
-
|
114
|
-
W.size == 1 ? 2 : W.size
|
115
|
-
end
|
116
|
-
#{labels}
|
117
|
-
LABELS = #{@labels.inspect}
|
119
|
+
#{@classifier.classify_method_code(:ruby)}
|
118
120
|
|
119
|
-
private
|
120
121
|
NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
|
121
122
|
'(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
122
123
|
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
123
|
-
|
124
|
-
|
124
|
+
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
|
125
|
+
#{@classifier.parameter_code(:ruby, lambda{|id| id2word(id) })}
|
125
126
|
end
|
126
127
|
MODEL
|
127
128
|
end
|
@@ -143,8 +144,6 @@ MODEL
|
|
143
144
|
end
|
144
145
|
def fv(text)
|
145
146
|
vec = Hash.new(0)
|
146
|
-
vec[word2id(" bias ")] = 1
|
147
|
-
|
148
147
|
prev = nil
|
149
148
|
words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
|
150
149
|
if (prev)
|
@@ -170,7 +169,7 @@ MODEL
|
|
170
169
|
vec
|
171
170
|
end
|
172
171
|
def normalize(vec)
|
173
|
-
norm = Math.sqrt(vec.
|
172
|
+
norm = Math.sqrt(vec.values.map{|v| v * v }.reduce(:+))
|
174
173
|
if (norm > 0.0)
|
175
174
|
s = 1.0 / norm
|
176
175
|
vec.each do |k, v|
|
data/lib/nekoneko_gen/version.rb
CHANGED
data/lib/nekoneko_gen.rb
CHANGED
@@ -5,29 +5,51 @@ require 'optparse'
|
|
5
5
|
require 'fileutils'
|
6
6
|
|
7
7
|
module NekonekoGen
|
8
|
-
DEFAULT_ITERATION = 20
|
9
|
-
|
10
8
|
def self.run(argv)
|
11
|
-
iteration =
|
9
|
+
iteration = nil
|
12
10
|
rubyfile = nil
|
13
11
|
quiet = false
|
14
|
-
|
15
|
-
$stdout.sync = true
|
16
12
|
|
13
|
+
$stdout.sync = true
|
14
|
+
method = nil
|
15
|
+
c = nil
|
17
16
|
opt = OptionParser.new do |o|
|
18
17
|
o.on('-n NAME', 'new classifier name') do |v|
|
19
18
|
rubyfile = File.join(File.dirname(v), File.basename(v, ".*") + ".rb")
|
20
19
|
FileUtils.touch(rubyfile)
|
21
20
|
end
|
22
|
-
o.on('-i N', "iteration
|
21
|
+
o.on('-i N', "iteration (default: auto)") do |v|
|
23
22
|
iteration = v.to_i.abs
|
24
23
|
end
|
24
|
+
o.on('-m METHOD', "machine learning method [AROW|PA2|MLP] (default AROW)") do |v|
|
25
|
+
if (v)
|
26
|
+
case v.downcase
|
27
|
+
when 'arow'
|
28
|
+
method = :arow
|
29
|
+
when 'pa1'
|
30
|
+
method = :pa1
|
31
|
+
when 'pa2'
|
32
|
+
method = :pa2
|
33
|
+
when 'mlp'
|
34
|
+
method = :mlp
|
35
|
+
else
|
36
|
+
warn opt
|
37
|
+
return -1
|
38
|
+
end
|
39
|
+
else
|
40
|
+
warn opt
|
41
|
+
return -1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
o.on('-p C', "parameter (default AROW::R=6.0, PA2::C=1.0, MLP::HIDDEN_UNIT=K)") do |v|
|
45
|
+
c = v.to_f
|
46
|
+
end
|
25
47
|
o.on('-q', "quiet") do
|
26
48
|
quiet = true
|
27
49
|
end
|
28
50
|
end
|
29
51
|
opt.version = NekonekoGen::VERSION
|
30
|
-
opt.banner = "Usage: nekoneko_gen -n
|
52
|
+
opt.banner = "Usage: nekoneko_gen [OPTIONS] -n NAME FILE1 FILE2 [FILES...]"
|
31
53
|
files = opt.parse(argv)
|
32
54
|
|
33
55
|
unless (rubyfile)
|
@@ -45,7 +67,7 @@ module NekonekoGen
|
|
45
67
|
end
|
46
68
|
end
|
47
69
|
|
48
|
-
gen = NekonekoGen::TextClassifierGenerator.new(rubyfile, files)
|
70
|
+
gen = NekonekoGen::TextClassifierGenerator.new(rubyfile, files, {:method => method, :c => c})
|
49
71
|
if (quiet)
|
50
72
|
gen.quiet = true
|
51
73
|
end
|
data/test/nekoneko_gen_test.rb
CHANGED
@@ -6,118 +6,146 @@ class NekonekoGenTest < Test::Unit::TestCase
|
|
6
6
|
@file0 = File.join(File.dirname(__FILE__), 'class0.txt')
|
7
7
|
@file1 = File.join(File.dirname(__FILE__), 'class1.txt')
|
8
8
|
@file2 = File.join(File.dirname(__FILE__), 'class2.txt')
|
9
|
-
@
|
10
|
-
@output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_classifier.rb")
|
9
|
+
@clean_files = []
|
11
10
|
end
|
12
11
|
def teardown
|
13
|
-
|
12
|
+
@clean_files.each do |file|
|
13
|
+
if (File.exist?(file))
|
14
|
+
File.unlink(file)
|
15
|
+
end
|
16
|
+
end
|
14
17
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
18
|
+
|
19
|
+
def test_mlp
|
20
|
+
gen2('mlp', {:method => :mlp})
|
21
|
+
gen3('mlp', {:method => :mlp})
|
22
|
+
end
|
23
|
+
def test_pa2
|
24
|
+
gen2('pa2', {:method => :pa2})
|
25
|
+
gen3('pa2', {:method => :pa2})
|
26
|
+
end
|
27
|
+
def test_arow
|
28
|
+
gen2('arow', {:method => :arow})
|
29
|
+
gen3('arow',{:method => :arow})
|
30
|
+
end
|
31
|
+
|
32
|
+
def clean!(a, b)
|
33
|
+
if (File.exist?(a))
|
34
|
+
File.unlink(a)
|
19
35
|
end
|
20
|
-
|
21
|
-
File.unlink(
|
22
|
-
rescue
|
36
|
+
if (File.exist?(b))
|
37
|
+
File.unlink(b)
|
23
38
|
end
|
24
|
-
end
|
39
|
+
end
|
25
40
|
|
26
|
-
def
|
27
|
-
|
41
|
+
def gen2(prefix, options)
|
42
|
+
p "---- #{prefix} generate 2class"
|
43
|
+
output_file2 = File.join(Dir.tmpdir, "nekoneko_test2_#{prefix}_classifier.rb")
|
44
|
+
output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_#{prefix}_classifier.rb")
|
28
45
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
gen.generate
|
46
|
+
clean!(output_file2, output_file3)
|
47
|
+
@clean_files << output_file2
|
48
|
+
@clean_files << output_file3
|
33
49
|
|
34
|
-
|
35
|
-
|
50
|
+
gen = NekonekoGen::TextClassifierGenerator.new(output_file2, [@file0, @file1], options)
|
51
|
+
gen.train
|
52
|
+
modname = gen.generate
|
53
|
+
|
54
|
+
unless (File.exist?(output_file2))
|
55
|
+
assert_equal "#{output_file2} not found", nil
|
36
56
|
end
|
37
57
|
|
38
58
|
begin
|
39
|
-
load
|
59
|
+
load output_file2
|
40
60
|
|
61
|
+
mod = Kernel.const_get(modname)
|
41
62
|
ok = 0
|
42
63
|
count = 0
|
43
64
|
File.open(@file0) do |f|
|
44
65
|
until f.eof?
|
45
|
-
if (
|
66
|
+
if (mod.predict(f.readline) == mod::CLASS0)
|
46
67
|
ok += 1
|
47
68
|
end
|
48
69
|
count += 1
|
49
70
|
end
|
50
71
|
end
|
51
|
-
puts "#{
|
72
|
+
puts "#{mod::LABELS[0]}: #{ok.to_f / count}"
|
52
73
|
assert ok.to_f / count > 0.9
|
53
|
-
|
74
|
+
|
54
75
|
ok = 0
|
55
76
|
count = 0
|
56
77
|
File.open(@file1) do |f|
|
57
78
|
until f.eof?
|
58
|
-
if (
|
79
|
+
if (mod.predict(f.readline) == mod::CLASS1)
|
59
80
|
ok += 1
|
60
81
|
end
|
61
82
|
count += 1
|
62
83
|
end
|
63
84
|
end
|
64
|
-
puts "#{
|
85
|
+
puts "#{mod::LABELS[1]}: #{ok.to_f / count}"
|
65
86
|
assert ok.to_f / count > 0.9
|
66
87
|
end
|
67
88
|
end
|
68
89
|
|
69
|
-
def
|
70
|
-
|
90
|
+
def gen3(prefix, options)
|
91
|
+
p "---- #{prefix} generate 3class"
|
92
|
+
output_file2 = File.join(Dir.tmpdir, "nekoneko_test2_#{prefix}_classifier.rb")
|
93
|
+
output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_#{prefix}_classifier.rb")
|
94
|
+
|
95
|
+
clean!(output_file2, output_file3)
|
96
|
+
@clean_files << output_file2
|
97
|
+
@clean_files << output_file3
|
71
98
|
|
72
|
-
gen = NekonekoGen::TextClassifierGenerator.new(
|
73
|
-
|
74
|
-
gen.train
|
75
|
-
gen.generate
|
99
|
+
gen = NekonekoGen::TextClassifierGenerator.new(output_file3,
|
100
|
+
[@file0, @file1, @file2], options)
|
101
|
+
gen.train
|
102
|
+
modname = gen.generate
|
76
103
|
|
77
|
-
unless (File.exist?(
|
78
|
-
assert_equal "#{
|
104
|
+
unless (File.exist?(output_file3))
|
105
|
+
assert_equal "#{output_file3} not found", nil
|
79
106
|
end
|
80
107
|
|
81
108
|
begin
|
82
|
-
load
|
83
|
-
|
109
|
+
load output_file3
|
110
|
+
|
111
|
+
mod = Kernel.const_get(modname)
|
84
112
|
ok = 0
|
85
113
|
count = 0
|
86
114
|
File.open(@file0) do |f|
|
87
115
|
until f.eof?
|
88
|
-
if (
|
116
|
+
if (mod.predict(f.readline) == mod::CLASS0)
|
89
117
|
ok += 1
|
90
118
|
end
|
91
119
|
count += 1
|
92
120
|
end
|
93
121
|
end
|
94
|
-
puts "#{
|
122
|
+
puts "#{mod::LABELS[0]}: #{ok.to_f / count}"
|
95
123
|
assert ok.to_f / count > 0.9
|
96
124
|
|
97
125
|
ok = 0
|
98
126
|
count = 0
|
99
127
|
File.open(@file1) do |f|
|
100
128
|
until f.eof?
|
101
|
-
if (
|
129
|
+
if (mod.predict(f.readline) == mod::CLASS1)
|
102
130
|
ok += 1
|
103
131
|
end
|
104
132
|
count += 1
|
105
133
|
end
|
106
134
|
end
|
107
|
-
puts "#{
|
135
|
+
puts "#{mod::LABELS[1]}: #{ok.to_f / count}"
|
108
136
|
assert ok.to_f / count > 0.9
|
109
137
|
|
110
138
|
ok = 0
|
111
139
|
count = 0
|
112
140
|
File.open(@file2) do |f|
|
113
141
|
until f.eof?
|
114
|
-
if (
|
142
|
+
if (mod.predict(f.readline) == mod::CLASS2)
|
115
143
|
ok += 1
|
116
144
|
end
|
117
145
|
count += 1
|
118
146
|
end
|
119
147
|
end
|
120
|
-
puts "#{
|
148
|
+
puts "#{mod::LABELS[2]}: #{ok.to_f / count}"
|
121
149
|
assert ok.to_f / count > 0.9
|
122
150
|
end
|
123
151
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nekoneko_gen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-01 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bimyou_segmenter
|
16
|
-
requirement: &
|
16
|
+
requirement: &14306440 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *14306440
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: json
|
27
|
-
requirement: &
|
27
|
+
requirement: &14304220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *14304220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: test-unit
|
38
|
-
requirement: &
|
38
|
+
requirement: &14303060 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *14303060
|
47
47
|
description: Japanese Text Classifier Generator
|
48
48
|
email:
|
49
49
|
- nagadomi@nurs.or.jp
|
@@ -60,6 +60,11 @@ files:
|
|
60
60
|
- bin/nekoneko_gen
|
61
61
|
- lib/nekoneko_gen.rb
|
62
62
|
- lib/nekoneko_gen/arow.rb
|
63
|
+
- lib/nekoneko_gen/classifier.rb
|
64
|
+
- lib/nekoneko_gen/classifier_factory.rb
|
65
|
+
- lib/nekoneko_gen/linear_classifier.rb
|
66
|
+
- lib/nekoneko_gen/mlp.rb
|
67
|
+
- lib/nekoneko_gen/pa.rb
|
63
68
|
- lib/nekoneko_gen/text_classifier_generator.rb
|
64
69
|
- lib/nekoneko_gen/version.rb
|
65
70
|
- nekoneko_gen.gemspec
|