nekoneko_gen 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/nekoneko_gen/arow.rb +22 -38
- data/lib/nekoneko_gen/classifier.rb +21 -0
- data/lib/nekoneko_gen/classifier_factory.rb +22 -0
- data/lib/nekoneko_gen/linear_classifier.rb +86 -0
- data/lib/nekoneko_gen/mlp.rb +176 -0
- data/lib/nekoneko_gen/pa.rb +68 -0
- data/lib/nekoneko_gen/text_classifier_generator.rb +39 -40
- data/lib/nekoneko_gen/version.rb +1 -1
- data/lib/nekoneko_gen.rb +30 -8
- data/test/nekoneko_gen_test.rb +69 -41
- metadata +13 -8
data/lib/nekoneko_gen/arow.rb
CHANGED
@@ -1,72 +1,56 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'linear_classifier'))
|
3
|
+
|
2
4
|
module NekonekoGen
|
3
|
-
|
5
|
+
# Adaptive Regularization of Weight Vector
|
6
|
+
class Arow < LinearClassifier
|
4
7
|
R = 6.0
|
5
|
-
|
8
|
+
DEFAULT_ITERATION = 20
|
9
|
+
|
6
10
|
def initialize(k, options = {})
|
7
|
-
@r = options[:
|
11
|
+
@r = options[:c] || R
|
8
12
|
@k = k
|
9
13
|
@cov = []
|
14
|
+
@covb = []
|
10
15
|
@w = []
|
16
|
+
@bias = []
|
11
17
|
if (@k == 2)
|
12
18
|
@cov[0] = Hash.new(1.0)
|
13
19
|
@w[0] = Hash.new(0.0)
|
20
|
+
@covb[0] = 1.0
|
21
|
+
@bias[0] = 0.0
|
14
22
|
else
|
15
23
|
k.times do |i|
|
16
24
|
@cov[i] = Hash.new(1.0)
|
17
25
|
@w[i] = Hash.new(0.0)
|
26
|
+
@covb[i] = 1.0
|
27
|
+
@bias[i] = 0.0
|
18
28
|
end
|
19
29
|
end
|
20
30
|
end
|
21
|
-
def update(vec, label)
|
22
|
-
loss = 0.0
|
23
|
-
if (@k == 2)
|
24
|
-
loss = update_at(0, vec, label)
|
25
|
-
else
|
26
|
-
nega = rand(@k - 1)
|
27
|
-
if (nega == label)
|
28
|
-
nega += 1
|
29
|
-
end
|
30
|
-
s = 1.0 / @k
|
31
|
-
@k.times do |i|
|
32
|
-
loss += update_at(i, vec, label) * s
|
33
|
-
end
|
34
|
-
end
|
35
|
-
loss
|
36
|
-
end
|
37
|
-
def strip!
|
38
|
-
@w.each do |w|
|
39
|
-
w.reject!{|k,v| v.abs <= Float::EPSILON }
|
40
|
-
end
|
41
|
-
@w
|
42
|
-
end
|
43
|
-
|
44
|
-
private
|
45
|
-
def dot(vec, w)
|
46
|
-
dot = 0.0
|
47
|
-
vec.each do |k, v|
|
48
|
-
if (a = w[k])
|
49
|
-
dot += a * v
|
50
|
-
end
|
51
|
-
end
|
52
|
-
dot
|
53
|
-
end
|
54
31
|
def update_at(i, vec, label)
|
55
32
|
w = @w[i]
|
56
33
|
cov = @cov[i]
|
34
|
+
covb = @covb[i]
|
35
|
+
bias = @bias[i]
|
57
36
|
y = label == i ? 1 : -1
|
58
|
-
score = dot(vec, w)
|
37
|
+
score = bias + dot(vec, w)
|
59
38
|
alpha = 1.0 - y * score
|
60
39
|
if (alpha > 0.0)
|
61
40
|
r_inv= 1.0 / @r
|
62
|
-
var = vec.map
|
41
|
+
var = vec.map{|k, v| cov[k] * v * v }.reduce(:+) + covb
|
63
42
|
alpha *= (1.0 / (var + @r)) * y
|
64
43
|
vec.each do |k, v|
|
65
44
|
w[k] += alpha * cov[k] * v
|
66
45
|
cov[k] = 1.0 / ((1.0 / cov[k]) + (v * v * r_inv))
|
67
46
|
end
|
47
|
+
@bias[i] += alpha * covb
|
48
|
+
@covb[i] = 1.0 / ((1.0 / covb) + r_inv)
|
68
49
|
end
|
69
50
|
score * y < 0.0 ? 1.0 : 0.0
|
70
51
|
end
|
52
|
+
def default_iteration
|
53
|
+
DEFAULT_ITERATION
|
54
|
+
end
|
71
55
|
end
|
72
56
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module NekonekoGen
|
3
|
+
class Classifier
|
4
|
+
attr_reader :k
|
5
|
+
def parameter_code(index_converter = nil)
|
6
|
+
raise NotImplementedError
|
7
|
+
end
|
8
|
+
def classify_method_code
|
9
|
+
raise NotImplementedError
|
10
|
+
end
|
11
|
+
def update(vec, label)
|
12
|
+
raise NotImplementedError
|
13
|
+
end
|
14
|
+
def features(i = -1)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
def default_iteration
|
18
|
+
raise NotImplementedError
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'arow'))
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'pa'))
|
4
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'mlp'))
|
5
|
+
|
6
|
+
module NekonekoGen
|
7
|
+
module ClassifierFactory
|
8
|
+
def self.create(k, options)
|
9
|
+
method = options[:method] || :arow
|
10
|
+
case (method)
|
11
|
+
when :arow
|
12
|
+
Arow.new(k, options)
|
13
|
+
when :pa, :pa1, :pa2
|
14
|
+
PA.new(k, options)
|
15
|
+
when :mlp
|
16
|
+
MLP.new(k, options)
|
17
|
+
else
|
18
|
+
raise ArgumentError
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'json'
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier'))
|
4
|
+
|
5
|
+
module NekonekoGen
|
6
|
+
class LinearClassifier < Classifier
|
7
|
+
attr_reader :w, :bias
|
8
|
+
def dot(vec, w)
|
9
|
+
dot = 0.0
|
10
|
+
vec.each do |k, v|
|
11
|
+
if (a = w[k])
|
12
|
+
dot += a * v
|
13
|
+
end
|
14
|
+
end
|
15
|
+
dot
|
16
|
+
end
|
17
|
+
def strip!
|
18
|
+
@w.each {|w|
|
19
|
+
w.reject!{|k,v|
|
20
|
+
if (v.abs < Float::EPSILON)
|
21
|
+
# p v
|
22
|
+
true
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
}
|
27
|
+
}
|
28
|
+
@w
|
29
|
+
end
|
30
|
+
def update(vec, label)
|
31
|
+
loss = 0.0
|
32
|
+
if (@k == 2)
|
33
|
+
loss = update_at(0, vec, label)
|
34
|
+
else
|
35
|
+
s = 1.0 / @k
|
36
|
+
@k.times do |i|
|
37
|
+
loss += update_at(i, vec, label) * s
|
38
|
+
end
|
39
|
+
end
|
40
|
+
loss
|
41
|
+
end
|
42
|
+
def features(i = -1)
|
43
|
+
if (i < 0)
|
44
|
+
w.reduce(0){|sum, v| sum + v.size }
|
45
|
+
else
|
46
|
+
w[i].size
|
47
|
+
end
|
48
|
+
end
|
49
|
+
def parameter_code(lang, index_converter = lambda{|i| i})
|
50
|
+
lang ||= :ruby
|
51
|
+
case lang
|
52
|
+
when :ruby
|
53
|
+
else
|
54
|
+
raise NotImplementedError
|
55
|
+
end
|
56
|
+
|
57
|
+
wvec = self.strip!.map {|w|
|
58
|
+
w.reduce({}) {|h, kv| h[index_converter.call(kv[0])] = kv[1]; h }
|
59
|
+
}
|
60
|
+
<<CODE
|
61
|
+
BIAS = #{self.bias.inspect}
|
62
|
+
W = JSON.load(#{wvec.to_json.inspect})
|
63
|
+
CODE
|
64
|
+
end
|
65
|
+
def classify_method_code(lang)
|
66
|
+
lang ||= :ruby
|
67
|
+
case lang
|
68
|
+
when :ruby
|
69
|
+
else
|
70
|
+
raise NotImplementedError
|
71
|
+
end
|
72
|
+
|
73
|
+
<<CODE
|
74
|
+
def self.classify(vec)
|
75
|
+
if (K == 2)
|
76
|
+
BIAS[0] + W[0].values_at(*vec).compact.reduce(0.0, :+) > 0.0 ? 0 : 1
|
77
|
+
else
|
78
|
+
W.each_with_index.map {|w, i|
|
79
|
+
[BIAS[i] + w.values_at(*vec).compact.reduce(0.0, :+), i]
|
80
|
+
}.max.pop
|
81
|
+
end
|
82
|
+
end
|
83
|
+
CODE
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,176 @@
|
|
1
|
+
require 'json'
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier'))
|
3
|
+
|
4
|
+
module NekonekoGen
|
5
|
+
# Multi Layer Perceptron
|
6
|
+
class MLP < Classifier
|
7
|
+
IR = 0.4
|
8
|
+
HR = 0.1
|
9
|
+
NOISE_VAR = 0.3
|
10
|
+
MARGIN = 0.2
|
11
|
+
DEFAULT_ITERATION = 40
|
12
|
+
|
13
|
+
def default_hidden_unit
|
14
|
+
@k
|
15
|
+
end
|
16
|
+
def initialize(k, options)
|
17
|
+
@k = k
|
18
|
+
@output_units = @k == 2 ? 1 : @k
|
19
|
+
@hidden_units = (options[:c] || default_hidden_unit).to_i
|
20
|
+
@input = []
|
21
|
+
@hidden = []
|
22
|
+
@input_bias = []
|
23
|
+
@hidden_bias = []
|
24
|
+
@hidden_units.times do |i|
|
25
|
+
@input[i] = Hash.new {|hash, key| hash[key] = default_value }
|
26
|
+
@input_bias[i] = default_value
|
27
|
+
end
|
28
|
+
@output_units.times do |i|
|
29
|
+
@hidden[i] = []
|
30
|
+
@hidden_units.times do |j|
|
31
|
+
@hidden[i][j] = default_value
|
32
|
+
end
|
33
|
+
@hidden_bias[i] = default_value
|
34
|
+
end
|
35
|
+
end
|
36
|
+
def update(vec, label)
|
37
|
+
input_y = []
|
38
|
+
hidden_y = []
|
39
|
+
output_y = []
|
40
|
+
|
41
|
+
input_y = @hidden_units.times.map do |i|
|
42
|
+
w = @input[i]
|
43
|
+
sigmoid(@input_bias[i] + vec.map{|k, v| w[k] * v}.reduce(:+) + noise)
|
44
|
+
end
|
45
|
+
hidden_y = @output_units.times.map do |i|
|
46
|
+
@hidden_bias[i] + input_y.zip(@hidden[i]).map{|a, b| a * b }.reduce(:+)
|
47
|
+
end
|
48
|
+
output_y = @output_units.times.map do |i|
|
49
|
+
sigmoid(hidden_y[i])
|
50
|
+
end
|
51
|
+
|
52
|
+
loss = 0.0
|
53
|
+
dotrain = false
|
54
|
+
if (@output_units == 1)
|
55
|
+
if (output_y[0] > 0.5)
|
56
|
+
l = 0
|
57
|
+
else
|
58
|
+
l = 1
|
59
|
+
end
|
60
|
+
if (label == 0)
|
61
|
+
if (output_y[0] < 1.0 - MARGIN)
|
62
|
+
dotrain = true
|
63
|
+
end
|
64
|
+
else
|
65
|
+
if (output_y[0] > MARGIN)
|
66
|
+
dotrain = true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
loss = (label == l) ? 0.0 : 1.0
|
70
|
+
else
|
71
|
+
max_p, l = output_y.each_with_index.max
|
72
|
+
if (l == label)
|
73
|
+
if (max_p < 1.0 - MARGIN)
|
74
|
+
dotrain = true
|
75
|
+
end
|
76
|
+
else
|
77
|
+
loss = 1.0
|
78
|
+
dotrain = true
|
79
|
+
end
|
80
|
+
end
|
81
|
+
if (dotrain)
|
82
|
+
output_bp = @output_units.times.map do |i|
|
83
|
+
y = hidden_y[i]
|
84
|
+
yt = (label == i) ? 1.0 : 0.0
|
85
|
+
expy = Math.exp(y)
|
86
|
+
-((2.0 * yt - 1.0) * expy + yt) / (Math.exp(2.0 * y) + 2.0 * expy + 1.0)
|
87
|
+
end
|
88
|
+
hidden_bp = @hidden_units.times.map do |j|
|
89
|
+
y = 0.0
|
90
|
+
@output_units.times do |i|
|
91
|
+
y += output_bp[i] * @hidden[i][j]
|
92
|
+
end
|
93
|
+
y * (1.0 - input_y[j]) * input_y[j]
|
94
|
+
end
|
95
|
+
@output_units.times do |j|
|
96
|
+
hidden = @hidden[j]
|
97
|
+
@hidden_units.times do |i|
|
98
|
+
hidden[i] -= HR * input_y[i] * output_bp[j]
|
99
|
+
end
|
100
|
+
@hidden_bias[j] -= HR * output_bp[j]
|
101
|
+
end
|
102
|
+
@hidden_units.times do |i|
|
103
|
+
input = @input[i]
|
104
|
+
vec.each do |k, v|
|
105
|
+
input[k] -= IR * v * hidden_bp[i]
|
106
|
+
end
|
107
|
+
@input_bias[i] -= IR * hidden_bp[i]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
loss
|
111
|
+
end
|
112
|
+
def features(i = -1)
|
113
|
+
@input.map{|v| v.size }.reduce(:+)
|
114
|
+
end
|
115
|
+
def sigmoid(a)
|
116
|
+
1.0 / (1.0 + Math.exp(-a))
|
117
|
+
end
|
118
|
+
def default_value
|
119
|
+
(rand - 0.5)
|
120
|
+
end
|
121
|
+
def noise
|
122
|
+
(Math.sqrt(-2.0 * Math.log(rand)) * Math.sin(2.0 * Math::PI * rand)) * NOISE_VAR
|
123
|
+
end
|
124
|
+
def default_iteration
|
125
|
+
DEFAULT_ITERATION
|
126
|
+
end
|
127
|
+
def parameter_code(lang, index_converter = lambda{|i| i})
|
128
|
+
lang ||= :ruby
|
129
|
+
case lang
|
130
|
+
when :ruby
|
131
|
+
else
|
132
|
+
raise NotImplementedError
|
133
|
+
end
|
134
|
+
|
135
|
+
wvec = @input.map {|w|
|
136
|
+
w.reduce({}) {|h, kv| h[index_converter.call(kv[0])] = kv[1]; h }
|
137
|
+
}
|
138
|
+
<<CODE
|
139
|
+
HIDDEN_UNITS = #{@hidden_units}
|
140
|
+
INPUT_BIAS = #{@input_bias.inspect}
|
141
|
+
HIDDEN_BIAS = #{@hidden_bias.inspect}
|
142
|
+
INPUT_W = JSON.load(#{wvec.to_json.inspect})
|
143
|
+
HIDDEN_W = #{@hidden.inspect}
|
144
|
+
CODE
|
145
|
+
end
|
146
|
+
def classify_method_code(lang)
|
147
|
+
lang ||= :ruby
|
148
|
+
case lang
|
149
|
+
when :ruby
|
150
|
+
else
|
151
|
+
raise NotImplementedError
|
152
|
+
end
|
153
|
+
<<CODE
|
154
|
+
def self.classify(vec)
|
155
|
+
input_y = []
|
156
|
+
output_y = []
|
157
|
+
HIDDEN_UNITS.times do |i|
|
158
|
+
input_y[i] = sigmoid(INPUT_BIAS[i] +
|
159
|
+
INPUT_W[i].values_at(*vec).compact.reduce(0.0, :+))
|
160
|
+
end
|
161
|
+
if (K == 2)
|
162
|
+
HIDDEN_BIAS[0] +
|
163
|
+
input_y.zip(HIDDEN_W[0]).map{|a, b| a * b }.reduce(:+) > 0.0 ? 0 : 1
|
164
|
+
else
|
165
|
+
K.times.map{|i|
|
166
|
+
[HIDDEN_BIAS[i] + input_y.zip(HIDDEN_W[i]).map{|a, b| a * b }.reduce(:+), i]
|
167
|
+
}.max.pop
|
168
|
+
end
|
169
|
+
end
|
170
|
+
def self.sigmoid(a)
|
171
|
+
1.0 / (1.0 + Math.exp(-a))
|
172
|
+
end
|
173
|
+
CODE
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'linear_classifier'))
|
3
|
+
|
4
|
+
module NekonekoGen
|
5
|
+
# Passive Agressive
|
6
|
+
class PA < LinearClassifier
|
7
|
+
C = 1.0
|
8
|
+
NORM = 2.0 # norm + BIAS
|
9
|
+
DEFAULT_ITERATION = 20
|
10
|
+
|
11
|
+
def initialize(k, options = {})
|
12
|
+
@k = k
|
13
|
+
@c = options[:c] || C
|
14
|
+
@w = []
|
15
|
+
@bias = []
|
16
|
+
if (@k == 2)
|
17
|
+
@w[0] = Hash.new(0.0)
|
18
|
+
@bias[0] = 0.0
|
19
|
+
else
|
20
|
+
k.times do |i|
|
21
|
+
@w[i] = Hash.new(0.0)
|
22
|
+
@bias[i] = 0.0
|
23
|
+
end
|
24
|
+
end
|
25
|
+
if options[:method]
|
26
|
+
@tau =
|
27
|
+
case options[:method]
|
28
|
+
when :pa
|
29
|
+
lambda{|y, l| pa(y, l)}
|
30
|
+
when :pa1
|
31
|
+
lambda{|y, l| pa1(y, l)}
|
32
|
+
when :pa2
|
33
|
+
lambda{|y, l| pa2(y, l)}
|
34
|
+
else
|
35
|
+
lambda{|y, l| pa2(y, l)}
|
36
|
+
end
|
37
|
+
else
|
38
|
+
@tau = lambda{|y, l| pa2(y, l)}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def pa2(y, l)
|
42
|
+
y * (l / NORM + 0.5 / @c)
|
43
|
+
end
|
44
|
+
def pa1(y, l)
|
45
|
+
y * [@c, (l / NORM)].min
|
46
|
+
end
|
47
|
+
def pa(y, l)
|
48
|
+
y * l / NORM
|
49
|
+
end
|
50
|
+
def update_at(i, vec, label)
|
51
|
+
y = label == i ? 1 : -1
|
52
|
+
w = @w[i]
|
53
|
+
score = @bias[i] + dot(vec, w)
|
54
|
+
l = 1.0 - score * y
|
55
|
+
if (l > 0.0)
|
56
|
+
alpha = @tau.call(y, l)
|
57
|
+
vec.each do |k, v|
|
58
|
+
w[k] += alpha * v
|
59
|
+
end
|
60
|
+
@bias[i] += alpha
|
61
|
+
end
|
62
|
+
y * score < 0.0 ? 1.0 : 0.0
|
63
|
+
end
|
64
|
+
def default_iteration
|
65
|
+
DEFAULT_ITERATION
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -1,9 +1,8 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
-
require 'json'
|
3
2
|
require 'nkf'
|
4
3
|
require 'bimyou_segmenter'
|
5
4
|
|
6
|
-
require File.expand_path(File.join(File.dirname(__FILE__), '
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'classifier_factory'))
|
7
6
|
|
8
7
|
module NekonekoGen
|
9
8
|
class TextClassifierGenerator
|
@@ -15,16 +14,15 @@ module NekonekoGen
|
|
15
14
|
@files = files
|
16
15
|
@word2id = {}
|
17
16
|
@id2word = {}
|
18
|
-
@
|
19
|
-
|
17
|
+
@classifier = ClassifierFactory.create(files.size, options)
|
20
18
|
@name = safe_name(@filename).split("_").map(&:capitalize).join
|
21
19
|
@labels = files.map {|file| "#{safe_name(file).upcase}"}
|
22
20
|
end
|
23
21
|
|
24
|
-
def train(iteration =
|
25
|
-
iteration ||=
|
22
|
+
def train(iteration = nil)
|
23
|
+
iteration ||= @classifier.default_iteration
|
26
24
|
data = []
|
27
|
-
@
|
25
|
+
@classifier.k.times do |i|
|
28
26
|
t = Time.now
|
29
27
|
data[i] = []
|
30
28
|
print "loading #{@files[i]}... "
|
@@ -49,31 +47,35 @@ module NekonekoGen
|
|
49
47
|
t = Time.now
|
50
48
|
print sprintf("step %3d...", step)
|
51
49
|
|
52
|
-
@
|
50
|
+
@classifier.k.times.map do |i|
|
53
51
|
sampling(data[i], samples).map {|vec| [vec, i] }
|
54
52
|
end.flatten(1).shuffle!.each do |v|
|
55
|
-
loss += @
|
53
|
+
loss += @classifier.update(v[0], v[1])
|
56
54
|
c += 1
|
57
55
|
end
|
58
56
|
print sprintf(" %.6f, %.4fs\n", 1.0 - loss / c.to_f, Time.now - t)
|
59
57
|
end
|
60
|
-
@
|
61
|
-
|
62
|
-
|
63
|
-
@arow.w.each_with_index do |w, i|
|
64
|
-
puts "#{@labels[i]} : #{w.size} features"
|
58
|
+
if (@classifier.k > 2)
|
59
|
+
@classifier.k.times do |i|
|
60
|
+
puts "#{@labels[i]} : #{@classifier.features(i)} features"
|
65
61
|
end
|
66
62
|
else
|
67
|
-
puts "#{@labels[0]}, #{@labels[1]} : #{@
|
63
|
+
puts "#{@labels[0]}, #{@labels[1]} : #{@classifier.features(0)} features"
|
68
64
|
end
|
69
65
|
puts "done nyan! "
|
70
66
|
end
|
71
|
-
def generate
|
72
|
-
|
73
|
-
|
74
|
-
|
67
|
+
def generate(lang = :ruby)
|
68
|
+
lang ||= :ruby
|
69
|
+
case lang
|
70
|
+
when :ruby
|
71
|
+
generate_ruby_code
|
72
|
+
else
|
73
|
+
raise NotImplementedError
|
74
|
+
end
|
75
|
+
@name
|
76
|
+
end
|
77
|
+
def generate_ruby_code
|
75
78
|
labels = @labels.each_with_index.map{|v, i| " #{v} = #{i}"}.join("\n")
|
76
|
-
|
77
79
|
File.open(@filename, "w") do |f|
|
78
80
|
f.write <<MODEL
|
79
81
|
# -*- coding: utf-8 -*-
|
@@ -82,9 +84,21 @@ require 'json'
|
|
82
84
|
require 'bimyou_segmenter'
|
83
85
|
|
84
86
|
class #{@name}
|
87
|
+
def self.k
|
88
|
+
K
|
89
|
+
end
|
85
90
|
def self.predict(text)
|
91
|
+
classify(fv(text))
|
92
|
+
end
|
93
|
+
|
94
|
+
#{labels}
|
95
|
+
LABELS = #{@labels.inspect}
|
96
|
+
K = #{@classifier.k}
|
97
|
+
|
98
|
+
private
|
99
|
+
def self.fv(text)
|
86
100
|
prev = nil
|
87
|
-
|
101
|
+
BimyouSegmenter.segment(text).map do |word|
|
88
102
|
if (prev)
|
89
103
|
if (NGRAM_TARGET =~ word)
|
90
104
|
nword = [prev + word, word]
|
@@ -101,27 +115,14 @@ class #{@name}
|
|
101
115
|
word
|
102
116
|
end
|
103
117
|
end.flatten(1)
|
104
|
-
vec << " bias "
|
105
|
-
if (W.size == 1)
|
106
|
-
W[0].values_at(*vec).compact.reduce(:+) > 0.0 ? 0 : 1
|
107
|
-
else
|
108
|
-
W.each_with_index.map {|w,i|
|
109
|
-
[w.values_at(*vec).compact.reduce(:+), i]
|
110
|
-
}.max.pop
|
111
|
-
end
|
112
118
|
end
|
113
|
-
|
114
|
-
W.size == 1 ? 2 : W.size
|
115
|
-
end
|
116
|
-
#{labels}
|
117
|
-
LABELS = #{@labels.inspect}
|
119
|
+
#{@classifier.classify_method_code(:ruby)}
|
118
120
|
|
119
|
-
private
|
120
121
|
NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
|
121
122
|
'(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
|
122
123
|
[0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
|
123
|
-
|
124
|
-
|
124
|
+
[0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
|
125
|
+
#{@classifier.parameter_code(:ruby, lambda{|id| id2word(id) })}
|
125
126
|
end
|
126
127
|
MODEL
|
127
128
|
end
|
@@ -143,8 +144,6 @@ MODEL
|
|
143
144
|
end
|
144
145
|
def fv(text)
|
145
146
|
vec = Hash.new(0)
|
146
|
-
vec[word2id(" bias ")] = 1
|
147
|
-
|
148
147
|
prev = nil
|
149
148
|
words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
|
150
149
|
if (prev)
|
@@ -170,7 +169,7 @@ MODEL
|
|
170
169
|
vec
|
171
170
|
end
|
172
171
|
def normalize(vec)
|
173
|
-
norm = Math.sqrt(vec.
|
172
|
+
norm = Math.sqrt(vec.values.map{|v| v * v }.reduce(:+))
|
174
173
|
if (norm > 0.0)
|
175
174
|
s = 1.0 / norm
|
176
175
|
vec.each do |k, v|
|
data/lib/nekoneko_gen/version.rb
CHANGED
data/lib/nekoneko_gen.rb
CHANGED
@@ -5,29 +5,51 @@ require 'optparse'
|
|
5
5
|
require 'fileutils'
|
6
6
|
|
7
7
|
module NekonekoGen
|
8
|
-
DEFAULT_ITERATION = 20
|
9
|
-
|
10
8
|
def self.run(argv)
|
11
|
-
iteration =
|
9
|
+
iteration = nil
|
12
10
|
rubyfile = nil
|
13
11
|
quiet = false
|
14
|
-
|
15
|
-
$stdout.sync = true
|
16
12
|
|
13
|
+
$stdout.sync = true
|
14
|
+
method = nil
|
15
|
+
c = nil
|
17
16
|
opt = OptionParser.new do |o|
|
18
17
|
o.on('-n NAME', 'new classifier name') do |v|
|
19
18
|
rubyfile = File.join(File.dirname(v), File.basename(v, ".*") + ".rb")
|
20
19
|
FileUtils.touch(rubyfile)
|
21
20
|
end
|
22
|
-
o.on('-i N', "iteration
|
21
|
+
o.on('-i N', "iteration (default: auto)") do |v|
|
23
22
|
iteration = v.to_i.abs
|
24
23
|
end
|
24
|
+
o.on('-m METHOD', "machine learning method [AROW|PA2|MLP] (default AROW)") do |v|
|
25
|
+
if (v)
|
26
|
+
case v.downcase
|
27
|
+
when 'arow'
|
28
|
+
method = :arow
|
29
|
+
when 'pa1'
|
30
|
+
method = :pa1
|
31
|
+
when 'pa2'
|
32
|
+
method = :pa2
|
33
|
+
when 'mlp'
|
34
|
+
method = :mlp
|
35
|
+
else
|
36
|
+
warn opt
|
37
|
+
return -1
|
38
|
+
end
|
39
|
+
else
|
40
|
+
warn opt
|
41
|
+
return -1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
o.on('-p C', "parameter (default AROW::R=6.0, PA2::C=1.0, MLP::HIDDEN_UNIT=K)") do |v|
|
45
|
+
c = v.to_f
|
46
|
+
end
|
25
47
|
o.on('-q', "quiet") do
|
26
48
|
quiet = true
|
27
49
|
end
|
28
50
|
end
|
29
51
|
opt.version = NekonekoGen::VERSION
|
30
|
-
opt.banner = "Usage: nekoneko_gen -n
|
52
|
+
opt.banner = "Usage: nekoneko_gen [OPTIONS] -n NAME FILE1 FILE2 [FILES...]"
|
31
53
|
files = opt.parse(argv)
|
32
54
|
|
33
55
|
unless (rubyfile)
|
@@ -45,7 +67,7 @@ module NekonekoGen
|
|
45
67
|
end
|
46
68
|
end
|
47
69
|
|
48
|
-
gen = NekonekoGen::TextClassifierGenerator.new(rubyfile, files)
|
70
|
+
gen = NekonekoGen::TextClassifierGenerator.new(rubyfile, files, {:method => method, :c => c})
|
49
71
|
if (quiet)
|
50
72
|
gen.quiet = true
|
51
73
|
end
|
data/test/nekoneko_gen_test.rb
CHANGED
@@ -6,118 +6,146 @@ class NekonekoGenTest < Test::Unit::TestCase
|
|
6
6
|
@file0 = File.join(File.dirname(__FILE__), 'class0.txt')
|
7
7
|
@file1 = File.join(File.dirname(__FILE__), 'class1.txt')
|
8
8
|
@file2 = File.join(File.dirname(__FILE__), 'class2.txt')
|
9
|
-
@
|
10
|
-
@output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_classifier.rb")
|
9
|
+
@clean_files = []
|
11
10
|
end
|
12
11
|
def teardown
|
13
|
-
|
12
|
+
@clean_files.each do |file|
|
13
|
+
if (File.exist?(file))
|
14
|
+
File.unlink(file)
|
15
|
+
end
|
16
|
+
end
|
14
17
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
18
|
+
|
19
|
+
def test_mlp
|
20
|
+
gen2('mlp', {:method => :mlp})
|
21
|
+
gen3('mlp', {:method => :mlp})
|
22
|
+
end
|
23
|
+
def test_pa2
|
24
|
+
gen2('pa2', {:method => :pa2})
|
25
|
+
gen3('pa2', {:method => :pa2})
|
26
|
+
end
|
27
|
+
def test_arow
|
28
|
+
gen2('arow', {:method => :arow})
|
29
|
+
gen3('arow',{:method => :arow})
|
30
|
+
end
|
31
|
+
|
32
|
+
def clean!(a, b)
|
33
|
+
if (File.exist?(a))
|
34
|
+
File.unlink(a)
|
19
35
|
end
|
20
|
-
|
21
|
-
File.unlink(
|
22
|
-
rescue
|
36
|
+
if (File.exist?(b))
|
37
|
+
File.unlink(b)
|
23
38
|
end
|
24
|
-
end
|
39
|
+
end
|
25
40
|
|
26
|
-
def
|
27
|
-
|
41
|
+
def gen2(prefix, options)
|
42
|
+
p "---- #{prefix} generate 2class"
|
43
|
+
output_file2 = File.join(Dir.tmpdir, "nekoneko_test2_#{prefix}_classifier.rb")
|
44
|
+
output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_#{prefix}_classifier.rb")
|
28
45
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
gen.generate
|
46
|
+
clean!(output_file2, output_file3)
|
47
|
+
@clean_files << output_file2
|
48
|
+
@clean_files << output_file3
|
33
49
|
|
34
|
-
|
35
|
-
|
50
|
+
gen = NekonekoGen::TextClassifierGenerator.new(output_file2, [@file0, @file1], options)
|
51
|
+
gen.train
|
52
|
+
modname = gen.generate
|
53
|
+
|
54
|
+
unless (File.exist?(output_file2))
|
55
|
+
assert_equal "#{output_file2} not found", nil
|
36
56
|
end
|
37
57
|
|
38
58
|
begin
|
39
|
-
load
|
59
|
+
load output_file2
|
40
60
|
|
61
|
+
mod = Kernel.const_get(modname)
|
41
62
|
ok = 0
|
42
63
|
count = 0
|
43
64
|
File.open(@file0) do |f|
|
44
65
|
until f.eof?
|
45
|
-
if (
|
66
|
+
if (mod.predict(f.readline) == mod::CLASS0)
|
46
67
|
ok += 1
|
47
68
|
end
|
48
69
|
count += 1
|
49
70
|
end
|
50
71
|
end
|
51
|
-
puts "#{
|
72
|
+
puts "#{mod::LABELS[0]}: #{ok.to_f / count}"
|
52
73
|
assert ok.to_f / count > 0.9
|
53
|
-
|
74
|
+
|
54
75
|
ok = 0
|
55
76
|
count = 0
|
56
77
|
File.open(@file1) do |f|
|
57
78
|
until f.eof?
|
58
|
-
if (
|
79
|
+
if (mod.predict(f.readline) == mod::CLASS1)
|
59
80
|
ok += 1
|
60
81
|
end
|
61
82
|
count += 1
|
62
83
|
end
|
63
84
|
end
|
64
|
-
puts "#{
|
85
|
+
puts "#{mod::LABELS[1]}: #{ok.to_f / count}"
|
65
86
|
assert ok.to_f / count > 0.9
|
66
87
|
end
|
67
88
|
end
|
68
89
|
|
69
|
-
def
|
70
|
-
|
90
|
+
def gen3(prefix, options)
|
91
|
+
p "---- #{prefix} generate 3class"
|
92
|
+
output_file2 = File.join(Dir.tmpdir, "nekoneko_test2_#{prefix}_classifier.rb")
|
93
|
+
output_file3 = File.join(Dir.tmpdir, "nekoneko_test3_#{prefix}_classifier.rb")
|
94
|
+
|
95
|
+
clean!(output_file2, output_file3)
|
96
|
+
@clean_files << output_file2
|
97
|
+
@clean_files << output_file3
|
71
98
|
|
72
|
-
gen = NekonekoGen::TextClassifierGenerator.new(
|
73
|
-
|
74
|
-
gen.train
|
75
|
-
gen.generate
|
99
|
+
gen = NekonekoGen::TextClassifierGenerator.new(output_file3,
|
100
|
+
[@file0, @file1, @file2], options)
|
101
|
+
gen.train
|
102
|
+
modname = gen.generate
|
76
103
|
|
77
|
-
unless (File.exist?(
|
78
|
-
assert_equal "#{
|
104
|
+
unless (File.exist?(output_file3))
|
105
|
+
assert_equal "#{output_file3} not found", nil
|
79
106
|
end
|
80
107
|
|
81
108
|
begin
|
82
|
-
load
|
83
|
-
|
109
|
+
load output_file3
|
110
|
+
|
111
|
+
mod = Kernel.const_get(modname)
|
84
112
|
ok = 0
|
85
113
|
count = 0
|
86
114
|
File.open(@file0) do |f|
|
87
115
|
until f.eof?
|
88
|
-
if (
|
116
|
+
if (mod.predict(f.readline) == mod::CLASS0)
|
89
117
|
ok += 1
|
90
118
|
end
|
91
119
|
count += 1
|
92
120
|
end
|
93
121
|
end
|
94
|
-
puts "#{
|
122
|
+
puts "#{mod::LABELS[0]}: #{ok.to_f / count}"
|
95
123
|
assert ok.to_f / count > 0.9
|
96
124
|
|
97
125
|
ok = 0
|
98
126
|
count = 0
|
99
127
|
File.open(@file1) do |f|
|
100
128
|
until f.eof?
|
101
|
-
if (
|
129
|
+
if (mod.predict(f.readline) == mod::CLASS1)
|
102
130
|
ok += 1
|
103
131
|
end
|
104
132
|
count += 1
|
105
133
|
end
|
106
134
|
end
|
107
|
-
puts "#{
|
135
|
+
puts "#{mod::LABELS[1]}: #{ok.to_f / count}"
|
108
136
|
assert ok.to_f / count > 0.9
|
109
137
|
|
110
138
|
ok = 0
|
111
139
|
count = 0
|
112
140
|
File.open(@file2) do |f|
|
113
141
|
until f.eof?
|
114
|
-
if (
|
142
|
+
if (mod.predict(f.readline) == mod::CLASS2)
|
115
143
|
ok += 1
|
116
144
|
end
|
117
145
|
count += 1
|
118
146
|
end
|
119
147
|
end
|
120
|
-
puts "#{
|
148
|
+
puts "#{mod::LABELS[2]}: #{ok.to_f / count}"
|
121
149
|
assert ok.to_f / count > 0.9
|
122
150
|
end
|
123
151
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nekoneko_gen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-01 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bimyou_segmenter
|
16
|
-
requirement: &
|
16
|
+
requirement: &14306440 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *14306440
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: json
|
27
|
-
requirement: &
|
27
|
+
requirement: &14304220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *14304220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: test-unit
|
38
|
-
requirement: &
|
38
|
+
requirement: &14303060 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *14303060
|
47
47
|
description: Japanese Text Classifier Generator
|
48
48
|
email:
|
49
49
|
- nagadomi@nurs.or.jp
|
@@ -60,6 +60,11 @@ files:
|
|
60
60
|
- bin/nekoneko_gen
|
61
61
|
- lib/nekoneko_gen.rb
|
62
62
|
- lib/nekoneko_gen/arow.rb
|
63
|
+
- lib/nekoneko_gen/classifier.rb
|
64
|
+
- lib/nekoneko_gen/classifier_factory.rb
|
65
|
+
- lib/nekoneko_gen/linear_classifier.rb
|
66
|
+
- lib/nekoneko_gen/mlp.rb
|
67
|
+
- lib/nekoneko_gen/pa.rb
|
63
68
|
- lib/nekoneko_gen/text_classifier_generator.rb
|
64
69
|
- lib/nekoneko_gen/version.rb
|
65
70
|
- nekoneko_gen.gemspec
|