nekoneko_gen 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in nekoneko_gen.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 nagadomi@nurs.or.jp
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,126 @@
1
+ # NekonekoGen
2
+
3
+ Easy to Use Ruby Text Classifier Generator.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'nekoneko_gen'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install nekoneko_gen
18
+
19
+ ## Usage
20
+
21
+ % mkdir data
22
+ % cd data
23
+ % wget -i http://www.udp.jp/misc/2ch_data/index1.txt
24
+ ...
25
+ % cd ..
26
+ % nekoneko_gen -n game_thread_classifier data/dragon_quest.txt data/loveplus.txt
27
+ loading data/dragon_quest.txt... 35.5426s
28
+ loading data/loveplus.txt... 36.0522s
29
+ step 0... 0.879858, 3.7805s
30
+ step 1... 0.919624, 2.2018s
31
+ step 2... 0.932147, 2.1174s
32
+ step 3... 0.940959, 2.0569s
33
+ step 4... 0.946985, 1.8876s
34
+ step 5... 0.950891, 1.8564s
35
+ step 6... 0.953541, 1.8398s
36
+ step 7... 0.955464, 1.8204s
37
+ step 8... 0.957427, 1.8008s
38
+ step 9... 0.959056, 1.7912s
39
+ step 10... 0.961098, 1.8027s
40
+ step 11... 0.961745, 1.7716s
41
+ step 12... 0.962943, 1.7633s
42
+ step 13... 0.963610, 1.7477s
43
+ step 14... 0.964611, 1.6216s
44
+ step 15... 0.965259, 1.7291s
45
+ step 16... 0.965730, 1.7271s
46
+ step 17... 0.966613, 1.7225s
47
+ step 18... 0.967241, 1.5861s
48
+ step 19... 0.967712, 1.7113s
49
+ DRAGON_QUEST, LOVEPLUS : 71573 features
50
+ done nyan!
51
+
52
+ % ls -la
53
+ ...
54
+ -rw-r--r-- 1 ore users 2555555 2012-05-28 08:10 game_thread_classifier.rb
55
+ ...
56
+
57
+ % cat > console.rb
58
+ # coding: utf-8
59
+ if (RUBY_VERSION < '1.9.0')
60
+ $KCODE = 'u'
61
+ end
62
+ require './game_thread_classifier'
63
+
64
+ $stdout.sync = true
65
+ loop do
66
+ print "> "
67
+ line = $stdin.readline
68
+ label = GameThreadClassifier.predict(line)
69
+ puts "#{GameThreadClassifier::LABELS[label]}の話題です!!!"
70
+ end
71
+ ^D
72
+
73
+ % ruby console.rb
74
+ > 彼女からメールが来た
75
+ LOVEPLUSの話題です!!!
76
+ > 日曜日はデートしてました
77
+ LOVEPLUSの話題です!!!
78
+ > 金欲しい
79
+ DRAGON_QUESTの話題です!!!
80
+ > 王様になりたい
81
+ DRAGON_QUESTの話題です!!!
82
+ > スライム
83
+ DRAGON_QUESTの話題です!!!
84
+ > スライムを彼女にプレゼント
85
+ LOVEPLUSの話題です!!!
86
+
87
+ %cat > test.rb
88
+ if (RUBY_VERSION < '1.9.0')
89
+ $KCODE = 'u'
90
+ end
91
+ require './game_thread_classifier'
92
+
93
+ labels = Array.new(GameThreadClassifier.k, 0)
94
+ file = ARGV.shift
95
+ File.open(file) do |f|
96
+ until f.eof?
97
+ l = f.readline.chomp
98
+ label = GameThreadClassifier.predict(l)
99
+ labels[label] += 1
100
+ end
101
+ end
102
+ count = labels.reduce(:+)
103
+ labels.each_with_index do |c, i|
104
+ printf "%16s: %f\n", GameThreadClassifier::LABELS[i], c.to_f / count.to_f
105
+ end
106
+ ^D
107
+
108
+ % ruby test.rb data/dragon_quest_test.txt
109
+ DRAGON_QUEST: 0.932000
110
+ LOVEPLUS: 0.068000
111
+ % ruby test.rb data/loveplus_test.txt
112
+ DRAGON_QUEST: 0.124000
113
+ LOVEPLUS: 0.876000
114
+ % ruby test.rb data/dragon_quest_test2.txt
115
+ DRAGON_QUEST: 0.988000
116
+ LOVEPLUS: 0.012000
117
+ % ruby test.rb data/loveplus_test2.txt
118
+ DRAGON_QUEST: 0.012048
119
+ LOVEPLUS: 0.987952
120
+
121
+
122
+ % nekoneko_gen -n game_thread_classifier data/dragon_quest.txt data/loveplus.txt data/skyrim.txt data/mhf.txt
123
+ ...
124
+ ...
125
+ ...
126
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new(:test) do |test|
6
+ test.libs << 'test'
7
+ test.test_files = Dir.glob("test/**/*_test.rb")
8
+ test.verbose = true
9
+ test.warning = true
10
+ end
data/bin/nekoneko_gen ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ if (RUBY_VERSION < "1.9.0")
4
+ require 'rubygems'
5
+ $KCODE= 'u'
6
+ end
7
+ require 'nekoneko_gen'
8
+
9
+ exit(NekonekoGen.run(ARGV))
@@ -0,0 +1,71 @@
1
+ module NekonekoGen
2
+ class Arow
3
+ R = 6.0
4
+ attr_accessor :k, :w
5
+ def initialize(k, options = {})
6
+ @r = options[:r] || R
7
+ @k = k
8
+ @cov = []
9
+ @w = []
10
+ if (@k == 2)
11
+ @cov[0] = Hash.new(1.0)
12
+ @w[0] = Hash.new(0.0)
13
+ else
14
+ k.times do |i|
15
+ @cov[i] = Hash.new(1.0)
16
+ @w[i] = Hash.new(0.0)
17
+ end
18
+ end
19
+ end
20
+ def update(vec, label)
21
+ loss = 0.0
22
+ if (@k == 2)
23
+ loss = update_at(0, vec, label)
24
+ else
25
+ nega = rand(@k - 1)
26
+ if (nega == label)
27
+ nega += 1
28
+ end
29
+ s = 1.0 / @k
30
+ @k.times do |i|
31
+ loss += update_at(i, vec, label) * s
32
+ end
33
+ end
34
+ loss
35
+ end
36
+ def strip!
37
+ @w.each do |w|
38
+ w.reject!{|k,v| v.abs <= Float::EPSILON }
39
+ end
40
+ @w
41
+ end
42
+
43
+ private
44
+ def dot(vec, w)
45
+ dot = 0.0
46
+ vec.each do |k, v|
47
+ if (a = w[k])
48
+ dot += a * v
49
+ end
50
+ end
51
+ dot
52
+ end
53
+ def update_at(i, vec, label)
54
+ w = @w[i]
55
+ cov = @cov[i]
56
+ y = label == i ? 1 : -1
57
+ score = dot(vec, w)
58
+ alpha = 1.0 - y * score
59
+ if (alpha > 0.0)
60
+ r_inv= 1.0 / @r
61
+ var = vec.map {|k, v| cov[k] * v * v }.reduce(:+)
62
+ alpha *= (1.0 / (var + @r)) * y
63
+ vec.each do |k, v|
64
+ w[k] += alpha * cov[k] * v
65
+ cov[k] = 1.0 / ((1.0 / cov[k]) + (v * v * r_inv))
66
+ end
67
+ end
68
+ score * y < 0.0 ? 1.0 : 0.0
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,226 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'json'
3
+ require 'bimyou_segmenter'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), 'arow'))
5
+
6
+ module NekonekoGen
7
+ class TextClassifierGenerator
8
+ attr_accessor :quiet
9
+ def initialize(filename, files, options = {})
10
+ @quiet = false
11
+ @options = options
12
+ @filename = filename
13
+ @files = files
14
+ @word2id = {}
15
+ @id2word = {}
16
+ @arow = Arow.new(files.size, options)
17
+
18
+ @name = safe_name(@filename).split("_").map(&:capitalize).join
19
+ @labels = files.map {|file| "#{safe_name(file).upcase}"}
20
+ end
21
+
22
+ def train(iteration = 20)
23
+ iteration ||= 20
24
+ data = []
25
+ @arow.k.times do |i|
26
+ t = Time.now
27
+ data[i] = []
28
+ print "loading #{@files[i]}... "
29
+ File.open(@files[i]) do |f|
30
+ until (f.eof?)
31
+ vec = fv(f.readline.chomp)
32
+ if (vec.size > 0)
33
+ data[i] << normalize(vec)
34
+ end
35
+ end
36
+ end
37
+ puts sprintf("%.4fs", Time.now - t)
38
+ end
39
+ samples = data.map{|v| v.size}.min
40
+ iteration.times do |step|
41
+ loss = 0.0
42
+ c = 0
43
+ t = Time.now
44
+ print sprintf("step %3d...", step)
45
+
46
+ @arow.k.times.map do |i|
47
+ sampling(data[i], samples).map {|vec| [vec, i] }
48
+ end.flatten(1).shuffle!.each do |v|
49
+ loss += @arow.update(v[0], v[1])
50
+ c += 1
51
+ end
52
+ print sprintf(" %.6f, %.4fs\n", 1.0 - loss / c.to_f, Time.now - t)
53
+ end
54
+ @arow.strip!
55
+
56
+ if (@arow.k > 2)
57
+ @arow.w.each_with_index do |w, i|
58
+ puts "#{@labels[i]} : #{w.size} features"
59
+ end
60
+ else
61
+ puts "#{@labels[0]}, #{@labels[1]} : #{@arow.w[0].size} features"
62
+ end
63
+ puts "done nyan! "
64
+ end
65
+ def generate
66
+ wv = @arow.w.map {|w|
67
+ w.reduce({}) {|h, kv| h[id2word(kv[0])] = kv[1]; h }
68
+ }
69
+ labels = @labels.each_with_index.map{|v, i| " #{v} = #{i}"}.join("\n")
70
+
71
+ File.open(@filename, "w") do |f|
72
+ f.write <<MODEL
73
+ # -*- coding: utf-8 -*-
74
+ require 'rubygems'
75
+ require 'json'
76
+ require 'bimyou_segmenter'
77
+
78
+ class #{@name}
79
+ def self.predict(text)
80
+ vec = Hash.new(0)
81
+ prev = nil
82
+ vec = BimyouSegmenter.segment(text).map do |word|
83
+ if (prev)
84
+ if (NGRAM_TARGET =~ word)
85
+ nword = [prev + word, word]
86
+ prev = word
87
+ nword
88
+ else
89
+ prev = nil
90
+ word
91
+ end
92
+ else
93
+ if (NGRAM_TARGET =~ word)
94
+ prev = word
95
+ end
96
+ word
97
+ end
98
+ end.flatten(1)
99
+ vec << " bias "
100
+ if (W.size == 1)
101
+ W[0].values_at(*vec).compact.reduce(:+) > 0.0 ? 0 : 1
102
+ else
103
+ W.each_with_index.map {|w,i|
104
+ [w.values_at(*vec).compact.reduce(:+), i]
105
+ }.max.pop
106
+ end
107
+ end
108
+ def self.k
109
+ W.size == 1 ? 2 : W.size
110
+ end
111
+ #{labels}
112
+ LABELS = #{@labels.inspect}
113
+
114
+ private
115
+ NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
116
+ '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
117
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
118
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
119
+ W = JSON.load(#{wv.to_json.inspect})
120
+ end
121
+ MODEL
122
+ end
123
+ end
124
+
125
+ private
126
+ def id2word(id)
127
+ @id2word[id]
128
+ end
129
+ def word2id(word)
130
+ if (word_id = @word2id[word])
131
+ word_id
132
+ else
133
+ word_id = @word2id.size
134
+ @word2id[word] = word_id
135
+ @id2word[word_id] = word
136
+ word_id
137
+ end
138
+ end
139
+ def fv(text)
140
+ vec = Hash.new(0)
141
+ vec[word2id(" bias ")] = 1
142
+
143
+ prev = nil
144
+ words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
145
+ if (prev)
146
+ if (NGRAM_TARGET =~ word)
147
+ nword = [prev + word, word]
148
+ prev = word
149
+ nword
150
+ else
151
+ prev = nil
152
+ word
153
+ end
154
+ else
155
+ if (NGRAM_TARGET =~ word)
156
+ prev = word
157
+ end
158
+ word
159
+ end
160
+ end.flatten(1).reject do |word|
161
+ STOP_WORDS[word]
162
+ end.each do |word|
163
+ vec[word2id(word)] += 1
164
+ end
165
+ vec
166
+ end
167
+ def normalize(vec)
168
+ norm = Math.sqrt(vec.each_value.reduce(0){|a, v| a + v * v })
169
+ if (norm > 0.0)
170
+ s = 1.0 / norm
171
+ vec.each do |k, v|
172
+ vec[k] = v * s
173
+ end
174
+ end
175
+ vec
176
+ end
177
+ def sampling(a, n)
178
+ if (a.size < n)
179
+ over_sampling(a, n)
180
+ else
181
+ under_sampling(a, n)
182
+ end
183
+ end
184
+ def over_sampling(a, n)
185
+ if (a.size == n)
186
+ a
187
+ else
188
+ if (a.respond_to?(:sample))
189
+ a + a.sample(n - a.size)
190
+ else
191
+ a + a.shuffle[0, n - a.size]
192
+ end
193
+ end
194
+ end
195
+ def under_sampling(a, n)
196
+ if (a.size == n)
197
+ a
198
+ else
199
+ if (a.respond_to?(:sample))
200
+ a.sample(n)
201
+ else
202
+ a.shuffle[0, n]
203
+ end
204
+ end
205
+ end
206
+ def safe_name(filename)
207
+ File.basename(filename, ".*").gsub('-','_').gsub(/[^a-zA-Z_0-9]/, '')
208
+ end
209
+ def puts(s)
210
+ unless (@quiet)
211
+ Kernel.puts s
212
+ end
213
+ end
214
+ def print(s)
215
+ unless (@quiet)
216
+ Kernel.print s
217
+ end
218
+ end
219
+ NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\-_a-zA-Z‐_0-90-9]+$)|' +
220
+ '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
221
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
222
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
223
+ WORD_COUNT_THRESH = 2
224
+ STOP_WORDS = {"の"=>1, "に"=>1, "て"=>1, "が"=>1, "た"=>1, "は"=>1, "で"=>1, "を"=>1, "と"=>1, "か"=>1, "も"=>1, "ない"=>1, "だ"=>1, "な"=>1, "です"=>1, "から"=>1, "ます"=>1, "う"=>1, "けど"=>1, "って"=>1, "ば"=>1, "よ"=>1, "まし"=>1, "たら"=>1, "ね"=>1, "ん"=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1}
225
+ end
226
+ end