nekoneko_gen 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in nekoneko_gen.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 nagadomi@nurs.or.jp
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,126 @@
1
+ # NekonekoGen
2
+
3
+ Easy to Use Ruby Text Classifier Generator.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'nekoneko_gen'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install nekoneko_gen
18
+
19
+ ## Usage
20
+
21
+ % mkdir data
22
+ % cd data
23
+ % wget -i http://www.udp.jp/misc/2ch_data/index1.txt
24
+ ...
25
+ % cd ..
26
+ % nekoneko_gen -n game_thread_classifier data/dragon_quest.txt data/loveplus.txt
27
+ loading data/dragon_quest.txt... 35.5426s
28
+ loading data/loveplus.txt... 36.0522s
29
+ step 0... 0.879858, 3.7805s
30
+ step 1... 0.919624, 2.2018s
31
+ step 2... 0.932147, 2.1174s
32
+ step 3... 0.940959, 2.0569s
33
+ step 4... 0.946985, 1.8876s
34
+ step 5... 0.950891, 1.8564s
35
+ step 6... 0.953541, 1.8398s
36
+ step 7... 0.955464, 1.8204s
37
+ step 8... 0.957427, 1.8008s
38
+ step 9... 0.959056, 1.7912s
39
+ step 10... 0.961098, 1.8027s
40
+ step 11... 0.961745, 1.7716s
41
+ step 12... 0.962943, 1.7633s
42
+ step 13... 0.963610, 1.7477s
43
+ step 14... 0.964611, 1.6216s
44
+ step 15... 0.965259, 1.7291s
45
+ step 16... 0.965730, 1.7271s
46
+ step 17... 0.966613, 1.7225s
47
+ step 18... 0.967241, 1.5861s
48
+ step 19... 0.967712, 1.7113s
49
+ DRAGON_QUEST, LOVEPLUS : 71573 features
50
+ done nyan!
51
+
52
+ % ls -la
53
+ ...
54
+ -rw-r--r-- 1 ore users 2555555 2012-05-28 08:10 game_thread_classifier.rb
55
+ ...
56
+
57
+ % cat > console.rb
58
+ # coding: utf-8
59
+ if (RUBY_VERSION < '1.9.0')
60
+ $KCODE = 'u'
61
+ end
62
+ require './game_thread_classifier'
63
+
64
+ $stdout.sync = true
65
+ loop do
66
+ print "> "
67
+ line = $stdin.readline
68
+ label = GameThreadClassifier.predict(line)
69
+ puts "#{GameThreadClassifier::LABELS[label]}の話題です!!!"
70
+ end
71
+ ^D
72
+
73
+ % ruby console.rb
74
+ > 彼女からメールが来た
75
+ LOVEPLUSの話題です!!!
76
+ > 日曜日はデートしてました
77
+ LOVEPLUSの話題です!!!
78
+ > 金欲しい
79
+ DRAGON_QUESTの話題です!!!
80
+ > 王様になりたい
81
+ DRAGON_QUESTの話題です!!!
82
+ > スライム
83
+ DRAGON_QUESTの話題です!!!
84
+ > スライムを彼女にプレゼント
85
+ LOVEPLUSの話題です!!!
86
+
87
+ %cat > test.rb
88
+ if (RUBY_VERSION < '1.9.0')
89
+ $KCODE = 'u'
90
+ end
91
+ require './game_thread_classifier'
92
+
93
+ labels = Array.new(GameThreadClassifier.k, 0)
94
+ file = ARGV.shift
95
+ File.open(file) do |f|
96
+ until f.eof?
97
+ l = f.readline.chomp
98
+ label = GameThreadClassifier.predict(l)
99
+ labels[label] += 1
100
+ end
101
+ end
102
+ count = labels.reduce(:+)
103
+ labels.each_with_index do |c, i|
104
+ printf "%16s: %f\n", GameThreadClassifier::LABELS[i], c.to_f / count.to_f
105
+ end
106
+ ^D
107
+
108
+ % ruby test.rb data/dragon_quest_test.txt
109
+ DRAGON_QUEST: 0.932000
110
+ LOVEPLUS: 0.068000
111
+ % ruby test.rb data/loveplus_test.txt
112
+ DRAGON_QUEST: 0.124000
113
+ LOVEPLUS: 0.876000
114
+ % ruby test.rb data/dragon_quest_test2.txt
115
+ DRAGON_QUEST: 0.988000
116
+ LOVEPLUS: 0.012000
117
+ % ruby test.rb data/loveplus_test2.txt
118
+ DRAGON_QUEST: 0.012048
119
+ LOVEPLUS: 0.987952
120
+
121
+
122
+ % nekoneko_gen -n game_thread_classifier data/dragon_quest.txt data/loveplus.txt data/skyrim.txt data/mhf.txt
123
+ ...
124
+ ...
125
+ ...
126
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new(:test) do |test|
6
+ test.libs << 'test'
7
+ test.test_files = Dir.glob("test/**/*_test.rb")
8
+ test.verbose = true
9
+ test.warning = true
10
+ end
data/bin/nekoneko_gen ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ if (RUBY_VERSION < "1.9.0")
4
+ require 'rubygems'
5
+ $KCODE= 'u'
6
+ end
7
+ require 'nekoneko_gen'
8
+
9
+ exit(NekonekoGen.run(ARGV))
@@ -0,0 +1,71 @@
1
+ module NekonekoGen
2
+ class Arow
3
+ R = 6.0
4
+ attr_accessor :k, :w
5
+ def initialize(k, options = {})
6
+ @r = options[:r] || R
7
+ @k = k
8
+ @cov = []
9
+ @w = []
10
+ if (@k == 2)
11
+ @cov[0] = Hash.new(1.0)
12
+ @w[0] = Hash.new(0.0)
13
+ else
14
+ k.times do |i|
15
+ @cov[i] = Hash.new(1.0)
16
+ @w[i] = Hash.new(0.0)
17
+ end
18
+ end
19
+ end
20
+ def update(vec, label)
21
+ loss = 0.0
22
+ if (@k == 2)
23
+ loss = update_at(0, vec, label)
24
+ else
25
+ nega = rand(@k - 1)
26
+ if (nega == label)
27
+ nega += 1
28
+ end
29
+ s = 1.0 / @k
30
+ @k.times do |i|
31
+ loss += update_at(i, vec, label) * s
32
+ end
33
+ end
34
+ loss
35
+ end
36
+ def strip!
37
+ @w.each do |w|
38
+ w.reject!{|k,v| v.abs <= Float::EPSILON }
39
+ end
40
+ @w
41
+ end
42
+
43
+ private
44
+ def dot(vec, w)
45
+ dot = 0.0
46
+ vec.each do |k, v|
47
+ if (a = w[k])
48
+ dot += a * v
49
+ end
50
+ end
51
+ dot
52
+ end
53
+ def update_at(i, vec, label)
54
+ w = @w[i]
55
+ cov = @cov[i]
56
+ y = label == i ? 1 : -1
57
+ score = dot(vec, w)
58
+ alpha = 1.0 - y * score
59
+ if (alpha > 0.0)
60
+ r_inv= 1.0 / @r
61
+ var = vec.map {|k, v| cov[k] * v * v }.reduce(:+)
62
+ alpha *= (1.0 / (var + @r)) * y
63
+ vec.each do |k, v|
64
+ w[k] += alpha * cov[k] * v
65
+ cov[k] = 1.0 / ((1.0 / cov[k]) + (v * v * r_inv))
66
+ end
67
+ end
68
+ score * y < 0.0 ? 1.0 : 0.0
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,226 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'json'
3
+ require 'bimyou_segmenter'
4
+ require File.expand_path(File.join(File.dirname(__FILE__), 'arow'))
5
+
6
+ module NekonekoGen
7
+ class TextClassifierGenerator
8
+ attr_accessor :quiet
9
+ def initialize(filename, files, options = {})
10
+ @quiet = false
11
+ @options = options
12
+ @filename = filename
13
+ @files = files
14
+ @word2id = {}
15
+ @id2word = {}
16
+ @arow = Arow.new(files.size, options)
17
+
18
+ @name = safe_name(@filename).split("_").map(&:capitalize).join
19
+ @labels = files.map {|file| "#{safe_name(file).upcase}"}
20
+ end
21
+
22
+ def train(iteration = 20)
23
+ iteration ||= 20
24
+ data = []
25
+ @arow.k.times do |i|
26
+ t = Time.now
27
+ data[i] = []
28
+ print "loading #{@files[i]}... "
29
+ File.open(@files[i]) do |f|
30
+ until (f.eof?)
31
+ vec = fv(f.readline.chomp)
32
+ if (vec.size > 0)
33
+ data[i] << normalize(vec)
34
+ end
35
+ end
36
+ end
37
+ puts sprintf("%.4fs", Time.now - t)
38
+ end
39
+ samples = data.map{|v| v.size}.min
40
+ iteration.times do |step|
41
+ loss = 0.0
42
+ c = 0
43
+ t = Time.now
44
+ print sprintf("step %3d...", step)
45
+
46
+ @arow.k.times.map do |i|
47
+ sampling(data[i], samples).map {|vec| [vec, i] }
48
+ end.flatten(1).shuffle!.each do |v|
49
+ loss += @arow.update(v[0], v[1])
50
+ c += 1
51
+ end
52
+ print sprintf(" %.6f, %.4fs\n", 1.0 - loss / c.to_f, Time.now - t)
53
+ end
54
+ @arow.strip!
55
+
56
+ if (@arow.k > 2)
57
+ @arow.w.each_with_index do |w, i|
58
+ puts "#{@labels[i]} : #{w.size} features"
59
+ end
60
+ else
61
+ puts "#{@labels[0]}, #{@labels[1]} : #{@arow.w[0].size} features"
62
+ end
63
+ puts "done nyan! "
64
+ end
65
+ def generate
66
+ wv = @arow.w.map {|w|
67
+ w.reduce({}) {|h, kv| h[id2word(kv[0])] = kv[1]; h }
68
+ }
69
+ labels = @labels.each_with_index.map{|v, i| " #{v} = #{i}"}.join("\n")
70
+
71
+ File.open(@filename, "w") do |f|
72
+ f.write <<MODEL
73
+ # -*- coding: utf-8 -*-
74
+ require 'rubygems'
75
+ require 'json'
76
+ require 'bimyou_segmenter'
77
+
78
+ class #{@name}
79
+ def self.predict(text)
80
+ vec = Hash.new(0)
81
+ prev = nil
82
+ vec = BimyouSegmenter.segment(text).map do |word|
83
+ if (prev)
84
+ if (NGRAM_TARGET =~ word)
85
+ nword = [prev + word, word]
86
+ prev = word
87
+ nword
88
+ else
89
+ prev = nil
90
+ word
91
+ end
92
+ else
93
+ if (NGRAM_TARGET =~ word)
94
+ prev = word
95
+ end
96
+ word
97
+ end
98
+ end.flatten(1)
99
+ vec << " bias "
100
+ if (W.size == 1)
101
+ W[0].values_at(*vec).compact.reduce(:+) > 0.0 ? 0 : 1
102
+ else
103
+ W.each_with_index.map {|w,i|
104
+ [w.values_at(*vec).compact.reduce(:+), i]
105
+ }.max.pop
106
+ end
107
+ end
108
+ def self.k
109
+ W.size == 1 ? 2 : W.size
110
+ end
111
+ #{labels}
112
+ LABELS = #{@labels.inspect}
113
+
114
+ private
115
+ NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\\-_a-zA-Z‐_0-90-9]+$)|' +
116
+ '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
117
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
118
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
119
+ W = JSON.load(#{wv.to_json.inspect})
120
+ end
121
+ MODEL
122
+ end
123
+ end
124
+
125
+ private
126
+ def id2word(id)
127
+ @id2word[id]
128
+ end
129
+ def word2id(word)
130
+ if (word_id = @word2id[word])
131
+ word_id
132
+ else
133
+ word_id = @word2id.size
134
+ @word2id[word] = word_id
135
+ @id2word[word_id] = word
136
+ word_id
137
+ end
138
+ end
139
+ def fv(text)
140
+ vec = Hash.new(0)
141
+ vec[word2id(" bias ")] = 1
142
+
143
+ prev = nil
144
+ words = BimyouSegmenter.segment(text, :white_space => true).map do |word|
145
+ if (prev)
146
+ if (NGRAM_TARGET =~ word)
147
+ nword = [prev + word, word]
148
+ prev = word
149
+ nword
150
+ else
151
+ prev = nil
152
+ word
153
+ end
154
+ else
155
+ if (NGRAM_TARGET =~ word)
156
+ prev = word
157
+ end
158
+ word
159
+ end
160
+ end.flatten(1).reject do |word|
161
+ STOP_WORDS[word]
162
+ end.each do |word|
163
+ vec[word2id(word)] += 1
164
+ end
165
+ vec
166
+ end
167
+ def normalize(vec)
168
+ norm = Math.sqrt(vec.each_value.reduce(0){|a, v| a + v * v })
169
+ if (norm > 0.0)
170
+ s = 1.0 / norm
171
+ vec.each do |k, v|
172
+ vec[k] = v * s
173
+ end
174
+ end
175
+ vec
176
+ end
177
+ def sampling(a, n)
178
+ if (a.size < n)
179
+ over_sampling(a, n)
180
+ else
181
+ under_sampling(a, n)
182
+ end
183
+ end
184
+ def over_sampling(a, n)
185
+ if (a.size == n)
186
+ a
187
+ else
188
+ if (a.respond_to?(:sample))
189
+ a + a.sample(n - a.size)
190
+ else
191
+ a + a.shuffle[0, n - a.size]
192
+ end
193
+ end
194
+ end
195
+ def under_sampling(a, n)
196
+ if (a.size == n)
197
+ a
198
+ else
199
+ if (a.respond_to?(:sample))
200
+ a.sample(n)
201
+ else
202
+ a.shuffle[0, n]
203
+ end
204
+ end
205
+ end
206
+ def safe_name(filename)
207
+ File.basename(filename, ".*").gsub('-','_').gsub(/[^a-zA-Z_0-9]/, '')
208
+ end
209
+ def puts(s)
210
+ unless (@quiet)
211
+ Kernel.puts s
212
+ end
213
+ end
214
+ def print(s)
215
+ unless (@quiet)
216
+ Kernel.print s
217
+ end
218
+ end
219
+ NGRAM_TARGET = Regexp.new('(^[ァ-ヾ]+$)|(^[a-zA-Z\-_a-zA-Z‐_0-90-9]+$)|' +
220
+ '(^[々〇ヵヶ' + [0x3400].pack('U') + '-' + [0x9FFF].pack('U') +
221
+ [0xF900].pack('U') + '-' + [0xFAFF].pack('U') +
222
+ [0x20000].pack('U') + '-' + [0x2FFFF].pack('U') + ']+$)')
223
+ WORD_COUNT_THRESH = 2
224
+ STOP_WORDS = {"の"=>1, "に"=>1, "て"=>1, "が"=>1, "た"=>1, "は"=>1, "で"=>1, "を"=>1, "と"=>1, "か"=>1, "も"=>1, "ない"=>1, "だ"=>1, "な"=>1, "です"=>1, "から"=>1, "ます"=>1, "う"=>1, "けど"=>1, "って"=>1, "ば"=>1, "よ"=>1, "まし"=>1, "たら"=>1, "ね"=>1, "ん"=>1, "なら"=>1, "でしょ"=>1, "とか"=>1, "じゃ"=>1, "まで"=>1, "ので"=>1, "ませ"=>1, "だけ"=>1, "へ"=>1, "なく"=>1, "という"=>1, "や"=>1, "でも"=>1, "ござい"=>1, "し"=>1, "たい"=>1, "だろ"=>1, "なかっ"=>1, "ある"=>1, "ず"=>1, "たり"=>1, "だっ"=>1, "しか"=>1, "くらい"=>1, "かも"=>1, "ながら"=>1, "でし"=>1, "また"=>1, "より"=>1, "のに"=>1, "わ"=>1, "など"=>1, "として"=>1, "ぬ"=>1, "あっ"=>1, "らしい"=>1, "ばかり"=>1, "ほど"=>1, "ぞ"=>1, "しかし"=>1, "なけれ"=>1, "ただ"=>1, "つ"=>1, "けれども"=>1, "んで"=>1, "ぐらい"=>1, "なんて"=>1, "について"=>1, "そうして"=>1, "ましょ"=>1, "さえ"=>1, "のみ"=>1, "たく"=>1, "あり"=>1, "る"=>1, "なんか"=>1, "べき"=>1, "だって"=>1, "それとも"=>1, "ちゃ"=>1, "なぁ"=>1, "それから"=>1, "さ"=>1, "ぜ"=>1, "によって"=>1, "ねえ"=>1, "っけ"=>1, "やら"=>1, "だから"=>1, "とも"=>1, "いや"=>1, "なり"=>1, "それでも"=>1, "なあ"=>1, "まい"=>1, "つつ"=>1, "そして"=>1, "それで"=>1, "かい"=>1, "すると"=>1, "しかも"=>1, "あろ"=>1, "らしく"=>1, "ずつ"=>1, "り"=>1, "たる"=>1, "又"=>1, "ねぇ"=>1, "に対して"=>1, "け"=>1, "こそ"=>1, "もしくは"=>1, "なきゃ"=>1, "だら"=>1, "そこで"=>1, "すら"=>1, "実は"=>1, "ところが"=>1, "なる"=>1, "による"=>1, "御座い"=>1, "じゃん"=>1, "つまり"=>1, "けれど"=>1, "ただし"=>1, "だの"=>1, "たかっ"=>1, "ざる"=>1, "ごとく"=>1, "に対する"=>1, "とかいう"=>1, "かしら"=>1, "なくっ"=>1, "そりゃ"=>1, "または"=>1, "べ"=>1, "にて"=>1, "において"=>1, "たろ"=>1, "無い"=>1, "あれ"=>1, "なぞ"=>1, "っと"=>1, "き"=>1, "にとって"=>1, "たって"=>1, "じ"=>1, "あるいは"=>1, "ど"=>1, "っす"=>1, "だり"=>1, "又は"=>1, "ばっかり"=>1, "てか"=>1, "けども"=>1, "と共に"=>1, "れ"=>1, "なかろ"=>1, "なお"=>1, "ものの"=>1, "に関する"=>1, "ばっか"=>1, "こうして"=>1, "程"=>1, "べし"=>1, "たとえば"=>1, "ども"=>1, "一方"=>1, "それでは"=>1, "かつ"=>1, "やし"=>1, "だけど"=>1, "なんぞ"=>1, "べく"=>1, "迄"=>1, "如く"=>1, "ってか"=>1, "すなわち"=>1, "さて"=>1, "どころか"=>1, "では"=>1, "を以て"=>1, "かぁ"=>1, "のう"=>1, "らしかっ"=>1, "そしたら"=>1, "にゃ"=>1, "まじ"=>1, "るる"=>1, "らし"=>1, "やん"=>1, "たけれ"=>1, "らしき"=>1, "しも"=>1, "べから"=>1, "或いは"=>1, "及び"=>1, "だが"=>1, "ごとき"=>1, "なし"=>1, "如き"=>1, "ねん"=>1, "但し"=>1, "ござる"=>1, "いえ"=>1, "故に"=>1, "即ち"=>1, "やっ"=>1, "なき"=>1, "無かっ"=>1, "なけりゃ"=>1, "即"=>1, "よって"=>1, "或は"=>1, "および"=>1, "尚"=>1, "否"=>1, "じゃろ"=>1, "っしょ"=>1, "尤も"=>1, "だに"=>1, "やす"=>1, "ござん"=>1, "ついで"=>1, "へん"=>1, "じゃっ"=>1, "わい"=>1, "次に"=>1, "之"=>1, "ける"=>1, "然し"=>1, "もっとも"=>1, "そうしたら"=>1, "無く"=>1, "やろ"=>1, "亦"=>1, "っし"=>1, "に対し"=>1, "乃至"=>1, "なれ"=>1, "御座る"=>1, "御座ん"=>1, "とう"=>1, "てえ"=>1, "但"=>1, "どし"=>1, "ざり"=>1, "といふ"=>1, "たれ"=>1, "したら"=>1, "もん"=>1, "やせ"=>1, "たくっ"=>1, "若しくは"=>1, "ずん"=>1, "あら"=>1, "ざれ"=>1, "無かろ"=>1, "無けれ"=>1, "ごとし"=>1, "たきゃ"=>1, "どす"=>1, "けり"=>1, "まじき"=>1, "ますれ"=>1, "たき"=>1, "てん"=>1, "たけりゃ"=>1, "無き"=>1, "無"=>1, "如し"=>1, "あん"=>1, "御座っ"=>1, "ありゃ"=>1, "かな"=>1, "ばかし"=>1}
225
+ end
226
+ end