nysol-mining 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/mnb.rb
ADDED
@@ -0,0 +1,631 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#encoding:utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'nysol/mcmd'
|
6
|
+
|
7
|
+
# w=とclass=のバグ修正 20151209
|
8
|
+
$version=1.2
|
9
|
+
$revision="###VERSION###"
|
10
|
+
|
11
|
+
|
12
|
+
def help
|
13
|
+
|
14
|
+
STDERR.puts <<EOF
|
15
|
+
------------------------------
|
16
|
+
mnb.rb version #{$version}
|
17
|
+
------------------------------
|
18
|
+
概要) Naive Bayesを利用した分類器
|
19
|
+
特徴) 1) ベイズの定理による確率モデルを用いた教師あり学習の分類器
|
20
|
+
2) アイテムの頻度情報を扱えるようにMultinominal Naive Bayesを利用
|
21
|
+
3) ラプラススムージングによりゼロ頻度問題を調整(注1参照)
|
22
|
+
4) complement Naive Bayesも利用可能
|
23
|
+
用法1) モデル構築モード
|
24
|
+
mnb.rb [tid=] [item=] [w=] [class=] i= O= [seed=] [-complement] [-cv] [T=] [-mcmdenv] [--help]
|
25
|
+
用法2) 予測モード
|
26
|
+
mnb.rb -predict i= I= o= [-complement] [T=] [--help] [-mcmdenv]
|
27
|
+
|
28
|
+
例) mnb.rb tid=tid item=word w=freq class=class i=train.csv O=output seed=1 -cv
|
29
|
+
mnb.rb tid=tid item=word w=freq i=test.csv I=output o=rsl_predict_model -predict
|
30
|
+
|
31
|
+
## モデル構築モード
|
32
|
+
i= : 入力データのファイル名【必須】
|
33
|
+
tid= : 1つのサンプルを表す項目名【デフォルト:"tid"】
|
34
|
+
item= : 1つの変数を表す項目名【デフォルト:"item"】
|
35
|
+
w= : 変数の重み項目名 【オプション】
|
36
|
+
: 指定しなければ、全行1とする。
|
37
|
+
class= : 目的変数の項目名(i=上の項目名)【デフォルト:"class"】
|
38
|
+
seed= : 乱数の種(0以上の整数,交差検証に影響)【オプション:default=-1(時間依存)】
|
39
|
+
O= : 出力ディレクト名 【必須】
|
40
|
+
-complement : complement Naive Bayesで実行【オプション】
|
41
|
+
-cv : 交差検証の実施。デフォルトではテストサンプル法を実施
|
42
|
+
|
43
|
+
その他
|
44
|
+
T= : 作業ディレクトリ【デフォルト:"/tmp"】
|
45
|
+
-mcmdenv : 内部のMCMDのコマンドメッセージを表示
|
46
|
+
--help : ヘルプの表示
|
47
|
+
|
48
|
+
|
49
|
+
## 予測モード(-predict)
|
50
|
+
I= : モデル構築モードでの出力先ディレクトリパス 【必須】
|
51
|
+
o= : 予測結果出力ファイル名 [必須]
|
52
|
+
i= : 未知データのファイル名 [必須]
|
53
|
+
tid=,item=,w= については、モデル構築モードと同じ項目名を持つ入力ファイルが必要である。
|
54
|
+
|
55
|
+
注1) ゼロ頻度問題は、テストで初めて出現したアイテムを含む場合に確率がゼロになる問題
|
56
|
+
|
57
|
+
利用例)
|
58
|
+
$ more train.csv
|
59
|
+
tid,item,freq,class
|
60
|
+
1,w1,2,M
|
61
|
+
1,w2,4,M
|
62
|
+
10,w1,1,F
|
63
|
+
11,w2,1,F
|
64
|
+
11,w1,2,F
|
65
|
+
12,w1,4,M
|
66
|
+
12,w2,4,M
|
67
|
+
13,w3,3,M
|
68
|
+
13,w2,2,M
|
69
|
+
13,w1,4,M
|
70
|
+
14,w1,5,M
|
71
|
+
14,w2,3,M
|
72
|
+
14,w3,2,M
|
73
|
+
15,w1,1,F
|
74
|
+
16,w1,2,F
|
75
|
+
16,w2,1,F
|
76
|
+
18,w2,4,F
|
77
|
+
18,w1,2,F
|
78
|
+
19,w2,2,F
|
79
|
+
19,w1,1,F
|
80
|
+
19,w3,3,F
|
81
|
+
2,w2,2,M
|
82
|
+
2,w1,3,M
|
83
|
+
2,w3,3,M
|
84
|
+
20,w1,1,F
|
85
|
+
20,w2,3,F
|
86
|
+
20,w3,2,F
|
87
|
+
4,w3,2,M
|
88
|
+
4,w2,3,M
|
89
|
+
4,w1,3,M
|
90
|
+
5,w1,1,F
|
91
|
+
6,w2,1,F
|
92
|
+
6,w1,1,F
|
93
|
+
7,w1,3,M
|
94
|
+
7,w2,4,M
|
95
|
+
8,w2,2,M
|
96
|
+
8,w3,3,M
|
97
|
+
8,w1,4,M
|
98
|
+
9,w1,3,M
|
99
|
+
9,w3,2,M
|
100
|
+
9,w2,3,M
|
101
|
+
17,w2,1,M
|
102
|
+
17,w1,2,M
|
103
|
+
3,w1,1,F
|
104
|
+
3,w2,1,F
|
105
|
+
|
106
|
+
$ mnb.rb tid=tid item=word w=freq class=class i=trainData.csv O=model seed=1
|
107
|
+
#MSG# separating data 1; 2014/08/18 12:17:38
|
108
|
+
#MSG# separating data 2; 2014/08/18 12:17:38
|
109
|
+
#MSG# separating data 3; 2014/08/18 12:17:38
|
110
|
+
#MSG# separating data 4; 2014/08/18 12:17:38
|
111
|
+
#MSG# separating data 5; 2014/08/18 12:17:38
|
112
|
+
#MSG# separating data 6; 2014/08/18 12:17:38
|
113
|
+
#MSG# separating data 7; 2014/08/18 12:17:38
|
114
|
+
#MSG# separating data 8; 2014/08/18 12:17:38
|
115
|
+
#MSG# separating data 9; 2014/08/18 12:17:38
|
116
|
+
#MSG# separating data 10; 2014/08/18 12:17:38
|
117
|
+
#MSG# Naive Bayes start using training data 1; 2014/08/18 12:17:38
|
118
|
+
#MSG# Naive Bayes start using test data 1; 2014/08/18 12:17:38
|
119
|
+
#END# ./mnb.rb tid=tid item=word w=freq class=class i=trainData.csv O=model seed=1; 2014/08/18 12:17:39
|
120
|
+
#MSG# Naive Bayes start using original data; 2014/08/18 12:17:39
|
121
|
+
#END# ./mnb.rb tid=tid item=word w=freq class=class i=trainData.csv O=model seed=1; 2014/08/18 12:17:39
|
122
|
+
|
123
|
+
$ more model/rsl_model.csv
|
124
|
+
tid,F,M,class,predictCls
|
125
|
+
1,0.5149523047,0.4850476955,M,F
|
126
|
+
10,0.4929065867,0.5070934133,F,M
|
127
|
+
11,0.5019607343,0.4980392657,F,F
|
128
|
+
12,0.5089038694,0.4910961304,M,F
|
129
|
+
13,0.4918393826,0.5081606174,M,M
|
130
|
+
14,0.4966021486,0.5033978514,M,M
|
131
|
+
15,0.4929065867,0.5070934133,F,M
|
132
|
+
...
|
133
|
+
...
|
134
|
+
|
135
|
+
Copyright(c) NYSOL 2012- All Rights Reserved.
|
136
|
+
EOF
|
137
|
+
exit
|
138
|
+
end
|
139
|
+
|
140
|
+
def ver()
|
141
|
+
$revision ="0" if $revision =~ /VERSION/
|
142
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
143
|
+
exit
|
144
|
+
end
|
145
|
+
|
146
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
147
|
+
ver() if ARGV[0]=="--version"
|
148
|
+
|
149
|
+
|
150
|
+
def mktsData(ifile,oPath,ratio,seed)
|
151
|
+
system "mkdir -p #{oPath}"
|
152
|
+
|
153
|
+
f=""
|
154
|
+
f << "msortf f=#{@cls},#{@tid} i=#{ifile} |"
|
155
|
+
f << "msep d='#{oPath}/xxts-${class}'"
|
156
|
+
system(f)
|
157
|
+
|
158
|
+
# class毎に分けたファイルからratioの件数分ランダムに選択
|
159
|
+
Dir::glob("#{oPath}/xxts-*").each {|ef|
|
160
|
+
|
161
|
+
fName=File::basename("#{ef}") #ファイル名
|
162
|
+
f=""
|
163
|
+
f << "mselrand k=tid p=#{ratio} -B S=#{seed} i=#{ef} o=#{oPath}/xxtest-#{fName} u=#{oPath}/xxtrain-#{fName}"
|
164
|
+
system(f)
|
165
|
+
|
166
|
+
}
|
167
|
+
system "mcat i=#{oPath}/xxtest-* o=#{oPath}/1_test.csv"
|
168
|
+
system "mcat i=#{oPath}/xxtrain-* o=#{oPath}/1_train.csv"
|
169
|
+
|
170
|
+
system "rm #{oPath}/xxt*"
|
171
|
+
end
|
172
|
+
|
173
|
+
def mkcvData(train,oPath,foldNum,seed)
|
174
|
+
|
175
|
+
keyCnt=nil
|
176
|
+
system "mkdir -p #{oPath}"
|
177
|
+
|
178
|
+
#clsを分けるためにfold数にあわせてcls番号をふる
|
179
|
+
if @tid # 縦型のデータ形式の場合
|
180
|
+
system "msortf f=#{@tid} i=#{train} o=#{@wf}-xx1"
|
181
|
+
system "mrand k=#{@tid} a=rand S=#{seed} i=#{@wf}-xx1 o=#{@wf}-rand"
|
182
|
+
# クラス別件数のカウント
|
183
|
+
f=""
|
184
|
+
f << "muniq k=#{@tid} i=#{@wf}-xx1 |"
|
185
|
+
f << "msortf f=#{@cls} |mcount k=#{@cls} a=keyCnt o=#{@wf}-keyCnt"
|
186
|
+
system(f)
|
187
|
+
|
188
|
+
keyCnt=`msortf f=keyCnt%n i=#{@wf}-keyCnt |mbest -q |mcut f=keyCnt -nfno |more`
|
189
|
+
if keyCnt.to_i < foldNum
|
190
|
+
MCMD::errorLog("#{File.basename($0)}: the number of tid is less than the number of fold")
|
191
|
+
exit
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
f=""
|
196
|
+
f << "msortf f=#{@cls},rand i=#{@wf}-rand |"
|
197
|
+
# class別のキー番号
|
198
|
+
f << "mnumber k=#{@cls} s=#{@cls},rand S=1 a=keyLine e=same |"
|
199
|
+
f << "mjoin k=#{@cls} f=keyCnt m=#{@wf}-keyCnt |"
|
200
|
+
f << "mcal c='ceil(${keyLine} / (${keyCnt} / #{foldNum}+0.00001),1)' a=val o=#{@wf}-xx2 "
|
201
|
+
system(f)
|
202
|
+
|
203
|
+
# クラス番号に合わせてファイルを出力
|
204
|
+
(1..foldNum).each{|loop|
|
205
|
+
MCMD::msgLog("separating data #{loop}")
|
206
|
+
system "msel c='${val} == #{loop}' i=#{@wf}-xx2 u=#{@wf}-train-#{loop} o=#{@wf}-test-#{loop}"
|
207
|
+
}
|
208
|
+
|
209
|
+
(1..foldNum).each{|loop|
|
210
|
+
system "mcut -r f=rand,keyLine,keyCnt,val i=#{@wf}-train-#{loop} o=#{oPath}/#{loop}_train.csv"
|
211
|
+
system "mcut -r f=rand,keyLine,keyCnt,val i=#{@wf}-test-#{loop} o=#{oPath}/#{loop}_test.csv"
|
212
|
+
}
|
213
|
+
|
214
|
+
system "rm #{@wf}-*"
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
def mkCompliData(input)
|
219
|
+
# ワード件数
|
220
|
+
f=""
|
221
|
+
f << "mcut f=#{@item},#{@w} i=#{input} |"
|
222
|
+
f << "msortf f=#{@item} |"
|
223
|
+
f << "msum k=#{@item} f=#{@w}:totalWord o=#{@wf}-xxtotalWord"
|
224
|
+
system(f)
|
225
|
+
|
226
|
+
# クラス別ワード別補集合の件数
|
227
|
+
f=""
|
228
|
+
f << "msortf f=#{@item},#{@cls} i=#{input} |"
|
229
|
+
f << "msum k=#{@item},#{@cls} f=#{@w}:wCnt o=#{@wf}-xxfreq"
|
230
|
+
system(f)
|
231
|
+
f=""
|
232
|
+
f << "mjoin -n k=#{@item},#{@cls} f=wCnt m=#{@wf}-xxfreq i=#{@wf}-xxwordClass |"
|
233
|
+
f << "mnullto f=wCnt v=0 |"
|
234
|
+
f << "msortf f=#{@item} |"
|
235
|
+
f << "mjoin k=#{@item} f=totalWord m=#{@wf}-xxtotalWord |"
|
236
|
+
f << "mcal c='${totalWord}-${wCnt}' a=compWcnt |"
|
237
|
+
f << "mcut f=#{@item},#{@cls},compWcnt:wCnt |"
|
238
|
+
f << "mfldname -q o=#{@wf}-xxsum"
|
239
|
+
system(f)
|
240
|
+
end
|
241
|
+
|
242
|
+
def mkNormalData(input)
|
243
|
+
# クラス別ワード別件数
|
244
|
+
f=""
|
245
|
+
f << "msortf f=#{@item},#{@cls} i=#{input} |"
|
246
|
+
f << "msum k=#{@item},#{@cls} f=#{@w}:wCnt o=#{@wf}-xxfreq"
|
247
|
+
system(f)
|
248
|
+
f=""
|
249
|
+
f << "mjoin -n k=#{@item},#{@cls} f=wCnt m=#{@wf}-xxfreq i=#{@wf}-xxwordClass |"
|
250
|
+
f << "mnullto f=wCnt v=0 |"
|
251
|
+
f << "mcut f=#{@item},#{@cls},wCnt |"
|
252
|
+
f << "mfldname -q o=#{@wf}-xxsum"
|
253
|
+
system(f)
|
254
|
+
end
|
255
|
+
|
256
|
+
def calAcc(input,outdir,oname)
|
257
|
+
system "mcount a=totalCnt i=#{input} o=#{@wf}-xxtotalCnt"
|
258
|
+
f=""
|
259
|
+
f << "mcal c='if($s{#{@cls}}==$s{predictCls},\"Match\",\"Unmatch\")' a=ans i=#{input} |"
|
260
|
+
f << "msortf f=ans |"
|
261
|
+
f << "mcount k=ans a=cnt |mproduct f=totalCnt m=#{@wf}-xxtotalCnt |"
|
262
|
+
f << "mcal c='${cnt}/${totalCnt}' a=accRate |"
|
263
|
+
f << "mcut f=ans,cnt,totalCnt,accRate |"
|
264
|
+
f << "mfldname -q o=#{outdir}/rsl_acc_#{oname}"
|
265
|
+
system(f)
|
266
|
+
end
|
267
|
+
|
268
|
+
|
269
|
+
def calAccAvg(outdir,type)
|
270
|
+
|
271
|
+
f=""
|
272
|
+
f << "mcat i=#{outdir}/rsl_acc_*_#{type}.csv -add_fname |"
|
273
|
+
f << "mselstr f=ans v=Match |"
|
274
|
+
f << "msed f=fileName c=\"/.*/\" v=\"\" |"
|
275
|
+
f << "msed f=fileName c=\"rsl_acc_\" v="" |"
|
276
|
+
f << "msed f=fileName c=_test.csv v="" |"
|
277
|
+
f << "mcut f=fileName:test,ans,cnt,totalCnt,accRate |"
|
278
|
+
f << "mfldname -q o=#{outdir}/acclist.csv"
|
279
|
+
system(f)
|
280
|
+
system "mavg f=accRate i=#{outdir}/acclist.csv |mcut f=accRate |mfldname -q o=#{outdir}/acc.csv"
|
281
|
+
|
282
|
+
system "rm #{outdir}/rsl_acc_*_*.csv"
|
283
|
+
|
284
|
+
end
|
285
|
+
|
286
|
+
def writeParam(temp,ifile,oPath,tid,item,w,cls,complement,ts,foldNum,seed)
|
287
|
+
fw = open("#{oPath}/param.csv", "w")
|
288
|
+
fw.puts "param,val"
|
289
|
+
fw.puts "i=,#{ifile}"
|
290
|
+
fw.puts "O=,#{oPath}"
|
291
|
+
fw.puts "tid=,#{tid}"
|
292
|
+
fw.puts "item=,#{item}"
|
293
|
+
fw.puts "w=,#{w}"
|
294
|
+
fw.puts "class=,#{cls}"
|
295
|
+
fw.puts "-complement,#{complement}"
|
296
|
+
fw.puts "ts=,#{ts}"
|
297
|
+
foldNum=nil if foldNum==1
|
298
|
+
fw.puts "cv=,#{foldNum}"
|
299
|
+
fw.puts "seed=,#{seed}"
|
300
|
+
fw.puts "T=,#{temp}"
|
301
|
+
fw.close
|
302
|
+
end
|
303
|
+
|
304
|
+
|
305
|
+
def run(input,output,odir,complement,trainFlg)
|
306
|
+
|
307
|
+
if trainFlg # モデル構築時のみ実行
|
308
|
+
|
309
|
+
# ワードとクラスの全組み合わせを生成
|
310
|
+
system "mcut f=#{@item} i=#{input} |msortf f=#{@item} |muniq k=#{@item} o=#{@wf}-xxword"
|
311
|
+
system "mcut f=#{@cls} i=#{input} |msortf f=#{@cls} |muniq k=#{@cls} o=#{@wf}-xxclass"
|
312
|
+
system "mproduct f=#{@cls} m=#{@wf}-xxclass i=#{@wf}-xxword o=#{@wf}-xxwordClass"
|
313
|
+
|
314
|
+
if complement
|
315
|
+
mkCompliData(input)
|
316
|
+
else
|
317
|
+
mkNormalData(input)
|
318
|
+
end
|
319
|
+
|
320
|
+
# クラス別合計数
|
321
|
+
f=""
|
322
|
+
f << "mcut f=#{@cls},wCnt i=#{@wf}-xxsum |"
|
323
|
+
f << "msortf f=#{@cls} |"
|
324
|
+
f << "msum k=#{@cls} f=wCnt:total |"
|
325
|
+
f << "mcut f=total,#{@cls} |"
|
326
|
+
f << "mfldname -q o=#{@wf}-xxtotal"
|
327
|
+
system(f)
|
328
|
+
|
329
|
+
# スムージングのために全ワード種類数を計算
|
330
|
+
f=""
|
331
|
+
f << "mcut f=#{@item} i=#{input} |"
|
332
|
+
f << "msortf f=#{@item} |"
|
333
|
+
f << "muniq k=#{@item} |"
|
334
|
+
f << "mcount a=wCategory |"
|
335
|
+
f << "mcut f=wCategory o=#{@wf}-xxcategory"
|
336
|
+
system(f)
|
337
|
+
|
338
|
+
# クラスの出現確率Pr[c]を計算 (各クラスのID数/全ID件数)
|
339
|
+
f=""
|
340
|
+
f << "mcut f=#{@tid},#{@cls} i=#{input} |"
|
341
|
+
f << "msortf f=#{@cls},#{@tid} |muniq k=#{@cls},#{@tid} o=#{@wf}-xx1"
|
342
|
+
system(f)
|
343
|
+
system "mcount a=totalId i=#{@wf}-xx1 o=#{@wf}-xxtotalID"
|
344
|
+
f=""
|
345
|
+
f << "mcount k=#{@cls} a=memberNum i=#{@wf}-xx1 |"
|
346
|
+
f << "mproduct f=totalId m=#{@wf}-xxtotalID |"
|
347
|
+
f << "mcal c='ln(${memberNum}/${totalId})' a=prob |"
|
348
|
+
f << "mcut -r f=#{@tid} |"
|
349
|
+
f << "mfldname -q o=#{@wf}-xxprob"
|
350
|
+
system(f)
|
351
|
+
|
352
|
+
# クラス別ワード件数を予測データ用に保存
|
353
|
+
system "cp #{@wf}-xxsum #{odir}/clsWord.csv"
|
354
|
+
system "cp #{@wf}-xxtotal #{odir}/totalCnt.csv"
|
355
|
+
system "cp #{@wf}-xxprob #{odir}/clsProb.csv"
|
356
|
+
system "cp #{@wf}-xxcategory #{odir}/category.csv"
|
357
|
+
end
|
358
|
+
|
359
|
+
f=""
|
360
|
+
f << "mproduct f=wCategory m=#{odir}/category.csv i=#{input} |"
|
361
|
+
f << "msortf f=#{@item} |"
|
362
|
+
if trainFlg # 訓練データ実行中
|
363
|
+
f << "mnjoin k=#{@item} f=wCnt,#{@cls}:keyCls m=#{@wf}-xxsum |"
|
364
|
+
f << "msortf f=keyCls |"
|
365
|
+
f << "mnjoin k=keyCls K=#{@cls} f=total m=#{@wf}-xxtotal |"
|
366
|
+
f << "msortf f=#{@tid},#{@item},#{@cls} o=#{@wf}-xxdat"
|
367
|
+
else # 予測データ実行中
|
368
|
+
f << "mnjoin k=#{@item} f=wCnt,#{@cls}:keyCls m=#{odir}/clsWord.csv |"
|
369
|
+
f << "msortf f=keyCls |"
|
370
|
+
f << "mnjoin k=keyCls K=#{@cls} f=total m=#{odir}/totalCnt.csv |"
|
371
|
+
f << "msortf f=#{@tid},#{@item} o=#{@wf}-xxdat"
|
372
|
+
end
|
373
|
+
system(f)
|
374
|
+
|
375
|
+
# xxdat
|
376
|
+
# id,word,freq,class,wCategory,wCnt,keyCls,total
|
377
|
+
# 1,w1,2,M,3,4,F,30
|
378
|
+
# 1,w1,2,M,3,6,M,30
|
379
|
+
# 1,w2,4,M,3,4,F,30
|
380
|
+
# 1,w2,4,M,3,17,M,30
|
381
|
+
# 1,w3,0,M,3,7,M,30
|
382
|
+
# 1,w3,0,M,3,0,F,30
|
383
|
+
# 2,w1,1,M,3,6,M,30
|
384
|
+
# 2,w1,1,M,3,4,F,30
|
385
|
+
# 2,w2,2,M,3,17,M,30
|
386
|
+
#
|
387
|
+
# データの意味: ex.)1行目
|
388
|
+
# id1(文章1)のclassはMで,文章中にw1という語が2回出現し、語の出現種類数は3である
|
389
|
+
# F(keyCls)に属する文章の中で語w1は4回出現している。classMの総出現語数は30
|
390
|
+
#
|
391
|
+
# 上記のデータを用いて
|
392
|
+
# argmax c = Ln Pr[c]+Σx_ij Ln(θcj)を計算
|
393
|
+
|
394
|
+
# Pr[c] (各クラスのID数/全ID件数)は計算済み
|
395
|
+
# Σx_ij Ln(θcj)を計算
|
396
|
+
f=""
|
397
|
+
f << "msortf f=keyCls i=#{@wf}-xxdat |"
|
398
|
+
if trainFlg # 訓練データ実行中
|
399
|
+
f << "mjoin k=keyCls K=#{@cls} f=prob m=#{@wf}-xxprob |"
|
400
|
+
else
|
401
|
+
f << "mjoin k=keyCls K=#{@cls} f=prob m=#{odir}/clsProb.csv |"
|
402
|
+
end
|
403
|
+
f << "mcal c='${#{@w}}*ln((${wCnt}+1)/(${total}+${wCategory}))' a=2term |"
|
404
|
+
f << "msortf f=#{@tid},keyCls,#{@item} |"
|
405
|
+
f << "msum k=#{@tid},keyCls f=2term:sumVal |"
|
406
|
+
f << "mcal c='${prob}+${sumVal}' a=probCls o=#{@wf}-xxprobCls"
|
407
|
+
system(f)
|
408
|
+
|
409
|
+
f=""
|
410
|
+
if complement # 属さない確率が最も低いものを選択
|
411
|
+
f << "msortf f=#{@tid},probCls%n i=#{@wf}-xxprobCls |"
|
412
|
+
else # 属す確率が最も高いものを選択
|
413
|
+
f << "msortf f=#{@tid},probCls%nr i=#{@wf}-xxprobCls |"
|
414
|
+
end
|
415
|
+
f << "mbest k=#{@tid} -q R=1 |"
|
416
|
+
if trainFlg # 訓練データ実行中
|
417
|
+
f << "mcut f=#{@tid},#{@cls},keyCls:predictCls o=#{@wf}-xx#{output}"
|
418
|
+
else
|
419
|
+
f << "mcut f=#{@tid},keyCls:predictCls o=#{@wf}-xx#{output}"
|
420
|
+
end
|
421
|
+
system(f)
|
422
|
+
|
423
|
+
# 出力
|
424
|
+
f=""
|
425
|
+
f << "mcut f=#{@tid},keyCls,probCls i=#{@wf}-xxprobCls |"
|
426
|
+
f << "msum k=#{@tid} f=probCls:sumProb o=#{@wf}-xxProbagg"
|
427
|
+
system(f)
|
428
|
+
|
429
|
+
f=""
|
430
|
+
f << "mjoin k=#{@tid} f=sumProb m=#{@wf}-xxProbagg i=#{@wf}-xxprobCls |"
|
431
|
+
f << "mcal c='if(${sumProb}<0,1-(${probCls}/${sumProb}),${probCls}/${sumProb})' a=probability |"
|
432
|
+
f << "mcut f=#{@tid},keyCls,probability |"
|
433
|
+
f << "mcross k=#{@tid} f=probability s=keyCls |mcut -r f=fld o=#{@wf}-xxcross"
|
434
|
+
system(f)
|
435
|
+
|
436
|
+
f=""
|
437
|
+
if trainFlg # 訓練データ実行中
|
438
|
+
f << "mjoin k=#{@tid} f=#{@cls},predictCls m=#{@wf}-xx#{output} i=#{@wf}-xxcross |"
|
439
|
+
f << "mfldname -q o=#{odir}/#{output}"
|
440
|
+
else
|
441
|
+
f << "mjoin k=#{@tid} f=predictCls m=#{@wf}-xx#{output} i=#{@wf}-xxcross |"
|
442
|
+
f << "mfldname -q o=#{odir}/#{output}"
|
443
|
+
end
|
444
|
+
system(f)
|
445
|
+
|
446
|
+
unless ARGV.index("-predict")
|
447
|
+
ifile=File.basename(input) # ファイル名抽出
|
448
|
+
calAcc("#{odir}/#{output}","#{odir}","#{ifile}") # 正解率 = 正解した評価事例数 / 評価事例数
|
449
|
+
end
|
450
|
+
|
451
|
+
system "rm #{@wf}-*"
|
452
|
+
end
|
453
|
+
|
454
|
+
##################################
|
455
|
+
# model predict mode
|
456
|
+
##################################
|
457
|
+
if ARGV.index("-predict")
|
458
|
+
#args=MCMD::Margs.new(ARGV,"i=,I=,o=,item=,tid=,w=,class=,T=,-complement,-mcmdenv,-predict","i=,I=")
|
459
|
+
args=MCMD::Margs.new(ARGV,"i=,I=,o=,w=,T=,-complement,-verbose,-predict","i=,I=,o=")
|
460
|
+
|
461
|
+
# mcmdのメッセージは警告とエラーのみ
|
462
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
463
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
464
|
+
|
465
|
+
#ワークファイルパス
|
466
|
+
if args.str("T=")!=nil then
|
467
|
+
@temp = args.str("T=").sub(/\/$/,"")
|
468
|
+
else
|
469
|
+
@temp="/tmp"
|
470
|
+
end
|
471
|
+
|
472
|
+
@wf="#{@temp}/mcmd-mnb"
|
473
|
+
|
474
|
+
ifileBase = args.file("i=","r")
|
475
|
+
iPath = args.file("I=","r")
|
476
|
+
ofile = args.file("o=","w")
|
477
|
+
|
478
|
+
mcomplement=nil
|
479
|
+
# モデル構築時のパラメータチェック
|
480
|
+
MCMD::Mcsvin.new("i=#{iPath}/param.csv"){|csv|
|
481
|
+
csv.each{|val|
|
482
|
+
@tid =val["val"] if val["param"] == "tid="
|
483
|
+
@item =val["val"] if val["param"] == "item="
|
484
|
+
@w =val["val"] if val["param"] == "w="
|
485
|
+
@cls =val["val"] if val["param"] == "class="
|
486
|
+
mcomplement =val["val"] if val["param"] == "-complement"
|
487
|
+
}
|
488
|
+
}
|
489
|
+
|
490
|
+
if @w=="unit"
|
491
|
+
system "msetstr v=1 a=#{@w} i=#{ifileBase} o=#{@wf}_testFile"
|
492
|
+
ifile="#{@wf}_testFile"
|
493
|
+
else
|
494
|
+
ifile="#{ifileBase}"
|
495
|
+
end
|
496
|
+
|
497
|
+
# -complement オプション
|
498
|
+
complement=args.bool("-complement")
|
499
|
+
|
500
|
+
# model構築時のcomplementオプションの有無と異なる場合はERROR
|
501
|
+
if mcomplement.to_s != complement.to_s
|
502
|
+
MCMD::errorLog("#{File.basename($0)}: The complement option is different from usage of the model construction")
|
503
|
+
exit
|
504
|
+
end
|
505
|
+
|
506
|
+
|
507
|
+
if complement # complement naiveBaysの実行
|
508
|
+
MCMD::msgLog("Complemt Naive Bayes start using test data")
|
509
|
+
run(ifile,"#{ofile}",iPath,complement,nil)
|
510
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
511
|
+
else # naiveBaysの実行
|
512
|
+
MCMD::msgLog("Naive Bayes start using test data")
|
513
|
+
run(ifile,"#{ofile}",iPath,complement,nil)
|
514
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
515
|
+
end
|
516
|
+
|
517
|
+
|
518
|
+
##################################
|
519
|
+
# model building mode
|
520
|
+
##################################
|
521
|
+
else
|
522
|
+
# パラメータ設定
|
523
|
+
args=MCMD::Margs.new(ARGV,"i=,O=,-complement,item=,tid=,w=,class=,seed=,cv=,ts=,T=,-verbose","i=,O=")
|
524
|
+
|
525
|
+
# mcmdのメッセージは警告とエラーのみ
|
526
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
527
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
528
|
+
|
529
|
+
#ワークファイルパス
|
530
|
+
if args.str("T=")!=nil then
|
531
|
+
@temp = args.str("T=").sub(/\/$/,"")
|
532
|
+
else
|
533
|
+
@temp="/tmp"
|
534
|
+
end
|
535
|
+
|
536
|
+
ifileBase = args.file("i=","r")
|
537
|
+
oPath = args.file("O=", "w")
|
538
|
+
system "mkdir -p #{oPath}"
|
539
|
+
|
540
|
+
# o=モデルの出力ファイル名
|
541
|
+
ofile ="rsl_model.csv"
|
542
|
+
|
543
|
+
# ---- tid field names
|
544
|
+
@tid = args.field("tid=" , ifileBase, "tid" , 1,1)["names"][0]
|
545
|
+
|
546
|
+
# ---- field name
|
547
|
+
@item = args.field("item=", ifileBase, "item", 1,1)["names"][0]
|
548
|
+
|
549
|
+
@cls = args.field("class=", ifileBase, "class", 1,1)["names"][0]
|
550
|
+
|
551
|
+
@wf="#{@temp}/mcmd-mnb"
|
552
|
+
|
553
|
+
@w = args.field("w=", ifileBase, nil, 1,1)
|
554
|
+
if @w # 重みが指定
|
555
|
+
@w = @w["names"][0]
|
556
|
+
ifile="#{ifileBase}"
|
557
|
+
else # 1の重みをつける
|
558
|
+
@w = "unit"
|
559
|
+
system "msetstr v=1 a=#{@w} i=#{ifileBase} o=#{@wf}_input"
|
560
|
+
ifile="#{@wf}_input"
|
561
|
+
end
|
562
|
+
|
563
|
+
|
564
|
+
# -complement オプション
|
565
|
+
complement=args.bool("-complement") #->true
|
566
|
+
|
567
|
+
# ts= オプション
|
568
|
+
# paraにts=が指定された場合は,ts=0.0
|
569
|
+
# paraにts=がない場合は,ts=nil
|
570
|
+
# paraにts=10が指定された場合は,ts=10.0
|
571
|
+
ts=args.float("ts=")
|
572
|
+
|
573
|
+
# cv= オプション
|
574
|
+
foldNum=args.int("cv=")
|
575
|
+
|
576
|
+
# 乱数の種
|
577
|
+
seed =args.int("seed=", -1)
|
578
|
+
|
579
|
+
|
580
|
+
if ts
|
581
|
+
ts=33.3 if ts == 0.0 # パラメータにts=だけが指定された場合は0.0
|
582
|
+
mktsData(ifile,"#{@temp}",ts,seed)
|
583
|
+
foldNum=1 # テストサンプル法なので1回だけ実行
|
584
|
+
|
585
|
+
elsif foldNum
|
586
|
+
# cv用にデータ・セットを生成
|
587
|
+
foldNum=10 if foldNum == 0
|
588
|
+
mkcvData(ifile,"#{@temp}",foldNum,seed)
|
589
|
+
else # 入力データを全て訓練データとみなす
|
590
|
+
system "cp #{ifile} #{@temp}/1_train.csv"
|
591
|
+
system "cp #{ifile} #{@temp}/1_test.csv"
|
592
|
+
foldNum=1 # 訓練データを全て利用するので1回だけ実行
|
593
|
+
end
|
594
|
+
|
595
|
+
writeParam(@temp,ifile,oPath,@tid,@item,@w,@cls,complement,ts,foldNum,seed)
|
596
|
+
|
597
|
+
(1..foldNum).each{|loop|
|
598
|
+
traFile ="#{@temp}/#{loop}_train.csv"
|
599
|
+
testFile="#{@temp}/#{loop}_test.csv"
|
600
|
+
|
601
|
+
if complement # complement naiveBaysの実行
|
602
|
+
MCMD::msgLog("Complemt Naive Bayes start using training data #{loop}")
|
603
|
+
run(traFile,ofile,oPath,complement,"true") # 訓練
|
604
|
+
MCMD::msgLog("Naive Bayes start using test data #{loop}")
|
605
|
+
run(testFile,ofile,oPath,complement,"false") # 検証
|
606
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
607
|
+
else # naiveBaysの実行
|
608
|
+
MCMD::msgLog("Naive Bayes start using training data #{loop}")
|
609
|
+
run(traFile,ofile,oPath,complement,"true") # 訓練
|
610
|
+
MCMD::msgLog("Naive Bayes start using test data #{loop}")
|
611
|
+
run(testFile,ofile,oPath,complement,"false") # 検証
|
612
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
613
|
+
end
|
614
|
+
}
|
615
|
+
|
616
|
+
calAccAvg(oPath,"test")
|
617
|
+
|
618
|
+
# 元データでモデル構築
|
619
|
+
if complement # complement naiveBaysの実行
|
620
|
+
MCMD::msgLog("Complement Naive Bayes start using original data")
|
621
|
+
run(ifile,ofile,oPath,complement,"true")
|
622
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
623
|
+
else
|
624
|
+
MCMD::msgLog("Naive Bayes start using original data")
|
625
|
+
run(ifile,ofile,oPath,complement,"true")
|
626
|
+
MCMD::endLog("#{$0} #{args.argv.join(' ')}")
|
627
|
+
end
|
628
|
+
|
629
|
+
system "rm #{@temp}/*_train.csv"
|
630
|
+
system "rm #{@temp}/*_test.csv"
|
631
|
+
end
|