nysol-mining 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,484 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # 1.0 initial release: 2017/01/15
5
+ $version="1.0"
6
+ $revision="###VERSION###"
7
+
8
+ def help
9
+
10
+ STDERR.puts <<EOF
11
+ ----------------------------
12
+ midxmine.rb version #{$version}
13
+ ----------------------------
14
+ description) construct a regression model with optimally indexed itemset sequences
15
+ features) 1) using elastic-net regression (ridge to lasso regression)
16
+ 2) exploring the best alphabet-index, which is mapping function from item to group
17
+ 3) enumerate frequent itemset sequences, and use them as input variables for a model
18
+ 4) linear regression and logistic regression can be chosed
19
+ usage1) model building mode
20
+ midxmine.rb -noidx i= tid= time= item= s= c= class= [family=binomial] [alpha=1.0] [idxSize=2] [seed=] O= [T=] [-mcmdenv] [--help]
21
+ usage2) prediction mode (not imprementaed yet)
22
+ mglmnet.rb -predict i= I= o= [T=] [-mcmdenv] [--help]
23
+
24
+ ### model building mode
25
+ # parameters for input data
26
+ i= : transaction data file (mandatory)
27
+ tid= : field name for transaction ID in i= file (mandatory)
28
+ time= : field name for time in i= file (mandatory)
29
+ itme= : field name for item in i= file (mandatory)
30
+
31
+ # parameters for class data
32
+ c= : target variable file (mandatory)
33
+ : this file have to have the same field name as tid= in i= file (mandatory)
34
+ class= : field name for target variable in c= file (mandatory)
35
+
36
+ # parameters for itemset sequence enumeration
37
+ s= : minimum support for enumerating itemset sequences (mandatory)
38
+
39
+ # parameters for regression
40
+ family : link function for generalized linear regression model
41
+ "binomial" or "gaussian" can be chosen
42
+ alpha : weight of L1 and L2 regulalization in elastic-net
43
+ 1.0: lasso regression (L1)
44
+ 0.0: ridge regression (L2)
45
+
46
+ # parameters for indexing
47
+ idxSize : index size
48
+ seed= : random seed for initial index
49
+
50
+ O= : directory name for ouput (mandatory)
51
+
52
+ ### prediction mode (not impremented yet)
53
+
54
+ ### other parameters
55
+ T= : directory name for temporal files (default=/tmp)
56
+ mcmdenv : show messages of mcmd
57
+ -help : show help
58
+
59
+ necessary software)
60
+ 1) R
61
+ 2) glmnet package in R
62
+ 2) arulesSequences package in R
63
+
64
+ example)
65
+ $ cat zaki.csv
66
+ tid,time,item
67
+ 1,10,C
68
+ 1,10,D
69
+ 1,15,A
70
+ 1,15,B
71
+ 1,15,C
72
+ 1,20,A
73
+ 1,20,B
74
+ 1,20,F
75
+ 1,25,A
76
+ 1,25,C
77
+ 1,25,D
78
+ 1,25,F
79
+ 2,15,A
80
+ 2,15,B
81
+ 2,15,F
82
+ 2,20,E
83
+ 3,10,A
84
+ 3,10,B
85
+ 3,10,F
86
+ 4,10,D
87
+ 4,10,G
88
+ 4,10,H
89
+ 4,20,B
90
+ 4,20,F
91
+ 4,25,A
92
+ 4,25,G
93
+ 4,25,H
94
+ $ cat zaki_c.csv
95
+ tid,class
96
+ 1,1
97
+ 2,1
98
+ 3,0
99
+ 4,0
100
+
101
+ $ midxmine.rb i=zaki.csv c=zaki_c.csv O=result1 tid=tid item=item time=time class=class idxSize=2 seed=111 s=0.1↩
102
+
103
+ $ ls result1
104
+ alphabetIndex.csv
105
+ beta.txt
106
+ coef.png
107
+ const.txt
108
+ info.txt
109
+ lambda.png
110
+ model.obj
111
+
112
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
113
+ EOF
114
+ exit
115
+ end
116
+
117
+ def ver()
118
+ $revision ="0" if $revision =~ /VERSION/
119
+ STDERR.puts "version #{$version} revision #{$revision}"
120
+ exit
121
+ end
122
+
123
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
124
+ ver() if ARGV[0]=="--version"
125
+
126
+ require "rubygems"
127
+ require "nysol/mcmd"
128
+ require "json"
129
+
130
+ # Rライブラリ実行可能確認
131
+ exit(1) unless(MCMD::chkRexe("glmnet"))
132
+ exit(1) unless(MCMD::chkRexe("arulesSequences"))
133
+
134
+ class Index
135
+ attr_reader :size
136
+ def show
137
+ puts "@ifile=#{@ifile}"
138
+ puts "@idxSize=#{@idxSize}"
139
+ puts "@seed=#{@seed}"
140
+ puts "@alphabets=#{@alphabets}"
141
+ end
142
+
143
+ # constructor
144
+ def initialize(ifile,idxSize,seed)
145
+ @ifile=ifile
146
+ @idxSize=idxSize
147
+
148
+ # setting up random object
149
+ unless seed
150
+ @seed=Random.new_seed
151
+ else
152
+ @seed=seed
153
+ end
154
+ @random = Random.new(@seed)
155
+ @done=[]
156
+
157
+ # setting alphabet vector and its size
158
+ temp=MCMD::Mtemp.new
159
+ xxitem=temp.file
160
+ f=""
161
+ f << "mcut f=item i=#{ifile} |"
162
+ f << "muniq k=item o=#{xxitem}"
163
+ system(f)
164
+ iCSV=MCMD::Mcsvin.new("i=#{xxitem}")
165
+ @alphabets=[]
166
+ iCSV.each{|flds|
167
+ @alphabets << flds["item"]
168
+ }
169
+ @size=@alphabets.size
170
+ end
171
+
172
+ # generate random index
173
+ def firstIdx(noidx)
174
+ index=nil
175
+ if noidx
176
+ index=[]
177
+ (0...@alphabets.size).each{|i|
178
+ index << i
179
+ }
180
+ else
181
+ begin
182
+ index=[]
183
+ itemset=Set.new
184
+ (0...@alphabets.size).each{|i|
185
+ num=@random.rand(@idxSize)
186
+ itemset << num
187
+ index << num
188
+ }
189
+ end while itemset.size < @idxSize
190
+ end
191
+ return index
192
+ end
193
+
194
+ # enumerating adjacents indexes
195
+ # indexes processed before will be skipped
196
+ def adjacents(index,noidx)
197
+ adjIndexes=[]
198
+ if noidx
199
+ adjIndexes << index
200
+ else
201
+ (0...index.size).each{|pos|
202
+ [-1,+1].each{|dir|
203
+ num=index[pos]+dir
204
+ next if num < 0 or num >= @idxSize
205
+ adjIndex=[]
206
+ (0...index.size).each{|i|
207
+ if pos==i
208
+ adjIndex << num
209
+ else
210
+ adjIndex << index[i]
211
+ end
212
+ }
213
+ if not @done.index(adjIndex)
214
+ adjIndexes << adjIndex
215
+ else
216
+ @done << adjIndex
217
+ end
218
+ }
219
+ }
220
+ end
221
+ return adjIndexes
222
+ end
223
+
224
+ # write alphabet-index to oFile
225
+ def writeAlphaIndex(index,oFile)
226
+ MCMD::Mcsvout.new("o=#{oFile} f=alphabet,index"){|oCSV|
227
+ (0...@alphabets.size).each{|i|
228
+ oCSV.write([@alphabets[i],index[i]])
229
+ }
230
+ }
231
+ end
232
+ end
233
+
234
+ # convert original transaction data to one with indexed item
235
+ def convTra(ifile,idxObj,index,convTraFile)
236
+ temp=MCMD::Mtemp.new
237
+ xxmf=temp.file
238
+ idxObj.writeAlphaIndex(index,xxmf)
239
+ f=""
240
+ f << "mjoin k=item K=alphabet m=#{xxmf} f=index i=#{ifile} |"
241
+ f << "mcut f=tid,time,index |"
242
+ f << "muniq k=tid,time,index |"
243
+ f << "mtra k=tid,time f=index |"
244
+ f << "mvcount vf=index:size |"
245
+ f << "mcut f=tid,time,size,index -nfno o=#{convTraFile}"
246
+ system(f)
247
+ end
248
+
249
+ # estimate the best lambda
250
+ # 1. enumerate frequent sequences using all data
251
+ # 2. construct regression model with the sequences as input variable
252
+ # cross validation is used for getting the best lambda
253
+ # 3. return deviance and lambda
254
+ def mkCVmodel(convTra,minSupport,yFile,seed)
255
+ system "cp #{convTra} xxconvTra"
256
+ temp=MCMD::Mtemp.new
257
+ xxscp=temp.file
258
+ xxdev=temp.file
259
+ xxlam=temp.file
260
+ scp= <<"EOS"
261
+ library(arulesSequences)
262
+ library(glmnet)
263
+ EOS
264
+ scp << "\tset.seed(#{seed})\n" if seed
265
+
266
+ scp << <<"EOS"
267
+ x <- read_baskets(con="#{convTra}", sep=",",info=c("sequenceID","eventID","SIZE"))
268
+ as(x, "data.frame")
269
+ s1 <- cspade(x, parameter = list(support = #{minSupport}), control = list(verbose = TRUE))
270
+ #as(s1, "data.frame")
271
+ xMTX=as(as(supportingTransactions(s1,x),"ngCMatrix"),"matrix")
272
+ #print(xMTX)
273
+ yMTX=as.matrix(read.csv(\"#{yFile}\"))
274
+ model = cv.glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0)
275
+ mm=which(model$lambda==model$lambda.min)
276
+ write.table(model$cvm[mm] ,"#{xxdev}", quote=F, col.names=F,row.names=F)
277
+ write.table(model$lambda.min,"#{xxlam}", quote=F, col.names=F,row.names=F)
278
+ #print(mm)
279
+ #print(model$lambda)
280
+ #print(model$lambda.min)
281
+ #print(model$cvm)
282
+ #print(model$cvm[mm])
283
+ #print(str(model))
284
+ #print(summary(model))
285
+ #sink()
286
+ EOS
287
+
288
+ File.open(xxscp,"w"){|fpw| fpw.puts scp}
289
+ system "R --vanilla -q --slave < #{xxscp} &>/dev/null"
290
+ #system "R --vanilla -q < #{xxscp}"
291
+ # if all fields have same value for all records, glmnet fail and it doesn't output the result.
292
+ dev=Float::MAX
293
+ lam=nil
294
+ if File.exists?(xxdev)
295
+ dev=`cat #{xxdev}`.strip.to_f
296
+ lam=`cat #{xxlam}`.strip.to_f
297
+ end
298
+ return dev,lam
299
+ #system "cp #{convTra} xxconvTra"
300
+ #system "cp #{xxscp} xxscp"
301
+ # puts scp
302
+ end
303
+
304
+ # construct a regression model with specified lambda
305
+ def mkModel(convTra,lam,minSupport,yFile,oPath)
306
+ temp=MCMD::Mtemp.new
307
+ xxscp=temp.file
308
+ xxdev=temp.file
309
+ xxlam=temp.file
310
+ scp= <<"EOS"
311
+ library(arulesSequences)
312
+ library(glmnet)
313
+ x <- read_baskets(con="#{convTra}", sep=",",info=c("sequenceID","eventID","SIZE"))
314
+ as(x, "data.frame")
315
+ s1 <- cspade(x, parameter = list(support = #{minSupport}), control = list(verbose = TRUE))
316
+ #as(s1, "data.frame")
317
+ xMTX=as(as(supportingTransactions(s1,x),"ngCMatrix"),"matrix")
318
+ #print(xMTX)
319
+ yMTX=as.matrix(read.csv(\"#{yFile}\"))
320
+
321
+ cv = cv.glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0)
322
+ png("#{oPath}/lambda.png")
323
+ plot(cv)
324
+ supmsg=dev.off()
325
+
326
+ model = glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0,lambda=#{lam})
327
+ save(model ,file="#{oPath}/model.obj")
328
+ write.csv(as.matrix(model$a0),file="#{oPath}/const.txt",quote=FALSE)
329
+ write.csv(as.matrix(model$beta),file="#{oPath}/beta.txt",quote=FALSE)
330
+ png("#{oPath}/coef.png")
331
+ plot(model,"lambda")
332
+ supmsg=dev.off()
333
+
334
+ info=as.data.frame(model$nobs)
335
+ colnames(info)=c("nobs")
336
+ info$lambda=#{lam}
337
+ info$devRatio=model$dev.ratio
338
+ info$nulldev=model$nulldev
339
+ write.table(info,"#{oPath}/info.txt", quote=F, sep=",", col.names=T,row.names=F, append=F)
340
+ EOS
341
+
342
+ File.open(xxscp,"w"){|fpw| fpw.puts scp}
343
+ system "R --vanilla -q --slave < #{xxscp} &>/dev/null"
344
+ #system "R --vanilla -q < #{xxscp}"
345
+ # if all fields have same value for all records, glmnet fail and it doesn't output the result.
346
+ dev=Float::MAX
347
+ lam=nil
348
+ if File.exists?(xxdev)
349
+ dev=`cat #{xxdev}`.strip.to_f
350
+ lam=`cat #{xxlam}`.strip.to_f
351
+ end
352
+ #p dev
353
+ #p lam
354
+ return dev,lam
355
+ #system "cp #{convTra} xxconvTra"
356
+ #system "cp #{xxscp} xxscp"
357
+ # puts scp
358
+ end
359
+
360
+ #################################################################################################
361
+ #### Entry point
362
+ st=Time.new
363
+
364
+ ########################
365
+ ## predict mode
366
+ if ARGV.index("-predict")
367
+ ;
368
+ ########################
369
+ #### model building mode
370
+ else
371
+ args=MCMD::Margs.new(ARGV,"-noidx,i=,c=,tid=,time=,item=,s=,class=,alpha=,family=,O=,idxSize=,seed=,mp=,T=,-verbose,T=,seed=","tid=,item=,time=,c=,s=,class=,i=,O=")
372
+
373
+ # mcmdのメッセージは警告とエラーのみ
374
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
375
+ ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
376
+
377
+ #ワークファイルパス
378
+ if args.str("T=")!=nil then
379
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
380
+ end
381
+
382
+ ifile =args.file("i=","r")
383
+ cfile =args.file("c=","r")
384
+ oPath =args.file("O=", "w")
385
+
386
+ tid = args.field("tid=" , ifile, nil , 1,1)["names"].join(",")
387
+ item = args.field("item=", ifile, nil , 1,1)["names"].join(",")
388
+ klass= args.field("class=",cfile, nil , 1,1)["names"].join(",")
389
+ time = args.field("time=", ifile, nil , 1,1)
390
+
391
+ if time
392
+ time=time["names"].join(",")
393
+ end
394
+
395
+ # ---- other paramters
396
+ alpha = args.float("alpha=", 1.0, 0.0, 1.0)
397
+ family = args.str("family=", "binomial")
398
+ minSupport= args.int("s=")
399
+ seed = args.int("seed=")
400
+ idxSize= args.int("idxSize=", 2)
401
+ mp = args.int("mp=", 8)
402
+ noidx = args.bool("-noidx")
403
+ #param = args.str("param=")
404
+ #param = ","+param if param
405
+ MCMD::mkDir(oPath)
406
+
407
+ wf=MCMD::Mtemp.new
408
+ xxifile =wf.file
409
+ xxyfile =wf.file
410
+ xxconvTra=wf.file
411
+ xxrsl =wf.file
412
+
413
+ f=""
414
+ f << "msortf f=#{tid} i=#{cfile} |"
415
+ f << "mcut f=#{klass}:klass o=#{xxyfile}"
416
+ system(f)
417
+
418
+ if time
419
+ f=""
420
+ f << "mcut f=#{tid}:tid,#{time}:time,#{item}:item i=#{ifile} |"
421
+ f << "muniq k=tid,time,item |"
422
+ f << "msortf f=tid,time | mfldname -q o=#{xxifile}"
423
+ system(f)
424
+ else
425
+ f=""
426
+ f << "mcut f=#{tid}:tid,#{item}:item i=#{ifile} |"
427
+ f << "muniq k=tid,item |"
428
+ f << "msortf f=tid o=#{xxifile}"
429
+ system(f)
430
+ end
431
+
432
+ idxObj=Index.new(xxifile,idxSize,seed)
433
+ bestMSE=Float::MAX
434
+ bestLAM=nil
435
+ bestIDX=idxObj.firstIdx(noidx)
436
+ STDERR.puts "#{bestIDX.join("")} initial index"
437
+
438
+ while true
439
+ indexes=idxObj.adjacents(bestIDX,noidx)
440
+ # find the better model in multiple indexes
441
+ (0...indexes.size).to_a.meach(mp){|i|
442
+ convTra(xxifile,idxObj,indexes[i],"#{xxconvTra}_#{i}")
443
+ dev,lam=mkCVmodel("#{xxconvTra}_#{i}",minSupport,xxyfile,seed)
444
+ File.open("#{xxrsl}_#{i}", 'w'){|fpw|
445
+ JSON.dump([dev,lam], fpw)
446
+ }
447
+ STDERR.puts "#{indexes[i].join("")} deviance[#{i}]=#{dev}"
448
+ }
449
+ updated=false
450
+ (0...indexes.size).each{|i|
451
+ dev=lam=nil
452
+ File.open("#{xxrsl}_#{i}"){|fpr|
453
+ dev,lam=JSON.load(fpr)
454
+ }
455
+ if bestMSE>dev
456
+ updated=true
457
+ bestMSE=dev
458
+ bestLAM=lam
459
+ bestIDX=indexes[i]
460
+ end
461
+ }
462
+ system "rm -r #{xxrsl}_*"
463
+ if updated
464
+ STDERR.puts "#{bestIDX.join("")} improved (deviance=#{bestMSE} lambda=#{bestLAM})"
465
+ else
466
+ STDERR.puts "not improved and finished for exploring"
467
+ break
468
+ end
469
+ end
470
+
471
+ if bestLAM
472
+ convTra(xxifile,idxObj,bestIDX,xxconvTra)
473
+ mkModel(xxconvTra,bestLAM,minSupport,xxyfile,oPath)
474
+ idxObj.writeAlphaIndex(bestIDX,"#{oPath}/alphabetIndex.csv")
475
+ else
476
+ STDERR.puts "it could not find any good model"
477
+ end
478
+ end
479
+
480
+ STDERR.puts "elapsed time : #{Time.new-st} seconds"
481
+
482
+ # end message
483
+ MCMD::endLog(args.cmdline)
484
+