nysol-mining 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,484 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # 1.0 initial release: 2017/01/15
5
+ $version="1.0"
6
+ $revision="###VERSION###"
7
+
8
+ def help
9
+
10
+ STDERR.puts <<EOF
11
+ ----------------------------
12
+ midxmine.rb version #{$version}
13
+ ----------------------------
14
+ description) construct a regression model with optimally indexed itemset sequences
15
+ features) 1) using elastic-net regression (ridge to lasso regression)
16
+ 2) exploring the best alphabet-index, which is mapping function from item to group
17
+ 3) enumerate frequent itemset sequences, and use them as input variables for a model
18
+ 4) linear regression and logistic regression can be chosed
19
+ usage1) model building mode
20
+ midxmine.rb -noidx i= tid= time= item= s= c= class= [family=binomial] [alpha=1.0] [idxSize=2] [seed=] O= [T=] [-mcmdenv] [--help]
21
+ usage2) prediction mode (not imprementaed yet)
22
+ mglmnet.rb -predict i= I= o= [T=] [-mcmdenv] [--help]
23
+
24
+ ### model building mode
25
+ # parameters for input data
26
+ i= : transaction data file (mandatory)
27
+ tid= : field name for transaction ID in i= file (mandatory)
28
+ time= : field name for time in i= file (mandatory)
29
+ itme= : field name for item in i= file (mandatory)
30
+
31
+ # parameters for class data
32
+ c= : target variable file (mandatory)
33
+ : this file have to have the same field name as tid= in i= file (mandatory)
34
+ class= : field name for target variable in c= file (mandatory)
35
+
36
+ # parameters for itemset sequence enumeration
37
+ s= : minimum support for enumerating itemset sequences (mandatory)
38
+
39
+ # parameters for regression
40
+ family : link function for generalized linear regression model
41
+ "binomial" or "gaussian" can be chosen
42
+ alpha : weight of L1 and L2 regulalization in elastic-net
43
+ 1.0: lasso regression (L1)
44
+ 0.0: ridge regression (L2)
45
+
46
+ # parameters for indexing
47
+ idxSize : index size
48
+ seed= : random seed for initial index
49
+
50
+ O= : directory name for ouput (mandatory)
51
+
52
+ ### prediction mode (not impremented yet)
53
+
54
+ ### other parameters
55
+ T= : directory name for temporal files (default=/tmp)
56
+ mcmdenv : show messages of mcmd
57
+ -help : show help
58
+
59
+ necessary software)
60
+ 1) R
61
+ 2) glmnet package in R
62
+ 2) arulesSequences package in R
63
+
64
+ example)
65
+ $ cat zaki.csv
66
+ tid,time,item
67
+ 1,10,C
68
+ 1,10,D
69
+ 1,15,A
70
+ 1,15,B
71
+ 1,15,C
72
+ 1,20,A
73
+ 1,20,B
74
+ 1,20,F
75
+ 1,25,A
76
+ 1,25,C
77
+ 1,25,D
78
+ 1,25,F
79
+ 2,15,A
80
+ 2,15,B
81
+ 2,15,F
82
+ 2,20,E
83
+ 3,10,A
84
+ 3,10,B
85
+ 3,10,F
86
+ 4,10,D
87
+ 4,10,G
88
+ 4,10,H
89
+ 4,20,B
90
+ 4,20,F
91
+ 4,25,A
92
+ 4,25,G
93
+ 4,25,H
94
+ $ cat zaki_c.csv
95
+ tid,class
96
+ 1,1
97
+ 2,1
98
+ 3,0
99
+ 4,0
100
+
101
+ $ midxmine.rb i=zaki.csv c=zaki_c.csv O=result1 tid=tid item=item time=time class=class idxSize=2 seed=111 s=0.1↩
102
+
103
+ $ ls result1
104
+ alphabetIndex.csv
105
+ beta.txt
106
+ coef.png
107
+ const.txt
108
+ info.txt
109
+ lambda.png
110
+ model.obj
111
+
112
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
113
+ EOF
114
+ exit
115
+ end
116
+
117
+ def ver()
118
+ $revision ="0" if $revision =~ /VERSION/
119
+ STDERR.puts "version #{$version} revision #{$revision}"
120
+ exit
121
+ end
122
+
123
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
124
+ ver() if ARGV[0]=="--version"
125
+
126
+ require "rubygems"
127
+ require "nysol/mcmd"
128
+ require "json"
129
+
130
+ # Rライブラリ実行可能確認
131
+ exit(1) unless(MCMD::chkRexe("glmnet"))
132
+ exit(1) unless(MCMD::chkRexe("arulesSequences"))
133
+
134
+ class Index
135
+ attr_reader :size
136
+ def show
137
+ puts "@ifile=#{@ifile}"
138
+ puts "@idxSize=#{@idxSize}"
139
+ puts "@seed=#{@seed}"
140
+ puts "@alphabets=#{@alphabets}"
141
+ end
142
+
143
+ # constructor
144
+ def initialize(ifile,idxSize,seed)
145
+ @ifile=ifile
146
+ @idxSize=idxSize
147
+
148
+ # setting up random object
149
+ unless seed
150
+ @seed=Random.new_seed
151
+ else
152
+ @seed=seed
153
+ end
154
+ @random = Random.new(@seed)
155
+ @done=[]
156
+
157
+ # setting alphabet vector and its size
158
+ temp=MCMD::Mtemp.new
159
+ xxitem=temp.file
160
+ f=""
161
+ f << "mcut f=item i=#{ifile} |"
162
+ f << "muniq k=item o=#{xxitem}"
163
+ system(f)
164
+ iCSV=MCMD::Mcsvin.new("i=#{xxitem}")
165
+ @alphabets=[]
166
+ iCSV.each{|flds|
167
+ @alphabets << flds["item"]
168
+ }
169
+ @size=@alphabets.size
170
+ end
171
+
172
+ # generate random index
173
+ def firstIdx(noidx)
174
+ index=nil
175
+ if noidx
176
+ index=[]
177
+ (0...@alphabets.size).each{|i|
178
+ index << i
179
+ }
180
+ else
181
+ begin
182
+ index=[]
183
+ itemset=Set.new
184
+ (0...@alphabets.size).each{|i|
185
+ num=@random.rand(@idxSize)
186
+ itemset << num
187
+ index << num
188
+ }
189
+ end while itemset.size < @idxSize
190
+ end
191
+ return index
192
+ end
193
+
194
+ # enumerating adjacents indexes
195
+ # indexes processed before will be skipped
196
+ def adjacents(index,noidx)
197
+ adjIndexes=[]
198
+ if noidx
199
+ adjIndexes << index
200
+ else
201
+ (0...index.size).each{|pos|
202
+ [-1,+1].each{|dir|
203
+ num=index[pos]+dir
204
+ next if num < 0 or num >= @idxSize
205
+ adjIndex=[]
206
+ (0...index.size).each{|i|
207
+ if pos==i
208
+ adjIndex << num
209
+ else
210
+ adjIndex << index[i]
211
+ end
212
+ }
213
+ if not @done.index(adjIndex)
214
+ adjIndexes << adjIndex
215
+ else
216
+ @done << adjIndex
217
+ end
218
+ }
219
+ }
220
+ end
221
+ return adjIndexes
222
+ end
223
+
224
+ # write alphabet-index to oFile
225
+ def writeAlphaIndex(index,oFile)
226
+ MCMD::Mcsvout.new("o=#{oFile} f=alphabet,index"){|oCSV|
227
+ (0...@alphabets.size).each{|i|
228
+ oCSV.write([@alphabets[i],index[i]])
229
+ }
230
+ }
231
+ end
232
+ end
233
+
234
+ # convert original transaction data to one with indexed item
235
+ def convTra(ifile,idxObj,index,convTraFile)
236
+ temp=MCMD::Mtemp.new
237
+ xxmf=temp.file
238
+ idxObj.writeAlphaIndex(index,xxmf)
239
+ f=""
240
+ f << "mjoin k=item K=alphabet m=#{xxmf} f=index i=#{ifile} |"
241
+ f << "mcut f=tid,time,index |"
242
+ f << "muniq k=tid,time,index |"
243
+ f << "mtra k=tid,time f=index |"
244
+ f << "mvcount vf=index:size |"
245
+ f << "mcut f=tid,time,size,index -nfno o=#{convTraFile}"
246
+ system(f)
247
+ end
248
+
249
+ # estimate the best lambda
250
+ # 1. enumerate frequent sequences using all data
251
+ # 2. construct regression model with the sequences as input variable
252
+ # cross validation is used for getting the best lambda
253
+ # 3. return deviance and lambda
254
+ def mkCVmodel(convTra,minSupport,yFile,seed)
255
+ system "cp #{convTra} xxconvTra"
256
+ temp=MCMD::Mtemp.new
257
+ xxscp=temp.file
258
+ xxdev=temp.file
259
+ xxlam=temp.file
260
+ scp= <<"EOS"
261
+ library(arulesSequences)
262
+ library(glmnet)
263
+ EOS
264
+ scp << "\tset.seed(#{seed})\n" if seed
265
+
266
+ scp << <<"EOS"
267
+ x <- read_baskets(con="#{convTra}", sep=",",info=c("sequenceID","eventID","SIZE"))
268
+ as(x, "data.frame")
269
+ s1 <- cspade(x, parameter = list(support = #{minSupport}), control = list(verbose = TRUE))
270
+ #as(s1, "data.frame")
271
+ xMTX=as(as(supportingTransactions(s1,x),"ngCMatrix"),"matrix")
272
+ #print(xMTX)
273
+ yMTX=as.matrix(read.csv(\"#{yFile}\"))
274
+ model = cv.glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0)
275
+ mm=which(model$lambda==model$lambda.min)
276
+ write.table(model$cvm[mm] ,"#{xxdev}", quote=F, col.names=F,row.names=F)
277
+ write.table(model$lambda.min,"#{xxlam}", quote=F, col.names=F,row.names=F)
278
+ #print(mm)
279
+ #print(model$lambda)
280
+ #print(model$lambda.min)
281
+ #print(model$cvm)
282
+ #print(model$cvm[mm])
283
+ #print(str(model))
284
+ #print(summary(model))
285
+ #sink()
286
+ EOS
287
+
288
+ File.open(xxscp,"w"){|fpw| fpw.puts scp}
289
+ system "R --vanilla -q --slave < #{xxscp} &>/dev/null"
290
+ #system "R --vanilla -q < #{xxscp}"
291
+ # if all fields have same value for all records, glmnet fail and it doesn't output the result.
292
+ dev=Float::MAX
293
+ lam=nil
294
+ if File.exists?(xxdev)
295
+ dev=`cat #{xxdev}`.strip.to_f
296
+ lam=`cat #{xxlam}`.strip.to_f
297
+ end
298
+ return dev,lam
299
+ #system "cp #{convTra} xxconvTra"
300
+ #system "cp #{xxscp} xxscp"
301
+ # puts scp
302
+ end
303
+
304
+ # construct a regression model with specified lambda
305
+ def mkModel(convTra,lam,minSupport,yFile,oPath)
306
+ temp=MCMD::Mtemp.new
307
+ xxscp=temp.file
308
+ xxdev=temp.file
309
+ xxlam=temp.file
310
+ scp= <<"EOS"
311
+ library(arulesSequences)
312
+ library(glmnet)
313
+ x <- read_baskets(con="#{convTra}", sep=",",info=c("sequenceID","eventID","SIZE"))
314
+ as(x, "data.frame")
315
+ s1 <- cspade(x, parameter = list(support = #{minSupport}), control = list(verbose = TRUE))
316
+ #as(s1, "data.frame")
317
+ xMTX=as(as(supportingTransactions(s1,x),"ngCMatrix"),"matrix")
318
+ #print(xMTX)
319
+ yMTX=as.matrix(read.csv(\"#{yFile}\"))
320
+
321
+ cv = cv.glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0)
322
+ png("#{oPath}/lambda.png")
323
+ plot(cv)
324
+ supmsg=dev.off()
325
+
326
+ model = glmnet(xMTX,yMTX,family=\"binomial\",alpha=1.0,lambda=#{lam})
327
+ save(model ,file="#{oPath}/model.obj")
328
+ write.csv(as.matrix(model$a0),file="#{oPath}/const.txt",quote=FALSE)
329
+ write.csv(as.matrix(model$beta),file="#{oPath}/beta.txt",quote=FALSE)
330
+ png("#{oPath}/coef.png")
331
+ plot(model,"lambda")
332
+ supmsg=dev.off()
333
+
334
+ info=as.data.frame(model$nobs)
335
+ colnames(info)=c("nobs")
336
+ info$lambda=#{lam}
337
+ info$devRatio=model$dev.ratio
338
+ info$nulldev=model$nulldev
339
+ write.table(info,"#{oPath}/info.txt", quote=F, sep=",", col.names=T,row.names=F, append=F)
340
+ EOS
341
+
342
+ File.open(xxscp,"w"){|fpw| fpw.puts scp}
343
+ system "R --vanilla -q --slave < #{xxscp} &>/dev/null"
344
+ #system "R --vanilla -q < #{xxscp}"
345
+ # if all fields have same value for all records, glmnet fail and it doesn't output the result.
346
+ dev=Float::MAX
347
+ lam=nil
348
+ if File.exists?(xxdev)
349
+ dev=`cat #{xxdev}`.strip.to_f
350
+ lam=`cat #{xxlam}`.strip.to_f
351
+ end
352
+ #p dev
353
+ #p lam
354
+ return dev,lam
355
+ #system "cp #{convTra} xxconvTra"
356
+ #system "cp #{xxscp} xxscp"
357
+ # puts scp
358
+ end
359
+
360
+ #################################################################################################
361
+ #### Entry point
362
+ st=Time.new
363
+
364
+ ########################
365
+ ## predict mode
366
+ if ARGV.index("-predict")
367
+ ;
368
+ ########################
369
+ #### model building mode
370
+ else
371
+ args=MCMD::Margs.new(ARGV,"-noidx,i=,c=,tid=,time=,item=,s=,class=,alpha=,family=,O=,idxSize=,seed=,mp=,T=,-verbose,T=,seed=","tid=,item=,time=,c=,s=,class=,i=,O=")
372
+
373
+ # mcmdのメッセージは警告とエラーのみ
374
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
375
+ ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
376
+
377
+ #ワークファイルパス
378
+ if args.str("T=")!=nil then
379
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
380
+ end
381
+
382
+ ifile =args.file("i=","r")
383
+ cfile =args.file("c=","r")
384
+ oPath =args.file("O=", "w")
385
+
386
+ tid = args.field("tid=" , ifile, nil , 1,1)["names"].join(",")
387
+ item = args.field("item=", ifile, nil , 1,1)["names"].join(",")
388
+ klass= args.field("class=",cfile, nil , 1,1)["names"].join(",")
389
+ time = args.field("time=", ifile, nil , 1,1)
390
+
391
+ if time
392
+ time=time["names"].join(",")
393
+ end
394
+
395
+ # ---- other paramters
396
+ alpha = args.float("alpha=", 1.0, 0.0, 1.0)
397
+ family = args.str("family=", "binomial")
398
+ minSupport= args.int("s=")
399
+ seed = args.int("seed=")
400
+ idxSize= args.int("idxSize=", 2)
401
+ mp = args.int("mp=", 8)
402
+ noidx = args.bool("-noidx")
403
+ #param = args.str("param=")
404
+ #param = ","+param if param
405
+ MCMD::mkDir(oPath)
406
+
407
+ wf=MCMD::Mtemp.new
408
+ xxifile =wf.file
409
+ xxyfile =wf.file
410
+ xxconvTra=wf.file
411
+ xxrsl =wf.file
412
+
413
+ f=""
414
+ f << "msortf f=#{tid} i=#{cfile} |"
415
+ f << "mcut f=#{klass}:klass o=#{xxyfile}"
416
+ system(f)
417
+
418
+ if time
419
+ f=""
420
+ f << "mcut f=#{tid}:tid,#{time}:time,#{item}:item i=#{ifile} |"
421
+ f << "muniq k=tid,time,item |"
422
+ f << "msortf f=tid,time | mfldname -q o=#{xxifile}"
423
+ system(f)
424
+ else
425
+ f=""
426
+ f << "mcut f=#{tid}:tid,#{item}:item i=#{ifile} |"
427
+ f << "muniq k=tid,item |"
428
+ f << "msortf f=tid o=#{xxifile}"
429
+ system(f)
430
+ end
431
+
432
+ idxObj=Index.new(xxifile,idxSize,seed)
433
+ bestMSE=Float::MAX
434
+ bestLAM=nil
435
+ bestIDX=idxObj.firstIdx(noidx)
436
+ STDERR.puts "#{bestIDX.join("")} initial index"
437
+
438
+ while true
439
+ indexes=idxObj.adjacents(bestIDX,noidx)
440
+ # find the better model in multiple indexes
441
+ (0...indexes.size).to_a.meach(mp){|i|
442
+ convTra(xxifile,idxObj,indexes[i],"#{xxconvTra}_#{i}")
443
+ dev,lam=mkCVmodel("#{xxconvTra}_#{i}",minSupport,xxyfile,seed)
444
+ File.open("#{xxrsl}_#{i}", 'w'){|fpw|
445
+ JSON.dump([dev,lam], fpw)
446
+ }
447
+ STDERR.puts "#{indexes[i].join("")} deviance[#{i}]=#{dev}"
448
+ }
449
+ updated=false
450
+ (0...indexes.size).each{|i|
451
+ dev=lam=nil
452
+ File.open("#{xxrsl}_#{i}"){|fpr|
453
+ dev,lam=JSON.load(fpr)
454
+ }
455
+ if bestMSE>dev
456
+ updated=true
457
+ bestMSE=dev
458
+ bestLAM=lam
459
+ bestIDX=indexes[i]
460
+ end
461
+ }
462
+ system "rm -r #{xxrsl}_*"
463
+ if updated
464
+ STDERR.puts "#{bestIDX.join("")} improved (deviance=#{bestMSE} lambda=#{bestLAM})"
465
+ else
466
+ STDERR.puts "not improved and finished for exploring"
467
+ break
468
+ end
469
+ end
470
+
471
+ if bestLAM
472
+ convTra(xxifile,idxObj,bestIDX,xxconvTra)
473
+ mkModel(xxconvTra,bestLAM,minSupport,xxyfile,oPath)
474
+ idxObj.writeAlphaIndex(bestIDX,"#{oPath}/alphabetIndex.csv")
475
+ else
476
+ STDERR.puts "it could not find any good model"
477
+ end
478
+ end
479
+
480
+ STDERR.puts "elapsed time : #{Time.new-st} seconds"
481
+
482
+ # end message
483
+ MCMD::endLog(args.cmdline)
484
+