nysol-mining 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,340 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # 1.0 initial development: 2015/10/20
5
+ # 1.1 minor modifications: 2015/12/09
6
+ $version="1.1"
7
+ $revision="###VERSION###"
8
+ CMD="mgfeatures.rb"
9
+
10
+ def help
11
+
12
+ STDERR.puts <<EOF
13
+ ----------------------------
14
+ #{CMD} version #{$version}
15
+ ----------------------------
16
+ summary) calculation graph features by igraph
17
+ feature) output the following graph features
18
+ node_size : number of nodes
19
+ edge_size : number of edges
20
+ degree0_node_size : number of nodes with 0 degree
21
+ mean_degree : mean of degree
22
+ median_degree : median of degree
23
+ min_degree : min of degree
24
+ max_degree : max of degree
25
+ graph_density : graph density
26
+ transitivity : so called clustering coefficient
27
+ average_shortest_path : mean of shortest path length for all pair of edges
28
+ diameter : max of shortest path length for all pair of edges
29
+
30
+ format) #{CMD} I=|(ei= [ni=]) ef= [nf=] O=|o= [log=] [T=] [--help]
31
+ args=MCMD::Margs.new(ARGV,"I=,ei=,ef=,ni=,nf=,o=,O=,diameter=,graph_density=,log=,-verbose","ef=,O=")
32
+ I= : path name of input files
33
+ : file extention of edge file must be ".edge" in this path
34
+ : file extention of node file must be ".node" in this path
35
+ ei= : input file name of edge (cannot be specified with I=)
36
+ ef= : field name of edge (two nodes)
37
+ ni= : input file name of nodes (cannot be specified with I=)
38
+ : if omitted, only edge file is used
39
+ nf= : field name of node
40
+ -directed : assume a directed graph
41
+ O= : output path
42
+
43
+ ## parameter for each feature (see igraph manual in detail)
44
+ diameter=unconnected=[TRUE|FALSE],directed=[TRUE|FALSE]
45
+ graph_density=loops=[FALSE|TRUE]
46
+ average_shortest_path=unconnected=[TRUE|FALSE],directed=[TRUE|FALSE]
47
+
48
+ ## others
49
+ mp= : Number of processes for parallel processing
50
+ T= : working directory (default:/tmp)
51
+ -mcmdenv : show the END messages of MCMD
52
+ --help : show help
53
+
54
+ required software)
55
+ 1) R
56
+ 2) igraph package for R
57
+
58
+ example)
59
+ $ cat data/dat1.edge
60
+ v1,v2
61
+ E,J
62
+ E,A
63
+ J,D
64
+ J,A
65
+ J,H
66
+ D,H
67
+ D,F
68
+ H,F
69
+ A,F
70
+ B,H
71
+ $ cat data/dat1.node
72
+ v
73
+ A
74
+ B
75
+ C
76
+ D
77
+ E
78
+ F
79
+ G
80
+ H
81
+ I
82
+ J
83
+ $ #{CMD} I=data O=data/result1 ef=v1,v2 nf=v O=result
84
+ #MSG# converting graph files into a pair of numbered nodes ...; 2015/10/20 14:57:26
85
+ #END# ../bin/mgfeatrues.rb I=./data O=result1 ef=v1,v2 nf=v; 2015/10/20 14:57:27
86
+ $ cat data/dat1.csv
87
+ id,node_size,edge_size,degree0_node_size,mean_degree,median_degree,min_degree,max_degree,graph_density,transitivity,average_shortest_path,diameter
88
+ dat1,10,10,3,2,2.5,0,4,0.222222222222222,0.409090909090909,1.61904761904762,3
89
+
90
+ # without specifying nf= (node file isn't used)
91
+ $ #{CMD} I=data O=data/result1 ef=v1,v2 O=result
92
+ #MSG# converting graph files into a pair of numbered nodes ...; 2015/10/20 14:57:26
93
+ #END# ../bin/mgfeatrues.rb I=./data O=result1 ef=v1,v2 nf=v; 2015/10/20 14:57:27
94
+ $ cat data/dat1.csv
95
+ id,node_size,edge_size,degree0_node_size,mean_degree,median_degree,min_degree,max_degree,graph_density,transitivity,average_shortest_path,diameter
96
+ dat1,10,10,0,2.85714285714286,3,1,4,0.476190476190476,0.409090909090909,1.61904761904762,3
97
+
98
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
99
+ EOF
100
+ exit
101
+ end
102
+
103
+ def ver()
104
+ $revision ="0" if $revision =~ /VERSION/
105
+ STDERR.puts "version #{$version} revision #{$revision}"
106
+ exit
107
+ end
108
+
109
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
110
+ ver() if ARGV[0]=="--version"
111
+
112
+ require "rubygems"
113
+ require "nysol/mcmd"
114
+
115
+ # confirm if R library is installed
116
+ exit(1) unless(MCMD::chkRexe("igraph"))
117
+
118
+ ####
119
+ # converting original graph file with text to one with integer
120
+ # output #{numFile} and #{mapFile}, then return the number of nodes of the graph
121
+ #
122
+ # ei ni xxnum xxmap
123
+ # v1,v2 v node%1,flag%0,num
124
+ # E,J A 0 3 A,0,0
125
+ # E,A B 0 4 B,0,1
126
+ # J,D C 0 6 D,0,2
127
+ # J,A D => 1 5 E,0,3
128
+ # J,H E 2 4 F,0,4
129
+ # D,H F 2 5 H,0,5
130
+ # D,F G 2 6 J,0,6
131
+ # H,F H 3 6 C,1,7
132
+ # A,F I 4 5 G,1,8
133
+ # B,H J 5 6 I,1,9
134
+ #
135
+ # return value is 10 (nodes)
136
+ # "flag" on xxmap: 0:nodes in "ei", 1:nodes only in "ni".
137
+ def g2pair(ni,nf,ei,ef1,ef2,numFile,mapFile)
138
+ #MCMD::msgLog("converting graph files into a pair of numbered nodes ...")
139
+ wf=MCMD::Mtemp.new
140
+ wf1=wf.file
141
+ wf2=wf.file
142
+ wf3=wf.file
143
+
144
+ system "mcut f=#{ef1}:node i=#{ei} | msetstr v=0 a=flag o=#{wf1}"
145
+ system "mcut f=#{ef2}:node i=#{ei} | msetstr v=0 a=flag o=#{wf2}"
146
+ system "mcut f=#{nf}:node i=#{ni} | msetstr v=1 a=flag o=#{wf3}" if nf
147
+
148
+ f=""
149
+ if nf
150
+ f << "mcat i=#{wf1},#{wf2},#{wf3} f=node,flag |"
151
+ f << "mbest k=node s=flag from=0 size=1 |"
152
+ else
153
+ f << "mcat i=#{wf1},#{wf2} f=node,flag |"
154
+ f << "muniq k=node |"
155
+ end
156
+ # isolated nodes are set to the end of position in mapping file.
157
+ # S= must start from 0 (but inside R vertex number will be added one)
158
+ f << "mnumber s=flag,node a=num S=0 o=#{mapFile}"
159
+ system(f)
160
+
161
+ f=""
162
+ f << "mcut f=#{ef1},#{ef2} i=#{ei} |"
163
+ f << "msortf f=#{ef1} |"
164
+ f << "mjoin k=#{ef1} K=node m=#{mapFile} f=num:num1 |"
165
+ f << "msortf f=#{ef2} |"
166
+ f << "mjoin k=#{ef2} K=node m=#{mapFile} f=num:num2 |"
167
+ f << "mcut f=num1,num2 |"
168
+ #f << "mfsort f=num1,num2 |"
169
+ f << "msortf f=num1%n,num2%n -nfno | tr ',' ' ' >#{numFile}"
170
+ system(f)
171
+
172
+ nodeSize=MCMD::mrecount("i=#{mapFile}")
173
+
174
+ return nodeSize
175
+ end
176
+
177
+ ####
178
+ # generating the R script for graph features
179
+ # pars: parameters for each graph feature
180
+ def genRscript(directed,pars,eFile,nodeSize,oFile,scpFile)
181
+ dir="FALSE"
182
+ dir="TRUE" if directed
183
+
184
+ r_proc = <<EOF
185
+ library(igraph)
186
+ ## reading edge file
187
+ g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
188
+
189
+ ####
190
+ deg=degree(g)
191
+ node_size=vcount(g)
192
+ edge_size=ecount(g)
193
+ mean_degree=mean(deg)
194
+ median_degree=median(deg)
195
+ min_degree=min(deg)
196
+ max_degree=max(deg)
197
+ degree0_node_size=length(deg[deg==0])
198
+ graph_density=graph.density(g #{pars["graph_density"]})
199
+ average_shortest_path=average.path.length(g #{pars["average_shortest_path"]})
200
+
201
+ #### diameter
202
+ diameter=diameter(g #{pars["diameter"]})
203
+ transitivity=transitivity(g)
204
+
205
+ dat=data.frame(
206
+ node_size=node_size,
207
+ edge_size=edge_size,
208
+ degree0_node_size=degree0_node_size,
209
+ mean_degree=mean_degree,
210
+ median_degree=median_degree,
211
+ min_degree=min_degree,
212
+ max_degree=max_degree,
213
+ graph_density=graph_density,
214
+ transitivity=transitivity,
215
+ average_shortest_path=average_shortest_path,
216
+ diameter=diameter
217
+ )
218
+ write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
219
+ EOF
220
+
221
+ File.open(scpFile,"w"){|fpw|
222
+ fpw.write(r_proc)
223
+ }
224
+ end
225
+
226
+
227
+ #################################################################################################
228
+ #### Entry point
229
+
230
+ args=MCMD::Margs.new(ARGV,"I=,ei=,ef=,ni=,nf=,o=,O=,-directed,diameter=,graph_density=,average_shortest_path,-verbose,mp=","ef=,O=")
231
+
232
+ # suppress the end message of MCMD
233
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
234
+
235
+ # work file path
236
+ if args.str("T=")!=nil then
237
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
238
+ end
239
+
240
+ # setting variables for edge file(s) and its field name
241
+ iPath = args.file("I=","r")
242
+ oPath = args.file("O=","w")
243
+
244
+ edgeFiles=nil
245
+ ef1 =nil
246
+ ef2 =nil
247
+ if iPath then
248
+ edgeFiles = Dir["#{iPath}/*.edge"]
249
+ if edgeFiles.size==0 then
250
+ raise "#ERROR# no edge file is found matching with #{iPath}/*.edge"
251
+ end
252
+ ef = args.field("ef=", edgeFiles[0])
253
+ ef1,ef2=ef["names"]
254
+ else
255
+ edgeFiles = args.file("ei=","r").split # edge file name
256
+ unless edgeFiles
257
+ raise "#ERROR# ei= or I= is mandatory"
258
+ end
259
+ ef = args.field("ef=", edgeFiles[0])
260
+ ef1,ef2=ef["names"]
261
+ end
262
+
263
+ # setting variables for node file(s) and its field name.
264
+ # if nf= is not specified, only edge files are used for generating a graph.
265
+ ni=nil
266
+ nf=nil
267
+ if iPath then
268
+ nodeFile0=edgeFiles[0].sub(/\.edge/,".node")
269
+ if File.exists?(nodeFile0)
270
+ nf = args.field("nf=", nodeFile0)
271
+ if nf
272
+ nf=nf["names"][0]
273
+ end
274
+ else
275
+ nf = args.str("nf=")
276
+ if nf then
277
+ raise "#ERROR# nf= is specified, but no node file is found matching with #{iPath}/*.node"
278
+ end
279
+ end
280
+ else
281
+ ni = args. file("ni=","r") # node file name
282
+ if ni
283
+ nf = args.field("nf=", ni)
284
+ unless nf
285
+ raise "#ERROR# nf= is mandatory, when ni= is specified"
286
+ end
287
+ nf=nf["names"][0]
288
+ end
289
+ end
290
+
291
+ directed=args.bool("-directed")
292
+ MP=args.int("mp=",4)
293
+
294
+ pars={}
295
+ par=args.str("diameter=")
296
+ pars["diameter"]=",#{par}" if par
297
+ par=args.str("graph_density=")
298
+ pars["graph_density"]=",#{par}" if par
299
+ par=args.str("average_shortest_path")
300
+ pars["average_shortest_path"]=",#{par}" if par
301
+
302
+
303
+ MCMD::mkDir(oPath)
304
+
305
+
306
+ edgeFiles.meach(MP){|edgeFile|
307
+ #MCMD::msgLog("START fearture extraction: #{edgeFile}")
308
+
309
+ baseName=edgeFile.sub(/\.edge$/,"")
310
+ name=baseName.sub(/^.*\//,"")
311
+
312
+ nodeFile=edgeFile.sub(/\.edge$/,".node")
313
+
314
+ # convert the original graph to one igraph can handle
315
+ wf=MCMD::Mtemp.new
316
+ xxnum=wf.file
317
+ xxmap=wf.file
318
+ xxout=wf.file
319
+ xxscp=wf.file
320
+ nodeSize=g2pair(nodeFile,nf,edgeFile,ef1,ef2,xxnum,xxmap)
321
+
322
+
323
+ # generate R script, and run
324
+ genRscript(directed,pars,xxnum, nodeSize, xxout, xxscp)
325
+ if args.bool("-verbose") then
326
+ system "R --vanilla -q < #{xxscp}"
327
+ else
328
+ system "R --vanilla -q --slave < #{xxscp} 2>/dev/null "
329
+ end
330
+
331
+
332
+
333
+
334
+ # store the result
335
+ system "msetstr v=#{name} a=id i=#{xxout} | mcut -x f=0L,0-1L o=#{oPath}/#{name}.csv"
336
+ }
337
+
338
+ # end message
339
+ MCMD::endLog(args.cmdline)
340
+
@@ -0,0 +1,843 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # 1.1 fix the but about dimention mismatch: 2014/09/03
5
+ # 1.2 add -nocv mode, use JSON: 2015/01/11
6
+ # 1.3 add -z option, add original strings for xvar in coeff.csv : 2015/03/02
7
+ # 1.4 bug fix about NaN problem in specifing -z option : 2015/03/02
8
+ # 1.5 bug fix for key mismatching problem between x and y files : 2015/03/03
9
+ # 1.6 bug fix for prediction in logistic regression : 2015/03/27
10
+ $version="1.6"
11
+ $revision="###VERSION###"
12
+
13
+ def help
14
+
15
+ STDERR.puts <<EOF
16
+ ----------------------------
17
+ mglmnet.rb version #{$version}
18
+ ----------------------------
19
+ 概要) Rのglmnetパッケージを利用した正則化法による回帰モデルの構築
20
+ 特徴) 1) リッジ回帰、LASSO、elastic-netの正則化を指定可能
21
+ 2) 入力データは、サンプルID,featureID,値の3項目CSVデータ
22
+ 3) 交差検証(CV)によりlambdaを決定することで最適モデルを構築可
23
+ 4) 線形回帰、ロジスティック回帰、ポアッソン回帰など指定可能
24
+ 5) Rスクリプトを書く必要はない
25
+ 用法1) モデル構築モード
26
+ a)スパースマトリックスによる入力ファイル指定の場合
27
+ mglmnet.rb [family=] [alpha=] i= k= x= [v=] c= y= exposure= O= [-z] [seed=] [-nocv] [param=] [T=] [-mcmdenv] [--help]
28
+ a)マトリックスによる入力ファイル指定の場合
29
+ mglmnet.rb [family=] [alpha=] i= x= y= exposure= O= [-z] [seed=] [-nocv] [param=] [T=] [-mcmdenv] [--help]
30
+ 用法2) 予測モード
31
+ mglmnet.rb -predict [lambda=] i= I= o= [param=] [T=] [-mcmdenv] [--help]
32
+
33
+ ### モデル構築モード
34
+
35
+ ## 入力ファイルの指定は、スパースマトリックスによる方法とマトリックスによる方法の2通りある。
36
+ ## (k=が指定されていれば、スパースマトリックスとみなされる)
37
+ # a) スパースマトリックスによる入力ファイルの指定
38
+ -sparse : このオプションが指定されて初めてスパースマトリックスモードとみなされる【必須】
39
+ i= : スパースマトリックス型入力データファイル名【必須】
40
+ k= : 1つのサンプルを表す項目名【必須】
41
+ x= : 説明変数を表す項目名【必須】
42
+ v= : 説明変数の値項目名【オプション】
43
+ : 指定しなければ、全行1(すなわちダミー変数)となる。
44
+ c= : 目的変数データファイル名【選択必須】
45
+ y= : 目的変数の項目名【必須】
46
+ : family=で指定した内容により以下に示す値である必要がある。
47
+ : gaussian: 実数
48
+ : poisson: 正の整数
49
+ : binomial: 2つのクラス値(文字列でも可)
50
+ : multinomial: 複数クラス値(文字列でも可)
51
+
52
+ # b) マトリックスによる入力ファイルの指定
53
+ i= : マトリックス型入力データファイル名【必須】
54
+ x= : 説明変数項目名リスト【必須】
55
+ y= : 目的変数項目名【選択必須】
56
+
57
+ O= : 出力ディレクトリ名【必須】
58
+
59
+ ## モデル構築関連
60
+ family= : リンク関数【デフォルト:"gaussian"】
61
+ : gaussian: 線形回帰
62
+ : poisson: ポアソン回帰
63
+ : binomial: ロジスティック回帰
64
+ : multinomial: 多項ロジスティック回帰
65
+ alpha= : elastic-netにおけるL1とL2正則化項の荷重【デフォルト:1.0】
66
+ : 1.0でL1正則化、0でL2正則化(リッジ回帰)、0<alpha<1でelastic-net
67
+ seed= : 乱数の種(0以上の整数,交差検証に影響)【オプション:default=-1(時間依存)】
68
+ -z : 内部で説明変数を標準化する。
69
+ : スケールの異なる変数の係数を比較したい場合に利用する。
70
+ : -zをつけて作成されたモデルで予測することには意味がないことに注意する。
71
+ -nocv : 交差検証をしない *注)
72
+
73
+ ### 予測モード(-predictを指定することで予測モードとして動作する)
74
+ I= : モデル構築モードでの出力先ディレクトリパス【必須】
75
+ : 利用するファイルは以下のとおり。
76
+ : map_var2vno.csv: データの変換に利用
77
+ : model.robj: 回帰モデルRオブジェクト
78
+ lambda= : 正則化項の重み【必須:複数指定可】
79
+ : 0以上の実数値を与える以外に、以下の2つは特殊な意味を持つシンボルとして指定できる
80
+ : min: CVにおけるエラー最小モデルに対応するlambda
81
+ : 1se: lambda.min+1*standard errorのモデルに対応するlambda
82
+ o= : 予測結果ファイル名
83
+ : key,目的変数予測値...
84
+ : lambda=で指定した各lambdaに対応する予測値全てを出力する
85
+ i= : 予測対象入力ファイル名
86
+ : フォーマットと項目名は、モデル構築モードで利用したものに完全に一致しなければならない。
87
+
88
+ ## その他
89
+ T= : 作業ディレクトリ【デフォルト:"/tmp"】
90
+ -mcmdenv : 内部のMCMDのコマンドメッセージを表示
91
+ --help : ヘルプの表示
92
+
93
+
94
+ 注) 交差検証(CV)をしてもしなくても、複数のlambda値に対する回帰係数の推定は行われる。
95
+ CVをすることで、lambda別に構築される回帰モデルの予測エラーを推定する。
96
+ そして、エラー最小化という意味における最適なlambdaを得ることが可能となる。
97
+ よって-nocvを指定した場合、CVによる予測エラーの推定を行わないため、
98
+ 予測モードにおいてlambda="min,1se"は指定できない。
99
+
100
+ 必要なソフトウェア)
101
+ 1) R
102
+ 2) Rのglmnetパッケージ
103
+
104
+ 入力データ)
105
+ 例:
106
+ key,var,val
107
+ 1,a,1
108
+ 1,c,2
109
+ 1,e,1
110
+ 2,c,2
111
+ 2,d,1
112
+ 3,a,2
113
+ 3,e,3
114
+ 3,d,6
115
+
116
+
117
+ モデル構築モードでの出力データ)
118
+ 1) model.robj : 回帰モデルのRオブジェクト
119
+ 2) model_info.csv : 回帰モデルに関する各種情報
120
+ 3) coef.csv : lambda別係数一覧
121
+ 4) coef.png : lambda別係数チャート
122
+ 5) lambda_stats.csv : lambda別の各種情報(deviance,係数が非0のfeature数、推定誤差など)
123
+ 6) lambda_error.png : lambda別エラーチャート
124
+ 7) map_var2vno.csv : i=のx=項目とR内部のfeature番号の対応表
125
+ 8) scp.R : 実行されたRスクリプト
126
+ 注: 6)は-nocvを指定時には出力されない
127
+ 注: 2)と5)は-nocvを指定時には一部出力されない
128
+
129
+ 予測モードでの出力データ例)
130
+ predict.csv
131
+ key_num,id,lambda_1se,lambda_0.01,lambda_min
132
+ 1,20070701_5604,724.004406058175,743.068998688436,742.831291756625
133
+ 2,20070701_5605,832.022338347663,959.170798180323,957.54590188041
134
+ 3,20070701_5606,978.945506261202,1012.07069692832,1011.86134342746
135
+ 4,20070701_5607,866.708820321008,786.158661840733,787.19246417122
136
+
137
+ 例)
138
+ # モデル構築
139
+ $ mglmnet.rb c=yaki_tanka.csv i=yaki_features.csv O=result1 k=id x=商品名 y=tanka -sparse
140
+
141
+ # 上のモデル構築で使ったデータを使って予測
142
+ $ mglmnet.rb -predict i=yaki_features.csv k=id x=商品名 I=result1 lambda=1se,0.01,min o=result1/predict.csv
143
+
144
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
145
+ EOF
146
+ exit
147
+ end
148
+
149
+ def ver()
150
+ $revision ="0" if $revision =~ /VERSION/
151
+ STDERR.puts "version #{$version} revision #{$revision}"
152
+ exit
153
+ end
154
+
155
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
156
+ ver() if ARGV[0]=="--version"
157
+
158
+ require "rubygems"
159
+ require "nysol/mcmd"
160
+ require "json"
161
+
162
+ # Rライブラリ実行可能確認
163
+ exit(1) unless(MCMD::chkRexe("glmnet"))
164
+
165
+ # separating the input file (ifile) into three following files
166
+ # 1) independent variables (x=) => xxvar
167
+ # 2) objective variable (y=) only when model building mode =>xxy
168
+ # 3) exposure variable if it's specified (exposure=) =>xxexposure
169
+ def matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num,yVar=nil,xxy=nil)
170
+ MCMD::msgLog("#{File.basename($0)}: cleaning a matrix data `#{ifile}' ...")
171
+ f=""
172
+ f << "mcut f=#{var} i=#{ifile} o=#{xxvar}"
173
+ system(f)
174
+
175
+ f=""
176
+ f << "mcut f=#{key}:key i=#{ifile} |"
177
+ f << "msortf f=key |"
178
+ f << "muniq k=key |"
179
+ f << "mnumber s=key a=key_num S=1 o=#{xxkey2num}"
180
+ system(f)
181
+
182
+ if xxy
183
+ f=""
184
+ f << "mcut f=#{yVar} i=#{ifile} o=#{xxy}"
185
+ system(f)
186
+ end
187
+
188
+ if exposure
189
+ f=""
190
+ f << "mcut f=#{exposure} i=#{ifile} o=#{xxexposure}"
191
+ system(f)
192
+ end
193
+ end
194
+
195
+ # traデータからsparseMatrix用データを作成する
196
+ # 1) var2numがnullならば、ifileからvar2numを生成。指定されていれば、そのファイルを使って変換
197
+ # 2) key2num変換表を出力(key2num)
198
+ # 3) key,varがnumに変換されたデータを出力(row,col,val)
199
+ def smatrix(ifile,key,var,val,cFile,predictMode,var2num,key2num,xxrow,xxcol,xxval)
200
+ MCMD::msgLog("#{File.basename($0)}: cleaning a sparse matrix data `#{ifile}' ...")
201
+
202
+ wf=MCMD::Mtemp.new
203
+ xxbase =wf.file
204
+ xxa =wf.file
205
+
206
+ # 1) add value "1" unless v= not specified.
207
+ # 2) make unique by key and var
208
+ f=""
209
+ if val
210
+ f << "mcut f=#{key}:key,#{var}:var,#{val}:val i=#{ifile} |"
211
+ else
212
+ f << "mcut f=#{key}:key,#{var}:var i=#{ifile} |"
213
+ f << "msetstr v=1 a=val |"
214
+ end
215
+
216
+ if cFile
217
+ f << "mcommon k=key K=#{key} m=#{cFile} |"
218
+ end
219
+
220
+ f << "msortf f=key,var |"
221
+ f << "muniq k=key,var o=#{xxbase}"
222
+ system(f)
223
+
224
+ recSize=MCMD::mrecount("i=#{xxbase}")
225
+ if recSize==0
226
+ raise "#ERROR# common records between x and y files are not found"
227
+ end
228
+
229
+ unless predictMode then
230
+ MCMD::msgLog("#{File.basename($0)}: creating a mapping table of variables and their number...")
231
+ f=""
232
+ f << "mcut f=var i=#{xxbase} |"
233
+ f << "msortf f=var |"
234
+ f << "muniq k=var |"
235
+ f << "mnumber s=var a=vno S=1 o=#{var2num}"
236
+ system(f)
237
+ end
238
+
239
+ MCMD::msgLog("#{File.basename($0)}: creating a mapping table of key and its number...")
240
+ f=""
241
+ f << "mcut f=key i=#{xxbase} |"
242
+ f << "msortf f=key |"
243
+ f << "muniq k=key |"
244
+ f << "mnumber s=key a=key_num S=1 o=#{key2num}"
245
+ system(f)
246
+
247
+ MCMD::msgLog("#{File.basename($0)}: creating index list and value list for initializing a sparse matrix...")
248
+ f=""
249
+ f << "mcut f=key,var,val i=#{xxbase} |"
250
+ f << "mjoin k=key m=#{key2num} f=key_num |"
251
+ f << "msortf f=var |"
252
+ f << "mjoin k=var m=#{var2num} f=vno o=#{xxa}"
253
+ system(f)
254
+
255
+ system "mcut f=key_num i=#{xxa} o=#{xxrow}"
256
+ system "mcut f=vno i=#{xxa} o=#{xxcol}"
257
+ system "mcut f=val i=#{xxa} o=#{xxval}"
258
+
259
+ rowSize=MCMD::mrecount("i=#{key2num}")
260
+ #colSize=MCMD::mrecount("i=#{var2num}")
261
+
262
+ return rowSize
263
+ end
264
+
265
+ def mkYvar(cfile,key,yVar,xxkey2num,xxy)
266
+ f=""
267
+ f << "mcut f=#{key}:key,#{yVar}:y i=#{cfile} |"
268
+ f << "msortf f=key |"
269
+ f << "mjoin k=key m=#{xxkey2num} f=key_num |"
270
+ f << "mcut f=key_num,y |"
271
+ f << "msortf f=key_num%n o=#{xxy}"
272
+ system(f)
273
+ end
274
+
275
+ def mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
276
+ if xxexposure
277
+ f=""
278
+ f << "mcut f=#{key}:key,#{exposure}:exposure i=#{cfile} |"
279
+ f << "msortf f=key |"
280
+ f << "mjoin k=key m=#{xxkey2num} f=key_num |"
281
+ f << "mcut f=key_num,exposure |"
282
+ f << "msortf f=key_num%n |"
283
+ f << "mcut f=exposure o=#{xxexposure}"
284
+ system(f)
285
+ end
286
+ end
287
+
288
+
289
+ #################################################################################################
290
+ #### generate R scripts
291
+
292
+ #######################
293
+ ## pre-process
294
+ def scpHeader(seed=nil)
295
+ scp=""
296
+ scp << "library(glmnet)\n"
297
+ scp << "set.seed(#{seed})\n" if seed
298
+ return scp
299
+ end
300
+
301
+ #######################
302
+ ## prediction script using an existing model (sparse matrix version)
303
+ ## xxmodel: model file (model.robj)
304
+ ## rowSize: the number of variables on dataset
305
+ ## lmdVar: lambda values. It make prediction using the model with given each lambda value
306
+ ## exposureFile: exposure file (nil unless poisson regression)
307
+ ## xxoFile: output file name
308
+ def scpPrdSMX(xxmodel,rowSize,lmdVar,exposureFile,xxoFile,nocv)
309
+ scp=""
310
+ scp << "library(glmnet)\n"
311
+ scp << "#####################################\n"
312
+ scp << "## loading the model and make prediction\n"
313
+ scp << "load(\"#{xxmodel}\")\n"
314
+ if nocv then
315
+ scp << "dims= c(#{rowSize},model$beta@Dim[1])\n"
316
+ else
317
+ scp << "dims= c(#{rowSize},model$glmnet.fit$beta@Dim[1])\n"
318
+ end
319
+ scp << "xMTX=sparseMatrix(i=row,j=col,x=val,dims=dims)\n"
320
+ scp << "#####################################\n"
321
+ scp << "## predict and output the result\n"
322
+ if exposureFile
323
+ scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
324
+ scp << "prd = predict(model,xMTX,c(#{lmdVar}),offset=log(exposureMTX))\n"
325
+ else
326
+ scp << "prd = predict(model, xMTX,c(#{lmdVar}))\n"
327
+ end
328
+ scp << "write.csv(prd,file=\"#{xxoFile}\",quote=FALSE)\n"
329
+
330
+ return scp
331
+ end
332
+
333
+ #######################
334
+ ## prediction script using an existing model (matrix version)
335
+ ## xxmodel: model file (model.robj)
336
+ ## lmdVar: lambda values. It make prediction using the model with given each lambda value
337
+ ## exposureFile: exposure file (nil unless poisson regression)
338
+ ## xxoFile: output file name
339
+ def scpPrdMTX(xxmodel,lmdVar,exposureFile,xxoFile)
340
+ scp=""
341
+ scp << "library(glmnet)\n"
342
+ scp << "#####################################\n"
343
+ scp << "## loading the model and make prediction\n"
344
+ scp << "load(\"#{xxmodel}\")\n"
345
+ scp << "#####################################\n"
346
+ scp << "## setting sparseMatrix from csv data\n"
347
+ scp << "#####################################\n"
348
+ scp << "## predict and output the result\n"
349
+ if exposureFile
350
+ scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
351
+ scp << "prd = predict(model,xMTX,c(#{lmdVar}),offset=log(exposureMTX))\n"
352
+ else
353
+ scp << "prd = predict(model,xMTX,c(#{lmdVar}))\n"
354
+ end
355
+ scp << "write.csv(prd,file=\"#{xxoFile}\",quote=FALSE)\n"
356
+
357
+ return scp
358
+ end
359
+
360
+ def scpInpSMX(rowFile,colFile,valFile,yFile=nil)
361
+ scp=""
362
+ scp << "#####################################\n"
363
+ scp << "## loading csv data and setting sparseMatrix\n"
364
+ scp << "row = read.csv(\"#{rowFile}\")$key_num\n"
365
+ scp << "col = read.csv(\"#{colFile}\")$vno\n"
366
+ scp << "val = read.csv(\"#{valFile}\")$val\n"
367
+ scp << "xMTX=sparseMatrix(i=row,j=col,x=val)\n"
368
+ if yFile then
369
+ scp << "#####################################\n"
370
+ scp << "## setting csv data into a vector of objective valiable\n"
371
+ scp << "yMTX= read.csv(\"#{yFile}\",header=T)\n"
372
+ scp << "yMTX = yMTX$y # as a vector\n"
373
+ end
374
+
375
+ return scp
376
+ end
377
+
378
+ def scpInpMTX(xFile,yFile=nil)
379
+ scp = ""
380
+ scp << "#####################################\n"
381
+ scp << "## loading csv data setting matrix\n"
382
+ scp << "xMTX= as.matrix(read.csv(\"#{xFile}\"))\n"
383
+ if yFile then
384
+ scp << "yMTX= as.matrix(read.csv(\"#{yFile}\"))\n"
385
+ end
386
+
387
+ return scp
388
+ end
389
+
390
+ def scpExeSMX(family,alpha,param,exposureFile,nocv,doZ)
391
+ cvStr="cv."
392
+ cvStr="" if nocv
393
+
394
+ scp=""
395
+ scp << "#####################################\n" if doZ
396
+ scp << "## standardizing x variables\n" if doZ
397
+ scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
398
+
399
+ scp << "#####################################\n"
400
+ scp << "## building a model with cross validation\n"
401
+ scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
402
+ scp << "xMTX[is.na(xMTX)] <- 0\n" if doZ
403
+
404
+ if exposureFile
405
+ scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
406
+ scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha},offset=log(exposureMTX))\n"
407
+ else
408
+ scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha})\n"
409
+ end
410
+ if nocv
411
+ scp << "fit=model\n"
412
+ else
413
+ scp << "fit=model$glmnet.fit\n"
414
+ end
415
+
416
+ return scp
417
+ end
418
+
419
+ def scpExeMTX(family,alpha,param,exposureFile,nocv,doZ)
420
+ cvStr="cv."
421
+ cvStr="" if nocv
422
+
423
+ scp=""
424
+ scp << "#####################################\n" if doZ
425
+ scp << "## standardizing x variables\n" if doZ
426
+ scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
427
+
428
+ scp << "#####################################\n"
429
+ scp << "## building a model with cross validation\n"
430
+ if exposureFile
431
+ scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
432
+ scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha},offset=log(exposureMTX))\n"
433
+ else
434
+ scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha})\n"
435
+ end
436
+ if nocv
437
+ scp << "fit=model\n"
438
+ else
439
+ scp << "fit=model$glmnet.fit\n"
440
+ end
441
+
442
+ return scp
443
+ end
444
+
445
+ def scpResult(modelFile,coefFile,coefPNG,constFile,lambdaFile,lambdaErrFile,infoFile)
446
+ r_post_proc = <<EOF
447
+ #####################################
448
+ ## output serialized objects of the model
449
+ save(model ,file="#{modelFile}")
450
+
451
+ #####################################
452
+ ## output coefficients on each lambda
453
+ write.csv(as.matrix(fit$beta),file="#{coefFile}",quote=FALSE)
454
+ write.csv(as.matrix(fit$a0),file="#{constFile}",quote=FALSE)
455
+
456
+ png("#{coefPNG}")
457
+ plot(fit,"lambda")
458
+ supmsg=dev.off()
459
+
460
+ #####################################
461
+ ## setting the model info
462
+ info=as.data.frame(fit$nobs)
463
+ colnames(info)=c("nobs")
464
+ info$colsize=fit$dim[2]
465
+ info$nulldev=fit$nulldev
466
+
467
+ #####################################
468
+ ## output results of cv in csv format
469
+ stats=as.data.frame(c(1:length(model$lambda)))
470
+ colnames(stats)=c("sno")
471
+ stats$lambda=model$lambda
472
+ stats$df=model$df
473
+ stats$dev.ratio=fit$dev.ratio
474
+ # stats$dev.ratio=fit$dev.ratio[1:length(model$lambda)]
475
+
476
+ stats$cvm=model$cvm
477
+ stats$cvsd=model$cvsd
478
+ stats$cvup=model$cvup
479
+ stats$cvlo=model$cvlo
480
+ write.csv(stats,"#{lambdaFile}",row.names=FALSE,quote=FALSE)
481
+
482
+ info$lambda_min=model$lambda.min
483
+ info$lambda_1se=model$lambda.1se
484
+ write.csv(info,"#{infoFile}",row.names=F,quote=FALSE)
485
+
486
+ png("#{lambdaErrFile}")
487
+ plot(model)
488
+ supmsg=dev.off()
489
+ EOF
490
+
491
+ return r_post_proc
492
+ end
493
+
494
+ #################################################################################################
495
+ #### post processing
496
+
497
+ def coeff(xxcoef,xxconst,xxlambda,xxvar2num,isSMX,oPath)
498
+ MCMD::msgLog("#{File.basename($0)}: summarizing coefficients on each lambda...")
499
+
500
+ wf=MCMD::Mtemp.new
501
+ xxnum2var = wf.file
502
+ xxcoefv = wf.file
503
+ xxconstv = wf.file
504
+ xxmap = wf.file
505
+
506
+ # ,s0
507
+ # V1,34.4221918005038
508
+ # V2,42.2816648447219
509
+ f=""
510
+ f << "mnullto f=0 v=vno -nfn i=#{xxcoef} |"
511
+ f << "msetstr v=coef a=fld |"
512
+ f << "mcross k=vno s=fld f=s* a=sno o=#{xxcoefv}"
513
+ system(f)
514
+ # vno,sno,coef
515
+ # V1,s0,0
516
+ # V1,s1,1.49990093015948
517
+ # V1,s2,2.93162904818885
518
+ # V1,s3,4.29828291361314
519
+
520
+ # constant file (model$a0)
521
+ # ,V1
522
+ # s0,1.75
523
+ # s1,0.625074302380394
524
+ # s2,-0.448721786141637
525
+ # s3,-1.47371218520986
526
+ f=""
527
+ f << "mnullto f=0 v=sno -nfn i=#{xxconst} |"
528
+ f << "msetstr v=CONSTANT a=vno |"
529
+ f << "mcut f=vno,sno,V1:coef o=#{xxconstv}"
530
+ system(f)
531
+
532
+ xxsno2lambda = wf.file
533
+ f=""
534
+ f << "mcut f=sno:sno_,lambda i=#{xxlambda} |"
535
+ f << "mcal c='${sno_}-1' a=sno |"
536
+ f << "msortf f=sno o=#{xxsno2lambda}"
537
+ system(f)
538
+
539
+ system("mcal c='\"V\"+$s{vno}' a=vnov i=#{xxvar2num} o=#{xxmap}") if isSMX
540
+
541
+ f=""
542
+ f << "mcat i=#{xxconstv},#{xxcoefv} |"
543
+ f << "msed f=sno c=s v= |"
544
+ f << "mselstr f=coef v=0 -r|"
545
+ f << "msortf f=sno |"
546
+ f << "mjoin k=sno m=#{xxsno2lambda} f=lambda -n |"
547
+ if isSMX then
548
+ f << "mjoin k=vno K=vnov m=#{xxmap} f=var:vname -n |"
549
+ f << "mcut f=lambda,vno,vname,coef |"
550
+ else
551
+ f << "mcut f=lambda,vno,coef |"
552
+ end
553
+ f << "msortf f=lambda%nr,vno o=#{oPath}/coef.csv"
554
+ system(f)
555
+
556
+ end
557
+
558
+ #######################
559
+ ## convert the predction file generated in R into output file(csv).
560
+ ## 1) the first column is converted from ID made in R into key field name on input data.
561
+ ## 2) each column correspond to prediction for each lambda specified.
562
+ ## 3) exp(predicted value) if poisson
563
+ def outputPrediction(xxpredict,lmd,xxkey2num,key,family,ofile)
564
+
565
+ fldNames=["0:key_num"]
566
+ predFlds=[]
567
+ i=0
568
+ lmd.split(",").each{|ele|
569
+ i+=1
570
+ fldNames << "#{i}:lambda_#{ele}"
571
+ predFlds << "lambda_#{ele}"
572
+ }
573
+ fldNames=fldNames.join(",")
574
+
575
+ wf=MCMD::Mtemp.new
576
+ xxnum2key=wf.file
577
+ system "msortf f=key_num i=#{xxkey2num} o=#{xxnum2key}"
578
+
579
+ # ,1,2,3
580
+ # 1,724.004406058175,743.068998688436,742.831291756625
581
+ # 2,832.022338347663,959.170798180323,957.54590188041
582
+ f=""
583
+ f << "mcut f=#{fldNames} -nfni i=#{xxpredict} |"
584
+ f << "mdelnull f=key_num |"
585
+
586
+ # exp(predict value) if poisson
587
+ if family=="poisson"
588
+ predFlds.each{|fld|
589
+ f << "mcal c='exp(${#{fld}})' a=xxnew_#{fld} |"
590
+ f << "mcut -r f=#{fld} |"
591
+ f << "mfldname f=xxnew_#{fld}:#{fld} |"
592
+ }
593
+
594
+ # 1/(1+exp(predict value)) if binomial
595
+ elsif family=="binomial"
596
+ predFlds.each{|fld|
597
+ f << "mcal c='1/(1+exp((-1)*${#{fld}}))' a=xxnew_#{fld} |"
598
+ f << "mcut -r f=#{fld} |"
599
+ f << "mfldname f=xxnew_#{fld}:#{fld} |"
600
+ }
601
+ end
602
+
603
+ # join the key field
604
+ f << "msortf f=key_num |"
605
+ f << "mjoin k=key_num m=#{xxnum2key} f=key |"
606
+ f << "msortf f=key |"
607
+ f << "mcut f=key:#{key},lambda* o=#{ofile}"
608
+ system(f)
609
+ end
610
+
611
+
612
+ #################################################################################################
613
+ #### Entry point
614
+
615
+ ########################
616
+ ## predict mode
617
+ if ARGV.index("-predict")
618
+ args=MCMD::Margs.new(ARGV,"i=,I=,o=,c=,lambda=,-sparse,-predict,T=,-verbose,T=","i=,I=")
619
+
620
+ # mcmdのメッセージは警告とエラーのみ
621
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
622
+ ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
623
+
624
+ #ワークファイルパス
625
+ if args.str("T=")!=nil then
626
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
627
+ end
628
+
629
+ iPath = args.file("I=","r")
630
+ params=nil
631
+ File.open("#{iPath}/build_params.json","r"){|fpr|
632
+ params=JSON.load(fpr)
633
+ }
634
+
635
+ ifile = args.file("i=","r")
636
+ cfile = args.file("c=","r")
637
+ var=params["var"]
638
+ val=params["val"]
639
+ key=params["key"]
640
+ exposure=params["exposure"]
641
+ family=params["family"]
642
+ isSMX=params["sparse"]
643
+ nocv=params["nocv"]
644
+
645
+ ofile = args.file("o=","w")
646
+ lmd = args.str("lambda=","min")
647
+ if nocv then
648
+ if lmd=~/min/ or lmd=~/1se/
649
+ raise "#ERROR# `min' or `1se' in lambda= parameter cannot be specified because the model built without cross-validation"
650
+ end
651
+ end
652
+ lmdVar= lmd.gsub("min","model$lambda.min").gsub("1se","model$lambda.1se")
653
+
654
+ wf=MCMD::Mtemp.new
655
+ o={}
656
+ o["xxscp"] =wf.file
657
+ o["xxofile"] =wf.file
658
+ xxkey2num =wf.file
659
+
660
+ if isSMX then
661
+ xxrow =wf.file
662
+ xxcol =wf.file
663
+ xxval =wf.file
664
+ xxexposure=nil
665
+ xxexposure=wf.file if exposure
666
+
667
+ rowSize=smatrix(ifile,key,var,val,nil,true,"#{iPath}/map_var2vno.csv",xxkey2num,xxrow,xxcol,xxval)
668
+ mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
669
+
670
+ scp0=scpHeader()
671
+ scp1=scpInpSMX(xxrow,xxcol,xxval)
672
+ scp2=scpPrdSMX("#{iPath}/model.robj",rowSize,lmdVar,xxexposure,o["xxofile"],nocv)
673
+ else
674
+ xxvar=wf.file
675
+ xxexposure=nil
676
+ xxexposure=wf.file if exposure
677
+
678
+ matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num)
679
+
680
+ scp0=scpHeader()
681
+ scp1=scpInpMTX(xxvar)
682
+ scp2=scpPrdMTX("#{iPath}/model.robj",lmdVar,xxexposure,o["xxofile"])
683
+ end
684
+
685
+ # writing the R script
686
+ File.open(o["xxscp"],"w"){|fpw|
687
+ fpw.puts "#{scp0}#{scp1}#{scp2}"
688
+ }
689
+
690
+ MCMD::msgLog("#{File.basename($0)}: executing R script...")
691
+ if args.bool("-verbose") then
692
+ system "R --vanilla -q < #{o['xxscp']}"
693
+ else
694
+ system "R --vanilla -q --slave < #{o['xxscp']} 2>/dev/null "
695
+ end
696
+
697
+
698
+ # output predicion
699
+ outputPrediction(o["xxofile"],lmd,xxkey2num,key,family,ofile)
700
+
701
+ ########################
702
+ #### model building mode
703
+ else
704
+
705
+ args=MCMD::Margs.new(ARGV,"i=,c=,O=,k=,x=,v=,y=,alpha=,family=,exposure=,os=,T=,-verbose,-z,T=,param=,seed=,-sparse,-nocv","k=,x=,y=,i=,O=")
706
+
707
+ # mcmdのメッセージは警告とエラーのみ
708
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
709
+ ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
710
+
711
+ #ワークファイルパス
712
+ if args.str("T=")!=nil then
713
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
714
+ end
715
+
716
+ ifile =args.file("i=","r")
717
+ cfile =args.file("c=","r")
718
+ oPath =args.file("O=", "w")
719
+ osfile=args.file("os=","r")
720
+
721
+ # flag for sparse matrix or matrix
722
+ isSMX=args.bool("-sparse")
723
+
724
+ doZ=args.bool("-z")
725
+
726
+ var=nil
727
+ key=nil
728
+ val=nil
729
+ yVar=nil
730
+ # ---- parameters for sparse matrix
731
+ if isSMX then
732
+ key = args.field("k=" , ifile, nil , 1,1)["names"].join(",")
733
+ var = args.field("x=", ifile, nil, 1,1)["names"][0]
734
+ val = args.field("v=", ifile, nil, 1,1)
735
+ val = val["names"][0] if val
736
+ yVar = args.field("y=", cfile, nil, 1,1)["names"][0]
737
+ exposure= args.field("exposure=",cfile,nil,1,1)
738
+ exposure=exposure["names"][0] if exposure
739
+
740
+ # ---- parameters for matrix
741
+ else
742
+ key = args.field("k=", ifile, nil, 1)["names"].join(",")
743
+ var = args.field("x=", ifile, nil, 1)["names"].join(",")
744
+ yVar = args.field("y=", ifile, nil, 1,1)["names"][0]
745
+ exposure= args.field("exposure=",ifile,nil,1,1)
746
+ exposure=exposure["names"][0] if exposure
747
+ end
748
+
749
+ # ---- other paramters
750
+ alpha = args.float("alpha=", 1.0, 0.0, 1.0)
751
+ family = args.str("family=", "gaussian")
752
+ seed = args.int("seed=", -1)
753
+ nocv = args.bool("-nocv")
754
+ param = args.str("param=")
755
+ param = ","+param if param
756
+
757
+ if family!="poisson" and exposure
758
+ raise "#ERROR# `exposure=' can be specified with only `family=poisson'"
759
+ end
760
+
761
+ MCMD::mkDir(oPath)
762
+
763
+ wf=MCMD::Mtemp.new
764
+ o={}
765
+ o["xxvar2num"] =wf.file
766
+ o["xxmodel"] =wf.file
767
+ o["xxcoef"] =wf.file
768
+ o["xxcoefPNG"] =wf.file
769
+ o["xxconst"] =wf.file
770
+ o["xxlambda"] =wf.file
771
+ o["xxlambdaPNG"]=wf.file
772
+ o["xxinfo"] =wf.file
773
+ o["xxscp"] =wf.file
774
+
775
+ # convert tra file to sparse matrix
776
+ # it's assumed as sparse matrix if cfile= is supplied.
777
+ if isSMX then
778
+ xxrow =wf.file
779
+ xxcol =wf.file
780
+ xxval =wf.file
781
+ xxy =wf.file
782
+ xxexposure=wf.file if exposure
783
+ xxkey2num=wf.file
784
+
785
+ smatrix(ifile,key,var,val,cfile,false,o["xxvar2num"],xxkey2num,xxrow,xxcol,xxval)
786
+ mkYvar(cfile,key,yVar,xxkey2num,xxy)
787
+ mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
788
+
789
+ scp0=scpHeader(seed)
790
+ scp1=scpInpSMX(xxrow,xxcol,xxval,xxy)
791
+ scp2=scpExeSMX(family,alpha,param,xxexposure,nocv,doZ)
792
+ scp3=scpResult(o["xxmodel"],o["xxcoef"],o["xxcoefPNG"],o["xxconst"],o["xxlambda"],o["xxlambdaPNG"],o["xxinfo"])
793
+
794
+ system "cp #{o['xxvar2num']} #{oPath}/map_var2vno.csv"
795
+ # otherwise it's assumed as matrix data
796
+ else
797
+ xxvar=wf.file
798
+ xxy =wf.file
799
+ xxexposure=nil
800
+ xxexposure=wf.file if exposure
801
+ xxkey2num=wf.file
802
+
803
+ matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num,yVar,xxy)
804
+
805
+ scp0=scpHeader(seed)
806
+ scp1=scpInpMTX(xxvar,xxy)
807
+ scp2=scpExeMTX(family,alpha,param,xxexposure,nocv,doZ)
808
+ scp3=scpResult(o["xxmodel"],o["xxcoef"],o["xxcoefPNG"],o["xxconst"],o["xxlambda"],o["xxlambdaPNG"],o["xxinfo"])
809
+ end
810
+
811
+ # writing the R script
812
+ File.open(o["xxscp"],"w"){|fpw|
813
+ fpw.puts "#{scp0}#{scp1}#{scp2}#{scp3}"
814
+ }
815
+ MCMD::msgLog("#{File.basename($0)}: executing R script...")
816
+ if args.bool("-verbose") then
817
+ system "R --vanilla -q < #{o['xxscp']}"
818
+ else
819
+ system "R --vanilla -q --slave < #{o['xxscp']} 2>/dev/null "
820
+ end
821
+
822
+
823
+
824
+ # saving all results to oPath
825
+ coeff(o["xxcoef"],o["xxconst"],o["xxlambda"],o['xxvar2num'],isSMX,oPath)
826
+ system "cp #{o['xxmodel']} #{oPath}/model.robj"
827
+ system "cp #{o['xxcoefPNG']} #{oPath}/coef.png"
828
+ system "cp #{o['xxlambdaPNG']} #{oPath}/lambda.png" unless nocv
829
+ system "cp #{o['xxinfo']} #{oPath}/info.csv"
830
+ system("cp #{o["xxscp"]} #{oPath}/scp.R")
831
+ system "mcut f=sno -r i=#{o['xxlambda']} o=#{oPath}/lambda.csv"
832
+
833
+ # 項目名
834
+ kv={"var"=>var,"val"=>val,"key"=>key,"exposure"=>exposure,"family"=>family,"sparse"=>isSMX,"nocv"=>nocv}
835
+ File.open("#{oPath}/build_params.json","w"){|fpw|
836
+ JSON.dump(kv,fpw)
837
+ }
838
+
839
+ end
840
+
841
+ # end message
842
+ MCMD::endLog(args.cmdline)
843
+