nysol-mining 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/mgfeatures.rb
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# 1.0 initial development: 2015/10/20
|
5
|
+
# 1.1 minor modifications: 2015/12/09
|
6
|
+
$version="1.1"
|
7
|
+
$revision="###VERSION###"
|
8
|
+
CMD="mgfeatures.rb"
|
9
|
+
|
10
|
+
def help
|
11
|
+
|
12
|
+
STDERR.puts <<EOF
|
13
|
+
----------------------------
|
14
|
+
#{CMD} version #{$version}
|
15
|
+
----------------------------
|
16
|
+
summary) calculation graph features by igraph
|
17
|
+
feature) output the following graph features
|
18
|
+
node_size : number of nodes
|
19
|
+
edge_size : number of edges
|
20
|
+
degree0_node_size : number of nodes with 0 degree
|
21
|
+
mean_degree : mean of degree
|
22
|
+
median_degree : median of degree
|
23
|
+
min_degree : min of degree
|
24
|
+
max_degree : max of degree
|
25
|
+
graph_density : graph density
|
26
|
+
transitivity : so called clustering coefficient
|
27
|
+
average_shortest_path : mean of shortest path length for all pair of edges
|
28
|
+
diameter : max of shortest path length for all pair of edges
|
29
|
+
|
30
|
+
format) #{CMD} I=|(ei= [ni=]) ef= [nf=] O=|o= [log=] [T=] [--help]
|
31
|
+
args=MCMD::Margs.new(ARGV,"I=,ei=,ef=,ni=,nf=,o=,O=,diameter=,graph_density=,log=,-verbose","ef=,O=")
|
32
|
+
I= : path name of input files
|
33
|
+
: file extention of edge file must be ".edge" in this path
|
34
|
+
: file extention of node file must be ".node" in this path
|
35
|
+
ei= : input file name of edge (cannot be specified with I=)
|
36
|
+
ef= : field name of edge (two nodes)
|
37
|
+
ni= : input file name of nodes (cannot be specified with I=)
|
38
|
+
: if omitted, only edge file is used
|
39
|
+
nf= : field name of node
|
40
|
+
-directed : assume a directed graph
|
41
|
+
O= : output path
|
42
|
+
|
43
|
+
## parameter for each feature (see igraph manual in detail)
|
44
|
+
diameter=unconnected=[TRUE|FALSE],directed=[TRUE|FALSE]
|
45
|
+
graph_density=loops=[FALSE|TRUE]
|
46
|
+
average_shortest_path=unconnected=[TRUE|FALSE],directed=[TRUE|FALSE]
|
47
|
+
|
48
|
+
## others
|
49
|
+
mp= : Number of processes for parallel processing
|
50
|
+
T= : working directory (default:/tmp)
|
51
|
+
-mcmdenv : show the END messages of MCMD
|
52
|
+
--help : show help
|
53
|
+
|
54
|
+
required software)
|
55
|
+
1) R
|
56
|
+
2) igraph package for R
|
57
|
+
|
58
|
+
example)
|
59
|
+
$ cat data/dat1.edge
|
60
|
+
v1,v2
|
61
|
+
E,J
|
62
|
+
E,A
|
63
|
+
J,D
|
64
|
+
J,A
|
65
|
+
J,H
|
66
|
+
D,H
|
67
|
+
D,F
|
68
|
+
H,F
|
69
|
+
A,F
|
70
|
+
B,H
|
71
|
+
$ cat data/dat1.node
|
72
|
+
v
|
73
|
+
A
|
74
|
+
B
|
75
|
+
C
|
76
|
+
D
|
77
|
+
E
|
78
|
+
F
|
79
|
+
G
|
80
|
+
H
|
81
|
+
I
|
82
|
+
J
|
83
|
+
$ #{CMD} I=data O=data/result1 ef=v1,v2 nf=v O=result
|
84
|
+
#MSG# converting graph files into a pair of numbered nodes ...; 2015/10/20 14:57:26
|
85
|
+
#END# ../bin/mgfeatrues.rb I=./data O=result1 ef=v1,v2 nf=v; 2015/10/20 14:57:27
|
86
|
+
$ cat data/dat1.csv
|
87
|
+
id,node_size,edge_size,degree0_node_size,mean_degree,median_degree,min_degree,max_degree,graph_density,transitivity,average_shortest_path,diameter
|
88
|
+
dat1,10,10,3,2,2.5,0,4,0.222222222222222,0.409090909090909,1.61904761904762,3
|
89
|
+
|
90
|
+
# without specifying nf= (node file isn't used)
|
91
|
+
$ #{CMD} I=data O=data/result1 ef=v1,v2 O=result
|
92
|
+
#MSG# converting graph files into a pair of numbered nodes ...; 2015/10/20 14:57:26
|
93
|
+
#END# ../bin/mgfeatrues.rb I=./data O=result1 ef=v1,v2 nf=v; 2015/10/20 14:57:27
|
94
|
+
$ cat data/dat1.csv
|
95
|
+
id,node_size,edge_size,degree0_node_size,mean_degree,median_degree,min_degree,max_degree,graph_density,transitivity,average_shortest_path,diameter
|
96
|
+
dat1,10,10,0,2.85714285714286,3,1,4,0.476190476190476,0.409090909090909,1.61904761904762,3
|
97
|
+
|
98
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
99
|
+
EOF
|
100
|
+
exit
|
101
|
+
end
|
102
|
+
|
103
|
+
def ver()
|
104
|
+
$revision ="0" if $revision =~ /VERSION/
|
105
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
106
|
+
exit
|
107
|
+
end
|
108
|
+
|
109
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
110
|
+
ver() if ARGV[0]=="--version"
|
111
|
+
|
112
|
+
require "rubygems"
|
113
|
+
require "nysol/mcmd"
|
114
|
+
|
115
|
+
# confirm if R library is installed
|
116
|
+
exit(1) unless(MCMD::chkRexe("igraph"))
|
117
|
+
|
118
|
+
####
|
119
|
+
# converting original graph file with text to one with integer
|
120
|
+
# output #{numFile} and #{mapFile}, then return the number of nodes of the graph
|
121
|
+
#
|
122
|
+
# ei ni xxnum xxmap
|
123
|
+
# v1,v2 v node%1,flag%0,num
|
124
|
+
# E,J A 0 3 A,0,0
|
125
|
+
# E,A B 0 4 B,0,1
|
126
|
+
# J,D C 0 6 D,0,2
|
127
|
+
# J,A D => 1 5 E,0,3
|
128
|
+
# J,H E 2 4 F,0,4
|
129
|
+
# D,H F 2 5 H,0,5
|
130
|
+
# D,F G 2 6 J,0,6
|
131
|
+
# H,F H 3 6 C,1,7
|
132
|
+
# A,F I 4 5 G,1,8
|
133
|
+
# B,H J 5 6 I,1,9
|
134
|
+
#
|
135
|
+
# return value is 10 (nodes)
|
136
|
+
# "flag" on xxmap: 0:nodes in "ei", 1:nodes only in "ni".
|
137
|
+
def g2pair(ni,nf,ei,ef1,ef2,numFile,mapFile)
|
138
|
+
#MCMD::msgLog("converting graph files into a pair of numbered nodes ...")
|
139
|
+
wf=MCMD::Mtemp.new
|
140
|
+
wf1=wf.file
|
141
|
+
wf2=wf.file
|
142
|
+
wf3=wf.file
|
143
|
+
|
144
|
+
system "mcut f=#{ef1}:node i=#{ei} | msetstr v=0 a=flag o=#{wf1}"
|
145
|
+
system "mcut f=#{ef2}:node i=#{ei} | msetstr v=0 a=flag o=#{wf2}"
|
146
|
+
system "mcut f=#{nf}:node i=#{ni} | msetstr v=1 a=flag o=#{wf3}" if nf
|
147
|
+
|
148
|
+
f=""
|
149
|
+
if nf
|
150
|
+
f << "mcat i=#{wf1},#{wf2},#{wf3} f=node,flag |"
|
151
|
+
f << "mbest k=node s=flag from=0 size=1 |"
|
152
|
+
else
|
153
|
+
f << "mcat i=#{wf1},#{wf2} f=node,flag |"
|
154
|
+
f << "muniq k=node |"
|
155
|
+
end
|
156
|
+
# isolated nodes are set to the end of position in mapping file.
|
157
|
+
# S= must start from 0 (but inside R vertex number will be added one)
|
158
|
+
f << "mnumber s=flag,node a=num S=0 o=#{mapFile}"
|
159
|
+
system(f)
|
160
|
+
|
161
|
+
f=""
|
162
|
+
f << "mcut f=#{ef1},#{ef2} i=#{ei} |"
|
163
|
+
f << "msortf f=#{ef1} |"
|
164
|
+
f << "mjoin k=#{ef1} K=node m=#{mapFile} f=num:num1 |"
|
165
|
+
f << "msortf f=#{ef2} |"
|
166
|
+
f << "mjoin k=#{ef2} K=node m=#{mapFile} f=num:num2 |"
|
167
|
+
f << "mcut f=num1,num2 |"
|
168
|
+
#f << "mfsort f=num1,num2 |"
|
169
|
+
f << "msortf f=num1%n,num2%n -nfno | tr ',' ' ' >#{numFile}"
|
170
|
+
system(f)
|
171
|
+
|
172
|
+
nodeSize=MCMD::mrecount("i=#{mapFile}")
|
173
|
+
|
174
|
+
return nodeSize
|
175
|
+
end
|
176
|
+
|
177
|
+
####
|
178
|
+
# generating the R script for graph features
|
179
|
+
# pars: parameters for each graph feature
|
180
|
+
def genRscript(directed,pars,eFile,nodeSize,oFile,scpFile)
|
181
|
+
dir="FALSE"
|
182
|
+
dir="TRUE" if directed
|
183
|
+
|
184
|
+
r_proc = <<EOF
|
185
|
+
library(igraph)
|
186
|
+
## reading edge file
|
187
|
+
g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
|
188
|
+
|
189
|
+
####
|
190
|
+
deg=degree(g)
|
191
|
+
node_size=vcount(g)
|
192
|
+
edge_size=ecount(g)
|
193
|
+
mean_degree=mean(deg)
|
194
|
+
median_degree=median(deg)
|
195
|
+
min_degree=min(deg)
|
196
|
+
max_degree=max(deg)
|
197
|
+
degree0_node_size=length(deg[deg==0])
|
198
|
+
graph_density=graph.density(g #{pars["graph_density"]})
|
199
|
+
average_shortest_path=average.path.length(g #{pars["average_shortest_path"]})
|
200
|
+
|
201
|
+
#### diameter
|
202
|
+
diameter=diameter(g #{pars["diameter"]})
|
203
|
+
transitivity=transitivity(g)
|
204
|
+
|
205
|
+
dat=data.frame(
|
206
|
+
node_size=node_size,
|
207
|
+
edge_size=edge_size,
|
208
|
+
degree0_node_size=degree0_node_size,
|
209
|
+
mean_degree=mean_degree,
|
210
|
+
median_degree=median_degree,
|
211
|
+
min_degree=min_degree,
|
212
|
+
max_degree=max_degree,
|
213
|
+
graph_density=graph_density,
|
214
|
+
transitivity=transitivity,
|
215
|
+
average_shortest_path=average_shortest_path,
|
216
|
+
diameter=diameter
|
217
|
+
)
|
218
|
+
write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
|
219
|
+
EOF
|
220
|
+
|
221
|
+
File.open(scpFile,"w"){|fpw|
|
222
|
+
fpw.write(r_proc)
|
223
|
+
}
|
224
|
+
end
|
225
|
+
|
226
|
+
|
227
|
+
#################################################################################################
|
228
|
+
#### Entry point
|
229
|
+
|
230
|
+
args=MCMD::Margs.new(ARGV,"I=,ei=,ef=,ni=,nf=,o=,O=,-directed,diameter=,graph_density=,average_shortest_path,-verbose,mp=","ef=,O=")
|
231
|
+
|
232
|
+
# suppress the end message of MCMD
|
233
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
234
|
+
|
235
|
+
# work file path
|
236
|
+
if args.str("T=")!=nil then
|
237
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
238
|
+
end
|
239
|
+
|
240
|
+
# setting variables for edge file(s) and its field name
|
241
|
+
iPath = args.file("I=","r")
|
242
|
+
oPath = args.file("O=","w")
|
243
|
+
|
244
|
+
edgeFiles=nil
|
245
|
+
ef1 =nil
|
246
|
+
ef2 =nil
|
247
|
+
if iPath then
|
248
|
+
edgeFiles = Dir["#{iPath}/*.edge"]
|
249
|
+
if edgeFiles.size==0 then
|
250
|
+
raise "#ERROR# no edge file is found matching with #{iPath}/*.edge"
|
251
|
+
end
|
252
|
+
ef = args.field("ef=", edgeFiles[0])
|
253
|
+
ef1,ef2=ef["names"]
|
254
|
+
else
|
255
|
+
edgeFiles = args.file("ei=","r").split # edge file name
|
256
|
+
unless edgeFiles
|
257
|
+
raise "#ERROR# ei= or I= is mandatory"
|
258
|
+
end
|
259
|
+
ef = args.field("ef=", edgeFiles[0])
|
260
|
+
ef1,ef2=ef["names"]
|
261
|
+
end
|
262
|
+
|
263
|
+
# setting variables for node file(s) and its field name.
|
264
|
+
# if nf= is not specified, only edge files are used for generating a graph.
|
265
|
+
ni=nil
|
266
|
+
nf=nil
|
267
|
+
if iPath then
|
268
|
+
nodeFile0=edgeFiles[0].sub(/\.edge/,".node")
|
269
|
+
if File.exists?(nodeFile0)
|
270
|
+
nf = args.field("nf=", nodeFile0)
|
271
|
+
if nf
|
272
|
+
nf=nf["names"][0]
|
273
|
+
end
|
274
|
+
else
|
275
|
+
nf = args.str("nf=")
|
276
|
+
if nf then
|
277
|
+
raise "#ERROR# nf= is specified, but no node file is found matching with #{iPath}/*.node"
|
278
|
+
end
|
279
|
+
end
|
280
|
+
else
|
281
|
+
ni = args. file("ni=","r") # node file name
|
282
|
+
if ni
|
283
|
+
nf = args.field("nf=", ni)
|
284
|
+
unless nf
|
285
|
+
raise "#ERROR# nf= is mandatory, when ni= is specified"
|
286
|
+
end
|
287
|
+
nf=nf["names"][0]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
directed=args.bool("-directed")
|
292
|
+
MP=args.int("mp=",4)
|
293
|
+
|
294
|
+
pars={}
|
295
|
+
par=args.str("diameter=")
|
296
|
+
pars["diameter"]=",#{par}" if par
|
297
|
+
par=args.str("graph_density=")
|
298
|
+
pars["graph_density"]=",#{par}" if par
|
299
|
+
par=args.str("average_shortest_path")
|
300
|
+
pars["average_shortest_path"]=",#{par}" if par
|
301
|
+
|
302
|
+
|
303
|
+
MCMD::mkDir(oPath)
|
304
|
+
|
305
|
+
|
306
|
+
edgeFiles.meach(MP){|edgeFile|
|
307
|
+
#MCMD::msgLog("START fearture extraction: #{edgeFile}")
|
308
|
+
|
309
|
+
baseName=edgeFile.sub(/\.edge$/,"")
|
310
|
+
name=baseName.sub(/^.*\//,"")
|
311
|
+
|
312
|
+
nodeFile=edgeFile.sub(/\.edge$/,".node")
|
313
|
+
|
314
|
+
# convert the original graph to one igraph can handle
|
315
|
+
wf=MCMD::Mtemp.new
|
316
|
+
xxnum=wf.file
|
317
|
+
xxmap=wf.file
|
318
|
+
xxout=wf.file
|
319
|
+
xxscp=wf.file
|
320
|
+
nodeSize=g2pair(nodeFile,nf,edgeFile,ef1,ef2,xxnum,xxmap)
|
321
|
+
|
322
|
+
|
323
|
+
# generate R script, and run
|
324
|
+
genRscript(directed,pars,xxnum, nodeSize, xxout, xxscp)
|
325
|
+
if args.bool("-verbose") then
|
326
|
+
system "R --vanilla -q < #{xxscp}"
|
327
|
+
else
|
328
|
+
system "R --vanilla -q --slave < #{xxscp} 2>/dev/null "
|
329
|
+
end
|
330
|
+
|
331
|
+
|
332
|
+
|
333
|
+
|
334
|
+
# store the result
|
335
|
+
system "msetstr v=#{name} a=id i=#{xxout} | mcut -x f=0L,0-1L o=#{oPath}/#{name}.csv"
|
336
|
+
}
|
337
|
+
|
338
|
+
# end message
|
339
|
+
MCMD::endLog(args.cmdline)
|
340
|
+
|
data/bin/mglmnet.rb
ADDED
@@ -0,0 +1,843 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# 1.1 fix the but about dimention mismatch: 2014/09/03
|
5
|
+
# 1.2 add -nocv mode, use JSON: 2015/01/11
|
6
|
+
# 1.3 add -z option, add original strings for xvar in coeff.csv : 2015/03/02
|
7
|
+
# 1.4 bug fix about NaN problem in specifing -z option : 2015/03/02
|
8
|
+
# 1.5 bug fix for key mismatching problem between x and y files : 2015/03/03
|
9
|
+
# 1.6 bug fix for prediction in logistic regression : 2015/03/27
|
10
|
+
$version="1.6"
|
11
|
+
$revision="###VERSION###"
|
12
|
+
|
13
|
+
def help
|
14
|
+
|
15
|
+
STDERR.puts <<EOF
|
16
|
+
----------------------------
|
17
|
+
mglmnet.rb version #{$version}
|
18
|
+
----------------------------
|
19
|
+
概要) Rのglmnetパッケージを利用した正則化法による回帰モデルの構築
|
20
|
+
特徴) 1) リッジ回帰、LASSO、elastic-netの正則化を指定可能
|
21
|
+
2) 入力データは、サンプルID,featureID,値の3項目CSVデータ
|
22
|
+
3) 交差検証(CV)によりlambdaを決定することで最適モデルを構築可
|
23
|
+
4) 線形回帰、ロジスティック回帰、ポアッソン回帰など指定可能
|
24
|
+
5) Rスクリプトを書く必要はない
|
25
|
+
用法1) モデル構築モード
|
26
|
+
a)スパースマトリックスによる入力ファイル指定の場合
|
27
|
+
mglmnet.rb [family=] [alpha=] i= k= x= [v=] c= y= exposure= O= [-z] [seed=] [-nocv] [param=] [T=] [-mcmdenv] [--help]
|
28
|
+
a)マトリックスによる入力ファイル指定の場合
|
29
|
+
mglmnet.rb [family=] [alpha=] i= x= y= exposure= O= [-z] [seed=] [-nocv] [param=] [T=] [-mcmdenv] [--help]
|
30
|
+
用法2) 予測モード
|
31
|
+
mglmnet.rb -predict [lambda=] i= I= o= [param=] [T=] [-mcmdenv] [--help]
|
32
|
+
|
33
|
+
### モデル構築モード
|
34
|
+
|
35
|
+
## 入力ファイルの指定は、スパースマトリックスによる方法とマトリックスによる方法の2通りある。
|
36
|
+
## (k=が指定されていれば、スパースマトリックスとみなされる)
|
37
|
+
# a) スパースマトリックスによる入力ファイルの指定
|
38
|
+
-sparse : このオプションが指定されて初めてスパースマトリックスモードとみなされる【必須】
|
39
|
+
i= : スパースマトリックス型入力データファイル名【必須】
|
40
|
+
k= : 1つのサンプルを表す項目名【必須】
|
41
|
+
x= : 説明変数を表す項目名【必須】
|
42
|
+
v= : 説明変数の値項目名【オプション】
|
43
|
+
: 指定しなければ、全行1(すなわちダミー変数)となる。
|
44
|
+
c= : 目的変数データファイル名【選択必須】
|
45
|
+
y= : 目的変数の項目名【必須】
|
46
|
+
: family=で指定した内容により以下に示す値である必要がある。
|
47
|
+
: gaussian: 実数
|
48
|
+
: poisson: 正の整数
|
49
|
+
: binomial: 2つのクラス値(文字列でも可)
|
50
|
+
: multinomial: 複数クラス値(文字列でも可)
|
51
|
+
|
52
|
+
# b) マトリックスによる入力ファイルの指定
|
53
|
+
i= : マトリックス型入力データファイル名【必須】
|
54
|
+
x= : 説明変数項目名リスト【必須】
|
55
|
+
y= : 目的変数項目名【選択必須】
|
56
|
+
|
57
|
+
O= : 出力ディレクトリ名【必須】
|
58
|
+
|
59
|
+
## モデル構築関連
|
60
|
+
family= : リンク関数【デフォルト:"gaussian"】
|
61
|
+
: gaussian: 線形回帰
|
62
|
+
: poisson: ポアソン回帰
|
63
|
+
: binomial: ロジスティック回帰
|
64
|
+
: multinomial: 多項ロジスティック回帰
|
65
|
+
alpha= : elastic-netにおけるL1とL2正則化項の荷重【デフォルト:1.0】
|
66
|
+
: 1.0でL1正則化、0でL2正則化(リッジ回帰)、0<alpha<1でelastic-net
|
67
|
+
seed= : 乱数の種(0以上の整数,交差検証に影響)【オプション:default=-1(時間依存)】
|
68
|
+
-z : 内部で説明変数を標準化する。
|
69
|
+
: スケールの異なる変数の係数を比較したい場合に利用する。
|
70
|
+
: -zをつけて作成されたモデルで予測することには意味がないことに注意する。
|
71
|
+
-nocv : 交差検証をしない *注)
|
72
|
+
|
73
|
+
### 予測モード(-predictを指定することで予測モードとして動作する)
|
74
|
+
I= : モデル構築モードでの出力先ディレクトリパス【必須】
|
75
|
+
: 利用するファイルは以下のとおり。
|
76
|
+
: map_var2vno.csv: データの変換に利用
|
77
|
+
: model.robj: 回帰モデルRオブジェクト
|
78
|
+
lambda= : 正則化項の重み【必須:複数指定可】
|
79
|
+
: 0以上の実数値を与える以外に、以下の2つは特殊な意味を持つシンボルとして指定できる
|
80
|
+
: min: CVにおけるエラー最小モデルに対応するlambda
|
81
|
+
: 1se: lambda.min+1*standard errorのモデルに対応するlambda
|
82
|
+
o= : 予測結果ファイル名
|
83
|
+
: key,目的変数予測値...
|
84
|
+
: lambda=で指定した各lambdaに対応する予測値全てを出力する
|
85
|
+
i= : 予測対象入力ファイル名
|
86
|
+
: フォーマットと項目名は、モデル構築モードで利用したものに完全に一致しなければならない。
|
87
|
+
|
88
|
+
## その他
|
89
|
+
T= : 作業ディレクトリ【デフォルト:"/tmp"】
|
90
|
+
-mcmdenv : 内部のMCMDのコマンドメッセージを表示
|
91
|
+
--help : ヘルプの表示
|
92
|
+
|
93
|
+
|
94
|
+
注) 交差検証(CV)をしてもしなくても、複数のlambda値に対する回帰係数の推定は行われる。
|
95
|
+
CVをすることで、lambda別に構築される回帰モデルの予測エラーを推定する。
|
96
|
+
そして、エラー最小化という意味における最適なlambdaを得ることが可能となる。
|
97
|
+
よって-nocvを指定した場合、CVによる予測エラーの推定を行わないため、
|
98
|
+
予測モードにおいてlambda="min,1se"は指定できない。
|
99
|
+
|
100
|
+
必要なソフトウェア)
|
101
|
+
1) R
|
102
|
+
2) Rのglmnetパッケージ
|
103
|
+
|
104
|
+
入力データ)
|
105
|
+
例:
|
106
|
+
key,var,val
|
107
|
+
1,a,1
|
108
|
+
1,c,2
|
109
|
+
1,e,1
|
110
|
+
2,c,2
|
111
|
+
2,d,1
|
112
|
+
3,a,2
|
113
|
+
3,e,3
|
114
|
+
3,d,6
|
115
|
+
|
116
|
+
|
117
|
+
モデル構築モードでの出力データ)
|
118
|
+
1) model.robj : 回帰モデルのRオブジェクト
|
119
|
+
2) model_info.csv : 回帰モデルに関する各種情報
|
120
|
+
3) coef.csv : lambda別係数一覧
|
121
|
+
4) coef.png : lambda別係数チャート
|
122
|
+
5) lambda_stats.csv : lambda別の各種情報(deviance,係数が非0のfeature数、推定誤差など)
|
123
|
+
6) lambda_error.png : lambda別エラーチャート
|
124
|
+
7) map_var2vno.csv : i=のx=項目とR内部のfeature番号の対応表
|
125
|
+
8) scp.R : 実行されたRスクリプト
|
126
|
+
注: 6)は-nocvを指定時には出力されない
|
127
|
+
注: 2)と5)は-nocvを指定時には一部出力されない
|
128
|
+
|
129
|
+
予測モードでの出力データ例)
|
130
|
+
predict.csv
|
131
|
+
key_num,id,lambda_1se,lambda_0.01,lambda_min
|
132
|
+
1,20070701_5604,724.004406058175,743.068998688436,742.831291756625
|
133
|
+
2,20070701_5605,832.022338347663,959.170798180323,957.54590188041
|
134
|
+
3,20070701_5606,978.945506261202,1012.07069692832,1011.86134342746
|
135
|
+
4,20070701_5607,866.708820321008,786.158661840733,787.19246417122
|
136
|
+
|
137
|
+
例)
|
138
|
+
# モデル構築
|
139
|
+
$ mglmnet.rb c=yaki_tanka.csv i=yaki_features.csv O=result1 k=id x=商品名 y=tanka -sparse
|
140
|
+
|
141
|
+
# 上のモデル構築で使ったデータを使って予測
|
142
|
+
$ mglmnet.rb -predict i=yaki_features.csv k=id x=商品名 I=result1 lambda=1se,0.01,min o=result1/predict.csv
|
143
|
+
|
144
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
145
|
+
EOF
|
146
|
+
exit
|
147
|
+
end
|
148
|
+
|
149
|
+
def ver()
|
150
|
+
$revision ="0" if $revision =~ /VERSION/
|
151
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
152
|
+
exit
|
153
|
+
end
|
154
|
+
|
155
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
156
|
+
ver() if ARGV[0]=="--version"
|
157
|
+
|
158
|
+
require "rubygems"
|
159
|
+
require "nysol/mcmd"
|
160
|
+
require "json"
|
161
|
+
|
162
|
+
# Rライブラリ実行可能確認
|
163
|
+
exit(1) unless(MCMD::chkRexe("glmnet"))
|
164
|
+
|
165
|
+
# separating the input file (ifile) into three following files
|
166
|
+
# 1) independent variables (x=) => xxvar
|
167
|
+
# 2) objective variable (y=) only when model building mode =>xxy
|
168
|
+
# 3) exposure variable if it's specified (exposure=) =>xxexposure
|
169
|
+
def matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num,yVar=nil,xxy=nil)
|
170
|
+
MCMD::msgLog("#{File.basename($0)}: cleaning a matrix data `#{ifile}' ...")
|
171
|
+
f=""
|
172
|
+
f << "mcut f=#{var} i=#{ifile} o=#{xxvar}"
|
173
|
+
system(f)
|
174
|
+
|
175
|
+
f=""
|
176
|
+
f << "mcut f=#{key}:key i=#{ifile} |"
|
177
|
+
f << "msortf f=key |"
|
178
|
+
f << "muniq k=key |"
|
179
|
+
f << "mnumber s=key a=key_num S=1 o=#{xxkey2num}"
|
180
|
+
system(f)
|
181
|
+
|
182
|
+
if xxy
|
183
|
+
f=""
|
184
|
+
f << "mcut f=#{yVar} i=#{ifile} o=#{xxy}"
|
185
|
+
system(f)
|
186
|
+
end
|
187
|
+
|
188
|
+
if exposure
|
189
|
+
f=""
|
190
|
+
f << "mcut f=#{exposure} i=#{ifile} o=#{xxexposure}"
|
191
|
+
system(f)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# traデータからsparseMatrix用データを作成する
|
196
|
+
# 1) var2numがnullならば、ifileからvar2numを生成。指定されていれば、そのファイルを使って変換
|
197
|
+
# 2) key2num変換表を出力(key2num)
|
198
|
+
# 3) key,varがnumに変換されたデータを出力(row,col,val)
|
199
|
+
def smatrix(ifile,key,var,val,cFile,predictMode,var2num,key2num,xxrow,xxcol,xxval)
|
200
|
+
MCMD::msgLog("#{File.basename($0)}: cleaning a sparse matrix data `#{ifile}' ...")
|
201
|
+
|
202
|
+
wf=MCMD::Mtemp.new
|
203
|
+
xxbase =wf.file
|
204
|
+
xxa =wf.file
|
205
|
+
|
206
|
+
# 1) add value "1" unless v= not specified.
|
207
|
+
# 2) make unique by key and var
|
208
|
+
f=""
|
209
|
+
if val
|
210
|
+
f << "mcut f=#{key}:key,#{var}:var,#{val}:val i=#{ifile} |"
|
211
|
+
else
|
212
|
+
f << "mcut f=#{key}:key,#{var}:var i=#{ifile} |"
|
213
|
+
f << "msetstr v=1 a=val |"
|
214
|
+
end
|
215
|
+
|
216
|
+
if cFile
|
217
|
+
f << "mcommon k=key K=#{key} m=#{cFile} |"
|
218
|
+
end
|
219
|
+
|
220
|
+
f << "msortf f=key,var |"
|
221
|
+
f << "muniq k=key,var o=#{xxbase}"
|
222
|
+
system(f)
|
223
|
+
|
224
|
+
recSize=MCMD::mrecount("i=#{xxbase}")
|
225
|
+
if recSize==0
|
226
|
+
raise "#ERROR# common records between x and y files are not found"
|
227
|
+
end
|
228
|
+
|
229
|
+
unless predictMode then
|
230
|
+
MCMD::msgLog("#{File.basename($0)}: creating a mapping table of variables and their number...")
|
231
|
+
f=""
|
232
|
+
f << "mcut f=var i=#{xxbase} |"
|
233
|
+
f << "msortf f=var |"
|
234
|
+
f << "muniq k=var |"
|
235
|
+
f << "mnumber s=var a=vno S=1 o=#{var2num}"
|
236
|
+
system(f)
|
237
|
+
end
|
238
|
+
|
239
|
+
MCMD::msgLog("#{File.basename($0)}: creating a mapping table of key and its number...")
|
240
|
+
f=""
|
241
|
+
f << "mcut f=key i=#{xxbase} |"
|
242
|
+
f << "msortf f=key |"
|
243
|
+
f << "muniq k=key |"
|
244
|
+
f << "mnumber s=key a=key_num S=1 o=#{key2num}"
|
245
|
+
system(f)
|
246
|
+
|
247
|
+
MCMD::msgLog("#{File.basename($0)}: creating index list and value list for initializing a sparse matrix...")
|
248
|
+
f=""
|
249
|
+
f << "mcut f=key,var,val i=#{xxbase} |"
|
250
|
+
f << "mjoin k=key m=#{key2num} f=key_num |"
|
251
|
+
f << "msortf f=var |"
|
252
|
+
f << "mjoin k=var m=#{var2num} f=vno o=#{xxa}"
|
253
|
+
system(f)
|
254
|
+
|
255
|
+
system "mcut f=key_num i=#{xxa} o=#{xxrow}"
|
256
|
+
system "mcut f=vno i=#{xxa} o=#{xxcol}"
|
257
|
+
system "mcut f=val i=#{xxa} o=#{xxval}"
|
258
|
+
|
259
|
+
rowSize=MCMD::mrecount("i=#{key2num}")
|
260
|
+
#colSize=MCMD::mrecount("i=#{var2num}")
|
261
|
+
|
262
|
+
return rowSize
|
263
|
+
end
|
264
|
+
|
265
|
+
def mkYvar(cfile,key,yVar,xxkey2num,xxy)
|
266
|
+
f=""
|
267
|
+
f << "mcut f=#{key}:key,#{yVar}:y i=#{cfile} |"
|
268
|
+
f << "msortf f=key |"
|
269
|
+
f << "mjoin k=key m=#{xxkey2num} f=key_num |"
|
270
|
+
f << "mcut f=key_num,y |"
|
271
|
+
f << "msortf f=key_num%n o=#{xxy}"
|
272
|
+
system(f)
|
273
|
+
end
|
274
|
+
|
275
|
+
def mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
|
276
|
+
if xxexposure
|
277
|
+
f=""
|
278
|
+
f << "mcut f=#{key}:key,#{exposure}:exposure i=#{cfile} |"
|
279
|
+
f << "msortf f=key |"
|
280
|
+
f << "mjoin k=key m=#{xxkey2num} f=key_num |"
|
281
|
+
f << "mcut f=key_num,exposure |"
|
282
|
+
f << "msortf f=key_num%n |"
|
283
|
+
f << "mcut f=exposure o=#{xxexposure}"
|
284
|
+
system(f)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
#################################################################################################
|
290
|
+
#### generate R scripts
|
291
|
+
|
292
|
+
#######################
|
293
|
+
## pre-process
|
294
|
+
def scpHeader(seed=nil)
|
295
|
+
scp=""
|
296
|
+
scp << "library(glmnet)\n"
|
297
|
+
scp << "set.seed(#{seed})\n" if seed
|
298
|
+
return scp
|
299
|
+
end
|
300
|
+
|
301
|
+
#######################
|
302
|
+
## prediction script using an existing model (sparse matrix version)
|
303
|
+
## xxmodel: model file (model.robj)
|
304
|
+
## rowSize: the number of variables on dataset
|
305
|
+
## lmdVar: lambda values. It make prediction using the model with given each lambda value
|
306
|
+
## exposureFile: exposure file (nil unless poisson regression)
|
307
|
+
## xxoFile: output file name
|
308
|
+
def scpPrdSMX(xxmodel,rowSize,lmdVar,exposureFile,xxoFile,nocv)
|
309
|
+
scp=""
|
310
|
+
scp << "library(glmnet)\n"
|
311
|
+
scp << "#####################################\n"
|
312
|
+
scp << "## loading the model and make prediction\n"
|
313
|
+
scp << "load(\"#{xxmodel}\")\n"
|
314
|
+
if nocv then
|
315
|
+
scp << "dims= c(#{rowSize},model$beta@Dim[1])\n"
|
316
|
+
else
|
317
|
+
scp << "dims= c(#{rowSize},model$glmnet.fit$beta@Dim[1])\n"
|
318
|
+
end
|
319
|
+
scp << "xMTX=sparseMatrix(i=row,j=col,x=val,dims=dims)\n"
|
320
|
+
scp << "#####################################\n"
|
321
|
+
scp << "## predict and output the result\n"
|
322
|
+
if exposureFile
|
323
|
+
scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
|
324
|
+
scp << "prd = predict(model,xMTX,c(#{lmdVar}),offset=log(exposureMTX))\n"
|
325
|
+
else
|
326
|
+
scp << "prd = predict(model, xMTX,c(#{lmdVar}))\n"
|
327
|
+
end
|
328
|
+
scp << "write.csv(prd,file=\"#{xxoFile}\",quote=FALSE)\n"
|
329
|
+
|
330
|
+
return scp
|
331
|
+
end
|
332
|
+
|
333
|
+
#######################
|
334
|
+
## prediction script using an existing model (matrix version)
|
335
|
+
## xxmodel: model file (model.robj)
|
336
|
+
## lmdVar: lambda values. It make prediction using the model with given each lambda value
|
337
|
+
## exposureFile: exposure file (nil unless poisson regression)
|
338
|
+
## xxoFile: output file name
|
339
|
+
def scpPrdMTX(xxmodel,lmdVar,exposureFile,xxoFile)
|
340
|
+
scp=""
|
341
|
+
scp << "library(glmnet)\n"
|
342
|
+
scp << "#####################################\n"
|
343
|
+
scp << "## loading the model and make prediction\n"
|
344
|
+
scp << "load(\"#{xxmodel}\")\n"
|
345
|
+
scp << "#####################################\n"
|
346
|
+
scp << "## setting sparseMatrix from csv data\n"
|
347
|
+
scp << "#####################################\n"
|
348
|
+
scp << "## predict and output the result\n"
|
349
|
+
if exposureFile
|
350
|
+
scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
|
351
|
+
scp << "prd = predict(model,xMTX,c(#{lmdVar}),offset=log(exposureMTX))\n"
|
352
|
+
else
|
353
|
+
scp << "prd = predict(model,xMTX,c(#{lmdVar}))\n"
|
354
|
+
end
|
355
|
+
scp << "write.csv(prd,file=\"#{xxoFile}\",quote=FALSE)\n"
|
356
|
+
|
357
|
+
return scp
|
358
|
+
end
|
359
|
+
|
360
|
+
def scpInpSMX(rowFile,colFile,valFile,yFile=nil)
|
361
|
+
scp=""
|
362
|
+
scp << "#####################################\n"
|
363
|
+
scp << "## loading csv data and setting sparseMatrix\n"
|
364
|
+
scp << "row = read.csv(\"#{rowFile}\")$key_num\n"
|
365
|
+
scp << "col = read.csv(\"#{colFile}\")$vno\n"
|
366
|
+
scp << "val = read.csv(\"#{valFile}\")$val\n"
|
367
|
+
scp << "xMTX=sparseMatrix(i=row,j=col,x=val)\n"
|
368
|
+
if yFile then
|
369
|
+
scp << "#####################################\n"
|
370
|
+
scp << "## setting csv data into a vector of objective valiable\n"
|
371
|
+
scp << "yMTX= read.csv(\"#{yFile}\",header=T)\n"
|
372
|
+
scp << "yMTX = yMTX$y # as a vector\n"
|
373
|
+
end
|
374
|
+
|
375
|
+
return scp
|
376
|
+
end
|
377
|
+
|
378
|
+
def scpInpMTX(xFile,yFile=nil)
|
379
|
+
scp = ""
|
380
|
+
scp << "#####################################\n"
|
381
|
+
scp << "## loading csv data setting matrix\n"
|
382
|
+
scp << "xMTX= as.matrix(read.csv(\"#{xFile}\"))\n"
|
383
|
+
if yFile then
|
384
|
+
scp << "yMTX= as.matrix(read.csv(\"#{yFile}\"))\n"
|
385
|
+
end
|
386
|
+
|
387
|
+
return scp
|
388
|
+
end
|
389
|
+
|
390
|
+
def scpExeSMX(family,alpha,param,exposureFile,nocv,doZ)
|
391
|
+
cvStr="cv."
|
392
|
+
cvStr="" if nocv
|
393
|
+
|
394
|
+
scp=""
|
395
|
+
scp << "#####################################\n" if doZ
|
396
|
+
scp << "## standardizing x variables\n" if doZ
|
397
|
+
scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
|
398
|
+
|
399
|
+
scp << "#####################################\n"
|
400
|
+
scp << "## building a model with cross validation\n"
|
401
|
+
scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
|
402
|
+
scp << "xMTX[is.na(xMTX)] <- 0\n" if doZ
|
403
|
+
|
404
|
+
if exposureFile
|
405
|
+
scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
|
406
|
+
scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha},offset=log(exposureMTX))\n"
|
407
|
+
else
|
408
|
+
scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha})\n"
|
409
|
+
end
|
410
|
+
if nocv
|
411
|
+
scp << "fit=model\n"
|
412
|
+
else
|
413
|
+
scp << "fit=model$glmnet.fit\n"
|
414
|
+
end
|
415
|
+
|
416
|
+
return scp
|
417
|
+
end
|
418
|
+
|
419
|
+
def scpExeMTX(family,alpha,param,exposureFile,nocv,doZ)
|
420
|
+
cvStr="cv."
|
421
|
+
cvStr="" if nocv
|
422
|
+
|
423
|
+
scp=""
|
424
|
+
scp << "#####################################\n" if doZ
|
425
|
+
scp << "## standardizing x variables\n" if doZ
|
426
|
+
scp << "xMTX=apply(xMTX,2,scale)\n" if doZ
|
427
|
+
|
428
|
+
scp << "#####################################\n"
|
429
|
+
scp << "## building a model with cross validation\n"
|
430
|
+
if exposureFile
|
431
|
+
scp << "exposureMTX= as.matrix(read.csv(\"#{exposureFile}\"))\n"
|
432
|
+
scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha},offset=log(exposureMTX))\n"
|
433
|
+
else
|
434
|
+
scp << "model = #{cvStr}glmnet(xMTX,yMTX,family=\"#{family}\",alpha=#{alpha})\n"
|
435
|
+
end
|
436
|
+
if nocv
|
437
|
+
scp << "fit=model\n"
|
438
|
+
else
|
439
|
+
scp << "fit=model$glmnet.fit\n"
|
440
|
+
end
|
441
|
+
|
442
|
+
return scp
|
443
|
+
end
|
444
|
+
|
445
|
+
def scpResult(modelFile,coefFile,coefPNG,constFile,lambdaFile,lambdaErrFile,infoFile)
|
446
|
+
r_post_proc = <<EOF
|
447
|
+
#####################################
|
448
|
+
## output serialized objects of the model
|
449
|
+
save(model ,file="#{modelFile}")
|
450
|
+
|
451
|
+
#####################################
|
452
|
+
## output coefficients on each lambda
|
453
|
+
write.csv(as.matrix(fit$beta),file="#{coefFile}",quote=FALSE)
|
454
|
+
write.csv(as.matrix(fit$a0),file="#{constFile}",quote=FALSE)
|
455
|
+
|
456
|
+
png("#{coefPNG}")
|
457
|
+
plot(fit,"lambda")
|
458
|
+
supmsg=dev.off()
|
459
|
+
|
460
|
+
#####################################
|
461
|
+
## setting the model info
|
462
|
+
info=as.data.frame(fit$nobs)
|
463
|
+
colnames(info)=c("nobs")
|
464
|
+
info$colsize=fit$dim[2]
|
465
|
+
info$nulldev=fit$nulldev
|
466
|
+
|
467
|
+
#####################################
|
468
|
+
## output results of cv in csv format
|
469
|
+
stats=as.data.frame(c(1:length(model$lambda)))
|
470
|
+
colnames(stats)=c("sno")
|
471
|
+
stats$lambda=model$lambda
|
472
|
+
stats$df=model$df
|
473
|
+
stats$dev.ratio=fit$dev.ratio
|
474
|
+
# stats$dev.ratio=fit$dev.ratio[1:length(model$lambda)]
|
475
|
+
|
476
|
+
stats$cvm=model$cvm
|
477
|
+
stats$cvsd=model$cvsd
|
478
|
+
stats$cvup=model$cvup
|
479
|
+
stats$cvlo=model$cvlo
|
480
|
+
write.csv(stats,"#{lambdaFile}",row.names=FALSE,quote=FALSE)
|
481
|
+
|
482
|
+
info$lambda_min=model$lambda.min
|
483
|
+
info$lambda_1se=model$lambda.1se
|
484
|
+
write.csv(info,"#{infoFile}",row.names=F,quote=FALSE)
|
485
|
+
|
486
|
+
png("#{lambdaErrFile}")
|
487
|
+
plot(model)
|
488
|
+
supmsg=dev.off()
|
489
|
+
EOF
|
490
|
+
|
491
|
+
return r_post_proc
|
492
|
+
end
|
493
|
+
|
494
|
+
#################################################################################################
|
495
|
+
#### post processing
|
496
|
+
|
497
|
+
def coeff(xxcoef,xxconst,xxlambda,xxvar2num,isSMX,oPath)
|
498
|
+
MCMD::msgLog("#{File.basename($0)}: summarizing coefficients on each lambda...")
|
499
|
+
|
500
|
+
wf=MCMD::Mtemp.new
|
501
|
+
xxnum2var = wf.file
|
502
|
+
xxcoefv = wf.file
|
503
|
+
xxconstv = wf.file
|
504
|
+
xxmap = wf.file
|
505
|
+
|
506
|
+
# ,s0
|
507
|
+
# V1,34.4221918005038
|
508
|
+
# V2,42.2816648447219
|
509
|
+
f=""
|
510
|
+
f << "mnullto f=0 v=vno -nfn i=#{xxcoef} |"
|
511
|
+
f << "msetstr v=coef a=fld |"
|
512
|
+
f << "mcross k=vno s=fld f=s* a=sno o=#{xxcoefv}"
|
513
|
+
system(f)
|
514
|
+
# vno,sno,coef
|
515
|
+
# V1,s0,0
|
516
|
+
# V1,s1,1.49990093015948
|
517
|
+
# V1,s2,2.93162904818885
|
518
|
+
# V1,s3,4.29828291361314
|
519
|
+
|
520
|
+
# constant file (model$a0)
|
521
|
+
# ,V1
|
522
|
+
# s0,1.75
|
523
|
+
# s1,0.625074302380394
|
524
|
+
# s2,-0.448721786141637
|
525
|
+
# s3,-1.47371218520986
|
526
|
+
f=""
|
527
|
+
f << "mnullto f=0 v=sno -nfn i=#{xxconst} |"
|
528
|
+
f << "msetstr v=CONSTANT a=vno |"
|
529
|
+
f << "mcut f=vno,sno,V1:coef o=#{xxconstv}"
|
530
|
+
system(f)
|
531
|
+
|
532
|
+
xxsno2lambda = wf.file
|
533
|
+
f=""
|
534
|
+
f << "mcut f=sno:sno_,lambda i=#{xxlambda} |"
|
535
|
+
f << "mcal c='${sno_}-1' a=sno |"
|
536
|
+
f << "msortf f=sno o=#{xxsno2lambda}"
|
537
|
+
system(f)
|
538
|
+
|
539
|
+
system("mcal c='\"V\"+$s{vno}' a=vnov i=#{xxvar2num} o=#{xxmap}") if isSMX
|
540
|
+
|
541
|
+
f=""
|
542
|
+
f << "mcat i=#{xxconstv},#{xxcoefv} |"
|
543
|
+
f << "msed f=sno c=s v= |"
|
544
|
+
f << "mselstr f=coef v=0 -r|"
|
545
|
+
f << "msortf f=sno |"
|
546
|
+
f << "mjoin k=sno m=#{xxsno2lambda} f=lambda -n |"
|
547
|
+
if isSMX then
|
548
|
+
f << "mjoin k=vno K=vnov m=#{xxmap} f=var:vname -n |"
|
549
|
+
f << "mcut f=lambda,vno,vname,coef |"
|
550
|
+
else
|
551
|
+
f << "mcut f=lambda,vno,coef |"
|
552
|
+
end
|
553
|
+
f << "msortf f=lambda%nr,vno o=#{oPath}/coef.csv"
|
554
|
+
system(f)
|
555
|
+
|
556
|
+
end
|
557
|
+
|
558
|
+
#######################
|
559
|
+
## convert the predction file generated in R into output file(csv).
|
560
|
+
## 1) the first column is converted from ID made in R into key field name on input data.
|
561
|
+
## 2) each column correspond to prediction for each lambda specified.
|
562
|
+
## 3) exp(predicted value) if poisson
|
563
|
+
def outputPrediction(xxpredict,lmd,xxkey2num,key,family,ofile)
|
564
|
+
|
565
|
+
fldNames=["0:key_num"]
|
566
|
+
predFlds=[]
|
567
|
+
i=0
|
568
|
+
lmd.split(",").each{|ele|
|
569
|
+
i+=1
|
570
|
+
fldNames << "#{i}:lambda_#{ele}"
|
571
|
+
predFlds << "lambda_#{ele}"
|
572
|
+
}
|
573
|
+
fldNames=fldNames.join(",")
|
574
|
+
|
575
|
+
wf=MCMD::Mtemp.new
|
576
|
+
xxnum2key=wf.file
|
577
|
+
system "msortf f=key_num i=#{xxkey2num} o=#{xxnum2key}"
|
578
|
+
|
579
|
+
# ,1,2,3
|
580
|
+
# 1,724.004406058175,743.068998688436,742.831291756625
|
581
|
+
# 2,832.022338347663,959.170798180323,957.54590188041
|
582
|
+
f=""
|
583
|
+
f << "mcut f=#{fldNames} -nfni i=#{xxpredict} |"
|
584
|
+
f << "mdelnull f=key_num |"
|
585
|
+
|
586
|
+
# exp(predict value) if poisson
|
587
|
+
if family=="poisson"
|
588
|
+
predFlds.each{|fld|
|
589
|
+
f << "mcal c='exp(${#{fld}})' a=xxnew_#{fld} |"
|
590
|
+
f << "mcut -r f=#{fld} |"
|
591
|
+
f << "mfldname f=xxnew_#{fld}:#{fld} |"
|
592
|
+
}
|
593
|
+
|
594
|
+
# 1/(1+exp(predict value)) if binomial
|
595
|
+
elsif family=="binomial"
|
596
|
+
predFlds.each{|fld|
|
597
|
+
f << "mcal c='1/(1+exp((-1)*${#{fld}}))' a=xxnew_#{fld} |"
|
598
|
+
f << "mcut -r f=#{fld} |"
|
599
|
+
f << "mfldname f=xxnew_#{fld}:#{fld} |"
|
600
|
+
}
|
601
|
+
end
|
602
|
+
|
603
|
+
# join the key field
|
604
|
+
f << "msortf f=key_num |"
|
605
|
+
f << "mjoin k=key_num m=#{xxnum2key} f=key |"
|
606
|
+
f << "msortf f=key |"
|
607
|
+
f << "mcut f=key:#{key},lambda* o=#{ofile}"
|
608
|
+
system(f)
|
609
|
+
end
|
610
|
+
|
611
|
+
|
612
|
+
#################################################################################################
|
613
|
+
#### Entry point
|
614
|
+
|
615
|
+
########################
|
616
|
+
## predict mode
|
617
|
+
if ARGV.index("-predict")
|
618
|
+
args=MCMD::Margs.new(ARGV,"i=,I=,o=,c=,lambda=,-sparse,-predict,T=,-verbose,T=","i=,I=")
|
619
|
+
|
620
|
+
# mcmdのメッセージは警告とエラーのみ
|
621
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
622
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
623
|
+
|
624
|
+
#ワークファイルパス
|
625
|
+
if args.str("T=")!=nil then
|
626
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
627
|
+
end
|
628
|
+
|
629
|
+
iPath = args.file("I=","r")
|
630
|
+
params=nil
|
631
|
+
File.open("#{iPath}/build_params.json","r"){|fpr|
|
632
|
+
params=JSON.load(fpr)
|
633
|
+
}
|
634
|
+
|
635
|
+
ifile = args.file("i=","r")
|
636
|
+
cfile = args.file("c=","r")
|
637
|
+
var=params["var"]
|
638
|
+
val=params["val"]
|
639
|
+
key=params["key"]
|
640
|
+
exposure=params["exposure"]
|
641
|
+
family=params["family"]
|
642
|
+
isSMX=params["sparse"]
|
643
|
+
nocv=params["nocv"]
|
644
|
+
|
645
|
+
ofile = args.file("o=","w")
|
646
|
+
lmd = args.str("lambda=","min")
|
647
|
+
if nocv then
|
648
|
+
if lmd=~/min/ or lmd=~/1se/
|
649
|
+
raise "#ERROR# `min' or `1se' in lambda= parameter cannot be specified because the model built without cross-validation"
|
650
|
+
end
|
651
|
+
end
|
652
|
+
lmdVar= lmd.gsub("min","model$lambda.min").gsub("1se","model$lambda.1se")
|
653
|
+
|
654
|
+
wf=MCMD::Mtemp.new
|
655
|
+
o={}
|
656
|
+
o["xxscp"] =wf.file
|
657
|
+
o["xxofile"] =wf.file
|
658
|
+
xxkey2num =wf.file
|
659
|
+
|
660
|
+
if isSMX then
|
661
|
+
xxrow =wf.file
|
662
|
+
xxcol =wf.file
|
663
|
+
xxval =wf.file
|
664
|
+
xxexposure=nil
|
665
|
+
xxexposure=wf.file if exposure
|
666
|
+
|
667
|
+
rowSize=smatrix(ifile,key,var,val,nil,true,"#{iPath}/map_var2vno.csv",xxkey2num,xxrow,xxcol,xxval)
|
668
|
+
mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
|
669
|
+
|
670
|
+
scp0=scpHeader()
|
671
|
+
scp1=scpInpSMX(xxrow,xxcol,xxval)
|
672
|
+
scp2=scpPrdSMX("#{iPath}/model.robj",rowSize,lmdVar,xxexposure,o["xxofile"],nocv)
|
673
|
+
else
|
674
|
+
xxvar=wf.file
|
675
|
+
xxexposure=nil
|
676
|
+
xxexposure=wf.file if exposure
|
677
|
+
|
678
|
+
matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num)
|
679
|
+
|
680
|
+
scp0=scpHeader()
|
681
|
+
scp1=scpInpMTX(xxvar)
|
682
|
+
scp2=scpPrdMTX("#{iPath}/model.robj",lmdVar,xxexposure,o["xxofile"])
|
683
|
+
end
|
684
|
+
|
685
|
+
# writing the R script
|
686
|
+
File.open(o["xxscp"],"w"){|fpw|
|
687
|
+
fpw.puts "#{scp0}#{scp1}#{scp2}"
|
688
|
+
}
|
689
|
+
|
690
|
+
MCMD::msgLog("#{File.basename($0)}: executing R script...")
|
691
|
+
if args.bool("-verbose") then
|
692
|
+
system "R --vanilla -q < #{o['xxscp']}"
|
693
|
+
else
|
694
|
+
system "R --vanilla -q --slave < #{o['xxscp']} 2>/dev/null "
|
695
|
+
end
|
696
|
+
|
697
|
+
|
698
|
+
# output predicion
|
699
|
+
outputPrediction(o["xxofile"],lmd,xxkey2num,key,family,ofile)
|
700
|
+
|
701
|
+
########################
|
702
|
+
#### model building mode
|
703
|
+
else
|
704
|
+
|
705
|
+
args=MCMD::Margs.new(ARGV,"i=,c=,O=,k=,x=,v=,y=,alpha=,family=,exposure=,os=,T=,-verbose,-z,T=,param=,seed=,-sparse,-nocv","k=,x=,y=,i=,O=")
|
706
|
+
|
707
|
+
# mcmdのメッセージは警告とエラーのみ
|
708
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
709
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
710
|
+
|
711
|
+
#ワークファイルパス
|
712
|
+
if args.str("T=")!=nil then
|
713
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
714
|
+
end
|
715
|
+
|
716
|
+
ifile =args.file("i=","r")
|
717
|
+
cfile =args.file("c=","r")
|
718
|
+
oPath =args.file("O=", "w")
|
719
|
+
osfile=args.file("os=","r")
|
720
|
+
|
721
|
+
# flag for sparse matrix or matrix
|
722
|
+
isSMX=args.bool("-sparse")
|
723
|
+
|
724
|
+
doZ=args.bool("-z")
|
725
|
+
|
726
|
+
var=nil
|
727
|
+
key=nil
|
728
|
+
val=nil
|
729
|
+
yVar=nil
|
730
|
+
# ---- parameters for sparse matrix
|
731
|
+
if isSMX then
|
732
|
+
key = args.field("k=" , ifile, nil , 1,1)["names"].join(",")
|
733
|
+
var = args.field("x=", ifile, nil, 1,1)["names"][0]
|
734
|
+
val = args.field("v=", ifile, nil, 1,1)
|
735
|
+
val = val["names"][0] if val
|
736
|
+
yVar = args.field("y=", cfile, nil, 1,1)["names"][0]
|
737
|
+
exposure= args.field("exposure=",cfile,nil,1,1)
|
738
|
+
exposure=exposure["names"][0] if exposure
|
739
|
+
|
740
|
+
# ---- parameters for matrix
|
741
|
+
else
|
742
|
+
key = args.field("k=", ifile, nil, 1)["names"].join(",")
|
743
|
+
var = args.field("x=", ifile, nil, 1)["names"].join(",")
|
744
|
+
yVar = args.field("y=", ifile, nil, 1,1)["names"][0]
|
745
|
+
exposure= args.field("exposure=",ifile,nil,1,1)
|
746
|
+
exposure=exposure["names"][0] if exposure
|
747
|
+
end
|
748
|
+
|
749
|
+
# ---- other paramters
|
750
|
+
alpha = args.float("alpha=", 1.0, 0.0, 1.0)
|
751
|
+
family = args.str("family=", "gaussian")
|
752
|
+
seed = args.int("seed=", -1)
|
753
|
+
nocv = args.bool("-nocv")
|
754
|
+
param = args.str("param=")
|
755
|
+
param = ","+param if param
|
756
|
+
|
757
|
+
if family!="poisson" and exposure
|
758
|
+
raise "#ERROR# `exposure=' can be specified with only `family=poisson'"
|
759
|
+
end
|
760
|
+
|
761
|
+
MCMD::mkDir(oPath)
|
762
|
+
|
763
|
+
wf=MCMD::Mtemp.new
|
764
|
+
o={}
|
765
|
+
o["xxvar2num"] =wf.file
|
766
|
+
o["xxmodel"] =wf.file
|
767
|
+
o["xxcoef"] =wf.file
|
768
|
+
o["xxcoefPNG"] =wf.file
|
769
|
+
o["xxconst"] =wf.file
|
770
|
+
o["xxlambda"] =wf.file
|
771
|
+
o["xxlambdaPNG"]=wf.file
|
772
|
+
o["xxinfo"] =wf.file
|
773
|
+
o["xxscp"] =wf.file
|
774
|
+
|
775
|
+
# convert tra file to sparse matrix
|
776
|
+
# it's assumed as sparse matrix if cfile= is supplied.
|
777
|
+
if isSMX then
|
778
|
+
xxrow =wf.file
|
779
|
+
xxcol =wf.file
|
780
|
+
xxval =wf.file
|
781
|
+
xxy =wf.file
|
782
|
+
xxexposure=wf.file if exposure
|
783
|
+
xxkey2num=wf.file
|
784
|
+
|
785
|
+
smatrix(ifile,key,var,val,cfile,false,o["xxvar2num"],xxkey2num,xxrow,xxcol,xxval)
|
786
|
+
mkYvar(cfile,key,yVar,xxkey2num,xxy)
|
787
|
+
mkExposure(cfile,key,exposure,xxkey2num,xxexposure)
|
788
|
+
|
789
|
+
scp0=scpHeader(seed)
|
790
|
+
scp1=scpInpSMX(xxrow,xxcol,xxval,xxy)
|
791
|
+
scp2=scpExeSMX(family,alpha,param,xxexposure,nocv,doZ)
|
792
|
+
scp3=scpResult(o["xxmodel"],o["xxcoef"],o["xxcoefPNG"],o["xxconst"],o["xxlambda"],o["xxlambdaPNG"],o["xxinfo"])
|
793
|
+
|
794
|
+
system "cp #{o['xxvar2num']} #{oPath}/map_var2vno.csv"
|
795
|
+
# otherwise it's assumed as matrix data
|
796
|
+
else
|
797
|
+
xxvar=wf.file
|
798
|
+
xxy =wf.file
|
799
|
+
xxexposure=nil
|
800
|
+
xxexposure=wf.file if exposure
|
801
|
+
xxkey2num=wf.file
|
802
|
+
|
803
|
+
matrix(ifile,key,var,xxvar,exposure,xxexposure,xxkey2num,yVar,xxy)
|
804
|
+
|
805
|
+
scp0=scpHeader(seed)
|
806
|
+
scp1=scpInpMTX(xxvar,xxy)
|
807
|
+
scp2=scpExeMTX(family,alpha,param,xxexposure,nocv,doZ)
|
808
|
+
scp3=scpResult(o["xxmodel"],o["xxcoef"],o["xxcoefPNG"],o["xxconst"],o["xxlambda"],o["xxlambdaPNG"],o["xxinfo"])
|
809
|
+
end
|
810
|
+
|
811
|
+
# writing the R script
|
812
|
+
File.open(o["xxscp"],"w"){|fpw|
|
813
|
+
fpw.puts "#{scp0}#{scp1}#{scp2}#{scp3}"
|
814
|
+
}
|
815
|
+
MCMD::msgLog("#{File.basename($0)}: executing R script...")
|
816
|
+
if args.bool("-verbose") then
|
817
|
+
system "R --vanilla -q < #{o['xxscp']}"
|
818
|
+
else
|
819
|
+
system "R --vanilla -q --slave < #{o['xxscp']} 2>/dev/null "
|
820
|
+
end
|
821
|
+
|
822
|
+
|
823
|
+
|
824
|
+
# saving all results to oPath
|
825
|
+
coeff(o["xxcoef"],o["xxconst"],o["xxlambda"],o['xxvar2num'],isSMX,oPath)
|
826
|
+
system "cp #{o['xxmodel']} #{oPath}/model.robj"
|
827
|
+
system "cp #{o['xxcoefPNG']} #{oPath}/coef.png"
|
828
|
+
system "cp #{o['xxlambdaPNG']} #{oPath}/lambda.png" unless nocv
|
829
|
+
system "cp #{o['xxinfo']} #{oPath}/info.csv"
|
830
|
+
system("cp #{o["xxscp"]} #{oPath}/scp.R")
|
831
|
+
system "mcut f=sno -r i=#{o['xxlambda']} o=#{oPath}/lambda.csv"
|
832
|
+
|
833
|
+
# 項目名
|
834
|
+
kv={"var"=>var,"val"=>val,"key"=>key,"exposure"=>exposure,"family"=>family,"sparse"=>isSMX,"nocv"=>nocv}
|
835
|
+
File.open("#{oPath}/build_params.json","w"){|fpw|
|
836
|
+
JSON.dump(kv,fpw)
|
837
|
+
}
|
838
|
+
|
839
|
+
end
|
840
|
+
|
841
|
+
# end message
|
842
|
+
MCMD::endLog(args.cmdline)
|
843
|
+
|