nysol-mining 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/mnetsimile.rb
ADDED
@@ -0,0 +1,572 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# 1.0 initial development: 2015/02/03
|
5
|
+
# 1.1 multi-processing: 2015/03/01
|
6
|
+
# 1.2 bug fix (0 division, sequence) : 2015/03/01
|
7
|
+
$version="1.2"
|
8
|
+
$revision="###VERSION###"
|
9
|
+
|
10
|
+
def help
|
11
|
+
|
12
|
+
STDERR.puts <<EOF
|
13
|
+
----------------------------
|
14
|
+
mnetsimile.rb version #{$version}
|
15
|
+
----------------------------
|
16
|
+
概要) NetSimileによるグラフ特徴量の計算および複数グラフの類似度計算
|
17
|
+
特徴) 1) グラフの全節点について7つの特徴量を計算する。
|
18
|
+
2) 全接点の7つの特徴量について5つの統計量(中央値,平均,標準偏差,歪度,尖度)を計算しグラフの特徴量(35次元ベクトル)とする。
|
19
|
+
3) グラフ間類似度を上記35次元ベクトルのcanberraDistanceにより定義する。*注1)
|
20
|
+
4) 指定した全グラフペアの類似度を出力する。
|
21
|
+
|
22
|
+
参考文献) M. Berlingerio, D. Koutra, T. Eliassi-Rad, and C. Faloutsos,
|
23
|
+
“Netsimile: A scalable approach to size-independent network similarity,” CoRR, vol. abs/1209.2684, 2012.
|
24
|
+
|
25
|
+
用法) mnetsimile.rb i=|I= O= mode=sequence|allpairs|features -edge [T=] [-mcmdenv] [--help]
|
26
|
+
I= : 複数のグラフファイルを格納したディレクトリパス【必須】*注2)
|
27
|
+
nf= : 節点データ上の節点項目名(省略時は"node")
|
28
|
+
ef= : 枝データ上の2つの節点項目名(省略時は"node1,node2")
|
29
|
+
O= : 出力パス【必須】
|
30
|
+
mode= : 動作モード【オプション】
|
31
|
+
: sequence : ファイル名のアルファベット順のグラフシーケンスとして見なし
|
32
|
+
: 隣り合うグラフ同士のみを比較する。
|
33
|
+
: allpairs : 全グラフペアを比較する。
|
34
|
+
: features : 各グラフの特徴量のみ出力し、グラフ間の類似度は計算しない。
|
35
|
+
-edge : 節点ファイルは利用せず、枝ファイルのみを利用する(I=のディレクトリに節点ファイルはなくてもよい)
|
36
|
+
|
37
|
+
## その他
|
38
|
+
T= : 作業ディレクトリ【デフォルト:"/tmp"】
|
39
|
+
-mcmdenv : 内部のMCMDのコマンドメッセージを表示
|
40
|
+
--help : ヘルプの表示
|
41
|
+
|
42
|
+
*注1) グラフ間類似度(P,Q)=1.0-canberraDistance(P,Q)= 1.0 - sum_i^d \frac{|P_i-Q_i|}{(|P_i|+|Q_i|}
|
43
|
+
*注2) グラフファイルは節点ファイルと枝ファイルによって指定する。
|
44
|
+
それぞれファイルの拡張子は".node"、".edge"でなければならない。
|
45
|
+
|
46
|
+
必要なソフトウェア)
|
47
|
+
1) R
|
48
|
+
2) Rのigraphパッケージ
|
49
|
+
|
50
|
+
入力データ)
|
51
|
+
例: graphsディレクトリ内の4ファイル
|
52
|
+
g1.edge g1.node g2.edge g2.node
|
53
|
+
node1,node2 node node1,node2 node
|
54
|
+
E,J A E,J A
|
55
|
+
E,A B E,A B
|
56
|
+
J,D C J,D C
|
57
|
+
J,A D J,A D
|
58
|
+
J,H E J,H E
|
59
|
+
D,H F D,H F
|
60
|
+
D,F G H,F G
|
61
|
+
H,F H A,F H
|
62
|
+
A,F I B,H I
|
63
|
+
B,H J J
|
64
|
+
|
65
|
+
# 以下のコマンドを実行することで得られる出力ファイル群
|
66
|
+
$ mnetsimile.rb I=graphs O=result
|
67
|
+
|
68
|
+
出力データ1) グラフ別35の特徴量(7特徴量×5統計量)
|
69
|
+
featruesGraph.csv (featureおよびstat項目の値の意味は後述)
|
70
|
+
gid,feature,stat,value
|
71
|
+
g1,cc,median,0.3333333333
|
72
|
+
g1,cc,mean,0.4285714286
|
73
|
+
g1,cc,usd,0.3170632437
|
74
|
+
g1,cc,uskew,0.8631849195
|
75
|
+
g1,cc,ukurt,1.244875346
|
76
|
+
g1,ccN,median,0.3333333333
|
77
|
+
g1,ccN,mean,0.4166666667
|
78
|
+
:
|
79
|
+
dat2,cc,median,0.3333333333
|
80
|
+
dat2,cc,mean,0.4047619048
|
81
|
+
dat2,cc,usd,0.4287918305
|
82
|
+
:
|
83
|
+
|
84
|
+
注) gid: グラフID(拡張子を除いたファイル名)
|
85
|
+
|
86
|
+
出力データ2) グラフ+節点別の7特徴量
|
87
|
+
featruesNode.csv (feature項目の値の意味は後述)
|
88
|
+
gid%0,fid,node%1,feature,value
|
89
|
+
dat1,A_deg,A,deg,3
|
90
|
+
dat1,A_cc,A,cc,0.333333333333333
|
91
|
+
dat1,A_degN,A,degN,3
|
92
|
+
dat1,A_ccN,A,ccN,0.555555555555556
|
93
|
+
:
|
94
|
+
注) fid項目は、node項目とfeature項目を結合した項目
|
95
|
+
注) 孤立節点(他の節点と接続のない1つの節点)の特徴量は全て0と定義する。
|
96
|
+
孤立節点を無視したければ-edgeオプションを指定して実行すれば良い。
|
97
|
+
|
98
|
+
出力データ3) グラフ間類似度行列
|
99
|
+
similarity.csv(特徴量35次元ベクトルのcanberraSimilarity
|
100
|
+
gid1,gid2,similarity
|
101
|
+
dat1,dat2,0.6593442055
|
102
|
+
|
103
|
+
出力データ4) 7つの類似度について、グラフ間比較における有意確率(サンプルは節点)
|
104
|
+
pvalue_ks.csv : two-sample Kolmogorov-Smirnov test (分布の差の検定)
|
105
|
+
gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
106
|
+
dat1,dat2,0.937502699053248,0.937502699053248,0.937502699053248,0.541243098374871,0.937502699053248,0.937502699053248,0.937502699053248
|
107
|
+
|
108
|
+
pvalue_wx.csv : two-sample Wilcoxon test (中央値の差の検定)
|
109
|
+
gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
110
|
+
dat1,dat2,0.64316810166757,0.687070906822053,0.846328946368719,0.257155612551595,0.436141208362552,0.429488461717429,0.62639648401305
|
111
|
+
|
112
|
+
7つの特徴量の項目名:
|
113
|
+
deg : 次数
|
114
|
+
cc : クラスタ係数
|
115
|
+
degN : 近傍節点の平均次数
|
116
|
+
ccN : 近傍節点の平均クラスタ係数
|
117
|
+
eEgo : egoネットワークの枝数
|
118
|
+
eoEgo : egoネットワークに接続された枝数
|
119
|
+
nEgo : egoネットワークに接続された節点数
|
120
|
+
|
121
|
+
注) 詳細な定義は、上述の参考文献を参照のこと。
|
122
|
+
|
123
|
+
5つの統計量
|
124
|
+
median : 中央値
|
125
|
+
mean : 平均
|
126
|
+
usd : 標準偏差
|
127
|
+
uskew : 歪度
|
128
|
+
ukurt : 尖度
|
129
|
+
|
130
|
+
例)
|
131
|
+
$ mnetsimile.rb I=graphs O=result mode=sequence ef=v1,v2 nf=v
|
132
|
+
|
133
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
134
|
+
EOF
|
135
|
+
exit
|
136
|
+
end
|
137
|
+
|
138
|
+
def ver()
|
139
|
+
$revision ="0" if $revision =~ /VERSION/
|
140
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
141
|
+
exit
|
142
|
+
end
|
143
|
+
|
144
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
145
|
+
ver() if ARGV[0]=="--version"
|
146
|
+
|
147
|
+
require "rubygems"
|
148
|
+
require "nysol/mcmd"
|
149
|
+
|
150
|
+
# Rライブラリ実行可能確認
|
151
|
+
exit(1) unless(MCMD::chkRexe("igraph"))
|
152
|
+
|
153
|
+
def genRtest(eFile,oFile,scpFile)
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
|
157
|
+
def genRscript(eFile,oFile,scpFile)
|
158
|
+
r_proc = <<EOF
|
159
|
+
library(igraph)
|
160
|
+
## reading edge file
|
161
|
+
g=read.graph("#{eFile}",format="edgelist",directed=FALSE)
|
162
|
+
|
163
|
+
#### (1) d_i : degree of node_i
|
164
|
+
deg=degree(g)
|
165
|
+
|
166
|
+
#### (2) c_i : clustering coefficient of node_i
|
167
|
+
cc=transitivity(g,type="local",isolates="zero")
|
168
|
+
|
169
|
+
## neighbors list with order 1 or 2(1 hop or 2 hops) of node_i
|
170
|
+
nei1=neighborhood(g,order=1)
|
171
|
+
nei2=neighborhood(g,order=2)
|
172
|
+
|
173
|
+
## delete myself(first element) from the 1 hope list
|
174
|
+
delFirst=function(x){x[-1]}
|
175
|
+
nei1del=sapply(nei1,FUN=delFirst)
|
176
|
+
|
177
|
+
#### (3) d_{N(i)} : mean of degree for neighbors of node_i
|
178
|
+
f=function(x){mean(deg[unlist(x)])}
|
179
|
+
degN=sapply(nei1del,FUN=f)
|
180
|
+
|
181
|
+
#### (4) c_{N(i)} : mean of clustering coefficient for neighbors of node_i
|
182
|
+
f=function(x){mean(cc[unlist(x)])}
|
183
|
+
ccN=sapply(nei1del,FUN=f)
|
184
|
+
|
185
|
+
## get ego-network of node_i
|
186
|
+
f=function(x){induced.subgraph(g,vids=unlist(x))}
|
187
|
+
ego=lapply(nei1,FUN=f)
|
188
|
+
|
189
|
+
#### (5) E_{Ego(i)} : number of edges for egonetwork of node_i
|
190
|
+
eEgo=sapply(ego,FUN=ecount)
|
191
|
+
|
192
|
+
#### (6) E^o_{Ego(i)} : number of edges outgoing from egonetwork of node_i
|
193
|
+
f=function(x){
|
194
|
+
# induced subgraph by 1 hop neighbor
|
195
|
+
isg1=induced.subgraph(g,vids=unlist(nei1[x]))
|
196
|
+
# induced subgraph by 2 hops neighbor
|
197
|
+
isg2=induced.subgraph(g,vids=unlist(nei2[x]))
|
198
|
+
# difference 2hops neighbor from 1 hop neighbor
|
199
|
+
isgDiff=induced.subgraph(g,vids=setdiff(unlist(nei2[x]),unlist(nei1[x])))
|
200
|
+
length(E(isg2))-length(E(isg1))-length(E(isgDiff))
|
201
|
+
}
|
202
|
+
eoEgo=sapply(rep(1:vcount(g)),FUN=f)
|
203
|
+
|
204
|
+
#### (7) N(Ego(i)) : number of nodes for neighbors of egonetwork_i
|
205
|
+
f=function(x){length(setdiff(unlist(nei2[x]),unlist(nei1[x])))}
|
206
|
+
nEgo=sapply(rep(1:vcount(g)),FUN=f)
|
207
|
+
|
208
|
+
dat=data.frame(deg=deg, cc=cc, degN=degN, ccN=ccN, eEgo=eEgo, eoEgo=eoEgo, nEgo=nEgo)
|
209
|
+
write.csv(dat,file="#{oFile}",quote=FALSE)
|
210
|
+
EOF
|
211
|
+
|
212
|
+
File.open(scpFile,"w"){|fpw|
|
213
|
+
fpw.write(r_proc)
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
def conv2num(baseName,edgeFlg,numFile,mapFile,isoFile)
|
218
|
+
nFile="#{baseName}.node"
|
219
|
+
eFile="#{baseName}.edge"
|
220
|
+
|
221
|
+
wf=MCMD::Mtemp.new
|
222
|
+
xxn1=wf.file
|
223
|
+
xxn2=wf.file
|
224
|
+
xxn3=wf.file
|
225
|
+
xxeNodeMF=wf.file
|
226
|
+
|
227
|
+
# create a nodes list that are included in node and edge data
|
228
|
+
system "mcut f=#{$ef1}:node i=#{eFile} o=#{xxn1}"
|
229
|
+
system "mcut f=#{$ef2}:node i=#{eFile} o=#{xxn2}"
|
230
|
+
if edgeFlg or not File.exists?(nFile)
|
231
|
+
system "echo node >#{xxn3}"
|
232
|
+
else
|
233
|
+
system "mcut f=#{$nf}:node i=#{nFile} o=#{xxn3}"
|
234
|
+
end
|
235
|
+
|
236
|
+
# xxeNodeMF : nodes list that are included in edge
|
237
|
+
system "mcat i=#{xxn1},#{xxn2} | muniq k=node | msetstr v=1 a=eNode o=#{xxeNodeMF}"
|
238
|
+
|
239
|
+
# isolate nodes list
|
240
|
+
system "mcat i=#{xxn1},#{xxn2},#{xxn3} | mcommon k=node m=#{xxeNodeMF} -r o=#{isoFile}"
|
241
|
+
|
242
|
+
# create a mapping table between the original node label and the number iGraph will use
|
243
|
+
f=""
|
244
|
+
f << "mcat i=#{xxn1},#{xxn2},#{xxn3} |"
|
245
|
+
f << "muniq k=node |"
|
246
|
+
f << "mjoin k=node m=#{xxeNodeMF} f=eNode |"
|
247
|
+
f << "mnullto f=eNode v=0 |"
|
248
|
+
f << "mnumber s=eNode%r,node a=nid o=#{mapFile}"
|
249
|
+
system(f)
|
250
|
+
|
251
|
+
# create a data file that R script read
|
252
|
+
f=""
|
253
|
+
f << "mjoin k=#{$ef1} K=node m=#{mapFile} f=nid:nid1 i=#{eFile} |"
|
254
|
+
f << "mjoin k=#{$ef2} K=node m=#{mapFile} f=nid:nid2 |"
|
255
|
+
f << "mcut f=nid1,nid2 -nfno |"
|
256
|
+
f << "tr ',' ' ' >#{numFile}"
|
257
|
+
system(f)
|
258
|
+
end
|
259
|
+
|
260
|
+
|
261
|
+
#################################################################################################
|
262
|
+
#### Entry point
|
263
|
+
|
264
|
+
args=MCMD::Margs.new(ARGV,"i=,I=,ef=,nf=,O=,mode=,-edge,mp=,T=,-verbose,T=","O=")
|
265
|
+
|
266
|
+
# mcmdのメッセージは警告とエラーのみ
|
267
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
268
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
269
|
+
|
270
|
+
#ワークファイルパス
|
271
|
+
if args.str("T=")!=nil then
|
272
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
273
|
+
end
|
274
|
+
|
275
|
+
iPath = args.file("I=","r")
|
276
|
+
oPath = args.file("O=","w")
|
277
|
+
mode = args.str("mode=","allpairs")
|
278
|
+
# ---- edge field names (two nodes)
|
279
|
+
ef = args.str("ef=", "node1,node2")
|
280
|
+
ef = ef.split(",")
|
281
|
+
if ef.size!=2 then
|
282
|
+
raise "#ERROR# ef= must take two field names"
|
283
|
+
end
|
284
|
+
$ef1=ef[0]
|
285
|
+
$ef2=ef[1]
|
286
|
+
|
287
|
+
# ---- node field name
|
288
|
+
nf = args.str("nf=","node")
|
289
|
+
$nf=nf
|
290
|
+
|
291
|
+
edgeFlg=args.bool("-edge")
|
292
|
+
$mp=args.int("mp=",1)
|
293
|
+
|
294
|
+
nfFile="#{oPath}/featuresNode.csv"
|
295
|
+
gfFile="#{oPath}/featuresGraph.csv"
|
296
|
+
MCMD::mkDir(oPath)
|
297
|
+
|
298
|
+
wf=MCMD::Mtemp.new
|
299
|
+
numFile=Array.new($mp)
|
300
|
+
mapFile=Array.new($mp)
|
301
|
+
scpFile=Array.new($mp)
|
302
|
+
feaFile=Array.new($mp)
|
303
|
+
isoFile=Array.new($mp)
|
304
|
+
isoFeatures=Array.new($mp)
|
305
|
+
xxtmp=Array.new($mp)
|
306
|
+
|
307
|
+
nfPath=Array.new($mp)
|
308
|
+
gfPath=Array.new($mp)
|
309
|
+
|
310
|
+
(0...$mp).each{|i|
|
311
|
+
numFile[i]=wf.file
|
312
|
+
mapFile[i]=wf.file
|
313
|
+
scpFile[i]=wf.file
|
314
|
+
feaFile[i]=wf.file
|
315
|
+
isoFile[i]=wf.file
|
316
|
+
|
317
|
+
isoFeatures[i]=wf.file
|
318
|
+
nfPath[i] =wf.file
|
319
|
+
xxtmp[i] =wf.file
|
320
|
+
gfPath[i] =wf.file
|
321
|
+
|
322
|
+
MCMD::mkDir(nfPath[i])
|
323
|
+
MCMD::mkDir(gfPath[i])
|
324
|
+
}
|
325
|
+
|
326
|
+
|
327
|
+
files = Dir["#{iPath}/*.edge"]
|
328
|
+
files.sort! # 419行目のnames.sortと同じ順番を保証するため
|
329
|
+
files.meach($mp){|file,count,pno|
|
330
|
+
|
331
|
+
MCMD::msgLog("START fearture extraction: #{file} #{pno} #{isoFeatures[pno]}")
|
332
|
+
|
333
|
+
baseName=file.sub(/\.edge$/,"")
|
334
|
+
name=baseName.sub(/^.*\//,"")
|
335
|
+
|
336
|
+
conv2num(baseName,edgeFlg,numFile[pno],mapFile[pno],isoFile[pno])
|
337
|
+
|
338
|
+
# isolate node
|
339
|
+
f=""
|
340
|
+
f << "msetstr v=#{name},0,0,0,0,0,0,0 a=gid,deg,cc,degN,ccN,eEgo,eoEgo,nEgo i=#{isoFile[pno]} |"
|
341
|
+
f << "mcut f=gid,node:#{$nf},deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{isoFeatures[pno]}"
|
342
|
+
system(f)
|
343
|
+
|
344
|
+
genRscript(numFile[pno], feaFile[pno],scpFile[pno])
|
345
|
+
if args.bool("-verbose") then
|
346
|
+
system "R --vanilla -q < #{scpFile[pno]} "
|
347
|
+
else
|
348
|
+
system "R --vanilla -q --slave < #{scpFile[pno]} 2>/dev/null"
|
349
|
+
end
|
350
|
+
|
351
|
+
f=""
|
352
|
+
f << "mnullto f=0 v=seq -nfn i=#{feaFile[pno]} |"
|
353
|
+
f << "mcal c='${seq}-1' a=nid |"
|
354
|
+
f << "mjoin k=nid f=node m=#{mapFile[pno]} |"
|
355
|
+
f << "msetstr a=gid v=#{name} |"
|
356
|
+
f << "mcut f=gid,node:#{$nf},deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{nfPath[pno]}/#{name}"
|
357
|
+
system(f)
|
358
|
+
|
359
|
+
system "mcat i=#{nfPath[pno]}/#{name},#{isoFeatures[pno]} o=#{xxtmp[pno]}"
|
360
|
+
system "cp #{xxtmp[pno]} #{nfPath[pno]}/#{name}"
|
361
|
+
|
362
|
+
f=""
|
363
|
+
f << "msummary c=median,mean,usd,uskew,ukurt f=deg,cc,degN,ccN,eEgo,eoEgo,nEgo i=#{nfPath[pno]}/#{name} |"
|
364
|
+
f << "mfldname f=fld:feature |"
|
365
|
+
f << "msetstr v=value a=value |"
|
366
|
+
f << "mcross k=feature a=stat f=median,mean,usd,uskew,ukurt s=value |"
|
367
|
+
f << "msetstr a=gid v=#{name} |"
|
368
|
+
f << "mcut f=gid,feature,stat,value o=#{gfPath[pno]}/#{name}"
|
369
|
+
system(f)
|
370
|
+
|
371
|
+
}
|
372
|
+
|
373
|
+
gfStr=[]
|
374
|
+
gfPath.each{|path| gfStr << "#{path}/*" }
|
375
|
+
nfStr=[]
|
376
|
+
nfPath.each{|path| nfStr << "#{path}/*" }
|
377
|
+
|
378
|
+
f=""
|
379
|
+
f << "mcat i=#{gfStr.join(',')} |"
|
380
|
+
f << "mcut f=gid,feature,stat,value |"
|
381
|
+
f << "msortf f=gid,feature,stat o=#{gfFile}"
|
382
|
+
system(f)
|
383
|
+
|
384
|
+
f=""
|
385
|
+
f << "mcat i=#{nfStr.join(',')} |"
|
386
|
+
f << "msetstr v=value a=value |"
|
387
|
+
f << "mcross k=gid,#{$nf} s=value a=feature f=deg,cc,degN,ccN,eEgo,eoEgo,nEgo |"
|
388
|
+
f << "mcal c='$s{#{$nf}}+\"_\"+$s{feature}' a=fid |"
|
389
|
+
f << "mcut f=gid,fid,#{$nf},feature,value |"
|
390
|
+
f << "msortf f=gid,fid,#{$nf},feature o=#{nfFile}"
|
391
|
+
system(f)
|
392
|
+
|
393
|
+
#↓現状つかわていない
|
394
|
+
def getFeatures(file)
|
395
|
+
vector=[]
|
396
|
+
MCMD::Mcsvin.new("i=#{file}"){|csv|
|
397
|
+
csv.each{|flds|
|
398
|
+
vector << flds["value"]
|
399
|
+
}
|
400
|
+
}
|
401
|
+
return vector
|
402
|
+
end
|
403
|
+
|
404
|
+
features={}
|
405
|
+
names=[]
|
406
|
+
Dir.glob(gfStr).each{|file|
|
407
|
+
name=file.sub(/^.*\//,"")
|
408
|
+
vector=[]
|
409
|
+
MCMD::Mcsvin.new("i=#{file}"){|csv|
|
410
|
+
csv.each{|flds|
|
411
|
+
vector << flds["value"].to_f
|
412
|
+
}
|
413
|
+
}
|
414
|
+
if vector.size==35
|
415
|
+
names << name
|
416
|
+
features[name]=vector
|
417
|
+
else
|
418
|
+
MCMD::warningLog("internal warning: vector size must be 35, but #{vector.size} in file #{name}")
|
419
|
+
end
|
420
|
+
}
|
421
|
+
names.sort!
|
422
|
+
|
423
|
+
def canberraSim(p,q)
|
424
|
+
dist=0
|
425
|
+
(0...p.size).each{|i|
|
426
|
+
den=p[i].abs+q[i].abs
|
427
|
+
num=(p[i]-q[i]).abs
|
428
|
+
if den==0
|
429
|
+
dist += 0
|
430
|
+
else
|
431
|
+
dist += num/den
|
432
|
+
end
|
433
|
+
}
|
434
|
+
return 1.0-dist/p.size
|
435
|
+
end
|
436
|
+
|
437
|
+
#↓現状つかわていない
|
438
|
+
def svm(path,name1,name2)
|
439
|
+
wf=MCMD::Mtemp.new
|
440
|
+
xxds=wf.file
|
441
|
+
xxscp=wf.file
|
442
|
+
|
443
|
+
# gid,node,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
444
|
+
# 20000115,あう,39,0.777327935222672,104.384615384615,0.506364530021185,615,2880,538
|
445
|
+
# 20000115,ある,253,0.0989397076353598,28.7786561264822,0.844456619367387,3407,720,325
|
446
|
+
f=""
|
447
|
+
f << "mcat i=#{path}/#{name1},#{path}/#{name2} |"
|
448
|
+
f << "mcut f=gid,deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{xxds}"
|
449
|
+
system(f)
|
450
|
+
|
451
|
+
r_proc = <<EOF
|
452
|
+
library(kernlab)
|
453
|
+
library(mlbench)
|
454
|
+
d=read.csv("#{xxds}")
|
455
|
+
y=d$gid
|
456
|
+
x=as.matrix(d[,2:8])
|
457
|
+
model=ksvm(x,y,type="C-svc",kernel="vanilladot",cross=3)
|
458
|
+
print(model)
|
459
|
+
str(model)
|
460
|
+
EOF
|
461
|
+
|
462
|
+
File.open(xxscp,"w"){|fpw|
|
463
|
+
fpw.write(r_proc)
|
464
|
+
}
|
465
|
+
|
466
|
+
system "R --vanilla -q < #{xxscp} "
|
467
|
+
exit
|
468
|
+
prob1=0.1
|
469
|
+
prob2=0.9
|
470
|
+
return prob1,prob2
|
471
|
+
end
|
472
|
+
|
473
|
+
#↓現状つかわていない
|
474
|
+
def test(paths,name1,name2)
|
475
|
+
wf=MCMD::Mtemp.new
|
476
|
+
xxks=wf.file
|
477
|
+
xxwx=wf.file
|
478
|
+
xxscp=wf.file
|
479
|
+
|
480
|
+
# gid,node,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
481
|
+
# 20000115,あう,39,0.777327935222672,104.384615384615,0.506364530021185,615,2880,538
|
482
|
+
# 20000115,ある,253,0.0989397076353598,28.7786561264822,0.844456619367387,3407,720,325
|
483
|
+
r_proc = <<EOF
|
484
|
+
## reading edge file
|
485
|
+
d1=read.csv("#{path}/#{name1}")
|
486
|
+
d2=read.csv("#{path}/#{name2}")
|
487
|
+
|
488
|
+
ks_deg =ks.test(d1$deg , d2$deg ,exact=TRUE)
|
489
|
+
ks_cc =ks.test(d1$cc , d2$cc ,exact=TRUE)
|
490
|
+
ks_degN =ks.test(d1$degN , d2$degN ,exact=TRUE)
|
491
|
+
ks_ccN =ks.test(d1$ccN , d2$ccN ,exact=TRUE)
|
492
|
+
ks_eEgo =ks.test(d1$eEgo , d2$eEgo ,exact=TRUE)
|
493
|
+
ks_eoEgo=ks.test(d1$eoEgo, d2$eoEgo ,exact=TRUE)
|
494
|
+
ks_nEgo =ks.test(d1$nEgo , d2$nEgo ,exact=TRUE)
|
495
|
+
|
496
|
+
wx_deg =wilcox.test(d1$deg , d2$deg ,exact=TRUE)
|
497
|
+
wx_cc =wilcox.test(d1$cc , d2$cc ,exact=TRUE)
|
498
|
+
wx_degN =wilcox.test(d1$degN , d2$degN ,exact=TRUE)
|
499
|
+
wx_ccN =wilcox.test(d1$ccN , d2$ccN ,exact=TRUE)
|
500
|
+
wx_eEgo =wilcox.test(d1$eEgo , d2$eEgo ,exact=TRUE)
|
501
|
+
wx_eoEgo=wilcox.test(d1$eoEgo, d2$eoEgo ,exact=TRUE)
|
502
|
+
wx_nEgo =wilcox.test(d1$nEgo , d2$nEgo ,exact=TRUE)
|
503
|
+
|
504
|
+
ks_dat=data.frame(deg=ks_deg$p.value, cc=ks_cc$p.value, degN=ks_degN$p.value, ccN=ks_ccN$p.value, eEgo=ks_eEgo$p.value, eoEgo=ks_eoEgo$p.value, nEgo=ks_nEgo$p.value)
|
505
|
+
wx_dat=data.frame(deg=wx_deg$p.value, cc=wx_cc$p.value, degN=wx_degN$p.value, ccN=wx_ccN$p.value, eEgo=wx_eEgo$p.value, eoEgo=wx_eoEgo$p.value, nEgo=wx_nEgo$p.value)
|
506
|
+
print(ks_dat)
|
507
|
+
write.csv(ks_dat,file="#{xxks}",quote=FALSE,row.names=FALSE)
|
508
|
+
write.csv(wx_dat,file="#{xxwx}",quote=FALSE,row.names=FALSE)
|
509
|
+
EOF
|
510
|
+
|
511
|
+
File.open(xxscp,"w"){|fpw|
|
512
|
+
fpw.write(r_proc)
|
513
|
+
}
|
514
|
+
|
515
|
+
system "R --vanilla -q < #{xxscp} "
|
516
|
+
|
517
|
+
ksv=[]
|
518
|
+
MCMD::Mcsvin.new("i=#{xxks}"){|csv| csv.each{|flds|
|
519
|
+
ksv << name1
|
520
|
+
ksv << name2
|
521
|
+
ksv << flds["deg"]
|
522
|
+
ksv << flds["cc"]
|
523
|
+
ksv << flds["degN"]
|
524
|
+
ksv << flds["ccN"]
|
525
|
+
ksv << flds["eEgo"]
|
526
|
+
ksv << flds["eoEgo"]
|
527
|
+
ksv << flds["nEgo"]
|
528
|
+
}}
|
529
|
+
|
530
|
+
wxv=[]
|
531
|
+
MCMD::Mcsvin.new("i=#{xxwx}"){|csv| csv.each{|flds|
|
532
|
+
wxv << name1
|
533
|
+
wxv << name2
|
534
|
+
wxv << flds["deg"]
|
535
|
+
wxv << flds["cc"]
|
536
|
+
wxv << flds["degN"]
|
537
|
+
wxv << flds["ccN"]
|
538
|
+
wxv << flds["eEgo"]
|
539
|
+
wxv << flds["eoEgo"]
|
540
|
+
wxv << flds["nEgo"]
|
541
|
+
}}
|
542
|
+
return ksv,wxv
|
543
|
+
end
|
544
|
+
|
545
|
+
# skip calculation of similarity if mode=="features"
|
546
|
+
unless mode=="features" then
|
547
|
+
MCMD::Mcsvout.new("o=#{oPath}/similarity.csv f=gid1,gid2,similarity"){|oCSV|
|
548
|
+
MCMD::Mcsvout.new("o=#{oPath}/pvalues_ks.csv f=gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo"){|ksCSV|
|
549
|
+
MCMD::Mcsvout.new("o=#{oPath}/pvalues_wx.csv f=gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo"){|wxCSV|
|
550
|
+
(0...names.size-1).each{|i|
|
551
|
+
(i...names.size).each{|j|
|
552
|
+
next if i==j
|
553
|
+
next if mode=="sequence" and i+1!=j
|
554
|
+
MCMD::msgLog("START similarity calcuration: #{names[i]} and #{names[j]}")
|
555
|
+
g1=features[names[i]]
|
556
|
+
g2=features[names[j]]
|
557
|
+
sim=canberraSim(g1,g2)
|
558
|
+
#ks_pvalues,wx_pvalues=test(nfStr,names[i],names[j])
|
559
|
+
#prob1,prob2=svm(nfPath,names[i],names[j])
|
560
|
+
oCSV.write( [ names[i],names[j],sim ] )
|
561
|
+
#ksCSV.write( ks_pvalues )
|
562
|
+
#wxCSV.write( wx_pvalues )
|
563
|
+
}
|
564
|
+
}
|
565
|
+
}}}
|
566
|
+
end
|
567
|
+
|
568
|
+
#wf.rm
|
569
|
+
|
570
|
+
# end message
|
571
|
+
MCMD::endLog(args.cmdline)
|
572
|
+
|