nysol-mining 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/mnetsimile.rb
ADDED
@@ -0,0 +1,572 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# 1.0 initial development: 2015/02/03
|
5
|
+
# 1.1 multi-processing: 2015/03/01
|
6
|
+
# 1.2 bug fix (0 division, sequence) : 2015/03/01
|
7
|
+
$version="1.2"
|
8
|
+
$revision="###VERSION###"
|
9
|
+
|
10
|
+
def help
|
11
|
+
|
12
|
+
STDERR.puts <<EOF
|
13
|
+
----------------------------
|
14
|
+
mnetsimile.rb version #{$version}
|
15
|
+
----------------------------
|
16
|
+
概要) NetSimileによるグラフ特徴量の計算および複数グラフの類似度計算
|
17
|
+
特徴) 1) グラフの全節点について7つの特徴量を計算する。
|
18
|
+
2) 全接点の7つの特徴量について5つの統計量(中央値,平均,標準偏差,歪度,尖度)を計算しグラフの特徴量(35次元ベクトル)とする。
|
19
|
+
3) グラフ間類似度を上記35次元ベクトルのcanberraDistanceにより定義する。*注1)
|
20
|
+
4) 指定した全グラフペアの類似度を出力する。
|
21
|
+
|
22
|
+
参考文献) M. Berlingerio, D. Koutra, T. Eliassi-Rad, and C. Faloutsos,
|
23
|
+
“Netsimile: A scalable approach to size-independent network similarity,” CoRR, vol. abs/1209.2684, 2012.
|
24
|
+
|
25
|
+
用法) mnetsimile.rb i=|I= O= mode=sequence|allpairs|features -edge [T=] [-mcmdenv] [--help]
|
26
|
+
I= : 複数のグラフファイルを格納したディレクトリパス【必須】*注2)
|
27
|
+
nf= : 節点データ上の節点項目名(省略時は"node")
|
28
|
+
ef= : 枝データ上の2つの節点項目名(省略時は"node1,node2")
|
29
|
+
O= : 出力パス【必須】
|
30
|
+
mode= : 動作モード【オプション】
|
31
|
+
: sequence : ファイル名のアルファベット順のグラフシーケンスとして見なし
|
32
|
+
: 隣り合うグラフ同士のみを比較する。
|
33
|
+
: allpairs : 全グラフペアを比較する。
|
34
|
+
: features : 各グラフの特徴量のみ出力し、グラフ間の類似度は計算しない。
|
35
|
+
-edge : 節点ファイルは利用せず、枝ファイルのみを利用する(I=のディレクトリに節点ファイルはなくてもよい)
|
36
|
+
|
37
|
+
## その他
|
38
|
+
T= : 作業ディレクトリ【デフォルト:"/tmp"】
|
39
|
+
-mcmdenv : 内部のMCMDのコマンドメッセージを表示
|
40
|
+
--help : ヘルプの表示
|
41
|
+
|
42
|
+
*注1) グラフ間類似度(P,Q)=1.0-canberraDistance(P,Q)= 1.0 - sum_i^d \frac{|P_i-Q_i|}{(|P_i|+|Q_i|}
|
43
|
+
*注2) グラフファイルは節点ファイルと枝ファイルによって指定する。
|
44
|
+
それぞれファイルの拡張子は".node"、".edge"でなければならない。
|
45
|
+
|
46
|
+
必要なソフトウェア)
|
47
|
+
1) R
|
48
|
+
2) Rのigraphパッケージ
|
49
|
+
|
50
|
+
入力データ)
|
51
|
+
例: graphsディレクトリ内の4ファイル
|
52
|
+
g1.edge g1.node g2.edge g2.node
|
53
|
+
node1,node2 node node1,node2 node
|
54
|
+
E,J A E,J A
|
55
|
+
E,A B E,A B
|
56
|
+
J,D C J,D C
|
57
|
+
J,A D J,A D
|
58
|
+
J,H E J,H E
|
59
|
+
D,H F D,H F
|
60
|
+
D,F G H,F G
|
61
|
+
H,F H A,F H
|
62
|
+
A,F I B,H I
|
63
|
+
B,H J J
|
64
|
+
|
65
|
+
# 以下のコマンドを実行することで得られる出力ファイル群
|
66
|
+
$ mnetsimile.rb I=graphs O=result
|
67
|
+
|
68
|
+
出力データ1) グラフ別35の特徴量(7特徴量×5統計量)
|
69
|
+
featruesGraph.csv (featureおよびstat項目の値の意味は後述)
|
70
|
+
gid,feature,stat,value
|
71
|
+
g1,cc,median,0.3333333333
|
72
|
+
g1,cc,mean,0.4285714286
|
73
|
+
g1,cc,usd,0.3170632437
|
74
|
+
g1,cc,uskew,0.8631849195
|
75
|
+
g1,cc,ukurt,1.244875346
|
76
|
+
g1,ccN,median,0.3333333333
|
77
|
+
g1,ccN,mean,0.4166666667
|
78
|
+
:
|
79
|
+
dat2,cc,median,0.3333333333
|
80
|
+
dat2,cc,mean,0.4047619048
|
81
|
+
dat2,cc,usd,0.4287918305
|
82
|
+
:
|
83
|
+
|
84
|
+
注) gid: グラフID(拡張子を除いたファイル名)
|
85
|
+
|
86
|
+
出力データ2) グラフ+節点別の7特徴量
|
87
|
+
featruesNode.csv (feature項目の値の意味は後述)
|
88
|
+
gid%0,fid,node%1,feature,value
|
89
|
+
dat1,A_deg,A,deg,3
|
90
|
+
dat1,A_cc,A,cc,0.333333333333333
|
91
|
+
dat1,A_degN,A,degN,3
|
92
|
+
dat1,A_ccN,A,ccN,0.555555555555556
|
93
|
+
:
|
94
|
+
注) fid項目は、node項目とfeature項目を結合した項目
|
95
|
+
注) 孤立節点(他の節点と接続のない1つの節点)の特徴量は全て0と定義する。
|
96
|
+
孤立節点を無視したければ-edgeオプションを指定して実行すれば良い。
|
97
|
+
|
98
|
+
出力データ3) グラフ間類似度行列
|
99
|
+
similarity.csv(特徴量35次元ベクトルのcanberraSimilarity
|
100
|
+
gid1,gid2,similarity
|
101
|
+
dat1,dat2,0.6593442055
|
102
|
+
|
103
|
+
出力データ4) 7つの類似度について、グラフ間比較における有意確率(サンプルは節点)
|
104
|
+
pvalue_ks.csv : two-sample Kolmogorov-Smirnov test (分布の差の検定)
|
105
|
+
gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
106
|
+
dat1,dat2,0.937502699053248,0.937502699053248,0.937502699053248,0.541243098374871,0.937502699053248,0.937502699053248,0.937502699053248
|
107
|
+
|
108
|
+
pvalue_wx.csv : two-sample Wilcoxon test (中央値の差の検定)
|
109
|
+
gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
110
|
+
dat1,dat2,0.64316810166757,0.687070906822053,0.846328946368719,0.257155612551595,0.436141208362552,0.429488461717429,0.62639648401305
|
111
|
+
|
112
|
+
7つの特徴量の項目名:
|
113
|
+
deg : 次数
|
114
|
+
cc : クラスタ係数
|
115
|
+
degN : 近傍節点の平均次数
|
116
|
+
ccN : 近傍節点の平均クラスタ係数
|
117
|
+
eEgo : egoネットワークの枝数
|
118
|
+
eoEgo : egoネットワークに接続された枝数
|
119
|
+
nEgo : egoネットワークに接続された節点数
|
120
|
+
|
121
|
+
注) 詳細な定義は、上述の参考文献を参照のこと。
|
122
|
+
|
123
|
+
5つの統計量
|
124
|
+
median : 中央値
|
125
|
+
mean : 平均
|
126
|
+
usd : 標準偏差
|
127
|
+
uskew : 歪度
|
128
|
+
ukurt : 尖度
|
129
|
+
|
130
|
+
例)
|
131
|
+
$ mnetsimile.rb I=graphs O=result mode=sequence ef=v1,v2 nf=v
|
132
|
+
|
133
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
134
|
+
EOF
|
135
|
+
exit
|
136
|
+
end
|
137
|
+
|
138
|
+
def ver()
|
139
|
+
$revision ="0" if $revision =~ /VERSION/
|
140
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
141
|
+
exit
|
142
|
+
end
|
143
|
+
|
144
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
145
|
+
ver() if ARGV[0]=="--version"
|
146
|
+
|
147
|
+
require "rubygems"
|
148
|
+
require "nysol/mcmd"
|
149
|
+
|
150
|
+
# Rライブラリ実行可能確認
|
151
|
+
exit(1) unless(MCMD::chkRexe("igraph"))
|
152
|
+
|
153
|
+
def genRtest(eFile,oFile,scpFile)
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
|
157
|
+
def genRscript(eFile,oFile,scpFile)
|
158
|
+
r_proc = <<EOF
|
159
|
+
library(igraph)
|
160
|
+
## reading edge file
|
161
|
+
g=read.graph("#{eFile}",format="edgelist",directed=FALSE)
|
162
|
+
|
163
|
+
#### (1) d_i : degree of node_i
|
164
|
+
deg=degree(g)
|
165
|
+
|
166
|
+
#### (2) c_i : clustering coefficient of node_i
|
167
|
+
cc=transitivity(g,type="local",isolates="zero")
|
168
|
+
|
169
|
+
## neighbors list with order 1 or 2(1 hop or 2 hops) of node_i
|
170
|
+
nei1=neighborhood(g,order=1)
|
171
|
+
nei2=neighborhood(g,order=2)
|
172
|
+
|
173
|
+
## delete myself(first element) from the 1 hope list
|
174
|
+
delFirst=function(x){x[-1]}
|
175
|
+
nei1del=sapply(nei1,FUN=delFirst)
|
176
|
+
|
177
|
+
#### (3) d_{N(i)} : mean of degree for neighbors of node_i
|
178
|
+
f=function(x){mean(deg[unlist(x)])}
|
179
|
+
degN=sapply(nei1del,FUN=f)
|
180
|
+
|
181
|
+
#### (4) c_{N(i)} : mean of clustering coefficient for neighbors of node_i
|
182
|
+
f=function(x){mean(cc[unlist(x)])}
|
183
|
+
ccN=sapply(nei1del,FUN=f)
|
184
|
+
|
185
|
+
## get ego-network of node_i
|
186
|
+
f=function(x){induced.subgraph(g,vids=unlist(x))}
|
187
|
+
ego=lapply(nei1,FUN=f)
|
188
|
+
|
189
|
+
#### (5) E_{Ego(i)} : number of edges for egonetwork of node_i
|
190
|
+
eEgo=sapply(ego,FUN=ecount)
|
191
|
+
|
192
|
+
#### (6) E^o_{Ego(i)} : number of edges outgoing from egonetwork of node_i
|
193
|
+
f=function(x){
|
194
|
+
# induced subgraph by 1 hop neighbor
|
195
|
+
isg1=induced.subgraph(g,vids=unlist(nei1[x]))
|
196
|
+
# induced subgraph by 2 hops neighbor
|
197
|
+
isg2=induced.subgraph(g,vids=unlist(nei2[x]))
|
198
|
+
# difference 2hops neighbor from 1 hop neighbor
|
199
|
+
isgDiff=induced.subgraph(g,vids=setdiff(unlist(nei2[x]),unlist(nei1[x])))
|
200
|
+
length(E(isg2))-length(E(isg1))-length(E(isgDiff))
|
201
|
+
}
|
202
|
+
eoEgo=sapply(rep(1:vcount(g)),FUN=f)
|
203
|
+
|
204
|
+
#### (7) N(Ego(i)) : number of nodes for neighbors of egonetwork_i
|
205
|
+
f=function(x){length(setdiff(unlist(nei2[x]),unlist(nei1[x])))}
|
206
|
+
nEgo=sapply(rep(1:vcount(g)),FUN=f)
|
207
|
+
|
208
|
+
dat=data.frame(deg=deg, cc=cc, degN=degN, ccN=ccN, eEgo=eEgo, eoEgo=eoEgo, nEgo=nEgo)
|
209
|
+
write.csv(dat,file="#{oFile}",quote=FALSE)
|
210
|
+
EOF
|
211
|
+
|
212
|
+
File.open(scpFile,"w"){|fpw|
|
213
|
+
fpw.write(r_proc)
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
def conv2num(baseName,edgeFlg,numFile,mapFile,isoFile)
|
218
|
+
nFile="#{baseName}.node"
|
219
|
+
eFile="#{baseName}.edge"
|
220
|
+
|
221
|
+
wf=MCMD::Mtemp.new
|
222
|
+
xxn1=wf.file
|
223
|
+
xxn2=wf.file
|
224
|
+
xxn3=wf.file
|
225
|
+
xxeNodeMF=wf.file
|
226
|
+
|
227
|
+
# create a nodes list that are included in node and edge data
|
228
|
+
system "mcut f=#{$ef1}:node i=#{eFile} o=#{xxn1}"
|
229
|
+
system "mcut f=#{$ef2}:node i=#{eFile} o=#{xxn2}"
|
230
|
+
if edgeFlg or not File.exists?(nFile)
|
231
|
+
system "echo node >#{xxn3}"
|
232
|
+
else
|
233
|
+
system "mcut f=#{$nf}:node i=#{nFile} o=#{xxn3}"
|
234
|
+
end
|
235
|
+
|
236
|
+
# xxeNodeMF : nodes list that are included in edge
|
237
|
+
system "mcat i=#{xxn1},#{xxn2} | muniq k=node | msetstr v=1 a=eNode o=#{xxeNodeMF}"
|
238
|
+
|
239
|
+
# isolate nodes list
|
240
|
+
system "mcat i=#{xxn1},#{xxn2},#{xxn3} | mcommon k=node m=#{xxeNodeMF} -r o=#{isoFile}"
|
241
|
+
|
242
|
+
# create a mapping table between the original node label and the number iGraph will use
|
243
|
+
f=""
|
244
|
+
f << "mcat i=#{xxn1},#{xxn2},#{xxn3} |"
|
245
|
+
f << "muniq k=node |"
|
246
|
+
f << "mjoin k=node m=#{xxeNodeMF} f=eNode |"
|
247
|
+
f << "mnullto f=eNode v=0 |"
|
248
|
+
f << "mnumber s=eNode%r,node a=nid o=#{mapFile}"
|
249
|
+
system(f)
|
250
|
+
|
251
|
+
# create a data file that R script read
|
252
|
+
f=""
|
253
|
+
f << "mjoin k=#{$ef1} K=node m=#{mapFile} f=nid:nid1 i=#{eFile} |"
|
254
|
+
f << "mjoin k=#{$ef2} K=node m=#{mapFile} f=nid:nid2 |"
|
255
|
+
f << "mcut f=nid1,nid2 -nfno |"
|
256
|
+
f << "tr ',' ' ' >#{numFile}"
|
257
|
+
system(f)
|
258
|
+
end
|
259
|
+
|
260
|
+
|
261
|
+
#################################################################################################
|
262
|
+
#### Entry point
|
263
|
+
|
264
|
+
args=MCMD::Margs.new(ARGV,"i=,I=,ef=,nf=,O=,mode=,-edge,mp=,T=,-verbose,T=","O=")
|
265
|
+
|
266
|
+
# mcmdのメッセージは警告とエラーのみ
|
267
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
268
|
+
ENV["KG_ScpVerboseLevel"]="3" unless args.bool("-verbose")
|
269
|
+
|
270
|
+
#ワークファイルパス
|
271
|
+
if args.str("T=")!=nil then
|
272
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
273
|
+
end
|
274
|
+
|
275
|
+
iPath = args.file("I=","r")
|
276
|
+
oPath = args.file("O=","w")
|
277
|
+
mode = args.str("mode=","allpairs")
|
278
|
+
# ---- edge field names (two nodes)
|
279
|
+
ef = args.str("ef=", "node1,node2")
|
280
|
+
ef = ef.split(",")
|
281
|
+
if ef.size!=2 then
|
282
|
+
raise "#ERROR# ef= must take two field names"
|
283
|
+
end
|
284
|
+
$ef1=ef[0]
|
285
|
+
$ef2=ef[1]
|
286
|
+
|
287
|
+
# ---- node field name
|
288
|
+
nf = args.str("nf=","node")
|
289
|
+
$nf=nf
|
290
|
+
|
291
|
+
edgeFlg=args.bool("-edge")
|
292
|
+
$mp=args.int("mp=",1)
|
293
|
+
|
294
|
+
nfFile="#{oPath}/featuresNode.csv"
|
295
|
+
gfFile="#{oPath}/featuresGraph.csv"
|
296
|
+
MCMD::mkDir(oPath)
|
297
|
+
|
298
|
+
wf=MCMD::Mtemp.new
|
299
|
+
numFile=Array.new($mp)
|
300
|
+
mapFile=Array.new($mp)
|
301
|
+
scpFile=Array.new($mp)
|
302
|
+
feaFile=Array.new($mp)
|
303
|
+
isoFile=Array.new($mp)
|
304
|
+
isoFeatures=Array.new($mp)
|
305
|
+
xxtmp=Array.new($mp)
|
306
|
+
|
307
|
+
nfPath=Array.new($mp)
|
308
|
+
gfPath=Array.new($mp)
|
309
|
+
|
310
|
+
(0...$mp).each{|i|
|
311
|
+
numFile[i]=wf.file
|
312
|
+
mapFile[i]=wf.file
|
313
|
+
scpFile[i]=wf.file
|
314
|
+
feaFile[i]=wf.file
|
315
|
+
isoFile[i]=wf.file
|
316
|
+
|
317
|
+
isoFeatures[i]=wf.file
|
318
|
+
nfPath[i] =wf.file
|
319
|
+
xxtmp[i] =wf.file
|
320
|
+
gfPath[i] =wf.file
|
321
|
+
|
322
|
+
MCMD::mkDir(nfPath[i])
|
323
|
+
MCMD::mkDir(gfPath[i])
|
324
|
+
}
|
325
|
+
|
326
|
+
|
327
|
+
files = Dir["#{iPath}/*.edge"]
|
328
|
+
files.sort! # 419行目のnames.sortと同じ順番を保証するため
|
329
|
+
files.meach($mp){|file,count,pno|
|
330
|
+
|
331
|
+
MCMD::msgLog("START fearture extraction: #{file} #{pno} #{isoFeatures[pno]}")
|
332
|
+
|
333
|
+
baseName=file.sub(/\.edge$/,"")
|
334
|
+
name=baseName.sub(/^.*\//,"")
|
335
|
+
|
336
|
+
conv2num(baseName,edgeFlg,numFile[pno],mapFile[pno],isoFile[pno])
|
337
|
+
|
338
|
+
# isolate node
|
339
|
+
f=""
|
340
|
+
f << "msetstr v=#{name},0,0,0,0,0,0,0 a=gid,deg,cc,degN,ccN,eEgo,eoEgo,nEgo i=#{isoFile[pno]} |"
|
341
|
+
f << "mcut f=gid,node:#{$nf},deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{isoFeatures[pno]}"
|
342
|
+
system(f)
|
343
|
+
|
344
|
+
genRscript(numFile[pno], feaFile[pno],scpFile[pno])
|
345
|
+
if args.bool("-verbose") then
|
346
|
+
system "R --vanilla -q < #{scpFile[pno]} "
|
347
|
+
else
|
348
|
+
system "R --vanilla -q --slave < #{scpFile[pno]} 2>/dev/null"
|
349
|
+
end
|
350
|
+
|
351
|
+
f=""
|
352
|
+
f << "mnullto f=0 v=seq -nfn i=#{feaFile[pno]} |"
|
353
|
+
f << "mcal c='${seq}-1' a=nid |"
|
354
|
+
f << "mjoin k=nid f=node m=#{mapFile[pno]} |"
|
355
|
+
f << "msetstr a=gid v=#{name} |"
|
356
|
+
f << "mcut f=gid,node:#{$nf},deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{nfPath[pno]}/#{name}"
|
357
|
+
system(f)
|
358
|
+
|
359
|
+
system "mcat i=#{nfPath[pno]}/#{name},#{isoFeatures[pno]} o=#{xxtmp[pno]}"
|
360
|
+
system "cp #{xxtmp[pno]} #{nfPath[pno]}/#{name}"
|
361
|
+
|
362
|
+
f=""
|
363
|
+
f << "msummary c=median,mean,usd,uskew,ukurt f=deg,cc,degN,ccN,eEgo,eoEgo,nEgo i=#{nfPath[pno]}/#{name} |"
|
364
|
+
f << "mfldname f=fld:feature |"
|
365
|
+
f << "msetstr v=value a=value |"
|
366
|
+
f << "mcross k=feature a=stat f=median,mean,usd,uskew,ukurt s=value |"
|
367
|
+
f << "msetstr a=gid v=#{name} |"
|
368
|
+
f << "mcut f=gid,feature,stat,value o=#{gfPath[pno]}/#{name}"
|
369
|
+
system(f)
|
370
|
+
|
371
|
+
}
|
372
|
+
|
373
|
+
gfStr=[]
|
374
|
+
gfPath.each{|path| gfStr << "#{path}/*" }
|
375
|
+
nfStr=[]
|
376
|
+
nfPath.each{|path| nfStr << "#{path}/*" }
|
377
|
+
|
378
|
+
f=""
|
379
|
+
f << "mcat i=#{gfStr.join(',')} |"
|
380
|
+
f << "mcut f=gid,feature,stat,value |"
|
381
|
+
f << "msortf f=gid,feature,stat o=#{gfFile}"
|
382
|
+
system(f)
|
383
|
+
|
384
|
+
f=""
|
385
|
+
f << "mcat i=#{nfStr.join(',')} |"
|
386
|
+
f << "msetstr v=value a=value |"
|
387
|
+
f << "mcross k=gid,#{$nf} s=value a=feature f=deg,cc,degN,ccN,eEgo,eoEgo,nEgo |"
|
388
|
+
f << "mcal c='$s{#{$nf}}+\"_\"+$s{feature}' a=fid |"
|
389
|
+
f << "mcut f=gid,fid,#{$nf},feature,value |"
|
390
|
+
f << "msortf f=gid,fid,#{$nf},feature o=#{nfFile}"
|
391
|
+
system(f)
|
392
|
+
|
393
|
+
#↓現状つかわていない
|
394
|
+
def getFeatures(file)
|
395
|
+
vector=[]
|
396
|
+
MCMD::Mcsvin.new("i=#{file}"){|csv|
|
397
|
+
csv.each{|flds|
|
398
|
+
vector << flds["value"]
|
399
|
+
}
|
400
|
+
}
|
401
|
+
return vector
|
402
|
+
end
|
403
|
+
|
404
|
+
features={}
|
405
|
+
names=[]
|
406
|
+
Dir.glob(gfStr).each{|file|
|
407
|
+
name=file.sub(/^.*\//,"")
|
408
|
+
vector=[]
|
409
|
+
MCMD::Mcsvin.new("i=#{file}"){|csv|
|
410
|
+
csv.each{|flds|
|
411
|
+
vector << flds["value"].to_f
|
412
|
+
}
|
413
|
+
}
|
414
|
+
if vector.size==35
|
415
|
+
names << name
|
416
|
+
features[name]=vector
|
417
|
+
else
|
418
|
+
MCMD::warningLog("internal warning: vector size must be 35, but #{vector.size} in file #{name}")
|
419
|
+
end
|
420
|
+
}
|
421
|
+
names.sort!
|
422
|
+
|
423
|
+
def canberraSim(p,q)
|
424
|
+
dist=0
|
425
|
+
(0...p.size).each{|i|
|
426
|
+
den=p[i].abs+q[i].abs
|
427
|
+
num=(p[i]-q[i]).abs
|
428
|
+
if den==0
|
429
|
+
dist += 0
|
430
|
+
else
|
431
|
+
dist += num/den
|
432
|
+
end
|
433
|
+
}
|
434
|
+
return 1.0-dist/p.size
|
435
|
+
end
|
436
|
+
|
437
|
+
#↓現状つかわていない
|
438
|
+
def svm(path,name1,name2)
|
439
|
+
wf=MCMD::Mtemp.new
|
440
|
+
xxds=wf.file
|
441
|
+
xxscp=wf.file
|
442
|
+
|
443
|
+
# gid,node,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
444
|
+
# 20000115,あう,39,0.777327935222672,104.384615384615,0.506364530021185,615,2880,538
|
445
|
+
# 20000115,ある,253,0.0989397076353598,28.7786561264822,0.844456619367387,3407,720,325
|
446
|
+
f=""
|
447
|
+
f << "mcat i=#{path}/#{name1},#{path}/#{name2} |"
|
448
|
+
f << "mcut f=gid,deg,cc,degN,ccN,eEgo,eoEgo,nEgo o=#{xxds}"
|
449
|
+
system(f)
|
450
|
+
|
451
|
+
r_proc = <<EOF
|
452
|
+
library(kernlab)
|
453
|
+
library(mlbench)
|
454
|
+
d=read.csv("#{xxds}")
|
455
|
+
y=d$gid
|
456
|
+
x=as.matrix(d[,2:8])
|
457
|
+
model=ksvm(x,y,type="C-svc",kernel="vanilladot",cross=3)
|
458
|
+
print(model)
|
459
|
+
str(model)
|
460
|
+
EOF
|
461
|
+
|
462
|
+
File.open(xxscp,"w"){|fpw|
|
463
|
+
fpw.write(r_proc)
|
464
|
+
}
|
465
|
+
|
466
|
+
system "R --vanilla -q < #{xxscp} "
|
467
|
+
exit
|
468
|
+
prob1=0.1
|
469
|
+
prob2=0.9
|
470
|
+
return prob1,prob2
|
471
|
+
end
|
472
|
+
|
473
|
+
#↓現状つかわていない
|
474
|
+
def test(paths,name1,name2)
|
475
|
+
wf=MCMD::Mtemp.new
|
476
|
+
xxks=wf.file
|
477
|
+
xxwx=wf.file
|
478
|
+
xxscp=wf.file
|
479
|
+
|
480
|
+
# gid,node,deg,cc,degN,ccN,eEgo,eoEgo,nEgo
|
481
|
+
# 20000115,あう,39,0.777327935222672,104.384615384615,0.506364530021185,615,2880,538
|
482
|
+
# 20000115,ある,253,0.0989397076353598,28.7786561264822,0.844456619367387,3407,720,325
|
483
|
+
r_proc = <<EOF
|
484
|
+
## reading edge file
|
485
|
+
d1=read.csv("#{path}/#{name1}")
|
486
|
+
d2=read.csv("#{path}/#{name2}")
|
487
|
+
|
488
|
+
ks_deg =ks.test(d1$deg , d2$deg ,exact=TRUE)
|
489
|
+
ks_cc =ks.test(d1$cc , d2$cc ,exact=TRUE)
|
490
|
+
ks_degN =ks.test(d1$degN , d2$degN ,exact=TRUE)
|
491
|
+
ks_ccN =ks.test(d1$ccN , d2$ccN ,exact=TRUE)
|
492
|
+
ks_eEgo =ks.test(d1$eEgo , d2$eEgo ,exact=TRUE)
|
493
|
+
ks_eoEgo=ks.test(d1$eoEgo, d2$eoEgo ,exact=TRUE)
|
494
|
+
ks_nEgo =ks.test(d1$nEgo , d2$nEgo ,exact=TRUE)
|
495
|
+
|
496
|
+
wx_deg =wilcox.test(d1$deg , d2$deg ,exact=TRUE)
|
497
|
+
wx_cc =wilcox.test(d1$cc , d2$cc ,exact=TRUE)
|
498
|
+
wx_degN =wilcox.test(d1$degN , d2$degN ,exact=TRUE)
|
499
|
+
wx_ccN =wilcox.test(d1$ccN , d2$ccN ,exact=TRUE)
|
500
|
+
wx_eEgo =wilcox.test(d1$eEgo , d2$eEgo ,exact=TRUE)
|
501
|
+
wx_eoEgo=wilcox.test(d1$eoEgo, d2$eoEgo ,exact=TRUE)
|
502
|
+
wx_nEgo =wilcox.test(d1$nEgo , d2$nEgo ,exact=TRUE)
|
503
|
+
|
504
|
+
ks_dat=data.frame(deg=ks_deg$p.value, cc=ks_cc$p.value, degN=ks_degN$p.value, ccN=ks_ccN$p.value, eEgo=ks_eEgo$p.value, eoEgo=ks_eoEgo$p.value, nEgo=ks_nEgo$p.value)
|
505
|
+
wx_dat=data.frame(deg=wx_deg$p.value, cc=wx_cc$p.value, degN=wx_degN$p.value, ccN=wx_ccN$p.value, eEgo=wx_eEgo$p.value, eoEgo=wx_eoEgo$p.value, nEgo=wx_nEgo$p.value)
|
506
|
+
print(ks_dat)
|
507
|
+
write.csv(ks_dat,file="#{xxks}",quote=FALSE,row.names=FALSE)
|
508
|
+
write.csv(wx_dat,file="#{xxwx}",quote=FALSE,row.names=FALSE)
|
509
|
+
EOF
|
510
|
+
|
511
|
+
File.open(xxscp,"w"){|fpw|
|
512
|
+
fpw.write(r_proc)
|
513
|
+
}
|
514
|
+
|
515
|
+
system "R --vanilla -q < #{xxscp} "
|
516
|
+
|
517
|
+
ksv=[]
|
518
|
+
MCMD::Mcsvin.new("i=#{xxks}"){|csv| csv.each{|flds|
|
519
|
+
ksv << name1
|
520
|
+
ksv << name2
|
521
|
+
ksv << flds["deg"]
|
522
|
+
ksv << flds["cc"]
|
523
|
+
ksv << flds["degN"]
|
524
|
+
ksv << flds["ccN"]
|
525
|
+
ksv << flds["eEgo"]
|
526
|
+
ksv << flds["eoEgo"]
|
527
|
+
ksv << flds["nEgo"]
|
528
|
+
}}
|
529
|
+
|
530
|
+
wxv=[]
|
531
|
+
MCMD::Mcsvin.new("i=#{xxwx}"){|csv| csv.each{|flds|
|
532
|
+
wxv << name1
|
533
|
+
wxv << name2
|
534
|
+
wxv << flds["deg"]
|
535
|
+
wxv << flds["cc"]
|
536
|
+
wxv << flds["degN"]
|
537
|
+
wxv << flds["ccN"]
|
538
|
+
wxv << flds["eEgo"]
|
539
|
+
wxv << flds["eoEgo"]
|
540
|
+
wxv << flds["nEgo"]
|
541
|
+
}}
|
542
|
+
return ksv,wxv
|
543
|
+
end
|
544
|
+
|
545
|
+
# skip calculation of similarity if mode=="features"
|
546
|
+
unless mode=="features" then
|
547
|
+
MCMD::Mcsvout.new("o=#{oPath}/similarity.csv f=gid1,gid2,similarity"){|oCSV|
|
548
|
+
MCMD::Mcsvout.new("o=#{oPath}/pvalues_ks.csv f=gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo"){|ksCSV|
|
549
|
+
MCMD::Mcsvout.new("o=#{oPath}/pvalues_wx.csv f=gid1,gid2,deg,cc,degN,ccN,eEgo,eoEgo,nEgo"){|wxCSV|
|
550
|
+
(0...names.size-1).each{|i|
|
551
|
+
(i...names.size).each{|j|
|
552
|
+
next if i==j
|
553
|
+
next if mode=="sequence" and i+1!=j
|
554
|
+
MCMD::msgLog("START similarity calcuration: #{names[i]} and #{names[j]}")
|
555
|
+
g1=features[names[i]]
|
556
|
+
g2=features[names[j]]
|
557
|
+
sim=canberraSim(g1,g2)
|
558
|
+
#ks_pvalues,wx_pvalues=test(nfStr,names[i],names[j])
|
559
|
+
#prob1,prob2=svm(nfPath,names[i],names[j])
|
560
|
+
oCSV.write( [ names[i],names[j],sim ] )
|
561
|
+
#ksCSV.write( ks_pvalues )
|
562
|
+
#wxCSV.write( wx_pvalues )
|
563
|
+
}
|
564
|
+
}
|
565
|
+
}}}
|
566
|
+
end
|
567
|
+
|
568
|
+
#wf.rm
|
569
|
+
|
570
|
+
# end message
|
571
|
+
MCMD::endLog(args.cmdline)
|
572
|
+
|