nysol-mining 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,345 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require "rubygems"
5
+ require "nysol/mcmd"
6
+
7
+ $version="1.0"
8
+ $revision="###VERSION###"
9
+ CMD="mnewman.rb"
10
+
11
+ def help
12
+ STDERR.puts <<EOF
13
+ ----------------------------
14
+ #{CMD} version #{$version}
15
+ ----------------------------
16
+ 概要) newman クラスタリング
17
+ 特徴) 1) modularityの最適化を用いたクラスタリングが実施できる。
18
+ 2) 辺の媒介中心性を利用したグラフ分割によるクラスタリングが実施できる。
19
+ 書式) #{CMD} ei= ef= ni= [nf=] [ew=] [al=] o= [-directed]
20
+
21
+ ei= : 枝データファイル
22
+ ef= : 枝データ上の2つの節点項目名
23
+ ni= : 節点データファイル
24
+ nf= : 節点データ上の節点項目名
25
+ ew= : 枝ファイル上の重み項目名【省略時は全ての枝の重みを1と見なす】
26
+ al= : クラスタリングアルゴリズム。省略時はmoが選択される。
27
+ mo:(modularity optimization) modularityを最適化するための貪欲法によるクラスタリング
28
+ 無向グラフでのみ指定可能。igraphのcluster_fast_greedyを利用
29
+ eb:(edge betweenness) 辺の媒介中心性を計算し最もそれが高い辺を取り除くことでグラフを分割する。
30
+ 分割数はmodurarityが最大となるように決定される。igraphのcluster_edge_betweennessを利用
31
+ -directed : 有向グラフ
32
+ o= : クラスタ
33
+
34
+ その他
35
+ T= : ワークディレクトリ(default:/tmp)
36
+ -verbose : show the END messages of MCMD and R used in this command
37
+ --help : ヘルプの表示
38
+
39
+ 必要なソフトウェア)
40
+ 1) R
41
+ 2) igraph package for R
42
+
43
+ 入力データ)
44
+ 節点ペアのCSVファイル(ファイル名はei=にて指定)
45
+ 例)
46
+ $ cat data/dat1.edge
47
+ n1,n2
48
+ a,b
49
+ a,c
50
+ a,d
51
+ a,e
52
+ a,f
53
+ a,g
54
+ b,c
55
+ b,d
56
+ b,e
57
+ b,f
58
+ c,h
59
+ d,g
60
+ e,f
61
+
62
+ $ cat data/dat.node
63
+ node
64
+ a
65
+ b
66
+ c
67
+ d
68
+ e
69
+ f
70
+ g
71
+
72
+ ${CMD} ei=data/dat1.edge ef=n1,n2 al=mo o=rsl01
73
+ #END# mnewman.rb ei=./data/dat1.edge ef=n1,n2 al=mo o=rsl01; 2016/01/24 01:54:25
74
+
75
+ $ cat rsl01
76
+ node,cls
77
+ a,2
78
+ b,1
79
+ c,3
80
+ d,2
81
+ e,1
82
+ f,1
83
+ g,2
84
+ h,3
85
+
86
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
87
+ EOF
88
+ exit
89
+ end
90
+
91
+ def ver()
92
+ $revision ="0" if $revision =~ /VERSION/
93
+ STDERR.puts "version #{$version} revision #{$revision}"
94
+ exit
95
+ end
96
+
97
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
98
+ ver() if ARGV[0]=="--version"
99
+
100
+ # confirm if R library is installed
101
+ exit(1) unless(MCMD::chkRexe("igraph"))
102
+
103
+
104
+ ####
105
+ # converting original graph file with text to one with integer
106
+ # output #{numFile} and #{mapFile}, then return the number of nodes of the graph
107
+ #
108
+ # ei ni xxnum xxmap
109
+ # v1,v2 v node%1,flag%0,num
110
+ # E,J A 0 3 A,0,0
111
+ # E,A B 0 4 B,0,1
112
+ # J,D C 0 6 D,0,2
113
+ # J,A D => 1 5 E,0,3
114
+ # J,H E 2 4 F,0,4
115
+ # D,H F 2 5 H,0,5
116
+ # D,F G 2 6 J,0,6
117
+ # H,F H 3 6 C,1,7
118
+ # A,F I 4 5 G,1,8
119
+ # B,H J 5 6 I,1,9
120
+ #
121
+ # return value is 10 (nodes)
122
+ # "flag" on xxmap: 0:nodes in "ei", 1:nodes only in "ni".
123
+ def g2pair(ni,nf,ei,ef1,ef2,ew,numFile,mapFile,weightFile)
124
+ #MCMD::msgLog("converting graph files into a pair of numbered nodes ...")
125
+ wf=MCMD::Mtemp.new
126
+ wf1=wf.file
127
+ wf2=wf.file
128
+ wf3=wf.file
129
+
130
+ system "mcut f=#{ef1}:node i=#{ei} | msetstr v=0 a=flag o=#{wf1}"
131
+ system "mcut f=#{ef2}:node i=#{ei} | msetstr v=0 a=flag o=#{wf2}"
132
+ system "mcut f=#{nf}:node i=#{ni} | msetstr v=1 a=flag o=#{wf3}" if nf
133
+
134
+ f=""
135
+ if nf
136
+ f << "mcat i=#{wf1},#{wf2},#{wf3} f=node,flag |"
137
+ f << "mbest k=node s=flag from=0 size=1 |"
138
+ else
139
+ f << "mcat i=#{wf1},#{wf2} f=node,flag |"
140
+ f << "muniq k=node |"
141
+ end
142
+ # isolated nodes are set to the end of position in mapping file.
143
+ # S= must start from 0 (but inside R vertex number will be added one)
144
+ f << "mnumber s=flag,node a=num S=0 o=#{mapFile}"
145
+ system(f)
146
+
147
+ f=""
148
+ f << "mcut f=#{ef1},#{ef2} i=#{ei} |"
149
+ f << "msortf f=#{ef1} |"
150
+ f << "mjoin k=#{ef1} K=node m=#{mapFile} f=num:num1 |"
151
+ f << "msortf f=#{ef2} |"
152
+ f << "mjoin k=#{ef2} K=node m=#{mapFile} f=num:num2 |"
153
+ f << "mcut f=num1,num2 |"
154
+ f << "mfsort f=num1,num2 |"
155
+ f << "msortf f=num1%n,num2%n -nfno | tr ',' ' ' >#{numFile}"
156
+ system(f)
157
+
158
+ nodeSize=MCMD::mrecount("i=#{mapFile}")
159
+
160
+ if ew
161
+ system "mcut f=#{ew} i=#{ei} o=#{weightFile}"
162
+ else
163
+ ew="weight"
164
+ system "msetstr v=1 a=#{ew} i=#{ei} |mcut f=#{ew} o=#{weightFile}"
165
+ end
166
+
167
+ return nodeSize
168
+ end
169
+
170
+ def convOrg(xxmap,xxout,ofile)
171
+
172
+ wf=MCMD::Mtemp.new
173
+ xx1=wf.file
174
+ xx2=wf.file
175
+
176
+ system "mnumber S=0 a=num -q i=#{xxout} o=#{xx1}"
177
+
178
+ f=""
179
+ f << "mjoin k=num f=cls m=#{xx1} i=#{xxmap} |"
180
+ f << "mcut f=node,cls o=#{ofile}"
181
+ system(f)
182
+
183
+ end
184
+
185
+
186
+
187
+ def genRscript(directed,eFile,wFile,ew,nodeSize,al,oFile,oInfo,scpFile)
188
+ dir="FALSE"
189
+ dir="TRUE" if directed
190
+
191
+ #system "cp #{eFile} xxedge"
192
+ #system "cp #{wFile} xxweight"
193
+
194
+ if al=="mo"
195
+ raise "#ERROR# can't use -directed option with al=\"mo\" " if directed
196
+ r_proc = <<EOF
197
+ library(igraph)
198
+ # reading edge file
199
+ g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
200
+ # reading weight file
201
+ w=read.csv("#{wFile}")
202
+ E(g)$weight=as.list(w$"#{ew}")
203
+ # do clustering
204
+ nc=cluster_fast_greedy(g,weight=E(g)$weight,merges=T,modularity=T,membership=T)
205
+
206
+ # 置換
207
+ ms=cbind(membership(nc))
208
+ # Community sizes:
209
+ cs=sizes(nc)
210
+
211
+ # modularity:
212
+ mq=modularity(nc)
213
+
214
+ dat=data.frame( cls=ms )
215
+ colnames(dat)=c("cls")
216
+
217
+ info=data.frame(cs, mq)
218
+ colnames(info)=c("cls","size","modurarityQ")
219
+
220
+ write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
221
+ write.csv(info,file="#{oInfo}",quote=FALSE,row.names=FALSE)
222
+ EOF
223
+
224
+ else # eb (edge betweenness)
225
+ r_proc = <<EOF
226
+ library(igraph)
227
+ # reading edge file
228
+ g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
229
+ # reading weight file
230
+ w=read.csv("#{wFile}")
231
+ E(g)$weight=as.list(w$"#{ew}")
232
+ # do clustering
233
+ nc=cluster_edge_betweenness(g,weights=E(g)$weight,directed=#{dir},bridges=T,merges=T,modularity=T,edge.betweenness=T,membership=T)
234
+
235
+ # 置換
236
+ ms=cbind(membership(nc))
237
+ # Community sizes:
238
+ cs=sizes(nc)
239
+
240
+ # modularity:
241
+ mq=modularity(nc)
242
+
243
+ dat=data.frame( cls=ms )
244
+ colnames(dat)=c("cls")
245
+
246
+ info=data.frame(cs, mq)
247
+ colnames(info)=c("cls","size","modurarityQ")
248
+
249
+ write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
250
+ write.csv(info,file="#{oInfo}",quote=FALSE,row.names=FALSE)
251
+ EOF
252
+
253
+ end
254
+
255
+
256
+ File.open(scpFile,"w"){|fpw|
257
+ fpw.write(r_proc)
258
+ }
259
+ end
260
+
261
+
262
+
263
+ #################################################################################################
264
+ #### Entry point
265
+
266
+ args=MCMD::Margs.new(ARGV,"ei=,ef=,ni=,nf=,ew=,al=,o=,-directed,T=,-verbose,--help","ei=")
267
+
268
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
269
+
270
+ # work file path
271
+ if args.str("T=")!=nil then
272
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
273
+ end
274
+
275
+ # 出力ファイル
276
+ ofile =args.file("o=","w")
277
+
278
+ # 枝データの扱い
279
+ # setting variables for edge file(s) and its field name
280
+ edgeFile=nil
281
+ edgeFile = args.file("ei=","r") # edge file name
282
+ unless edgeFile
283
+ raise "#ERROR# ei= is mandatory"
284
+ end
285
+ ef = args.field("ef=", edgeFile)
286
+ ef1,ef2=ef["names"]
287
+
288
+ # ---- 枝重み
289
+ ew = args.field("ew=", edgeFile, nil, 1,1)
290
+ ew = ew["names"][0] if ew
291
+
292
+ # 節点データの扱い
293
+ # if nf= is not specified, only edge files are used for generating a graph.
294
+ ni=nil
295
+ nodeFile=nil
296
+ nodeFile = args.file("ni=","r") # node file name
297
+ if nodeFile
298
+ nf = args.field("nf=", nodeFile)
299
+ unless nf
300
+ raise "#ERROR# nf= is mandatory, when ni= is specified"
301
+ end
302
+ nf=nf["names"][0]
303
+ end
304
+
305
+ # アルゴリズム
306
+ al = args.str("al=","mo") # Default=mo
307
+ unless al=="mo" or al=="eb"
308
+ raise "#ERROR# al= can specify mo|eb"
309
+ end
310
+
311
+
312
+ # 有向or無向グラフ
313
+ directed=args.bool("-directed")
314
+
315
+ # convert the original graph to one igraph can handle
316
+ wf=MCMD::Mtemp.new
317
+ xxnum =wf.file
318
+ xxmap =wf.file
319
+ xxout =wf.file
320
+ xxscp =wf.file
321
+ xxinfo=wf.file
322
+ xxweight=wf.file
323
+
324
+ nodeSize=g2pair(nodeFile,nf,edgeFile,ef1,ef2,ew,xxnum,xxmap,xxweight)
325
+
326
+ # generate R script, and run
327
+ genRscript(directed,xxnum,xxweight,ew,nodeSize,al,xxout,xxinfo,xxscp)
328
+
329
+ if args.bool("-verbose")
330
+ system "R --vanilla -q < #{xxscp}"
331
+ else
332
+ system "R --vanilla -q --slave < #{xxscp} 2>/dev/null"
333
+ end
334
+
335
+ # 元のデータに戻して出力
336
+ convOrg(xxmap,xxout,ofile)
337
+
338
+ #system "cp #{xxweight} xxweight"
339
+ #system "cp #{xxmap} xxmap"
340
+ #system "cp #{xxout} xxdat"
341
+ #system "cp #{xxinfo} xxinfo"
342
+
343
+ MCMD::endLog(args.cmdline)
344
+
345
+
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'rubygems'
5
+ require 'nysol/mcmd'
6
+ require 'nysol/mining'
7
+ require 'fileutils'
8
+
9
+
10
+ # 1.0: initial develpoment 2014/12/02
11
+ # 1.1: added the seed for org sketchsort and msketchsort 2015/10/24
12
+ # 1.2: added error process from sketchsort 2016/11/5
13
+ $version="1.2"
14
+
15
+ def help
16
+
17
+ STDERR.puts <<EOF
18
+ ---------------------------------
19
+ msketchsort.rb version #{$version}
20
+ ---------------------------------
21
+ 概要) スケッチソートを利用した全ベクトルペアの距離計算
22
+ 特徴) データに含まれる全ベクトル間の距離を高速に計算できる。
23
+ 窓を指定することで比較するベクトルの範囲を限定することができる。
24
+
25
+ 書式) #{$cmd} e= tid= [dist=] [th=] [mr=] [wf=] [ws=] [dist=C|H] i= [o=] [--help]
26
+
27
+ e= : ベクトルの各要素となる項目名【必須】ex) e=val1,val2,val3,val4
28
+ tid= : ベクトルを識別するための項目名(i=上の項目名)【必須】
29
+ dist= : ベクトル間の距離計算の方法。(省略時は C が指定される)
30
+ C (cosine distance): コサイン距離 (th=0-2)
31
+ H (Haming distance): ハミング距離 (th=1- )
32
+ th= : dist=で指定された距離計算について、ここで指定された値以下のペアを出力する。省略時は0.01が設定される。
33
+ mr= : ペアを逃す確率を指定 (missing ratio) False Negative。省略時は0.00001が設定される。
34
+ wf= : ウィンドウ項目。ex) 日付
35
+ ws= : ウィンドウサイズの上限(0以上の整数)【0で制限なし,default:0】
36
+ wfで指定した窓に含まれる全ペアを窓をずらしながら計算する。
37
+ i= : 入力ファイル
38
+ o= : 出力ファイル
39
+ seed= : 乱数の種(1以上の整数,default:1)
40
+ -uc : データ点を0を中心に移動させない
41
+
42
+
43
+ 例1: input1.csv
44
+ tid,val1,val2,val3,val4,val5
45
+ 0,4,9,1,8,7
46
+ 1,2,6,3,4,10
47
+ 2,3,10,1,7,4
48
+ 3,2,8,1,3,10
49
+ 4,4,7,2,3,10
50
+ 5,8,4,3,1,9
51
+ 6,6,7,5,1,9
52
+ 7,5,4,2,6,7
53
+ 8,3,10,1,5,9
54
+ 9,9,1,8,7,3
55
+ 10,5,2,3,10,9
56
+ 11,4,9,1,8,7
57
+
58
+ $ msketchsort.rb i=input1.csv tid=tid e=val1,val2,val3,val4,val5 o=out1.csv
59
+ SketchSort version 0.0.8
60
+ Written by Yasuo Tabei
61
+
62
+ deciding parameters such that the missing edge ratio is no more than 1e-05
63
+ decided parameters:
64
+ hamming distance threshold: 1
65
+ number of blocks: 4
66
+ number of chunks: 14
67
+ .
68
+ .
69
+ .
70
+
71
+ $ more out1.csv
72
+ distance,tid,tid2
73
+ 5.96046e-08,0,11
74
+
75
+
76
+
77
+ 例2: input2.csv
78
+ eCode,tgdate,term,val1,val2,val3,val4,val5
79
+ 1990,20100120,0,4,9,1,8,7
80
+ 2499,20100120,0,2,6,3,4,10
81
+ 2784,20100120,0,3,10,1,7,4
82
+ 3109,20100120,0,2,8,1,3,10
83
+ 3114,20100120,0,4,7,2,3,10
84
+ 6364,20100120,0,8,4,3,1,9
85
+ 8154,20100120,0,6,7,5,1,9
86
+ 8703,20100120,0,5,4,2,6,7
87
+ 9959,20100120,0,3,10,1,5,9
88
+ 1990,20100121,1,9,1,8,7,3
89
+ 2499,20100121,1,5,2,3,10,9
90
+ 2784,20100121,1,4,9,1,8,7
91
+ 3594,20100122,2,4,9,1,8,7
92
+
93
+
94
+ $ msketchsort.rb i=input2.csv tid=eCode,tgdate e=val1,val2,val3,val4,val5 th=0.05 wf=term ws=1 o=out2.csv
95
+ SketchSort version 0.0.8
96
+ Written by Yasuo Tabei
97
+
98
+ deciding parameters such that the missing edge ratio is no more than 1e-05
99
+ decided parameters:
100
+ hamming distance threshold: 1
101
+ number of blocks: 4
102
+ number of chunks: 14
103
+ .
104
+ .
105
+ .
106
+
107
+ $ more out2.csv
108
+ distance,eCode,tgdate,eCode2,tgdate2
109
+ 0,1990,20100120,2784,20100121
110
+ 0,2784,20100121,3594,20100122
111
+
112
+
113
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
114
+ EOF
115
+ exit
116
+ end
117
+
118
+ def ver()
119
+ STDERR.puts "version #{$version}"
120
+ exit
121
+ end
122
+
123
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
124
+ ver() if ARGV[0]=="--version"
125
+
126
+ args=MCMD::Margs.new(ARGV,"e=,tid=,dist=,th=,mr=,wf=,ws=,dist=,i=,o=,T=,seed=,-uc,","tid=,i=,e=")
127
+
128
+
129
+
130
+ # mcmdのメッセージは警告とエラーのみ
131
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
132
+
133
+ #ワークファイルパス
134
+ if args.str("T=")!=nil then
135
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
136
+ end
137
+
138
+ ifile = args.file("i=","r")
139
+ ofile = args.file("o=","w")
140
+ elem = args.str("e=")
141
+ tidH = args.field("tid=",ifile) # check field
142
+ tid = args.str("tid=")
143
+ dist = args.str("dist=","C")
144
+ th = args.float("th=",0.01)
145
+ mr = args.float("mr=",0.00001)
146
+ wfH = args.field("wf=",ifile) # check field
147
+ wf = args.str("wf=")
148
+ ws = args.int("ws=",0)
149
+ seed = args.int("seed=",1)
150
+ uc = args.bool("-uc")
151
+ @workf=MCMD::Mtemp.new
152
+
153
+ @pt=Time.now.to_i
154
+
155
+ if dist=="H" and th <1.0
156
+ MCMD::errorLog("#{File.basename($0)}: The range of th= is different")
157
+ exit
158
+ end
159
+
160
+ # convert the data for sketchport
161
+ def mkdata(ifile,elem,tid,wf)
162
+ xx1=@workf.file
163
+ xx2=@workf.file
164
+ xxmap=@workf.file
165
+ sdata=@workf.file
166
+
167
+
168
+ ln="#{@pt}line"
169
+
170
+ # make the line number
171
+ system "mnumber S=0 a=#{ln} -q i=#{ifile} o=#{xx1}"
172
+
173
+ if wf
174
+ system "mcut f=#{wf},#{tid},#{elem} i=#{xx1} o=#{xx2}"
175
+ system "mcut f=#{ln},#{tid} i=#{xx1} o=#{xxmap}"
176
+ else
177
+ wf="#{@pt}wf" unless wf
178
+ f=""
179
+ f << "msetstr v=0 a=#{wf} i=#{xx1} |"
180
+ f << "mcut f=#{wf},#{tid},#{elem} o=#{xx2}"
181
+ system(f)
182
+ system "mcut f=#{ln},#{tid} i=#{xx1} o=#{xxmap}"
183
+ end
184
+
185
+
186
+
187
+ # make the data for sketchsort
188
+ system "mcut f=#{wf},#{elem} -nfno i=#{xx2} |sed 's/,/ /g' >#{sdata}"
189
+
190
+
191
+ return sdata,xxmap
192
+ end
193
+
194
+ def doSsort(sdata,ofile,map,ws,uc,th,dist,mr,tid,seed)
195
+ xx3=@workf.file
196
+
197
+ if dist=="C"
198
+ distance="-cosdist"
199
+ elsif dist=="H"
200
+ distance="-hamdist"
201
+ end
202
+
203
+ if uc
204
+ status=NYSOL_MINING::run_sketchsort("-auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}")
205
+ #status=system "#{CMD_ss} -auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
206
+ puts "sketchsort -auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
207
+ else
208
+ status=NYSOL_MINING::run_sketchsort("-auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}")
209
+ #status=system "#{CMD_ss} -auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
210
+ puts "sketchsort -auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
211
+ end
212
+ unless status
213
+ raise "#ERROR# checking sketchsort messages"
214
+ end
215
+
216
+ tmp=[]
217
+ tid.split(",").each{|val |
218
+ tmp << "#{val}:#{val}2"
219
+ }
220
+ tid2=tmp.join(",")
221
+
222
+
223
+ f=""
224
+ f << "sed 's/ /,/g' <#{xx3} |"
225
+ f << "mcut -nfni f=0:eline1,1:eline2,2:distance |"
226
+ f << "mfsort f=eline* |"
227
+ # 行番号に対応するtidを取得
228
+ f << "mjoin k=eline1 K=#{@pt}line f=#{tid} m=#{map} |"
229
+ f << "mjoin k=eline2 K=#{@pt}line f=#{tid2} m=#{map} |"
230
+ f << "msortf f=eline1%n,eline2%n |"
231
+ f << "mcut -r f=eline1,eline2 |"
232
+ f << "msortf f=#{tid} |"
233
+ f << "mfldname -q o=#{ofile}"
234
+ system(f)
235
+
236
+
237
+ end
238
+
239
+ sdata,xxmap=mkdata(ifile,elem,tid,wf)
240
+
241
+ doSsort(sdata,ofile,xxmap,ws,uc,th,dist,mr,tid,seed)
242
+
243
+ MCMD::endLog(args.cmdline)