nysol-mining 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/mnewman.rb
ADDED
@@ -0,0 +1,345 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require "rubygems"
|
5
|
+
require "nysol/mcmd"
|
6
|
+
|
7
|
+
$version="1.0"
|
8
|
+
$revision="###VERSION###"
|
9
|
+
CMD="mnewman.rb"
|
10
|
+
|
11
|
+
def help
|
12
|
+
STDERR.puts <<EOF
|
13
|
+
----------------------------
|
14
|
+
#{CMD} version #{$version}
|
15
|
+
----------------------------
|
16
|
+
概要) newman クラスタリング
|
17
|
+
特徴) 1) modularityの最適化を用いたクラスタリングが実施できる。
|
18
|
+
2) 辺の媒介中心性を利用したグラフ分割によるクラスタリングが実施できる。
|
19
|
+
書式) #{CMD} ei= ef= ni= [nf=] [ew=] [al=] o= [-directed]
|
20
|
+
|
21
|
+
ei= : 枝データファイル
|
22
|
+
ef= : 枝データ上の2つの節点項目名
|
23
|
+
ni= : 節点データファイル
|
24
|
+
nf= : 節点データ上の節点項目名
|
25
|
+
ew= : 枝ファイル上の重み項目名【省略時は全ての枝の重みを1と見なす】
|
26
|
+
al= : クラスタリングアルゴリズム。省略時はmoが選択される。
|
27
|
+
mo:(modularity optimization) modularityを最適化するための貪欲法によるクラスタリング
|
28
|
+
無向グラフでのみ指定可能。igraphのcluster_fast_greedyを利用
|
29
|
+
eb:(edge betweenness) 辺の媒介中心性を計算し最もそれが高い辺を取り除くことでグラフを分割する。
|
30
|
+
分割数はmodurarityが最大となるように決定される。igraphのcluster_edge_betweennessを利用
|
31
|
+
-directed : 有向グラフ
|
32
|
+
o= : クラスタ
|
33
|
+
|
34
|
+
その他
|
35
|
+
T= : ワークディレクトリ(default:/tmp)
|
36
|
+
-verbose : show the END messages of MCMD and R used in this command
|
37
|
+
--help : ヘルプの表示
|
38
|
+
|
39
|
+
必要なソフトウェア)
|
40
|
+
1) R
|
41
|
+
2) igraph package for R
|
42
|
+
|
43
|
+
入力データ)
|
44
|
+
節点ペアのCSVファイル(ファイル名はei=にて指定)
|
45
|
+
例)
|
46
|
+
$ cat data/dat1.edge
|
47
|
+
n1,n2
|
48
|
+
a,b
|
49
|
+
a,c
|
50
|
+
a,d
|
51
|
+
a,e
|
52
|
+
a,f
|
53
|
+
a,g
|
54
|
+
b,c
|
55
|
+
b,d
|
56
|
+
b,e
|
57
|
+
b,f
|
58
|
+
c,h
|
59
|
+
d,g
|
60
|
+
e,f
|
61
|
+
|
62
|
+
$ cat data/dat.node
|
63
|
+
node
|
64
|
+
a
|
65
|
+
b
|
66
|
+
c
|
67
|
+
d
|
68
|
+
e
|
69
|
+
f
|
70
|
+
g
|
71
|
+
|
72
|
+
${CMD} ei=data/dat1.edge ef=n1,n2 al=mo o=rsl01
|
73
|
+
#END# mnewman.rb ei=./data/dat1.edge ef=n1,n2 al=mo o=rsl01; 2016/01/24 01:54:25
|
74
|
+
|
75
|
+
$ cat rsl01
|
76
|
+
node,cls
|
77
|
+
a,2
|
78
|
+
b,1
|
79
|
+
c,3
|
80
|
+
d,2
|
81
|
+
e,1
|
82
|
+
f,1
|
83
|
+
g,2
|
84
|
+
h,3
|
85
|
+
|
86
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
87
|
+
EOF
|
88
|
+
exit
|
89
|
+
end
|
90
|
+
|
91
|
+
def ver()
|
92
|
+
$revision ="0" if $revision =~ /VERSION/
|
93
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
94
|
+
exit
|
95
|
+
end
|
96
|
+
|
97
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
98
|
+
ver() if ARGV[0]=="--version"
|
99
|
+
|
100
|
+
# confirm if R library is installed
|
101
|
+
exit(1) unless(MCMD::chkRexe("igraph"))
|
102
|
+
|
103
|
+
|
104
|
+
####
|
105
|
+
# converting original graph file with text to one with integer
|
106
|
+
# output #{numFile} and #{mapFile}, then return the number of nodes of the graph
|
107
|
+
#
|
108
|
+
# ei ni xxnum xxmap
|
109
|
+
# v1,v2 v node%1,flag%0,num
|
110
|
+
# E,J A 0 3 A,0,0
|
111
|
+
# E,A B 0 4 B,0,1
|
112
|
+
# J,D C 0 6 D,0,2
|
113
|
+
# J,A D => 1 5 E,0,3
|
114
|
+
# J,H E 2 4 F,0,4
|
115
|
+
# D,H F 2 5 H,0,5
|
116
|
+
# D,F G 2 6 J,0,6
|
117
|
+
# H,F H 3 6 C,1,7
|
118
|
+
# A,F I 4 5 G,1,8
|
119
|
+
# B,H J 5 6 I,1,9
|
120
|
+
#
|
121
|
+
# return value is 10 (nodes)
|
122
|
+
# "flag" on xxmap: 0:nodes in "ei", 1:nodes only in "ni".
|
123
|
+
def g2pair(ni,nf,ei,ef1,ef2,ew,numFile,mapFile,weightFile)
|
124
|
+
#MCMD::msgLog("converting graph files into a pair of numbered nodes ...")
|
125
|
+
wf=MCMD::Mtemp.new
|
126
|
+
wf1=wf.file
|
127
|
+
wf2=wf.file
|
128
|
+
wf3=wf.file
|
129
|
+
|
130
|
+
system "mcut f=#{ef1}:node i=#{ei} | msetstr v=0 a=flag o=#{wf1}"
|
131
|
+
system "mcut f=#{ef2}:node i=#{ei} | msetstr v=0 a=flag o=#{wf2}"
|
132
|
+
system "mcut f=#{nf}:node i=#{ni} | msetstr v=1 a=flag o=#{wf3}" if nf
|
133
|
+
|
134
|
+
f=""
|
135
|
+
if nf
|
136
|
+
f << "mcat i=#{wf1},#{wf2},#{wf3} f=node,flag |"
|
137
|
+
f << "mbest k=node s=flag from=0 size=1 |"
|
138
|
+
else
|
139
|
+
f << "mcat i=#{wf1},#{wf2} f=node,flag |"
|
140
|
+
f << "muniq k=node |"
|
141
|
+
end
|
142
|
+
# isolated nodes are set to the end of position in mapping file.
|
143
|
+
# S= must start from 0 (but inside R vertex number will be added one)
|
144
|
+
f << "mnumber s=flag,node a=num S=0 o=#{mapFile}"
|
145
|
+
system(f)
|
146
|
+
|
147
|
+
f=""
|
148
|
+
f << "mcut f=#{ef1},#{ef2} i=#{ei} |"
|
149
|
+
f << "msortf f=#{ef1} |"
|
150
|
+
f << "mjoin k=#{ef1} K=node m=#{mapFile} f=num:num1 |"
|
151
|
+
f << "msortf f=#{ef2} |"
|
152
|
+
f << "mjoin k=#{ef2} K=node m=#{mapFile} f=num:num2 |"
|
153
|
+
f << "mcut f=num1,num2 |"
|
154
|
+
f << "mfsort f=num1,num2 |"
|
155
|
+
f << "msortf f=num1%n,num2%n -nfno | tr ',' ' ' >#{numFile}"
|
156
|
+
system(f)
|
157
|
+
|
158
|
+
nodeSize=MCMD::mrecount("i=#{mapFile}")
|
159
|
+
|
160
|
+
if ew
|
161
|
+
system "mcut f=#{ew} i=#{ei} o=#{weightFile}"
|
162
|
+
else
|
163
|
+
ew="weight"
|
164
|
+
system "msetstr v=1 a=#{ew} i=#{ei} |mcut f=#{ew} o=#{weightFile}"
|
165
|
+
end
|
166
|
+
|
167
|
+
return nodeSize
|
168
|
+
end
|
169
|
+
|
170
|
+
def convOrg(xxmap,xxout,ofile)
|
171
|
+
|
172
|
+
wf=MCMD::Mtemp.new
|
173
|
+
xx1=wf.file
|
174
|
+
xx2=wf.file
|
175
|
+
|
176
|
+
system "mnumber S=0 a=num -q i=#{xxout} o=#{xx1}"
|
177
|
+
|
178
|
+
f=""
|
179
|
+
f << "mjoin k=num f=cls m=#{xx1} i=#{xxmap} |"
|
180
|
+
f << "mcut f=node,cls o=#{ofile}"
|
181
|
+
system(f)
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
def genRscript(directed,eFile,wFile,ew,nodeSize,al,oFile,oInfo,scpFile)
|
188
|
+
dir="FALSE"
|
189
|
+
dir="TRUE" if directed
|
190
|
+
|
191
|
+
#system "cp #{eFile} xxedge"
|
192
|
+
#system "cp #{wFile} xxweight"
|
193
|
+
|
194
|
+
if al=="mo"
|
195
|
+
raise "#ERROR# can't use -directed option with al=\"mo\" " if directed
|
196
|
+
r_proc = <<EOF
|
197
|
+
library(igraph)
|
198
|
+
# reading edge file
|
199
|
+
g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
|
200
|
+
# reading weight file
|
201
|
+
w=read.csv("#{wFile}")
|
202
|
+
E(g)$weight=as.list(w$"#{ew}")
|
203
|
+
# do clustering
|
204
|
+
nc=cluster_fast_greedy(g,weight=E(g)$weight,merges=T,modularity=T,membership=T)
|
205
|
+
|
206
|
+
# 置換
|
207
|
+
ms=cbind(membership(nc))
|
208
|
+
# Community sizes:
|
209
|
+
cs=sizes(nc)
|
210
|
+
|
211
|
+
# modularity:
|
212
|
+
mq=modularity(nc)
|
213
|
+
|
214
|
+
dat=data.frame( cls=ms )
|
215
|
+
colnames(dat)=c("cls")
|
216
|
+
|
217
|
+
info=data.frame(cs, mq)
|
218
|
+
colnames(info)=c("cls","size","modurarityQ")
|
219
|
+
|
220
|
+
write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
|
221
|
+
write.csv(info,file="#{oInfo}",quote=FALSE,row.names=FALSE)
|
222
|
+
EOF
|
223
|
+
|
224
|
+
else # eb (edge betweenness)
|
225
|
+
r_proc = <<EOF
|
226
|
+
library(igraph)
|
227
|
+
# reading edge file
|
228
|
+
g=read.graph("#{eFile}",format="edgelist",directed=#{dir},n=#{nodeSize})
|
229
|
+
# reading weight file
|
230
|
+
w=read.csv("#{wFile}")
|
231
|
+
E(g)$weight=as.list(w$"#{ew}")
|
232
|
+
# do clustering
|
233
|
+
nc=cluster_edge_betweenness(g,weights=E(g)$weight,directed=#{dir},bridges=T,merges=T,modularity=T,edge.betweenness=T,membership=T)
|
234
|
+
|
235
|
+
# 置換
|
236
|
+
ms=cbind(membership(nc))
|
237
|
+
# Community sizes:
|
238
|
+
cs=sizes(nc)
|
239
|
+
|
240
|
+
# modularity:
|
241
|
+
mq=modularity(nc)
|
242
|
+
|
243
|
+
dat=data.frame( cls=ms )
|
244
|
+
colnames(dat)=c("cls")
|
245
|
+
|
246
|
+
info=data.frame(cs, mq)
|
247
|
+
colnames(info)=c("cls","size","modurarityQ")
|
248
|
+
|
249
|
+
write.csv(dat,file="#{oFile}",quote=FALSE,row.names=FALSE)
|
250
|
+
write.csv(info,file="#{oInfo}",quote=FALSE,row.names=FALSE)
|
251
|
+
EOF
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
File.open(scpFile,"w"){|fpw|
|
257
|
+
fpw.write(r_proc)
|
258
|
+
}
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
#################################################################################################
|
264
|
+
#### Entry point
|
265
|
+
|
266
|
+
args=MCMD::Margs.new(ARGV,"ei=,ef=,ni=,nf=,ew=,al=,o=,-directed,T=,-verbose,--help","ei=")
|
267
|
+
|
268
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-verbose")
|
269
|
+
|
270
|
+
# work file path
|
271
|
+
if args.str("T=")!=nil then
|
272
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
273
|
+
end
|
274
|
+
|
275
|
+
# 出力ファイル
|
276
|
+
ofile =args.file("o=","w")
|
277
|
+
|
278
|
+
# 枝データの扱い
|
279
|
+
# setting variables for edge file(s) and its field name
|
280
|
+
edgeFile=nil
|
281
|
+
edgeFile = args.file("ei=","r") # edge file name
|
282
|
+
unless edgeFile
|
283
|
+
raise "#ERROR# ei= is mandatory"
|
284
|
+
end
|
285
|
+
ef = args.field("ef=", edgeFile)
|
286
|
+
ef1,ef2=ef["names"]
|
287
|
+
|
288
|
+
# ---- 枝重み
|
289
|
+
ew = args.field("ew=", edgeFile, nil, 1,1)
|
290
|
+
ew = ew["names"][0] if ew
|
291
|
+
|
292
|
+
# 節点データの扱い
|
293
|
+
# if nf= is not specified, only edge files are used for generating a graph.
|
294
|
+
ni=nil
|
295
|
+
nodeFile=nil
|
296
|
+
nodeFile = args.file("ni=","r") # node file name
|
297
|
+
if nodeFile
|
298
|
+
nf = args.field("nf=", nodeFile)
|
299
|
+
unless nf
|
300
|
+
raise "#ERROR# nf= is mandatory, when ni= is specified"
|
301
|
+
end
|
302
|
+
nf=nf["names"][0]
|
303
|
+
end
|
304
|
+
|
305
|
+
# アルゴリズム
|
306
|
+
al = args.str("al=","mo") # Default=mo
|
307
|
+
unless al=="mo" or al=="eb"
|
308
|
+
raise "#ERROR# al= can specify mo|eb"
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
# 有向or無向グラフ
|
313
|
+
directed=args.bool("-directed")
|
314
|
+
|
315
|
+
# convert the original graph to one igraph can handle
|
316
|
+
wf=MCMD::Mtemp.new
|
317
|
+
xxnum =wf.file
|
318
|
+
xxmap =wf.file
|
319
|
+
xxout =wf.file
|
320
|
+
xxscp =wf.file
|
321
|
+
xxinfo=wf.file
|
322
|
+
xxweight=wf.file
|
323
|
+
|
324
|
+
nodeSize=g2pair(nodeFile,nf,edgeFile,ef1,ef2,ew,xxnum,xxmap,xxweight)
|
325
|
+
|
326
|
+
# generate R script, and run
|
327
|
+
genRscript(directed,xxnum,xxweight,ew,nodeSize,al,xxout,xxinfo,xxscp)
|
328
|
+
|
329
|
+
if args.bool("-verbose")
|
330
|
+
system "R --vanilla -q < #{xxscp}"
|
331
|
+
else
|
332
|
+
system "R --vanilla -q --slave < #{xxscp} 2>/dev/null"
|
333
|
+
end
|
334
|
+
|
335
|
+
# 元のデータに戻して出力
|
336
|
+
convOrg(xxmap,xxout,ofile)
|
337
|
+
|
338
|
+
#system "cp #{xxweight} xxweight"
|
339
|
+
#system "cp #{xxmap} xxmap"
|
340
|
+
#system "cp #{xxout} xxdat"
|
341
|
+
#system "cp #{xxinfo} xxinfo"
|
342
|
+
|
343
|
+
MCMD::endLog(args.cmdline)
|
344
|
+
|
345
|
+
|
data/bin/msketchsort.rb
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'nysol/mcmd'
|
6
|
+
require 'nysol/mining'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
|
10
|
+
# 1.0: initial develpoment 2014/12/02
|
11
|
+
# 1.1: added the seed for org sketchsort and msketchsort 2015/10/24
|
12
|
+
# 1.2: added error process from sketchsort 2016/11/5
|
13
|
+
$version="1.2"
|
14
|
+
|
15
|
+
def help
|
16
|
+
|
17
|
+
STDERR.puts <<EOF
|
18
|
+
---------------------------------
|
19
|
+
msketchsort.rb version #{$version}
|
20
|
+
---------------------------------
|
21
|
+
概要) スケッチソートを利用した全ベクトルペアの距離計算
|
22
|
+
特徴) データに含まれる全ベクトル間の距離を高速に計算できる。
|
23
|
+
窓を指定することで比較するベクトルの範囲を限定することができる。
|
24
|
+
|
25
|
+
書式) #{$cmd} e= tid= [dist=] [th=] [mr=] [wf=] [ws=] [dist=C|H] i= [o=] [--help]
|
26
|
+
|
27
|
+
e= : ベクトルの各要素となる項目名【必須】ex) e=val1,val2,val3,val4
|
28
|
+
tid= : ベクトルを識別するための項目名(i=上の項目名)【必須】
|
29
|
+
dist= : ベクトル間の距離計算の方法。(省略時は C が指定される)
|
30
|
+
C (cosine distance): コサイン距離 (th=0-2)
|
31
|
+
H (Haming distance): ハミング距離 (th=1- )
|
32
|
+
th= : dist=で指定された距離計算について、ここで指定された値以下のペアを出力する。省略時は0.01が設定される。
|
33
|
+
mr= : ペアを逃す確率を指定 (missing ratio) False Negative。省略時は0.00001が設定される。
|
34
|
+
wf= : ウィンドウ項目。ex) 日付
|
35
|
+
ws= : ウィンドウサイズの上限(0以上の整数)【0で制限なし,default:0】
|
36
|
+
wfで指定した窓に含まれる全ペアを窓をずらしながら計算する。
|
37
|
+
i= : 入力ファイル
|
38
|
+
o= : 出力ファイル
|
39
|
+
seed= : 乱数の種(1以上の整数,default:1)
|
40
|
+
-uc : データ点を0を中心に移動させない
|
41
|
+
|
42
|
+
|
43
|
+
例1: input1.csv
|
44
|
+
tid,val1,val2,val3,val4,val5
|
45
|
+
0,4,9,1,8,7
|
46
|
+
1,2,6,3,4,10
|
47
|
+
2,3,10,1,7,4
|
48
|
+
3,2,8,1,3,10
|
49
|
+
4,4,7,2,3,10
|
50
|
+
5,8,4,3,1,9
|
51
|
+
6,6,7,5,1,9
|
52
|
+
7,5,4,2,6,7
|
53
|
+
8,3,10,1,5,9
|
54
|
+
9,9,1,8,7,3
|
55
|
+
10,5,2,3,10,9
|
56
|
+
11,4,9,1,8,7
|
57
|
+
|
58
|
+
$ msketchsort.rb i=input1.csv tid=tid e=val1,val2,val3,val4,val5 o=out1.csv
|
59
|
+
SketchSort version 0.0.8
|
60
|
+
Written by Yasuo Tabei
|
61
|
+
|
62
|
+
deciding parameters such that the missing edge ratio is no more than 1e-05
|
63
|
+
decided parameters:
|
64
|
+
hamming distance threshold: 1
|
65
|
+
number of blocks: 4
|
66
|
+
number of chunks: 14
|
67
|
+
.
|
68
|
+
.
|
69
|
+
.
|
70
|
+
|
71
|
+
$ more out1.csv
|
72
|
+
distance,tid,tid2
|
73
|
+
5.96046e-08,0,11
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
例2: input2.csv
|
78
|
+
eCode,tgdate,term,val1,val2,val3,val4,val5
|
79
|
+
1990,20100120,0,4,9,1,8,7
|
80
|
+
2499,20100120,0,2,6,3,4,10
|
81
|
+
2784,20100120,0,3,10,1,7,4
|
82
|
+
3109,20100120,0,2,8,1,3,10
|
83
|
+
3114,20100120,0,4,7,2,3,10
|
84
|
+
6364,20100120,0,8,4,3,1,9
|
85
|
+
8154,20100120,0,6,7,5,1,9
|
86
|
+
8703,20100120,0,5,4,2,6,7
|
87
|
+
9959,20100120,0,3,10,1,5,9
|
88
|
+
1990,20100121,1,9,1,8,7,3
|
89
|
+
2499,20100121,1,5,2,3,10,9
|
90
|
+
2784,20100121,1,4,9,1,8,7
|
91
|
+
3594,20100122,2,4,9,1,8,7
|
92
|
+
|
93
|
+
|
94
|
+
$ msketchsort.rb i=input2.csv tid=eCode,tgdate e=val1,val2,val3,val4,val5 th=0.05 wf=term ws=1 o=out2.csv
|
95
|
+
SketchSort version 0.0.8
|
96
|
+
Written by Yasuo Tabei
|
97
|
+
|
98
|
+
deciding parameters such that the missing edge ratio is no more than 1e-05
|
99
|
+
decided parameters:
|
100
|
+
hamming distance threshold: 1
|
101
|
+
number of blocks: 4
|
102
|
+
number of chunks: 14
|
103
|
+
.
|
104
|
+
.
|
105
|
+
.
|
106
|
+
|
107
|
+
$ more out2.csv
|
108
|
+
distance,eCode,tgdate,eCode2,tgdate2
|
109
|
+
0,1990,20100120,2784,20100121
|
110
|
+
0,2784,20100121,3594,20100122
|
111
|
+
|
112
|
+
|
113
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
114
|
+
EOF
|
115
|
+
exit
|
116
|
+
end
|
117
|
+
|
118
|
+
def ver()
|
119
|
+
STDERR.puts "version #{$version}"
|
120
|
+
exit
|
121
|
+
end
|
122
|
+
|
123
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
124
|
+
ver() if ARGV[0]=="--version"
|
125
|
+
|
126
|
+
args=MCMD::Margs.new(ARGV,"e=,tid=,dist=,th=,mr=,wf=,ws=,dist=,i=,o=,T=,seed=,-uc,","tid=,i=,e=")
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
# mcmdのメッセージは警告とエラーのみ
|
131
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
|
132
|
+
|
133
|
+
#ワークファイルパス
|
134
|
+
if args.str("T=")!=nil then
|
135
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
136
|
+
end
|
137
|
+
|
138
|
+
ifile = args.file("i=","r")
|
139
|
+
ofile = args.file("o=","w")
|
140
|
+
elem = args.str("e=")
|
141
|
+
tidH = args.field("tid=",ifile) # check field
|
142
|
+
tid = args.str("tid=")
|
143
|
+
dist = args.str("dist=","C")
|
144
|
+
th = args.float("th=",0.01)
|
145
|
+
mr = args.float("mr=",0.00001)
|
146
|
+
wfH = args.field("wf=",ifile) # check field
|
147
|
+
wf = args.str("wf=")
|
148
|
+
ws = args.int("ws=",0)
|
149
|
+
seed = args.int("seed=",1)
|
150
|
+
uc = args.bool("-uc")
|
151
|
+
@workf=MCMD::Mtemp.new
|
152
|
+
|
153
|
+
@pt=Time.now.to_i
|
154
|
+
|
155
|
+
if dist=="H" and th <1.0
|
156
|
+
MCMD::errorLog("#{File.basename($0)}: The range of th= is different")
|
157
|
+
exit
|
158
|
+
end
|
159
|
+
|
160
|
+
# convert the data for sketchport
|
161
|
+
def mkdata(ifile,elem,tid,wf)
|
162
|
+
xx1=@workf.file
|
163
|
+
xx2=@workf.file
|
164
|
+
xxmap=@workf.file
|
165
|
+
sdata=@workf.file
|
166
|
+
|
167
|
+
|
168
|
+
ln="#{@pt}line"
|
169
|
+
|
170
|
+
# make the line number
|
171
|
+
system "mnumber S=0 a=#{ln} -q i=#{ifile} o=#{xx1}"
|
172
|
+
|
173
|
+
if wf
|
174
|
+
system "mcut f=#{wf},#{tid},#{elem} i=#{xx1} o=#{xx2}"
|
175
|
+
system "mcut f=#{ln},#{tid} i=#{xx1} o=#{xxmap}"
|
176
|
+
else
|
177
|
+
wf="#{@pt}wf" unless wf
|
178
|
+
f=""
|
179
|
+
f << "msetstr v=0 a=#{wf} i=#{xx1} |"
|
180
|
+
f << "mcut f=#{wf},#{tid},#{elem} o=#{xx2}"
|
181
|
+
system(f)
|
182
|
+
system "mcut f=#{ln},#{tid} i=#{xx1} o=#{xxmap}"
|
183
|
+
end
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
# make the data for sketchsort
|
188
|
+
system "mcut f=#{wf},#{elem} -nfno i=#{xx2} |sed 's/,/ /g' >#{sdata}"
|
189
|
+
|
190
|
+
|
191
|
+
return sdata,xxmap
|
192
|
+
end
|
193
|
+
|
194
|
+
def doSsort(sdata,ofile,map,ws,uc,th,dist,mr,tid,seed)
|
195
|
+
xx3=@workf.file
|
196
|
+
|
197
|
+
if dist=="C"
|
198
|
+
distance="-cosdist"
|
199
|
+
elsif dist=="H"
|
200
|
+
distance="-hamdist"
|
201
|
+
end
|
202
|
+
|
203
|
+
if uc
|
204
|
+
status=NYSOL_MINING::run_sketchsort("-auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}")
|
205
|
+
#status=system "#{CMD_ss} -auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
|
206
|
+
puts "sketchsort -auto #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
|
207
|
+
else
|
208
|
+
status=NYSOL_MINING::run_sketchsort("-auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}")
|
209
|
+
#status=system "#{CMD_ss} -auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
|
210
|
+
puts "sketchsort -auto -centering #{distance} #{th} -missingratio #{mr} -windowsize #{ws} -seed #{seed} #{sdata} #{xx3}"
|
211
|
+
end
|
212
|
+
unless status
|
213
|
+
raise "#ERROR# checking sketchsort messages"
|
214
|
+
end
|
215
|
+
|
216
|
+
tmp=[]
|
217
|
+
tid.split(",").each{|val |
|
218
|
+
tmp << "#{val}:#{val}2"
|
219
|
+
}
|
220
|
+
tid2=tmp.join(",")
|
221
|
+
|
222
|
+
|
223
|
+
f=""
|
224
|
+
f << "sed 's/ /,/g' <#{xx3} |"
|
225
|
+
f << "mcut -nfni f=0:eline1,1:eline2,2:distance |"
|
226
|
+
f << "mfsort f=eline* |"
|
227
|
+
# 行番号に対応するtidを取得
|
228
|
+
f << "mjoin k=eline1 K=#{@pt}line f=#{tid} m=#{map} |"
|
229
|
+
f << "mjoin k=eline2 K=#{@pt}line f=#{tid2} m=#{map} |"
|
230
|
+
f << "msortf f=eline1%n,eline2%n |"
|
231
|
+
f << "mcut -r f=eline1,eline2 |"
|
232
|
+
f << "msortf f=#{tid} |"
|
233
|
+
f << "mfldname -q o=#{ofile}"
|
234
|
+
system(f)
|
235
|
+
|
236
|
+
|
237
|
+
end
|
238
|
+
|
239
|
+
sdata,xxmap=mkdata(ifile,elem,tid,wf)
|
240
|
+
|
241
|
+
doSsort(sdata,ofile,xxmap,ws,uc,th,dist,mr,tid,seed)
|
242
|
+
|
243
|
+
MCMD::endLog(args.cmdline)
|