nysol-take 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/mbiclique.rb +317 -0
- data/bin/mbipolish.rb +362 -0
- data/bin/mccomp.rb +235 -0
- data/bin/mclique.rb +295 -0
- data/bin/mclique2g.rb +105 -0
- data/bin/mcliqueInfo.rb +203 -0
- data/bin/mfriends.rb +202 -0
- data/bin/mgdiff.rb +252 -0
- data/bin/mhifriend.rb +456 -0
- data/bin/mhipolish.rb +465 -0
- data/bin/mitemset.rb +168 -0
- data/bin/mpal.rb +410 -0
- data/bin/mpolishing.rb +399 -0
- data/bin/msequence.rb +165 -0
- data/bin/mtra2g.rb +476 -0
- data/bin/mtra2gc.rb +360 -0
- data/ext/grhfilrun/extconf.rb +12 -0
- data/ext/grhfilrun/grhfilrun.c +85 -0
- data/ext/grhfilrun/src/_sspc.c +358 -0
- data/ext/grhfilrun/src/aheap.c +545 -0
- data/ext/grhfilrun/src/aheap.h +251 -0
- data/ext/grhfilrun/src/base.c +92 -0
- data/ext/grhfilrun/src/base.h +59 -0
- data/ext/grhfilrun/src/fstar.c +497 -0
- data/ext/grhfilrun/src/fstar.h +80 -0
- data/ext/grhfilrun/src/grhfil.c +214 -0
- data/ext/grhfilrun/src/itemset.c +713 -0
- data/ext/grhfilrun/src/itemset.h +170 -0
- data/ext/grhfilrun/src/problem.c +415 -0
- data/ext/grhfilrun/src/problem.h +179 -0
- data/ext/grhfilrun/src/queue.c +533 -0
- data/ext/grhfilrun/src/queue.h +182 -0
- data/ext/grhfilrun/src/sample.c +19 -0
- data/ext/grhfilrun/src/sspc.c +597 -0
- data/ext/grhfilrun/src/sspc2.c +491 -0
- data/ext/grhfilrun/src/stdlib2.c +1482 -0
- data/ext/grhfilrun/src/stdlib2.h +892 -0
- data/ext/grhfilrun/src/trsact.c +817 -0
- data/ext/grhfilrun/src/trsact.h +160 -0
- data/ext/grhfilrun/src/vec.c +745 -0
- data/ext/grhfilrun/src/vec.h +172 -0
- data/ext/lcmrun/extconf.rb +20 -0
- data/ext/lcmrun/lcmrun.cpp +99 -0
- data/ext/lcmrun/src/aheap.c +216 -0
- data/ext/lcmrun/src/aheap.h +111 -0
- data/ext/lcmrun/src/base.c +92 -0
- data/ext/lcmrun/src/base.h +59 -0
- data/ext/lcmrun/src/itemset.c +496 -0
- data/ext/lcmrun/src/itemset.h +157 -0
- data/ext/lcmrun/src/lcm.c +427 -0
- data/ext/lcmrun/src/problem.c +349 -0
- data/ext/lcmrun/src/problem.h +177 -0
- data/ext/lcmrun/src/queue.c +528 -0
- data/ext/lcmrun/src/queue.h +176 -0
- data/ext/lcmrun/src/sgraph.c +359 -0
- data/ext/lcmrun/src/sgraph.h +173 -0
- data/ext/lcmrun/src/stdlib2.c +1282 -0
- data/ext/lcmrun/src/stdlib2.h +823 -0
- data/ext/lcmrun/src/trsact.c +747 -0
- data/ext/lcmrun/src/trsact.h +159 -0
- data/ext/lcmrun/src/vec.c +731 -0
- data/ext/lcmrun/src/vec.h +171 -0
- data/ext/lcmseq0run/extconf.rb +20 -0
- data/ext/lcmseq0run/lcmseq0run.cpp +59 -0
- data/ext/lcmseq0run/src/aheap.c +216 -0
- data/ext/lcmseq0run/src/aheap.h +111 -0
- data/ext/lcmseq0run/src/base.c +92 -0
- data/ext/lcmseq0run/src/base.h +59 -0
- data/ext/lcmseq0run/src/itemset.c +518 -0
- data/ext/lcmseq0run/src/itemset.h +157 -0
- data/ext/lcmseq0run/src/itemset_zero.c +522 -0
- data/ext/lcmseq0run/src/lcm_seq.c +446 -0
- data/ext/lcmseq0run/src/lcm_seq_zero.c +446 -0
- data/ext/lcmseq0run/src/problem.c +439 -0
- data/ext/lcmseq0run/src/problem.h +179 -0
- data/ext/lcmseq0run/src/problem_zero.c +439 -0
- data/ext/lcmseq0run/src/queue.c +533 -0
- data/ext/lcmseq0run/src/queue.h +182 -0
- data/ext/lcmseq0run/src/stdlib2.c +1350 -0
- data/ext/lcmseq0run/src/stdlib2.h +864 -0
- data/ext/lcmseq0run/src/trsact.c +747 -0
- data/ext/lcmseq0run/src/trsact.h +159 -0
- data/ext/lcmseq0run/src/vec.c +779 -0
- data/ext/lcmseq0run/src/vec.h +172 -0
- data/ext/lcmseqrun/extconf.rb +20 -0
- data/ext/lcmseqrun/lcmseqrun.cpp +101 -0
- data/ext/lcmseqrun/src/aheap.c +216 -0
- data/ext/lcmseqrun/src/aheap.h +111 -0
- data/ext/lcmseqrun/src/base.c +92 -0
- data/ext/lcmseqrun/src/base.h +59 -0
- data/ext/lcmseqrun/src/itemset.c +518 -0
- data/ext/lcmseqrun/src/itemset.h +157 -0
- data/ext/lcmseqrun/src/itemset_zero.c +522 -0
- data/ext/lcmseqrun/src/lcm_seq.c +447 -0
- data/ext/lcmseqrun/src/lcm_seq_zero.c +446 -0
- data/ext/lcmseqrun/src/problem.c +439 -0
- data/ext/lcmseqrun/src/problem.h +179 -0
- data/ext/lcmseqrun/src/problem_zero.c +439 -0
- data/ext/lcmseqrun/src/queue.c +533 -0
- data/ext/lcmseqrun/src/queue.h +182 -0
- data/ext/lcmseqrun/src/stdlib2.c +1350 -0
- data/ext/lcmseqrun/src/stdlib2.h +864 -0
- data/ext/lcmseqrun/src/trsact.c +747 -0
- data/ext/lcmseqrun/src/trsact.h +159 -0
- data/ext/lcmseqrun/src/vec.c +779 -0
- data/ext/lcmseqrun/src/vec.h +172 -0
- data/ext/lcmtransrun/extconf.rb +18 -0
- data/ext/lcmtransrun/lcmtransrun.cpp +264 -0
- data/ext/macerun/extconf.rb +20 -0
- data/ext/macerun/macerun.cpp +57 -0
- data/ext/macerun/src/aheap.c +217 -0
- data/ext/macerun/src/aheap.h +112 -0
- data/ext/macerun/src/itemset.c +491 -0
- data/ext/macerun/src/itemset.h +158 -0
- data/ext/macerun/src/mace.c +503 -0
- data/ext/macerun/src/problem.c +346 -0
- data/ext/macerun/src/problem.h +174 -0
- data/ext/macerun/src/queue.c +529 -0
- data/ext/macerun/src/queue.h +177 -0
- data/ext/macerun/src/sgraph.c +360 -0
- data/ext/macerun/src/sgraph.h +174 -0
- data/ext/macerun/src/stdlib2.c +993 -0
- data/ext/macerun/src/stdlib2.h +811 -0
- data/ext/macerun/src/vec.c +634 -0
- data/ext/macerun/src/vec.h +170 -0
- data/ext/sspcrun/extconf.rb +20 -0
- data/ext/sspcrun/src/_sspc.c +358 -0
- data/ext/sspcrun/src/aheap.c +545 -0
- data/ext/sspcrun/src/aheap.h +251 -0
- data/ext/sspcrun/src/base.c +92 -0
- data/ext/sspcrun/src/base.h +59 -0
- data/ext/sspcrun/src/fstar.c +496 -0
- data/ext/sspcrun/src/fstar.h +80 -0
- data/ext/sspcrun/src/grhfil.c +213 -0
- data/ext/sspcrun/src/itemset.c +713 -0
- data/ext/sspcrun/src/itemset.h +170 -0
- data/ext/sspcrun/src/problem.c +415 -0
- data/ext/sspcrun/src/problem.h +179 -0
- data/ext/sspcrun/src/queue.c +533 -0
- data/ext/sspcrun/src/queue.h +182 -0
- data/ext/sspcrun/src/sample.c +19 -0
- data/ext/sspcrun/src/sspc.c +598 -0
- data/ext/sspcrun/src/sspc2.c +491 -0
- data/ext/sspcrun/src/stdlib2.c +1482 -0
- data/ext/sspcrun/src/stdlib2.h +892 -0
- data/ext/sspcrun/src/trsact.c +817 -0
- data/ext/sspcrun/src/trsact.h +160 -0
- data/ext/sspcrun/src/vec.c +745 -0
- data/ext/sspcrun/src/vec.h +172 -0
- data/ext/sspcrun/sspcrun.cpp +54 -0
- data/lib/nysol/enumLcmEp.rb +338 -0
- data/lib/nysol/enumLcmEsp.rb +284 -0
- data/lib/nysol/enumLcmIs.rb +275 -0
- data/lib/nysol/enumLcmSeq.rb +143 -0
- data/lib/nysol/items.rb +201 -0
- data/lib/nysol/seqDB.rb +256 -0
- data/lib/nysol/take.rb +39 -0
- data/lib/nysol/taxonomy.rb +113 -0
- data/lib/nysol/traDB.rb +257 -0
- metadata +239 -0
data/bin/mhifriend.rb
ADDED
@@ -0,0 +1,456 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require "rubygems"
|
5
|
+
require "nysol/mcmd"
|
6
|
+
|
7
|
+
# 1.0 initial development: 2016/12/26
|
8
|
+
$cmd=$0.sub(/.*\//,"")
|
9
|
+
$version="1.0"
|
10
|
+
|
11
|
+
def help
|
12
|
+
|
13
|
+
STDERR.puts <<EOF
|
14
|
+
----------------------------
|
15
|
+
#{$cmd} version #{$version}
|
16
|
+
----------------------------
|
17
|
+
概要) hierarchical friend: トランザクションデータにfriendによるpolishを階層的に適用する。
|
18
|
+
|
19
|
+
書式) #{$cmd} i= tid= item= [class=] [no=] eo= s=|S= [-node_support] [rank=] [sim=] [maxLevel=] [T=] [--help]
|
20
|
+
i= : トランザクションデータファイル【必須】
|
21
|
+
tid= : トランザクションID項目名【必須】
|
22
|
+
item= : アイテム項目名【必須】
|
23
|
+
no= : 出力ファイル(節点)
|
24
|
+
eo= : 出力ファイル(辺:節点ペア)
|
25
|
+
s= : 最小支持度(全トランザクション数に対する割合による指定): 0以上1以下の実数
|
26
|
+
S= : 最小支持度(トランザクション数による指定): 1以上の整数
|
27
|
+
-node_support : 節点にもs=,S=の条件を適用する。指定しなければ全てのitemを節点として出力する。
|
28
|
+
以上のパラメータ mtra2gc.rbのパラメータであり、詳細は同コマンドヘルプを参照のこと。
|
29
|
+
|
30
|
+
rank= : 枝を張る条件で、双方向類似枝の上位何個までを選択するか(デフォルト:3)
|
31
|
+
sim= : rank=で利用する類似度を指定する。(デフォルト:S)
|
32
|
+
指定できる類似度は以下の3つのいずれか一つ。
|
33
|
+
S:Support, J: Jaccard, P:normalized PMI, C:Confidence
|
34
|
+
|
35
|
+
maxLevel= : 階層化の回数上限(デフォルト:0,収束するまで)
|
36
|
+
|
37
|
+
|
38
|
+
その他
|
39
|
+
T= : ワークディレクトリ(default:/tmp)
|
40
|
+
--help : ヘルプの表示
|
41
|
+
|
42
|
+
入力ファイル形式)
|
43
|
+
トランザクションIDとアイテムの2項目によるトランザクションデータ。
|
44
|
+
|
45
|
+
o=の出力形式)
|
46
|
+
枝ファイル: cluster,node,support,frequency,total
|
47
|
+
節点ファイル: cluster%0,node1%1,node2%2,support(sim=で指定した類似度)
|
48
|
+
|
49
|
+
例)
|
50
|
+
$ cat tra1.csv
|
51
|
+
id,item
|
52
|
+
1,a
|
53
|
+
1,b
|
54
|
+
1,d
|
55
|
+
1,e
|
56
|
+
2,a
|
57
|
+
2,b
|
58
|
+
2,e
|
59
|
+
3,a
|
60
|
+
3,d
|
61
|
+
3,e
|
62
|
+
6,b
|
63
|
+
6,d
|
64
|
+
7,d
|
65
|
+
7,e
|
66
|
+
4,c
|
67
|
+
4,f
|
68
|
+
4,b
|
69
|
+
5,c
|
70
|
+
5,f
|
71
|
+
5,e
|
72
|
+
8,g
|
73
|
+
8,h
|
74
|
+
9,g
|
75
|
+
9,h
|
76
|
+
0,i
|
77
|
+
a,j
|
78
|
+
a,c
|
79
|
+
a,a
|
80
|
+
|
81
|
+
$ #{$cmd}hifriend.rb i=tra1.csv no=node1.csv eo=edge1.csv tid=id item=item sim=S S=2 rank=3
|
82
|
+
$ cat edge.csv
|
83
|
+
cluster%0,node1%1,node2%2,support
|
84
|
+
#1_1,a,b,0.1818181818
|
85
|
+
#1_1,a,d,0.1818181818
|
86
|
+
#1_1,a,e,0.2727272727
|
87
|
+
#1_1,b,d,0.1818181818
|
88
|
+
#1_1,b,e,0.1818181818
|
89
|
+
#1_1,d,e,0.2727272727
|
90
|
+
#1_2,c,f,0.1818181818
|
91
|
+
#1_3,g,h,0.1818181818
|
92
|
+
#2_1,#1_1,#1_2,0.2727272727
|
93
|
+
|
94
|
+
$ cat node.csv
|
95
|
+
cluster%0,node%1,support,frequency,total
|
96
|
+
,i,0.09090909091,1,11
|
97
|
+
,j,0.09090909091,1,11
|
98
|
+
#1_1,a,0.3636363636,4,11
|
99
|
+
#1_1,b,0.3636363636,4,11
|
100
|
+
#1_1,d,0.3636363636,4,11
|
101
|
+
#1_1,e,0.4545454545,5,11
|
102
|
+
#1_2,c,0.2727272727,3,11
|
103
|
+
#1_2,f,0.1818181818,2,11
|
104
|
+
#1_3,g,0.1818181818,2,11
|
105
|
+
#1_3,h,0.1818181818,2,11
|
106
|
+
#2_1,#1_1,0.7272727273,8,11
|
107
|
+
#2_1,#1_2,0.2727272727,3,11
|
108
|
+
|
109
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
110
|
+
EOF
|
111
|
+
exit
|
112
|
+
end
|
113
|
+
|
114
|
+
def ver()
|
115
|
+
STDERR.puts "version #{$version}"
|
116
|
+
exit
|
117
|
+
end
|
118
|
+
|
119
|
+
help() if ARGV.size <= 0 or ARGV[0]=="--help"
|
120
|
+
ver() if ARGV[0]=="--version"
|
121
|
+
|
122
|
+
args=MCMD::Margs.new(ARGV,"i=,tid=,item=,no=,eo=,s=,S=,-node_support,rank=,sim=,maxLevel=,-num,-verbose","i=,tid=,item=,eo=,no=")
|
123
|
+
|
124
|
+
# mcmdのメッセージは警告とエラーのみ
|
125
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
|
126
|
+
|
127
|
+
#ワークファイルパス
|
128
|
+
if args.str("T=")!=nil then
|
129
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
130
|
+
end
|
131
|
+
traFile= args.str("i=")
|
132
|
+
idFN = args.str("tid=")
|
133
|
+
itemFN = args.str("item=")
|
134
|
+
numtp = args.bool("-num")
|
135
|
+
|
136
|
+
onFile = args. file("no=", "w")
|
137
|
+
oeFile = args. file("eo=", "w")
|
138
|
+
|
139
|
+
# mtra2gc parameters
|
140
|
+
sp1 = args.str("s=")
|
141
|
+
sp2 = args.str("S=")
|
142
|
+
node_support=args.bool("-node_support")
|
143
|
+
|
144
|
+
# firend parameters
|
145
|
+
sim = args.str("sim=")
|
146
|
+
if sim=="S"
|
147
|
+
simStr="support"
|
148
|
+
elsif sim=="J"
|
149
|
+
simStr="jaccard"
|
150
|
+
elsif sim=="P"
|
151
|
+
simStr="PMI"
|
152
|
+
elsif sim=="C"
|
153
|
+
simStr="confidence"
|
154
|
+
else
|
155
|
+
MCMD::errorLog("sim= takes S, J, P or C")
|
156
|
+
raise ArgumentError
|
157
|
+
end
|
158
|
+
rank = args.str("rank=") # ranking
|
159
|
+
|
160
|
+
maxLevel = args.int("maxLevel=",0) # ranking
|
161
|
+
|
162
|
+
|
163
|
+
# traファイルから類似度グラフを作成
|
164
|
+
def runTra2gc(traFile,paramTra2gc,onFile,oeFile)
|
165
|
+
### mtra2gc.rb
|
166
|
+
system "mtra2gc.rb #{paramTra2gc} i=#{traFile} no=#{onFile} eo=#{oeFile}"
|
167
|
+
# node1%0,node2%1,support,frequency,frequency1,frequency2,total,lift,jaccard,PMI
|
168
|
+
# a,b,0.1818181818,2,4,4,11,1.375,0.3333333333,0.1868039815
|
169
|
+
# a,d,0.1818181818,2,4,4,11,1.375,0.3333333333,0.1868039815
|
170
|
+
# a,e,0.2727272727,3,4,5,11,1.65,0.5,0.385424341
|
171
|
+
# b,d,0.1818181818,2,4,4,11,1.375,0.3333333333,0.1868039815
|
172
|
+
# b,e,0.1818181818,2,4,5,11,1.1,0.2857142857,0.05590865902
|
173
|
+
# c,f,0.1818181818,2,3,2,11,3.6667,0.6666666667,0.7621554117
|
174
|
+
# d,e,0.2727272727,3,4,5,11,1.65,0.5,0.385424341
|
175
|
+
# g,h,0.1818181818,2,2,2,11,5.5,1,1
|
176
|
+
end
|
177
|
+
|
178
|
+
def runPolish(simgN,simgE,paramPolish,polishN,polishE)
|
179
|
+
## mpolishs.rb
|
180
|
+
system "mfriends.rb -udout -directed ef=node1,node2 nf=node #{paramPolish} ni=#{simgN} ei=#{simgE} eo=#{polishE} no=#{polishN}"
|
181
|
+
# polishE
|
182
|
+
# node1%0,node2%1,jaccard
|
183
|
+
# a,b,0.3333333333
|
184
|
+
# a,d,0.3333333333
|
185
|
+
# a,e,0.5
|
186
|
+
# b,d,0.3333333333
|
187
|
+
# b,e,0.2857142857
|
188
|
+
# c,f,0.6666666667
|
189
|
+
# d,e,0.5
|
190
|
+
# g,h,1
|
191
|
+
end
|
192
|
+
|
193
|
+
def runClustering(niFile,eiFile,paramCluster,oFile)
|
194
|
+
### mccomp.rb
|
195
|
+
system "mccomp.rb nf=node ef=node1,node2 #{paramCluster} ni=#{niFile} ei=#{eiFile} o=#{oFile}"
|
196
|
+
# id%0,node,size
|
197
|
+
# 1,a,4
|
198
|
+
# 1,b,4
|
199
|
+
# 1,d,4
|
200
|
+
# 1,e,4
|
201
|
+
# 2,c,2
|
202
|
+
# 2,f,2
|
203
|
+
# 3,g,2
|
204
|
+
# 3,h,2
|
205
|
+
# 4,i,1
|
206
|
+
# 5,j,1
|
207
|
+
end
|
208
|
+
|
209
|
+
def runConvert(traFile,idFN,itemFN,level,clusterFile,mFile,oFile,maxNo)
|
210
|
+
temp=MCMD::Mtemp.new
|
211
|
+
xxfreq=temp.file
|
212
|
+
xxmf0 =temp.file
|
213
|
+
xxmf1 =temp.file
|
214
|
+
xxmf2 =temp.file
|
215
|
+
if maxNo then
|
216
|
+
# node-clusterマスター作成
|
217
|
+
# (1つのクラスタに1つのnodeはオリジナルアイテムをclusterに)
|
218
|
+
system "mcount k=id a=freq i=#{clusterFile} o=#{xxfreq}"
|
219
|
+
f=""
|
220
|
+
f << "mjoin k=id m=#{xxfreq} f=freq i=#{clusterFile} |"
|
221
|
+
f << "mselnum c='(1,]' f=freq u=#{xxmf0} |"
|
222
|
+
f << "mnumber k=id -B S=1 a=num |"
|
223
|
+
f << "mcal c='${num}+#{maxNo}' a=cluster o=#{xxmf1};"
|
224
|
+
f << "mcal c='$s{node}' a=cluster i=#{xxmf0} o=#{xxmf2};"
|
225
|
+
f << "mcat f=node,freq,cluster i=#{xxmf1},#{xxmf2} o=#{mFile}"
|
226
|
+
system(f)
|
227
|
+
f= ""
|
228
|
+
f << "mstats c=max f=cluster i=#{xxmf1}|"
|
229
|
+
f << "mcut f=cluster -nfno "
|
230
|
+
maxNo = `#{f}`.chomp.to_i
|
231
|
+
else
|
232
|
+
|
233
|
+
# node-clusterマスター作成
|
234
|
+
# (1つのクラスタに1つのnodeはオリジナルアイテムをclusterに)
|
235
|
+
system "mcount k=id a=freq i=#{clusterFile} o=#{xxfreq}"
|
236
|
+
f=""
|
237
|
+
f << "mjoin k=id m=#{xxfreq} f=freq i=#{clusterFile} |"
|
238
|
+
f << "mcal c='if(${freq}==1,$s{node},\"##{level}_\"+$s{id})' a=cluster o=#{mFile}"
|
239
|
+
system(f)
|
240
|
+
end
|
241
|
+
|
242
|
+
# トランザクションのitemをclusterに変換
|
243
|
+
f=""
|
244
|
+
f << "mjoin k=#{itemFN} K=node m=#{mFile} f=cluster i=#{traFile} -n |"
|
245
|
+
f << "mcal c='if(isnull($s{cluster}),$s{#{itemFN}},$s{cluster})' a=newItem |"
|
246
|
+
f << "mcut f=#{itemFN},cluster -r |"
|
247
|
+
f << "mfldname f=newItem:#{itemFN} |"
|
248
|
+
f << "muniq k=#{idFN},#{itemFN} o=#{oFile}"
|
249
|
+
system(f)
|
250
|
+
return maxNo
|
251
|
+
end
|
252
|
+
|
253
|
+
def runSaveNode(polishN,simgN,ncMap,oFile)
|
254
|
+
# save to his
|
255
|
+
# node情報
|
256
|
+
f=""
|
257
|
+
f << "mcut f=node i=#{polishN} |"
|
258
|
+
f << "mjoin k=node m=#{ncMap} f=cluster |"
|
259
|
+
f << "mjoin k=node m=#{simgN} f=support,frequency,total o=#{oFile}"
|
260
|
+
system(f)
|
261
|
+
end
|
262
|
+
|
263
|
+
def runSaveEdge(polishE,simgE,ncMap,simStr,oFile)
|
264
|
+
f=""
|
265
|
+
f << "mcut f=node1,node2,#{simStr} i=#{polishE} |"
|
266
|
+
f << "mjoin k=node1 K=node m=#{ncMap} f=cluster |"
|
267
|
+
f << "mcut f=node1,node2,cluster,#{simStr} o=#{oFile}"
|
268
|
+
system(f)
|
269
|
+
end
|
270
|
+
|
271
|
+
##########################################
|
272
|
+
# iFileのitemFN項目のitem番号最大値を取得
|
273
|
+
def getMaxNo(iFile,itemFN)
|
274
|
+
maxNo = 0;
|
275
|
+
f= ""
|
276
|
+
f << "mstats c=max f=#{itemFN} i=#{iFile} |"
|
277
|
+
f << "mcut f=#{itemFN} -nfno "
|
278
|
+
maxNo = `#{f}`.chomp.to_i
|
279
|
+
return maxNo
|
280
|
+
end
|
281
|
+
|
282
|
+
def outputNode(iPath,lastItemNo,oFile)
|
283
|
+
temp=MCMD::Mtemp.new
|
284
|
+
xxwk1=temp.file
|
285
|
+
xxwk2=temp.file
|
286
|
+
# ノードファイルの出力
|
287
|
+
# いずれのレベルにおいても孤立ノードのクラスタ行は削除する
|
288
|
+
# 条件: node==cluster and (nodeが他の行のnodeとして出現していない or nodeはクラスタ)
|
289
|
+
# node%0,support,frequency,total,cluster
|
290
|
+
# #1_1,0.7272727273,8,11,#2_1
|
291
|
+
# #1_2,0.2727272727,3,11,#2_1
|
292
|
+
# #1_3,0.1818181818,2,11,#1_3
|
293
|
+
# i,0.09090909091,1,11,i
|
294
|
+
# j,0.09090909091,1,11,j
|
295
|
+
f=""
|
296
|
+
f << "mcat i=#{iPath}/node* |"
|
297
|
+
f << "muniq k=cluster,node o=#{xxwk1}"
|
298
|
+
system(f)
|
299
|
+
system "mcut f=node i=#{xxwk1} | mcount k=node a=freq o=#{xxwk2}"
|
300
|
+
f=""
|
301
|
+
f << "mjoin k=node m=#{xxwk2} i=#{xxwk1} |"
|
302
|
+
if lastItemNo then
|
303
|
+
f << "msel c='$s{node}==$s{cluster} && (${freq}>1 || ${node}>#{lastItemNo})' -r |"
|
304
|
+
else
|
305
|
+
f << "msel c='$s{node}==$s{cluster} && (${freq}>1 || left($s{node},1)==\"#\")' -r |"
|
306
|
+
end
|
307
|
+
f << "mcal c='if($s{node}==$s{cluster},\"\",$s{cluster})' a=newClust|"
|
308
|
+
f << "mcut f=newClust:cluster,node,support,frequency,total |"
|
309
|
+
f << "msortf f=cluster,node o=#{oFile}"
|
310
|
+
system(f)
|
311
|
+
end
|
312
|
+
|
313
|
+
def outputEdge(iPath,simStr,oFile)
|
314
|
+
# エッジファイルの出力
|
315
|
+
# node1%0,node2%1,jaccard,cluster
|
316
|
+
# a,b,0.3333333333,#1_1
|
317
|
+
# a,d,0.3333333333,#1_1
|
318
|
+
# a,e,0.5,#1_1
|
319
|
+
f=""
|
320
|
+
f << "mcat i=#{iPath}/edge* |"
|
321
|
+
f << "mcut f=cluster,node1,node2,#{simStr} |"
|
322
|
+
f << "msortf f=cluster,node1,node2 o=#{oFile}"
|
323
|
+
system(f)
|
324
|
+
end
|
325
|
+
|
326
|
+
def hiPolish(traFile,idFN,itemFN,simStr,paramTra2gc,paramPolish,paramCluster,maxLevel,oPath,maxNo)
|
327
|
+
temp=MCMD::Mtemp.new
|
328
|
+
xxtra =temp.file
|
329
|
+
xxtra2 =temp.file
|
330
|
+
xxsimgN=temp.file
|
331
|
+
xxsimgE=temp.file
|
332
|
+
xxpolishN=temp.file
|
333
|
+
xxpolishE=temp.file
|
334
|
+
xxcluster=temp.file
|
335
|
+
xxncMap=temp.file
|
336
|
+
|
337
|
+
system "cp #{traFile} #{xxtra}"
|
338
|
+
counter=1
|
339
|
+
while true
|
340
|
+
# 繰り返し上限の判定
|
341
|
+
break if maxLevel!=0 and counter>maxLevel
|
342
|
+
|
343
|
+
# 類似度グラフの作成
|
344
|
+
runTra2gc(xxtra,paramTra2gc,xxsimgN,xxsimgE)
|
345
|
+
# system "head #{xxsimgE}"
|
346
|
+
# system "cat #{xxsimgE}"
|
347
|
+
# node1%0,node2%1,frequency,frequency1,frequency2,total,support,confidence,lift,jaccard,PMI
|
348
|
+
# a,b,2,4,4,11,0.1818181818,0.5,1.375,0.3333333333,0.1868039815
|
349
|
+
# a,d,2,4,4,11,0.1818181818,0.5,1.375,0.3333333333,0.1868039815
|
350
|
+
|
351
|
+
# polish実行
|
352
|
+
runPolish(xxsimgN,xxsimgE,paramPolish,xxpolishN,xxpolishE)
|
353
|
+
#system "head #{xxpolishE}"
|
354
|
+
# node1%0,node2%1,support
|
355
|
+
# a,b,0.1818181818
|
356
|
+
# a,d,0.1818181818
|
357
|
+
# a,e,0.2727272727
|
358
|
+
|
359
|
+
# stop条件
|
360
|
+
size=MCMD::mrecount("i=#{xxpolishE}")
|
361
|
+
break if size==0
|
362
|
+
|
363
|
+
# クラスタリング(連結成分など)
|
364
|
+
runClustering(xxpolishN,xxpolishE,paramCluster,xxcluster)
|
365
|
+
# system "head #{xxcluster}"
|
366
|
+
# id%0,node,size
|
367
|
+
# 1,a,4
|
368
|
+
# 1,b,4
|
369
|
+
# 1,d,4
|
370
|
+
# 1,e,4
|
371
|
+
# 2,c,2
|
372
|
+
# 2,f,2
|
373
|
+
|
374
|
+
# traのitemをクラスタitemに変換
|
375
|
+
maxNo=runConvert(xxtra,idFN,itemFN,counter,xxcluster,xxncMap,xxtra2,maxNo)
|
376
|
+
# system "head #{xxncMap}"
|
377
|
+
# id%0,node,size,freq,cluster
|
378
|
+
# 1,a,4,4,#1_1
|
379
|
+
# 1,b,4,4,#1_1
|
380
|
+
# 1,d,4,4,#1_1
|
381
|
+
# 1,e,4,4,#1_1
|
382
|
+
# 2,c,2,2,#1_2
|
383
|
+
# 2,f,2,2,#1_2
|
384
|
+
# system "head #{xxtra2}"
|
385
|
+
# system "cat #{xxtra2}"
|
386
|
+
# id%0,item%1
|
387
|
+
# 0,i
|
388
|
+
# 1,#1_1
|
389
|
+
# 2,#1_1
|
390
|
+
# 3,#1_1
|
391
|
+
#
|
392
|
+
|
393
|
+
# node,edgeの保存
|
394
|
+
runSaveNode(xxpolishN,xxsimgN,xxncMap ,"#{oPath}/node_#{counter}")
|
395
|
+
runSaveEdge(xxpolishE,xxsimgE,xxncMap,simStr,"#{oPath}/edge_#{counter}")
|
396
|
+
# system "head #{oPath}/node_#{counter}"
|
397
|
+
# node%0,cluster,support,frequency,total
|
398
|
+
# a,#1_1,0.3636363636,4,11
|
399
|
+
# b,#1_1,0.3636363636,4,11
|
400
|
+
# system "head #{oPath}/edge_#{counter}"
|
401
|
+
# node1%0,node2%1,cluster,support
|
402
|
+
# a,b,#1_1,0.1818181818
|
403
|
+
# a,d,#1_1,0.1818181818
|
404
|
+
|
405
|
+
#system "cp #{xxtra} #{oPath}/tra_#{counter}"
|
406
|
+
# break if counter==3
|
407
|
+
counter+=1
|
408
|
+
system "cp #{xxtra2} #{xxtra}"
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
### mtra2gc用パラメータ
|
413
|
+
paramTra2gc=""
|
414
|
+
paramTra2gc << " tid=#{idFN}" if idFN
|
415
|
+
paramTra2gc << " item=#{itemFN}" if itemFN
|
416
|
+
paramTra2gc << " s=#{sp1}" if sp1
|
417
|
+
paramTra2gc << " S=#{sp2}" if sp2
|
418
|
+
#####################
|
419
|
+
# 異なる向きのconfidenceを列挙するためにsim=C th=0として双方向列挙しておく
|
420
|
+
# 出力データは倍になるが、mfriendsで-directedとすることで元が取れている
|
421
|
+
paramTra2gc << " sim=C"
|
422
|
+
paramTra2gc << " th=0"
|
423
|
+
#####################
|
424
|
+
paramTra2gc << " -node_support" if node_support
|
425
|
+
paramTra2gc << " -num" if numtp
|
426
|
+
|
427
|
+
### polish用パラメータ
|
428
|
+
paramPolish=""
|
429
|
+
paramPolish << " sim=#{simStr}"
|
430
|
+
paramPolish << " rank=#{rank}" if rank
|
431
|
+
|
432
|
+
### クラスタリング用パラメータ
|
433
|
+
paramCluster=""
|
434
|
+
|
435
|
+
temp=MCMD::Mtemp.new
|
436
|
+
xxhis=temp.file
|
437
|
+
MCMD::mkDir(xxhis,true) # 併合過程の履歴dir
|
438
|
+
|
439
|
+
# numtpの場合、数値item最大値を取得しておく
|
440
|
+
maxNo=nil # hiPolishで更新され、最終的に最後のcluster item番号となる
|
441
|
+
lastItemNo=nil # オリジナルのtra上のitem番号の最大値
|
442
|
+
if numtp then
|
443
|
+
maxNo=getMaxNo(traFile,itemFN)
|
444
|
+
lastItemNo=maxNo
|
445
|
+
end
|
446
|
+
|
447
|
+
# 階層化研磨(hierarchical polishing)実行
|
448
|
+
hiPolish(traFile,idFN,itemFN,simStr,paramTra2gc,paramPolish,paramCluster,maxLevel,xxhis,maxNo)
|
449
|
+
|
450
|
+
# node,edgeの最終出力
|
451
|
+
outputNode(xxhis,lastItemNo,onFile)
|
452
|
+
outputEdge(xxhis,simStr,oeFile)
|
453
|
+
|
454
|
+
# end message
|
455
|
+
MCMD::endLog(args.cmdline)
|
456
|
+
|