nysol-mining 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/msm.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require "rubygems"
|
5
|
+
require "nysol/mcmd"
|
6
|
+
require "json"
|
7
|
+
|
8
|
+
# 1.0: first release: 2015/5/5
|
9
|
+
$version="1.0"
|
10
|
+
$revision="###VERSION###"
|
11
|
+
|
12
|
+
def help
|
13
|
+
|
14
|
+
STDERR.puts <<EOF
|
15
|
+
----------------------------
|
16
|
+
msm.rb version #{$version}
|
17
|
+
----------------------------
|
18
|
+
概要) shift mean clustering
|
19
|
+
特徴) 1) RパッケージLPCMを利用している。
|
20
|
+
用法) msm.rb f= i= h= [O=] [--help]
|
21
|
+
|
22
|
+
f= : i=ファイル上の変数項目名【必須】
|
23
|
+
i= : 入力ファイル名【必須】
|
24
|
+
h= : band width
|
25
|
+
O= : 出力パス【必須】
|
26
|
+
-debug : Rの実行結果を表示
|
27
|
+
|
28
|
+
その他
|
29
|
+
--help : ヘルプの表示
|
30
|
+
|
31
|
+
必要なソフトウェア)
|
32
|
+
1) R
|
33
|
+
2) RのLPCMパッケージ
|
34
|
+
|
35
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
36
|
+
EOF
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
|
40
|
+
def ver()
|
41
|
+
$revision ="0" if $revision =~ /VERSION/
|
42
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
47
|
+
ver() if ARGV[0]=="--version"
|
48
|
+
|
49
|
+
args=MCMD::Margs.new(ARGV,"f=,h=,i=,o=,O=,-debug,-mcmdenv,T=","f=,h=,i=,o=")
|
50
|
+
|
51
|
+
# mcmdのメッセージは警告とエラーのみ
|
52
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
|
53
|
+
|
54
|
+
# Rライブラリ実行可能確認
|
55
|
+
exit(1) unless(MCMD::chkRexe("LPCM"))
|
56
|
+
|
57
|
+
#ワークファイルパス
|
58
|
+
if args.str("T=")!=nil then
|
59
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
60
|
+
end
|
61
|
+
|
62
|
+
iFile = args.file("i=","r")
|
63
|
+
oFile = args.file("o=","w")
|
64
|
+
flds = args.field("f=", iFile)
|
65
|
+
names = flds["names"].join(",")
|
66
|
+
newnames = flds["newNames"]
|
67
|
+
if newnames.index(nil)
|
68
|
+
raise "#ERROR# f= parameter takes new field names for output."
|
69
|
+
end
|
70
|
+
|
71
|
+
bw = args.float("h=")
|
72
|
+
oPath = args.file("O=","w")
|
73
|
+
$debug = args.bool("-debug")
|
74
|
+
|
75
|
+
MCMD::mkDir(oPath) if oPath
|
76
|
+
|
77
|
+
def runR(names,bw,csv,wp)
|
78
|
+
wf=MCMD::Mtemp.new
|
79
|
+
scp=wf.file #"xxscp"
|
80
|
+
|
81
|
+
r_scp = <<EOF
|
82
|
+
library('LPCM')
|
83
|
+
d=read.csv("#{csv}")
|
84
|
+
cm=colMeans(d)
|
85
|
+
#print(cm)
|
86
|
+
sftM=function(x){return(x-cm)}
|
87
|
+
sftP=function(x){return(x+cm)}
|
88
|
+
dd=t(apply(d,1,sftM))
|
89
|
+
#print(dd)
|
90
|
+
model=ms(dd,h=#{bw},plotms=F)
|
91
|
+
|
92
|
+
center=t(apply(model$cluster.center,1,sftP))
|
93
|
+
#print(model$cluster)
|
94
|
+
#print(center)
|
95
|
+
|
96
|
+
#ms.self.coverage(d, taumin=0.02, taumax=0.5, gridsize=25,
|
97
|
+
#thr=0.0001, scaled=TRUE, cluster=FALSE, plot.type="o",
|
98
|
+
#or.labels=NULL, print=FALSE)
|
99
|
+
|
100
|
+
#print(model)
|
101
|
+
#write.csv(model$cluster.center,"#{wp}/xxcluster")
|
102
|
+
write.csv(center,"#{wp}/xxcluster")
|
103
|
+
write.csv(model$cluster.label ,"#{wp}/xxlabel")
|
104
|
+
|
105
|
+
#png("#{wp}/gpr.png")
|
106
|
+
# plot(model,as="improv")
|
107
|
+
#dev.off()
|
108
|
+
EOF
|
109
|
+
|
110
|
+
File.open(scp,"w"){|fpw| fpw.write r_scp}
|
111
|
+
if $debug
|
112
|
+
system "R --vanilla -q < #{scp}"
|
113
|
+
else
|
114
|
+
system "R --vanilla -q < #{scp} &>/dev/null"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# cluster.csv
|
119
|
+
# "","V1","V2"
|
120
|
+
# "1",0.107262943725142,0.0329636308034888
|
121
|
+
# "2",-0.655560794404871,-0.448416202492924
|
122
|
+
# "3",-0.218883486000835,0.44341544263141
|
123
|
+
|
124
|
+
# label.csv
|
125
|
+
# "","x"
|
126
|
+
# "1",1
|
127
|
+
# "2",1
|
128
|
+
# "3",1
|
129
|
+
|
130
|
+
wf=MCMD::Mtemp.new
|
131
|
+
xxbase =wf.file
|
132
|
+
xxwp =wf.file
|
133
|
+
xxcmf =wf.file
|
134
|
+
xxlabel =wf.file
|
135
|
+
MCMD::mkDir(xxwp)
|
136
|
+
|
137
|
+
system "mcut f=#{names} i=#{iFile} o=#{xxbase}"
|
138
|
+
|
139
|
+
runR(names,bw,xxbase,xxwp)
|
140
|
+
|
141
|
+
#
|
142
|
+
nn=[]
|
143
|
+
(1..newnames.size).each{|i|
|
144
|
+
nn << "#{i}:#{newnames[i-1]}"
|
145
|
+
}
|
146
|
+
|
147
|
+
# cluster master file
|
148
|
+
f=""
|
149
|
+
f << "tail +2 <#{xxwp}/xxcluster |"
|
150
|
+
f << "mcut f=0:cluster,#{nn.join(",")} -nfni o=#{xxcmf}"
|
151
|
+
system(f)
|
152
|
+
|
153
|
+
# label file
|
154
|
+
f=""
|
155
|
+
f << "tail +2 <#{xxwp}/xxlabel |"
|
156
|
+
f << "mcut f=1:cluster -nfni o=#{xxlabel}"
|
157
|
+
system(f)
|
158
|
+
|
159
|
+
# join cmf and label file to ifile
|
160
|
+
f=""
|
161
|
+
f << "mpaste m=#{xxlabel} i=#{iFile} |"
|
162
|
+
f << "mjoin k=cluster m=#{xxcmf} o=#{oFile}"
|
163
|
+
system(f)
|
164
|
+
|
165
|
+
if oPath then
|
166
|
+
system "cp #{xxcmf} #{oPath}/cluster.csv"
|
167
|
+
system "cp #{xxlabel} #{oPath}/label.csv"
|
168
|
+
end
|
169
|
+
|
170
|
+
# 終了メッセージ
|
171
|
+
MCMD::endLog(args.cmdline)
|
172
|
+
|
@@ -0,0 +1,161 @@
|
|
1
|
+
/*
|
2
|
+
* Main.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
#include "SketchSort.hpp"
|
25
|
+
|
26
|
+
#include <iostream>
|
27
|
+
#include <cstdlib>
|
28
|
+
|
29
|
+
/* Globals */
|
30
|
+
void usage();
|
31
|
+
void version();
|
32
|
+
void parse_parameters (int argc, char **argv);
|
33
|
+
|
34
|
+
char *fname, *oname;
|
35
|
+
int hamDist = 1;
|
36
|
+
int numblocks = 4;
|
37
|
+
int numchunks = 3;
|
38
|
+
float cosDist = 0.01;
|
39
|
+
bool autoFlag = false;
|
40
|
+
float missingratio = 0.0001;
|
41
|
+
bool centering = false;
|
42
|
+
int windowsize = 0;
|
43
|
+
int seed = 1;
|
44
|
+
|
45
|
+
|
46
|
+
/*******************************************************************************/
|
47
|
+
#ifndef _NO_MAIN_
|
48
|
+
#define _NO_MAIN_
|
49
|
+
int main(int argc, char **argv)
|
50
|
+
{
|
51
|
+
version();
|
52
|
+
|
53
|
+
parse_parameters(argc, argv);
|
54
|
+
|
55
|
+
SketchSort sketchsort;
|
56
|
+
sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
|
57
|
+
|
58
|
+
return 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
#endif
|
62
|
+
/*******************************************************************************/
|
63
|
+
|
64
|
+
|
65
|
+
int sketchsort_main (int argc, char **argv){
|
66
|
+
|
67
|
+
parse_parameters(argc, argv);
|
68
|
+
|
69
|
+
SketchSort sketchsort;
|
70
|
+
sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
|
71
|
+
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
void version(){
|
77
|
+
std::cerr << "SketchSort version 0.0.8" << std::endl
|
78
|
+
<< "Written by Yasuo Tabei" << std::endl << std::endl;
|
79
|
+
}
|
80
|
+
|
81
|
+
void usage(){
|
82
|
+
std::cerr << std::endl
|
83
|
+
<< "Usage: sketchsort [OPTION]... INFILE OUTFILE" << std::endl << std::endl
|
84
|
+
<< " where [OPTION]... is a list of zero or more optional arguments" << std::endl
|
85
|
+
<< " INFILE is the name of an input file" << std::endl
|
86
|
+
<< " OUTFILE is the name of an output file" << std::endl << std::endl
|
87
|
+
<< "Additional arguments (input and output files may be specified):" << std::endl
|
88
|
+
<< " -hamdist [maximum hamming distance]" << std::endl
|
89
|
+
<< " (default: " << hamDist << ")" << std::endl
|
90
|
+
<< " -numblocks [the number of blocks]" << std::endl
|
91
|
+
<< " (default: " << numblocks << ")" << std::endl
|
92
|
+
<< " -cosdist [maximum cosine distance]" << std::endl
|
93
|
+
<< " (default: " << cosDist << ")" << std::endl
|
94
|
+
<< " -numchunks [the number of chunks]" << std::endl
|
95
|
+
<< " (default: " << numchunks << ")" << std::endl
|
96
|
+
<< " -auto " << std::endl
|
97
|
+
<< " -missingratio " << std::endl
|
98
|
+
<< " (default: " << missingratio << ")" << std::endl
|
99
|
+
<< " -centering" << std::endl
|
100
|
+
<< " -windowsize" << std::endl
|
101
|
+
<< " (default: " << windowsize << ")" << std::endl
|
102
|
+
<< " -seed " << std::endl
|
103
|
+
<< std::endl;
|
104
|
+
exit(0);
|
105
|
+
}
|
106
|
+
|
107
|
+
void parse_parameters (int argc, char **argv){
|
108
|
+
if (argc == 1) usage();
|
109
|
+
int argno;
|
110
|
+
for (argno = 1; argno < argc; argno++){
|
111
|
+
if (argv[argno][0] == '-'){
|
112
|
+
if (!strcmp (argv[argno], "-version")){
|
113
|
+
version();
|
114
|
+
}
|
115
|
+
else if (!strcmp (argv[argno], "-auto")) {
|
116
|
+
autoFlag = true;
|
117
|
+
}
|
118
|
+
else if (!strcmp (argv[argno], "-centering")) {
|
119
|
+
centering = true;
|
120
|
+
}
|
121
|
+
else if (!strcmp (argv[argno], "-numblocks")) {
|
122
|
+
if (argno == argc - 1) std::cerr << "Must specify minimum support after -numblocks" << std::endl;
|
123
|
+
numblocks = atoi(argv[++argno]);
|
124
|
+
}
|
125
|
+
else if (!strcmp (argv[argno], "-hamdist")) {
|
126
|
+
if (argno == argc - 1) std::cerr << "Must specify hamming distance threshold after -hamdist" << std::endl;
|
127
|
+
hamDist = atoi(argv[++argno]);
|
128
|
+
}
|
129
|
+
else if (!strcmp (argv[argno], "-cosdist")) {
|
130
|
+
if (argno == argc - 1) std::cerr << "Must specify cosine distance threshold size after -cosdist" << std::endl;
|
131
|
+
cosDist = atof(argv[++argno]);
|
132
|
+
}
|
133
|
+
else if (!strcmp (argv[argno], "-numchunks")) {
|
134
|
+
if (argno == argc - 1) std::cerr << "Must specify number of chunks after -numchunks" << std::endl;
|
135
|
+
numchunks = atoi(argv[++argno]);
|
136
|
+
}
|
137
|
+
else if (!strcmp (argv[argno], "-missingratio")) {
|
138
|
+
if (argno == argc - 1) std::cerr << "Must specify missing edge ratio after -missingratio" << std::endl;
|
139
|
+
missingratio = atof(argv[++argno]);
|
140
|
+
}
|
141
|
+
else if (!strcmp (argv[argno], "-seed")) {
|
142
|
+
if (argno == argc - 1) std::cerr << "Must specify initial seed after -seed" << std::endl;
|
143
|
+
seed = atoi(argv[++argno]);
|
144
|
+
}
|
145
|
+
else if (!strcmp (argv[argno], "-windowsize")) {
|
146
|
+
if (argno == argc - 1) std::cerr << "Must specify windowsize after -windowsize" << std::endl;
|
147
|
+
windowsize = atoi(argv[++argno]);
|
148
|
+
}
|
149
|
+
else {
|
150
|
+
usage();
|
151
|
+
}
|
152
|
+
} else {
|
153
|
+
break;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
if (argno > argc)
|
157
|
+
usage();
|
158
|
+
|
159
|
+
fname = argv[argno];
|
160
|
+
oname = argv[argno + 1];
|
161
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
/*
|
2
|
+
* Main.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
int sketchsort_main (int argc, char **argv);
|
@@ -0,0 +1,526 @@
|
|
1
|
+
/*
|
2
|
+
* SketchSort.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
#include "SketchSort.hpp"
|
25
|
+
|
26
|
+
template<class T>
|
27
|
+
inline uint8_t sign(T val) {
|
28
|
+
if (val > 0)
|
29
|
+
return 1;
|
30
|
+
return 0;
|
31
|
+
}
|
32
|
+
|
33
|
+
template<class T>
|
34
|
+
inline T max(T a1, T a2) {
|
35
|
+
if (a1 > a2)
|
36
|
+
return a1;
|
37
|
+
return a2;
|
38
|
+
}
|
39
|
+
|
40
|
+
bool cmp(const std::pair<int, float> &p1, const std::pair<int, float> &p2) {
|
41
|
+
return p1.second < p2.second;
|
42
|
+
}
|
43
|
+
|
44
|
+
void SketchSort::readFeature(const char *fname, unsigned int _windowsize) {
|
45
|
+
std::ifstream ifs(fname);
|
46
|
+
|
47
|
+
if (!ifs) {
|
48
|
+
std::cerr << "can not open " << fname << std::endl;
|
49
|
+
exit(0);
|
50
|
+
}
|
51
|
+
|
52
|
+
dim = 0;
|
53
|
+
float val = 0.f;
|
54
|
+
uint64_t lineCnt = 0;
|
55
|
+
std::string line;
|
56
|
+
while (std::getline(ifs, line)) {
|
57
|
+
fvs.resize(fvs.size() + 1);
|
58
|
+
tws.resize(fvs.size() + 1);
|
59
|
+
boost::numeric::ublas::vector<float> &fv = fvs[fvs.size() - 1];
|
60
|
+
uint32_t counter = 0;
|
61
|
+
std::istringstream is(line);
|
62
|
+
if (_windowsize > 0){
|
63
|
+
is >> val;
|
64
|
+
tws[lineCnt++] = val;
|
65
|
+
}
|
66
|
+
if (dim != 0) {
|
67
|
+
fv.resize(dim);
|
68
|
+
while (is >> val) {
|
69
|
+
fv[counter++]= val;
|
70
|
+
}
|
71
|
+
if (counter != dim) {
|
72
|
+
std::cerr << "dimesions of the input vector should be same!" << std::endl;
|
73
|
+
std::cerr << line << std::endl;
|
74
|
+
std::cerr << "dim:" << dim << " dim:" << counter << std::endl;
|
75
|
+
exit(1);
|
76
|
+
}
|
77
|
+
} else {
|
78
|
+
while (is >> val) {
|
79
|
+
fv.resize(counter + 1);
|
80
|
+
fv[counter] = val;
|
81
|
+
counter++;
|
82
|
+
}
|
83
|
+
dim = counter;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
void SketchSort::centeringData() {
|
89
|
+
size_t dim = fvs[0].size();
|
90
|
+
size_t numData = fvs.size();
|
91
|
+
float mean;
|
92
|
+
for (size_t i = 0; i < dim; i++) {
|
93
|
+
mean = 0.f;
|
94
|
+
for (size_t j = 0; j < numData; j++) {
|
95
|
+
mean += fvs[j][i];
|
96
|
+
}
|
97
|
+
mean /= (float)numData;
|
98
|
+
for (size_t j = 0; j < numData; j++) {
|
99
|
+
fvs[j][i] -= mean;
|
100
|
+
}
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
/* sparce random projection
|
105
|
+
int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, params ¶m) {
|
106
|
+
|
107
|
+
p = new boost::pool<>(sizeof(uint8_t));
|
108
|
+
sig.resize(fvs.size());
|
109
|
+
param.ids.resize(fvs.size());
|
110
|
+
for (size_t i = 0; i < sig.size(); i++) {
|
111
|
+
// sig[i] = new uint32_t[projectDim + 1];
|
112
|
+
sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
|
113
|
+
param.ids[i] = i;
|
114
|
+
}
|
115
|
+
|
116
|
+
boost::mt19937 gen(static_cast<unsigned long>(time(0)));
|
117
|
+
boost::uniform_real<> dst(0.f, 1.f);
|
118
|
+
boost::variate_generator<boost::mt19937&, boost::uniform_real<> > rand(gen, dst);
|
119
|
+
// double tiny = 1.0/1.79e+308;
|
120
|
+
std::vector<std::pair<int, float> > randMat;
|
121
|
+
float s = sqrt(float(dim));
|
122
|
+
// float s = dim/log(dim);
|
123
|
+
float thr = 1.f/(2*s);
|
124
|
+
float coff = sqrt(s);
|
125
|
+
for (size_t i = 0; i < projectDim; i++) {
|
126
|
+
randMat.clear();
|
127
|
+
for (size_t j = 0; j < dim; j++) {
|
128
|
+
float r = rand();
|
129
|
+
if (r < thr) {
|
130
|
+
randMat.push_back(std::make_pair(j, coff));
|
131
|
+
} else if (r < 2*thr) {
|
132
|
+
randMat.push_back(std::make_pair(j, -coff));
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
for (size_t j = 0; j < fvs.size(); j++) {
|
137
|
+
boost::numeric::ublas::vector<float> &fv = fvs[j];
|
138
|
+
double proc = 0.f;
|
139
|
+
for (size_t k = 0; k < randMat.size(); k++) {
|
140
|
+
proc += fv[randMat[k].first] * randMat[k].second;
|
141
|
+
}
|
142
|
+
sig[j][i+1] = sign(proc);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
param.seq_len = projectDim;
|
146
|
+
param.num_seq = fvs.size();
|
147
|
+
|
148
|
+
return 1;
|
149
|
+
}
|
150
|
+
*/
|
151
|
+
|
152
|
+
int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, unsigned int _seed, params ¶m) {
|
153
|
+
std::vector<float> randMat;
|
154
|
+
p = new boost::pool<>(sizeof(uint8_t));
|
155
|
+
sig.resize(fvs.size());
|
156
|
+
param.ids.resize(fvs.size());
|
157
|
+
for (size_t i = 0; i < sig.size(); i++) {
|
158
|
+
// sig[i] = new uint32_t[projectDim + 1];
|
159
|
+
sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
|
160
|
+
param.ids[i] = i;
|
161
|
+
}
|
162
|
+
boost::mt19937 gen(static_cast<unsigned long>(_seed));
|
163
|
+
//boost::mt19937 gen(static_cast<unsigned long>(time(0)));
|
164
|
+
boost::normal_distribution<> dst(0.f, 1.f);
|
165
|
+
boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > rand(gen, dst);
|
166
|
+
|
167
|
+
// double tiny = 1.0/1.79e+308;
|
168
|
+
randMat.resize(dim + 1);
|
169
|
+
for (size_t i = 0; i < projectDim; i++) {
|
170
|
+
for (size_t j = 0; j <= dim; j++) {
|
171
|
+
randMat[j] = rand();
|
172
|
+
}
|
173
|
+
|
174
|
+
for (size_t j = 0; j < fvs.size(); j++) {
|
175
|
+
boost::numeric::ublas::vector<float> &fv = fvs[j];
|
176
|
+
double proc = 0.f;
|
177
|
+
for (size_t k = 0; k < fv.size(); k++)
|
178
|
+
proc += fv[k] * randMat[k];
|
179
|
+
|
180
|
+
sig[j][i+1] = sign(proc);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
param.seq_len = projectDim;
|
184
|
+
param.num_seq = fvs.size();
|
185
|
+
|
186
|
+
return 1;
|
187
|
+
}
|
188
|
+
|
189
|
+
inline float SketchSort::checkCos(unsigned int id1, unsigned int id2) {
|
190
|
+
++numCosDist;
|
191
|
+
boost::numeric::ublas::vector<float> &fv_1 = fvs[id1];
|
192
|
+
boost::numeric::ublas::vector<float> &fv_2 = fvs[id2];
|
193
|
+
float sum = boost::numeric::ublas::inner_prod(fv_1, fv_2);
|
194
|
+
|
195
|
+
return (1.f - sum*(norms[id1]*norms[id2]));
|
196
|
+
}
|
197
|
+
|
198
|
+
inline void SketchSort::sort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
199
|
+
if (r - l + 1 > 50) radixsort(sig, spos, epos, l, r, param);
|
200
|
+
else insertionSort(sig, spos, epos, l, r, param);
|
201
|
+
}
|
202
|
+
|
203
|
+
inline void SketchSort::radixsort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
204
|
+
unsigned int *c = param.counter;
|
205
|
+
std::vector<unsigned int> &ids = param.ids;
|
206
|
+
std::vector<uint8_t*> newsig(r - l + 1);
|
207
|
+
std::vector<unsigned int> newids(r - l + 1);
|
208
|
+
unsigned int tmp;
|
209
|
+
int tpos = spos - 1;
|
210
|
+
while (++tpos <= epos) {
|
211
|
+
for (int i = 0; i < num_char; i++) *(c + i) = 0;
|
212
|
+
for (int i = l; i <= r; i++) c[sig[i][tpos]]++;
|
213
|
+
for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
|
214
|
+
for (int i = r; i >= l; --i) {
|
215
|
+
tmp = --c[sig[i][tpos]] + l;
|
216
|
+
newids[tmp - l] = ids[i];
|
217
|
+
newsig[tmp - l] = sig[i];
|
218
|
+
}
|
219
|
+
if (++tpos <= epos) {
|
220
|
+
for (int i = 0; i < num_char; i++) *(c + i) = 0;
|
221
|
+
for (int i = l; i <= r; i++) c[newsig[i - l][tpos]]++;
|
222
|
+
for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
|
223
|
+
for (int i = r; i >= l; --i) {
|
224
|
+
tmp = --c[newsig[i - l][tpos]] + l;
|
225
|
+
ids[tmp] = newids[i - l];
|
226
|
+
sig[tmp] = newsig[i - l];
|
227
|
+
}
|
228
|
+
}
|
229
|
+
else {
|
230
|
+
for (int i = l; i <= r; i++) {
|
231
|
+
ids[i] = newids[i - l];
|
232
|
+
sig[i] = newsig[i - l];
|
233
|
+
}
|
234
|
+
return;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
inline void SketchSort::insertionSort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
240
|
+
int i, j;
|
241
|
+
uint8_t *pivot, pval;
|
242
|
+
unsigned int pid;
|
243
|
+
std::vector<unsigned int> &ids = param.ids;
|
244
|
+
for (int tpos = spos; tpos <= epos; tpos++) {
|
245
|
+
for (i = l + 1; i <= r; i++) {
|
246
|
+
pivot = sig[i]; pval = sig[i][tpos]; pid = ids[i];
|
247
|
+
for (j = i; j > l && sig[j-1][tpos] > pval; j--) {
|
248
|
+
sig[j] = sig[j-1];
|
249
|
+
ids[j] = ids[j-1];
|
250
|
+
}
|
251
|
+
sig[j] = pivot;
|
252
|
+
ids[j] = pid;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
inline void SketchSort::classify(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, int bpos, params ¶m, unsigned int _windowsize) {
|
258
|
+
int n_l = l, n_r = r;
|
259
|
+
for (int iter = l + 1; iter <= r; iter++) {
|
260
|
+
if (!std::equal(sig[n_l] + spos, sig[n_l] + epos + 1, sig[iter] + spos)) {
|
261
|
+
n_r = iter - 1;
|
262
|
+
if (n_r - n_l >= 1)
|
263
|
+
multi_classification(sig, bpos + 1, n_l, n_r, param, _windowsize);
|
264
|
+
n_l = iter;
|
265
|
+
}
|
266
|
+
}
|
267
|
+
if (r - n_l >= 1)
|
268
|
+
multi_classification(sig, bpos + 1, n_l, r, param, _windowsize);
|
269
|
+
}
|
270
|
+
|
271
|
+
inline bool SketchSort::calc_chunk_hamdist(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
272
|
+
++numHamDist;
|
273
|
+
unsigned int d = 0;
|
274
|
+
for (size_t i = 1; i <= param.chunk_len; i++)
|
275
|
+
if (*seq1++ != *seq2++ && ++d > param.chunk_dist) return false;
|
276
|
+
return true;
|
277
|
+
}
|
278
|
+
|
279
|
+
inline bool SketchSort::check_chunk_canonical(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
280
|
+
unsigned int d = 0;
|
281
|
+
int end = param.pchunks[param.cchunk].start - 1;
|
282
|
+
int j = 1;
|
283
|
+
int tend = param.pchunks[j].end;
|
284
|
+
int i = 0;
|
285
|
+
|
286
|
+
while (++i <= end) {
|
287
|
+
if ((d += abs(seq1[i] - seq2[i])) > param.chunk_dist) {
|
288
|
+
while (++i <= tend) d += abs(seq1[i] - seq2[i]);
|
289
|
+
// if (seq1[i] != seq2[i]) ++d;
|
290
|
+
d = 0;
|
291
|
+
tend = param.pchunks[++j].end;
|
292
|
+
i = param.pchunks[j].start - 1;
|
293
|
+
continue;
|
294
|
+
}
|
295
|
+
if (tend == i)
|
296
|
+
return false;
|
297
|
+
}
|
298
|
+
return true;
|
299
|
+
}
|
300
|
+
|
301
|
+
inline bool SketchSort::check_canonical(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
302
|
+
size_t sb = 1, eb = 1;
|
303
|
+
size_t b;
|
304
|
+
for (size_t i = 0, size = param.blocks.size(); i < size; i++) {
|
305
|
+
eb = param.blocks[i];
|
306
|
+
for (b = sb; b < eb; b++) {
|
307
|
+
if (std::equal(seq1 + param.pos[b].start, seq1 + param.pos[b].end + 1, seq2 + param.pos[b].start))
|
308
|
+
return false;
|
309
|
+
}
|
310
|
+
sb = param.blocks[i] + 1;
|
311
|
+
}
|
312
|
+
return true;
|
313
|
+
}
|
314
|
+
|
315
|
+
inline void SketchSort::report(std::vector<uint8_t*> &sig, int l, int r, params ¶m, unsigned int _windowsize) {
|
316
|
+
// std::cout << "report" << std::endl;
|
317
|
+
float cosDist;
|
318
|
+
for (int i = l; i < r; i++) {
|
319
|
+
for (int j = i + 1; j <= r; j++) {
|
320
|
+
unsigned int span = abs(tws[param.ids[j]]-tws[param.ids[i]]);
|
321
|
+
if (_windowsize != 0 && ( span > _windowsize || span == 0 )){
|
322
|
+
//if (_windowsize != 0 && span > _windowsize)
|
323
|
+
continue;
|
324
|
+
}
|
325
|
+
if (check_canonical(sig[i], sig[j], param) &&
|
326
|
+
calc_chunk_hamdist(sig[i] + param.start_chunk, sig[j] + param.start_chunk, param) &&
|
327
|
+
check_chunk_canonical(sig[i], sig[j], param) &&
|
328
|
+
((cosDist = checkCos(param.ids[i], param.ids[j])) <= param.cosDist)) {
|
329
|
+
(*param.os) << param.ids[i] << " " << param.ids[j] << " " << cosDist << std::endl;
|
330
|
+
}
|
331
|
+
}
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
void SketchSort::multi_classification(std::vector<uint8_t*> &sig, int maxind, int l, int r, params ¶m, unsigned int _windowsize) {
|
336
|
+
|
337
|
+
if (param.blocks.size() == param.numblocks - param.chunk_dist) {
|
338
|
+
report(sig, l, r, param, _windowsize);
|
339
|
+
return;
|
340
|
+
}
|
341
|
+
|
342
|
+
for (int bpos = maxind; bpos <= (int)param.numblocks; bpos++) {
|
343
|
+
|
344
|
+
if (param.blocks.size() + (param.numblocks - bpos + 1) < param.numblocks - param.chunk_dist) { // pruning
|
345
|
+
// std::cerr << "return " << std::endl;
|
346
|
+
return;
|
347
|
+
}
|
348
|
+
param.blocks.push_back(bpos);
|
349
|
+
sort(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, param);
|
350
|
+
classify(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, bpos, param, _windowsize);
|
351
|
+
param.blocks.pop_back();
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
double combination(int n, int m) {
|
356
|
+
double sum = 1.0;
|
357
|
+
for (int i = 0; i < m; i++) {
|
358
|
+
sum *= (n-i)/(m-i);
|
359
|
+
}
|
360
|
+
return sum;
|
361
|
+
}
|
362
|
+
|
363
|
+
double SketchSort::calcMissingEdgeRatio(params ¶m) {
|
364
|
+
double sum = 0.f;
|
365
|
+
double prob = acos(1.0 - param.cosDist)/M_PI;
|
366
|
+
for (unsigned int k = 0; k <= param.chunk_dist; k++) {
|
367
|
+
sum += (combination(param.projectDim, k) * pow(prob, k) * pow(1 - prob, param.projectDim - k));
|
368
|
+
}
|
369
|
+
return pow(1.0 - sum, param.numchunks);
|
370
|
+
}
|
371
|
+
|
372
|
+
void SketchSort::preComputeNorms() {
|
373
|
+
norms.resize(fvs.size());
|
374
|
+
float sum;
|
375
|
+
for (size_t i = 0; i < fvs.size(); i++) {
|
376
|
+
boost::numeric::ublas::vector<float> &fv = fvs[i];
|
377
|
+
sum = 0.f;
|
378
|
+
for (size_t j = 0; j < fv.size(); j++) {
|
379
|
+
sum += pow(fv[j], 2);
|
380
|
+
}
|
381
|
+
norms[i] = 1.f/sqrt(sum);
|
382
|
+
}
|
383
|
+
}
|
384
|
+
|
385
|
+
void SketchSort::decideParameters(float _missingratio, params ¶m) {
|
386
|
+
unsigned int hamDist = 1;
|
387
|
+
unsigned int numBlocks = hamDist + 3;
|
388
|
+
unsigned int numchunks = 0;
|
389
|
+
|
390
|
+
do {
|
391
|
+
if (numchunks > 30) {
|
392
|
+
hamDist += 1;
|
393
|
+
numBlocks = hamDist + 3;
|
394
|
+
numchunks = 0;
|
395
|
+
}
|
396
|
+
numchunks += 1;
|
397
|
+
param.chunk_dist = hamDist;
|
398
|
+
param.numblocks = numBlocks;
|
399
|
+
param.numchunks = numchunks;
|
400
|
+
} while (calcMissingEdgeRatio(param) >= _missingratio);
|
401
|
+
}
|
402
|
+
|
403
|
+
void SketchSort::run(const char *fname, const char *oname,
|
404
|
+
unsigned int _numblocks,
|
405
|
+
unsigned int _dist,
|
406
|
+
float _cosDist,
|
407
|
+
unsigned int _numchunks,
|
408
|
+
bool _autoFlag,
|
409
|
+
float _missingratio,
|
410
|
+
bool _centering,
|
411
|
+
unsigned int _windowsize,
|
412
|
+
unsigned int _seed)
|
413
|
+
{
|
414
|
+
params param;
|
415
|
+
param.numblocks = _numblocks;
|
416
|
+
param.numchunks = _numchunks;
|
417
|
+
param.chunk_dist = _dist;
|
418
|
+
param.cosDist = _cosDist;
|
419
|
+
num_char = 2;
|
420
|
+
param.projectDim = 32;
|
421
|
+
|
422
|
+
numSort = 0;
|
423
|
+
numCosDist = 0;
|
424
|
+
numHamDist = 0;
|
425
|
+
|
426
|
+
if (_autoFlag) {
|
427
|
+
// std::cerr << "deciding parameters such that the missing edge ratio is no more than " << _missingratio << std::endl;
|
428
|
+
decideParameters(_missingratio, param);
|
429
|
+
// std::cout << "decided parameters:" << std::endl;
|
430
|
+
// std::cout << "hamming distance threshold: " << param.chunk_dist << std::endl;
|
431
|
+
// std::cout << "number of blocks: " << param.numblocks << std::endl;
|
432
|
+
// std::cout << "number of chunks: " << param.numchunks << std::endl;
|
433
|
+
// std::cout << std::endl;
|
434
|
+
}
|
435
|
+
|
436
|
+
std::ofstream ofs(oname);
|
437
|
+
param.os = &ofs;
|
438
|
+
|
439
|
+
//std::cout << "missing edge ratio:" << calcMissingEdgeRatio(param) << std::endl;
|
440
|
+
|
441
|
+
//std::cerr << "start reading" << std::endl;
|
442
|
+
double readstart = clock();
|
443
|
+
readFeature(fname,_windowsize);
|
444
|
+
double readend = clock();
|
445
|
+
//std::cerr << "end reading" << std::endl;
|
446
|
+
//std::cout << "readtime:" << (readend - readstart)/(double)CLOCKS_PER_SEC << std::endl;
|
447
|
+
|
448
|
+
if (_centering) {
|
449
|
+
//std::cerr << "start making input-data centered at 0" << std::endl;
|
450
|
+
double centeringstart = clock();
|
451
|
+
centeringData();
|
452
|
+
double centeringend = clock();
|
453
|
+
//std::cerr << "end making input-data centered at 0" << std::endl;
|
454
|
+
//std::cout << "centering time:" << (centeringend - centeringstart)/(double)CLOCKS_PER_SEC << std::endl;
|
455
|
+
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
double totalstart = clock();
|
460
|
+
preComputeNorms();
|
461
|
+
//param.projectDim = 2*(int)log(dim);
|
462
|
+
|
463
|
+
param.counter = new unsigned int[num_char];
|
464
|
+
|
465
|
+
//std::cout << "number of data:" << fvs.size() << std::endl;
|
466
|
+
//std::cout << "data dimension:" << dim << std::endl;
|
467
|
+
//std::cout << "projected dimension:" << param.projectDim << std::endl;
|
468
|
+
//std::cout << "length of strings:" << param.projectDim * param.numchunks << std::endl;
|
469
|
+
//std::cout << "number of chunks:" << param.numchunks << std::endl;
|
470
|
+
|
471
|
+
double projectstart = clock();
|
472
|
+
//std::cerr << "start projection" << std::endl;
|
473
|
+
std::vector<uint8_t*> sig;
|
474
|
+
projectVectors(param.projectDim * param.numchunks, sig, _seed, param);
|
475
|
+
//read(fname, sig, param);
|
476
|
+
//std::cerr << "end projection" << std::endl;
|
477
|
+
double projectend = clock();
|
478
|
+
//std::cout << "projecttime:" << (projectend - projectstart)/(double)CLOCKS_PER_SEC << std::endl;
|
479
|
+
|
480
|
+
param.pchunks = new pstat[param.numchunks + 1];
|
481
|
+
for (int i = 1; i <= (int)param.numchunks; i++) {
|
482
|
+
param.pchunks[i].start = (int)ceil((double)param.seq_len*((double)(i - 1)/(double)param.numchunks)) + 1;
|
483
|
+
param.pchunks[i].end = (int)ceil((double)param.seq_len*(double)i/(double)param.numchunks);
|
484
|
+
}
|
485
|
+
|
486
|
+
double msmtime = 0.0;
|
487
|
+
|
488
|
+
|
489
|
+
//std::cerr << "chunk distance:" << param.chunk_dist << std::endl;
|
490
|
+
//std::cerr << "the number of blocks:" << param.numblocks << std::endl;
|
491
|
+
param.pos = new pstat[param.numblocks + 1];
|
492
|
+
for (int i = 1; i <= (int) param.numchunks; i++) {
|
493
|
+
param.chunk_len = param.pchunks[i].end - param.pchunks[i].start + 1;
|
494
|
+
param.start_chunk = param.pchunks[i].start;
|
495
|
+
param.end_chunk = param.pchunks[i].end;
|
496
|
+
param.cchunk = i;
|
497
|
+
for (int j = 1; j <= (int)param.numblocks; j++) {
|
498
|
+
param.pos[j].start = (int)ceil((double)param.chunk_len*((double)(j - 1)/(double)param.numblocks)) + param.pchunks[i].start;
|
499
|
+
param.pos[j].end = (int)ceil((double)param.chunk_len*(double)j/(double)param.numblocks) + param.pchunks[i].start - 1;
|
500
|
+
}
|
501
|
+
//std::cerr << "start enumeration chunk no " << i << std::endl;
|
502
|
+
double msmstart = clock();
|
503
|
+
//std::cout << "sig=" << sig << std::endl;
|
504
|
+
//std::cout << "param.num_seq=" << param.num_seq << std::endl;
|
505
|
+
//std::cout << "param=" << param << std::endl;
|
506
|
+
multi_classification(sig, 1, 0, param.num_seq - 1, param, _windowsize);
|
507
|
+
double msmend = clock();
|
508
|
+
msmtime += (msmend - msmstart)/(double)CLOCKS_PER_SEC;
|
509
|
+
}
|
510
|
+
//std::cout << "msmtime:" << msmtime << std::endl;
|
511
|
+
|
512
|
+
double totalend = clock();
|
513
|
+
//std::cout << "cputime:" << (totalend - totalstart)/(double)CLOCKS_PER_SEC << std::endl;
|
514
|
+
|
515
|
+
//std::cout << "numSort:" << combination(param.numblocks, param.chunk_dist) * param.numchunks << std::endl;
|
516
|
+
//std::cout << "numHamDist:" << numHamDist << std::endl;
|
517
|
+
//std::cout << "numCosDist:" << numCosDist << std::endl;
|
518
|
+
ofs.close();
|
519
|
+
// destructor
|
520
|
+
delete p;
|
521
|
+
delete[] param.counter;
|
522
|
+
delete[] param.pchunks;
|
523
|
+
delete[] param.pos;
|
524
|
+
|
525
|
+
return;
|
526
|
+
}
|