nysol-mining 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/mbopt.rb +522 -0
- data/bin/mburst.rb +716 -0
- data/bin/mgfeatures.rb +340 -0
- data/bin/mglmnet.rb +843 -0
- data/bin/mgnfeatures.rb +369 -0
- data/bin/mgpmetis.rb +449 -0
- data/bin/midxmine.rb +484 -0
- data/bin/mnb.rb +631 -0
- data/bin/mnetsimile.rb +572 -0
- data/bin/mnewman.rb +345 -0
- data/bin/msketchsort.rb +243 -0
- data/bin/msm.rb +172 -0
- data/ext/sketchsortrun/Main.cpp +161 -0
- data/ext/sketchsortrun/Main.hpp +24 -0
- data/ext/sketchsortrun/SketchSort.cpp +526 -0
- data/ext/sketchsortrun/SketchSort.hpp +138 -0
- data/ext/sketchsortrun/extconf.rb +26 -0
- data/ext/sketchsortrun/sketchsortrun.cpp +56 -0
- data/lib/nysol/mining.rb +24 -0
- metadata +89 -0
data/bin/msm.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require "rubygems"
|
5
|
+
require "nysol/mcmd"
|
6
|
+
require "json"
|
7
|
+
|
8
|
+
# 1.0: first release: 2015/5/5
|
9
|
+
$version="1.0"
|
10
|
+
$revision="###VERSION###"
|
11
|
+
|
12
|
+
def help
|
13
|
+
|
14
|
+
STDERR.puts <<EOF
|
15
|
+
----------------------------
|
16
|
+
msm.rb version #{$version}
|
17
|
+
----------------------------
|
18
|
+
概要) shift mean clustering
|
19
|
+
特徴) 1) RパッケージLPCMを利用している。
|
20
|
+
用法) msm.rb f= i= h= [O=] [--help]
|
21
|
+
|
22
|
+
f= : i=ファイル上の変数項目名【必須】
|
23
|
+
i= : 入力ファイル名【必須】
|
24
|
+
h= : band width
|
25
|
+
O= : 出力パス【必須】
|
26
|
+
-debug : Rの実行結果を表示
|
27
|
+
|
28
|
+
その他
|
29
|
+
--help : ヘルプの表示
|
30
|
+
|
31
|
+
必要なソフトウェア)
|
32
|
+
1) R
|
33
|
+
2) RのLPCMパッケージ
|
34
|
+
|
35
|
+
# Copyright(c) NYSOL 2012- All Rights Reserved.
|
36
|
+
EOF
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
|
40
|
+
def ver()
|
41
|
+
$revision ="0" if $revision =~ /VERSION/
|
42
|
+
STDERR.puts "version #{$version} revision #{$revision}"
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
|
46
|
+
help() if ARGV[0]=="--help" or ARGV.size <= 0
|
47
|
+
ver() if ARGV[0]=="--version"
|
48
|
+
|
49
|
+
args=MCMD::Margs.new(ARGV,"f=,h=,i=,o=,O=,-debug,-mcmdenv,T=","f=,h=,i=,o=")
|
50
|
+
|
51
|
+
# mcmdのメッセージは警告とエラーのみ
|
52
|
+
ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
|
53
|
+
|
54
|
+
# Rライブラリ実行可能確認
|
55
|
+
exit(1) unless(MCMD::chkRexe("LPCM"))
|
56
|
+
|
57
|
+
#ワークファイルパス
|
58
|
+
if args.str("T=")!=nil then
|
59
|
+
ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
|
60
|
+
end
|
61
|
+
|
62
|
+
iFile = args.file("i=","r")
|
63
|
+
oFile = args.file("o=","w")
|
64
|
+
flds = args.field("f=", iFile)
|
65
|
+
names = flds["names"].join(",")
|
66
|
+
newnames = flds["newNames"]
|
67
|
+
if newnames.index(nil)
|
68
|
+
raise "#ERROR# f= parameter takes new field names for output."
|
69
|
+
end
|
70
|
+
|
71
|
+
bw = args.float("h=")
|
72
|
+
oPath = args.file("O=","w")
|
73
|
+
$debug = args.bool("-debug")
|
74
|
+
|
75
|
+
MCMD::mkDir(oPath) if oPath
|
76
|
+
|
77
|
+
def runR(names,bw,csv,wp)
|
78
|
+
wf=MCMD::Mtemp.new
|
79
|
+
scp=wf.file #"xxscp"
|
80
|
+
|
81
|
+
r_scp = <<EOF
|
82
|
+
library('LPCM')
|
83
|
+
d=read.csv("#{csv}")
|
84
|
+
cm=colMeans(d)
|
85
|
+
#print(cm)
|
86
|
+
sftM=function(x){return(x-cm)}
|
87
|
+
sftP=function(x){return(x+cm)}
|
88
|
+
dd=t(apply(d,1,sftM))
|
89
|
+
#print(dd)
|
90
|
+
model=ms(dd,h=#{bw},plotms=F)
|
91
|
+
|
92
|
+
center=t(apply(model$cluster.center,1,sftP))
|
93
|
+
#print(model$cluster)
|
94
|
+
#print(center)
|
95
|
+
|
96
|
+
#ms.self.coverage(d, taumin=0.02, taumax=0.5, gridsize=25,
|
97
|
+
#thr=0.0001, scaled=TRUE, cluster=FALSE, plot.type="o",
|
98
|
+
#or.labels=NULL, print=FALSE)
|
99
|
+
|
100
|
+
#print(model)
|
101
|
+
#write.csv(model$cluster.center,"#{wp}/xxcluster")
|
102
|
+
write.csv(center,"#{wp}/xxcluster")
|
103
|
+
write.csv(model$cluster.label ,"#{wp}/xxlabel")
|
104
|
+
|
105
|
+
#png("#{wp}/gpr.png")
|
106
|
+
# plot(model,as="improv")
|
107
|
+
#dev.off()
|
108
|
+
EOF
|
109
|
+
|
110
|
+
File.open(scp,"w"){|fpw| fpw.write r_scp}
|
111
|
+
if $debug
|
112
|
+
system "R --vanilla -q < #{scp}"
|
113
|
+
else
|
114
|
+
system "R --vanilla -q < #{scp} &>/dev/null"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# cluster.csv
|
119
|
+
# "","V1","V2"
|
120
|
+
# "1",0.107262943725142,0.0329636308034888
|
121
|
+
# "2",-0.655560794404871,-0.448416202492924
|
122
|
+
# "3",-0.218883486000835,0.44341544263141
|
123
|
+
|
124
|
+
# label.csv
|
125
|
+
# "","x"
|
126
|
+
# "1",1
|
127
|
+
# "2",1
|
128
|
+
# "3",1
|
129
|
+
|
130
|
+
wf=MCMD::Mtemp.new
|
131
|
+
xxbase =wf.file
|
132
|
+
xxwp =wf.file
|
133
|
+
xxcmf =wf.file
|
134
|
+
xxlabel =wf.file
|
135
|
+
MCMD::mkDir(xxwp)
|
136
|
+
|
137
|
+
system "mcut f=#{names} i=#{iFile} o=#{xxbase}"
|
138
|
+
|
139
|
+
runR(names,bw,xxbase,xxwp)
|
140
|
+
|
141
|
+
#
|
142
|
+
nn=[]
|
143
|
+
(1..newnames.size).each{|i|
|
144
|
+
nn << "#{i}:#{newnames[i-1]}"
|
145
|
+
}
|
146
|
+
|
147
|
+
# cluster master file
|
148
|
+
f=""
|
149
|
+
f << "tail +2 <#{xxwp}/xxcluster |"
|
150
|
+
f << "mcut f=0:cluster,#{nn.join(",")} -nfni o=#{xxcmf}"
|
151
|
+
system(f)
|
152
|
+
|
153
|
+
# label file
|
154
|
+
f=""
|
155
|
+
f << "tail +2 <#{xxwp}/xxlabel |"
|
156
|
+
f << "mcut f=1:cluster -nfni o=#{xxlabel}"
|
157
|
+
system(f)
|
158
|
+
|
159
|
+
# join cmf and label file to ifile
|
160
|
+
f=""
|
161
|
+
f << "mpaste m=#{xxlabel} i=#{iFile} |"
|
162
|
+
f << "mjoin k=cluster m=#{xxcmf} o=#{oFile}"
|
163
|
+
system(f)
|
164
|
+
|
165
|
+
if oPath then
|
166
|
+
system "cp #{xxcmf} #{oPath}/cluster.csv"
|
167
|
+
system "cp #{xxlabel} #{oPath}/label.csv"
|
168
|
+
end
|
169
|
+
|
170
|
+
# 終了メッセージ
|
171
|
+
MCMD::endLog(args.cmdline)
|
172
|
+
|
@@ -0,0 +1,161 @@
|
|
1
|
+
/*
|
2
|
+
* Main.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
#include "SketchSort.hpp"
|
25
|
+
|
26
|
+
#include <iostream>
|
27
|
+
#include <cstdlib>
|
28
|
+
|
29
|
+
/* Globals */
|
30
|
+
void usage();
|
31
|
+
void version();
|
32
|
+
void parse_parameters (int argc, char **argv);
|
33
|
+
|
34
|
+
char *fname, *oname;
|
35
|
+
int hamDist = 1;
|
36
|
+
int numblocks = 4;
|
37
|
+
int numchunks = 3;
|
38
|
+
float cosDist = 0.01;
|
39
|
+
bool autoFlag = false;
|
40
|
+
float missingratio = 0.0001;
|
41
|
+
bool centering = false;
|
42
|
+
int windowsize = 0;
|
43
|
+
int seed = 1;
|
44
|
+
|
45
|
+
|
46
|
+
/*******************************************************************************/
|
47
|
+
#ifndef _NO_MAIN_
|
48
|
+
#define _NO_MAIN_
|
49
|
+
int main(int argc, char **argv)
|
50
|
+
{
|
51
|
+
version();
|
52
|
+
|
53
|
+
parse_parameters(argc, argv);
|
54
|
+
|
55
|
+
SketchSort sketchsort;
|
56
|
+
sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
|
57
|
+
|
58
|
+
return 0;
|
59
|
+
}
|
60
|
+
|
61
|
+
#endif
|
62
|
+
/*******************************************************************************/
|
63
|
+
|
64
|
+
|
65
|
+
int sketchsort_main (int argc, char **argv){
|
66
|
+
|
67
|
+
parse_parameters(argc, argv);
|
68
|
+
|
69
|
+
SketchSort sketchsort;
|
70
|
+
sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
|
71
|
+
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
void version(){
|
77
|
+
std::cerr << "SketchSort version 0.0.8" << std::endl
|
78
|
+
<< "Written by Yasuo Tabei" << std::endl << std::endl;
|
79
|
+
}
|
80
|
+
|
81
|
+
void usage(){
|
82
|
+
std::cerr << std::endl
|
83
|
+
<< "Usage: sketchsort [OPTION]... INFILE OUTFILE" << std::endl << std::endl
|
84
|
+
<< " where [OPTION]... is a list of zero or more optional arguments" << std::endl
|
85
|
+
<< " INFILE is the name of an input file" << std::endl
|
86
|
+
<< " OUTFILE is the name of an output file" << std::endl << std::endl
|
87
|
+
<< "Additional arguments (input and output files may be specified):" << std::endl
|
88
|
+
<< " -hamdist [maximum hamming distance]" << std::endl
|
89
|
+
<< " (default: " << hamDist << ")" << std::endl
|
90
|
+
<< " -numblocks [the number of blocks]" << std::endl
|
91
|
+
<< " (default: " << numblocks << ")" << std::endl
|
92
|
+
<< " -cosdist [maximum cosine distance]" << std::endl
|
93
|
+
<< " (default: " << cosDist << ")" << std::endl
|
94
|
+
<< " -numchunks [the number of chunks]" << std::endl
|
95
|
+
<< " (default: " << numchunks << ")" << std::endl
|
96
|
+
<< " -auto " << std::endl
|
97
|
+
<< " -missingratio " << std::endl
|
98
|
+
<< " (default: " << missingratio << ")" << std::endl
|
99
|
+
<< " -centering" << std::endl
|
100
|
+
<< " -windowsize" << std::endl
|
101
|
+
<< " (default: " << windowsize << ")" << std::endl
|
102
|
+
<< " -seed " << std::endl
|
103
|
+
<< std::endl;
|
104
|
+
exit(0);
|
105
|
+
}
|
106
|
+
|
107
|
+
void parse_parameters (int argc, char **argv){
|
108
|
+
if (argc == 1) usage();
|
109
|
+
int argno;
|
110
|
+
for (argno = 1; argno < argc; argno++){
|
111
|
+
if (argv[argno][0] == '-'){
|
112
|
+
if (!strcmp (argv[argno], "-version")){
|
113
|
+
version();
|
114
|
+
}
|
115
|
+
else if (!strcmp (argv[argno], "-auto")) {
|
116
|
+
autoFlag = true;
|
117
|
+
}
|
118
|
+
else if (!strcmp (argv[argno], "-centering")) {
|
119
|
+
centering = true;
|
120
|
+
}
|
121
|
+
else if (!strcmp (argv[argno], "-numblocks")) {
|
122
|
+
if (argno == argc - 1) std::cerr << "Must specify minimum support after -numblocks" << std::endl;
|
123
|
+
numblocks = atoi(argv[++argno]);
|
124
|
+
}
|
125
|
+
else if (!strcmp (argv[argno], "-hamdist")) {
|
126
|
+
if (argno == argc - 1) std::cerr << "Must specify hamming distance threshold after -hamdist" << std::endl;
|
127
|
+
hamDist = atoi(argv[++argno]);
|
128
|
+
}
|
129
|
+
else if (!strcmp (argv[argno], "-cosdist")) {
|
130
|
+
if (argno == argc - 1) std::cerr << "Must specify cosine distance threshold size after -cosdist" << std::endl;
|
131
|
+
cosDist = atof(argv[++argno]);
|
132
|
+
}
|
133
|
+
else if (!strcmp (argv[argno], "-numchunks")) {
|
134
|
+
if (argno == argc - 1) std::cerr << "Must specify number of chunks after -numchunks" << std::endl;
|
135
|
+
numchunks = atoi(argv[++argno]);
|
136
|
+
}
|
137
|
+
else if (!strcmp (argv[argno], "-missingratio")) {
|
138
|
+
if (argno == argc - 1) std::cerr << "Must specify missing edge ratio after -missingratio" << std::endl;
|
139
|
+
missingratio = atof(argv[++argno]);
|
140
|
+
}
|
141
|
+
else if (!strcmp (argv[argno], "-seed")) {
|
142
|
+
if (argno == argc - 1) std::cerr << "Must specify initial seed after -seed" << std::endl;
|
143
|
+
seed = atoi(argv[++argno]);
|
144
|
+
}
|
145
|
+
else if (!strcmp (argv[argno], "-windowsize")) {
|
146
|
+
if (argno == argc - 1) std::cerr << "Must specify windowsize after -windowsize" << std::endl;
|
147
|
+
windowsize = atoi(argv[++argno]);
|
148
|
+
}
|
149
|
+
else {
|
150
|
+
usage();
|
151
|
+
}
|
152
|
+
} else {
|
153
|
+
break;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
if (argno > argc)
|
157
|
+
usage();
|
158
|
+
|
159
|
+
fname = argv[argno];
|
160
|
+
oname = argv[argno + 1];
|
161
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
/*
|
2
|
+
* Main.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
int sketchsort_main (int argc, char **argv);
|
@@ -0,0 +1,526 @@
|
|
1
|
+
/*
|
2
|
+
* SketchSort.cpp
|
3
|
+
* Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
|
4
|
+
*
|
5
|
+
* Permission is hereby granted, free of charge, to any person
|
6
|
+
* obtaining a copy of this software and associated documentation
|
7
|
+
* files (the "Software"), to deal in the Software without
|
8
|
+
* restriction, including without limitation the rights to use,
|
9
|
+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
* copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
|
11
|
+
* conditions:
|
12
|
+
*
|
13
|
+
* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
|
14
|
+
*
|
15
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
* OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
*/
|
23
|
+
|
24
|
+
#include "SketchSort.hpp"
|
25
|
+
|
26
|
+
template<class T>
|
27
|
+
inline uint8_t sign(T val) {
|
28
|
+
if (val > 0)
|
29
|
+
return 1;
|
30
|
+
return 0;
|
31
|
+
}
|
32
|
+
|
33
|
+
template<class T>
|
34
|
+
inline T max(T a1, T a2) {
|
35
|
+
if (a1 > a2)
|
36
|
+
return a1;
|
37
|
+
return a2;
|
38
|
+
}
|
39
|
+
|
40
|
+
bool cmp(const std::pair<int, float> &p1, const std::pair<int, float> &p2) {
|
41
|
+
return p1.second < p2.second;
|
42
|
+
}
|
43
|
+
|
44
|
+
void SketchSort::readFeature(const char *fname, unsigned int _windowsize) {
|
45
|
+
std::ifstream ifs(fname);
|
46
|
+
|
47
|
+
if (!ifs) {
|
48
|
+
std::cerr << "can not open " << fname << std::endl;
|
49
|
+
exit(0);
|
50
|
+
}
|
51
|
+
|
52
|
+
dim = 0;
|
53
|
+
float val = 0.f;
|
54
|
+
uint64_t lineCnt = 0;
|
55
|
+
std::string line;
|
56
|
+
while (std::getline(ifs, line)) {
|
57
|
+
fvs.resize(fvs.size() + 1);
|
58
|
+
tws.resize(fvs.size() + 1);
|
59
|
+
boost::numeric::ublas::vector<float> &fv = fvs[fvs.size() - 1];
|
60
|
+
uint32_t counter = 0;
|
61
|
+
std::istringstream is(line);
|
62
|
+
if (_windowsize > 0){
|
63
|
+
is >> val;
|
64
|
+
tws[lineCnt++] = val;
|
65
|
+
}
|
66
|
+
if (dim != 0) {
|
67
|
+
fv.resize(dim);
|
68
|
+
while (is >> val) {
|
69
|
+
fv[counter++]= val;
|
70
|
+
}
|
71
|
+
if (counter != dim) {
|
72
|
+
std::cerr << "dimesions of the input vector should be same!" << std::endl;
|
73
|
+
std::cerr << line << std::endl;
|
74
|
+
std::cerr << "dim:" << dim << " dim:" << counter << std::endl;
|
75
|
+
exit(1);
|
76
|
+
}
|
77
|
+
} else {
|
78
|
+
while (is >> val) {
|
79
|
+
fv.resize(counter + 1);
|
80
|
+
fv[counter] = val;
|
81
|
+
counter++;
|
82
|
+
}
|
83
|
+
dim = counter;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
void SketchSort::centeringData() {
|
89
|
+
size_t dim = fvs[0].size();
|
90
|
+
size_t numData = fvs.size();
|
91
|
+
float mean;
|
92
|
+
for (size_t i = 0; i < dim; i++) {
|
93
|
+
mean = 0.f;
|
94
|
+
for (size_t j = 0; j < numData; j++) {
|
95
|
+
mean += fvs[j][i];
|
96
|
+
}
|
97
|
+
mean /= (float)numData;
|
98
|
+
for (size_t j = 0; j < numData; j++) {
|
99
|
+
fvs[j][i] -= mean;
|
100
|
+
}
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
/* sparce random projection
|
105
|
+
int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, params ¶m) {
|
106
|
+
|
107
|
+
p = new boost::pool<>(sizeof(uint8_t));
|
108
|
+
sig.resize(fvs.size());
|
109
|
+
param.ids.resize(fvs.size());
|
110
|
+
for (size_t i = 0; i < sig.size(); i++) {
|
111
|
+
// sig[i] = new uint32_t[projectDim + 1];
|
112
|
+
sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
|
113
|
+
param.ids[i] = i;
|
114
|
+
}
|
115
|
+
|
116
|
+
boost::mt19937 gen(static_cast<unsigned long>(time(0)));
|
117
|
+
boost::uniform_real<> dst(0.f, 1.f);
|
118
|
+
boost::variate_generator<boost::mt19937&, boost::uniform_real<> > rand(gen, dst);
|
119
|
+
// double tiny = 1.0/1.79e+308;
|
120
|
+
std::vector<std::pair<int, float> > randMat;
|
121
|
+
float s = sqrt(float(dim));
|
122
|
+
// float s = dim/log(dim);
|
123
|
+
float thr = 1.f/(2*s);
|
124
|
+
float coff = sqrt(s);
|
125
|
+
for (size_t i = 0; i < projectDim; i++) {
|
126
|
+
randMat.clear();
|
127
|
+
for (size_t j = 0; j < dim; j++) {
|
128
|
+
float r = rand();
|
129
|
+
if (r < thr) {
|
130
|
+
randMat.push_back(std::make_pair(j, coff));
|
131
|
+
} else if (r < 2*thr) {
|
132
|
+
randMat.push_back(std::make_pair(j, -coff));
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
for (size_t j = 0; j < fvs.size(); j++) {
|
137
|
+
boost::numeric::ublas::vector<float> &fv = fvs[j];
|
138
|
+
double proc = 0.f;
|
139
|
+
for (size_t k = 0; k < randMat.size(); k++) {
|
140
|
+
proc += fv[randMat[k].first] * randMat[k].second;
|
141
|
+
}
|
142
|
+
sig[j][i+1] = sign(proc);
|
143
|
+
}
|
144
|
+
}
|
145
|
+
param.seq_len = projectDim;
|
146
|
+
param.num_seq = fvs.size();
|
147
|
+
|
148
|
+
return 1;
|
149
|
+
}
|
150
|
+
*/
|
151
|
+
|
152
|
+
int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, unsigned int _seed, params ¶m) {
|
153
|
+
std::vector<float> randMat;
|
154
|
+
p = new boost::pool<>(sizeof(uint8_t));
|
155
|
+
sig.resize(fvs.size());
|
156
|
+
param.ids.resize(fvs.size());
|
157
|
+
for (size_t i = 0; i < sig.size(); i++) {
|
158
|
+
// sig[i] = new uint32_t[projectDim + 1];
|
159
|
+
sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
|
160
|
+
param.ids[i] = i;
|
161
|
+
}
|
162
|
+
boost::mt19937 gen(static_cast<unsigned long>(_seed));
|
163
|
+
//boost::mt19937 gen(static_cast<unsigned long>(time(0)));
|
164
|
+
boost::normal_distribution<> dst(0.f, 1.f);
|
165
|
+
boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > rand(gen, dst);
|
166
|
+
|
167
|
+
// double tiny = 1.0/1.79e+308;
|
168
|
+
randMat.resize(dim + 1);
|
169
|
+
for (size_t i = 0; i < projectDim; i++) {
|
170
|
+
for (size_t j = 0; j <= dim; j++) {
|
171
|
+
randMat[j] = rand();
|
172
|
+
}
|
173
|
+
|
174
|
+
for (size_t j = 0; j < fvs.size(); j++) {
|
175
|
+
boost::numeric::ublas::vector<float> &fv = fvs[j];
|
176
|
+
double proc = 0.f;
|
177
|
+
for (size_t k = 0; k < fv.size(); k++)
|
178
|
+
proc += fv[k] * randMat[k];
|
179
|
+
|
180
|
+
sig[j][i+1] = sign(proc);
|
181
|
+
}
|
182
|
+
}
|
183
|
+
param.seq_len = projectDim;
|
184
|
+
param.num_seq = fvs.size();
|
185
|
+
|
186
|
+
return 1;
|
187
|
+
}
|
188
|
+
|
189
|
+
inline float SketchSort::checkCos(unsigned int id1, unsigned int id2) {
|
190
|
+
++numCosDist;
|
191
|
+
boost::numeric::ublas::vector<float> &fv_1 = fvs[id1];
|
192
|
+
boost::numeric::ublas::vector<float> &fv_2 = fvs[id2];
|
193
|
+
float sum = boost::numeric::ublas::inner_prod(fv_1, fv_2);
|
194
|
+
|
195
|
+
return (1.f - sum*(norms[id1]*norms[id2]));
|
196
|
+
}
|
197
|
+
|
198
|
+
inline void SketchSort::sort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
199
|
+
if (r - l + 1 > 50) radixsort(sig, spos, epos, l, r, param);
|
200
|
+
else insertionSort(sig, spos, epos, l, r, param);
|
201
|
+
}
|
202
|
+
|
203
|
+
inline void SketchSort::radixsort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
204
|
+
unsigned int *c = param.counter;
|
205
|
+
std::vector<unsigned int> &ids = param.ids;
|
206
|
+
std::vector<uint8_t*> newsig(r - l + 1);
|
207
|
+
std::vector<unsigned int> newids(r - l + 1);
|
208
|
+
unsigned int tmp;
|
209
|
+
int tpos = spos - 1;
|
210
|
+
while (++tpos <= epos) {
|
211
|
+
for (int i = 0; i < num_char; i++) *(c + i) = 0;
|
212
|
+
for (int i = l; i <= r; i++) c[sig[i][tpos]]++;
|
213
|
+
for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
|
214
|
+
for (int i = r; i >= l; --i) {
|
215
|
+
tmp = --c[sig[i][tpos]] + l;
|
216
|
+
newids[tmp - l] = ids[i];
|
217
|
+
newsig[tmp - l] = sig[i];
|
218
|
+
}
|
219
|
+
if (++tpos <= epos) {
|
220
|
+
for (int i = 0; i < num_char; i++) *(c + i) = 0;
|
221
|
+
for (int i = l; i <= r; i++) c[newsig[i - l][tpos]]++;
|
222
|
+
for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
|
223
|
+
for (int i = r; i >= l; --i) {
|
224
|
+
tmp = --c[newsig[i - l][tpos]] + l;
|
225
|
+
ids[tmp] = newids[i - l];
|
226
|
+
sig[tmp] = newsig[i - l];
|
227
|
+
}
|
228
|
+
}
|
229
|
+
else {
|
230
|
+
for (int i = l; i <= r; i++) {
|
231
|
+
ids[i] = newids[i - l];
|
232
|
+
sig[i] = newsig[i - l];
|
233
|
+
}
|
234
|
+
return;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
inline void SketchSort::insertionSort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params ¶m) {
|
240
|
+
int i, j;
|
241
|
+
uint8_t *pivot, pval;
|
242
|
+
unsigned int pid;
|
243
|
+
std::vector<unsigned int> &ids = param.ids;
|
244
|
+
for (int tpos = spos; tpos <= epos; tpos++) {
|
245
|
+
for (i = l + 1; i <= r; i++) {
|
246
|
+
pivot = sig[i]; pval = sig[i][tpos]; pid = ids[i];
|
247
|
+
for (j = i; j > l && sig[j-1][tpos] > pval; j--) {
|
248
|
+
sig[j] = sig[j-1];
|
249
|
+
ids[j] = ids[j-1];
|
250
|
+
}
|
251
|
+
sig[j] = pivot;
|
252
|
+
ids[j] = pid;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
inline void SketchSort::classify(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, int bpos, params ¶m, unsigned int _windowsize) {
|
258
|
+
int n_l = l, n_r = r;
|
259
|
+
for (int iter = l + 1; iter <= r; iter++) {
|
260
|
+
if (!std::equal(sig[n_l] + spos, sig[n_l] + epos + 1, sig[iter] + spos)) {
|
261
|
+
n_r = iter - 1;
|
262
|
+
if (n_r - n_l >= 1)
|
263
|
+
multi_classification(sig, bpos + 1, n_l, n_r, param, _windowsize);
|
264
|
+
n_l = iter;
|
265
|
+
}
|
266
|
+
}
|
267
|
+
if (r - n_l >= 1)
|
268
|
+
multi_classification(sig, bpos + 1, n_l, r, param, _windowsize);
|
269
|
+
}
|
270
|
+
|
271
|
+
inline bool SketchSort::calc_chunk_hamdist(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
272
|
+
++numHamDist;
|
273
|
+
unsigned int d = 0;
|
274
|
+
for (size_t i = 1; i <= param.chunk_len; i++)
|
275
|
+
if (*seq1++ != *seq2++ && ++d > param.chunk_dist) return false;
|
276
|
+
return true;
|
277
|
+
}
|
278
|
+
|
279
|
+
inline bool SketchSort::check_chunk_canonical(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
280
|
+
unsigned int d = 0;
|
281
|
+
int end = param.pchunks[param.cchunk].start - 1;
|
282
|
+
int j = 1;
|
283
|
+
int tend = param.pchunks[j].end;
|
284
|
+
int i = 0;
|
285
|
+
|
286
|
+
while (++i <= end) {
|
287
|
+
if ((d += abs(seq1[i] - seq2[i])) > param.chunk_dist) {
|
288
|
+
while (++i <= tend) d += abs(seq1[i] - seq2[i]);
|
289
|
+
// if (seq1[i] != seq2[i]) ++d;
|
290
|
+
d = 0;
|
291
|
+
tend = param.pchunks[++j].end;
|
292
|
+
i = param.pchunks[j].start - 1;
|
293
|
+
continue;
|
294
|
+
}
|
295
|
+
if (tend == i)
|
296
|
+
return false;
|
297
|
+
}
|
298
|
+
return true;
|
299
|
+
}
|
300
|
+
|
301
|
+
inline bool SketchSort::check_canonical(uint8_t *seq1, uint8_t *seq2, const params ¶m) {
|
302
|
+
size_t sb = 1, eb = 1;
|
303
|
+
size_t b;
|
304
|
+
for (size_t i = 0, size = param.blocks.size(); i < size; i++) {
|
305
|
+
eb = param.blocks[i];
|
306
|
+
for (b = sb; b < eb; b++) {
|
307
|
+
if (std::equal(seq1 + param.pos[b].start, seq1 + param.pos[b].end + 1, seq2 + param.pos[b].start))
|
308
|
+
return false;
|
309
|
+
}
|
310
|
+
sb = param.blocks[i] + 1;
|
311
|
+
}
|
312
|
+
return true;
|
313
|
+
}
|
314
|
+
|
315
|
+
inline void SketchSort::report(std::vector<uint8_t*> &sig, int l, int r, params ¶m, unsigned int _windowsize) {
|
316
|
+
// std::cout << "report" << std::endl;
|
317
|
+
float cosDist;
|
318
|
+
for (int i = l; i < r; i++) {
|
319
|
+
for (int j = i + 1; j <= r; j++) {
|
320
|
+
unsigned int span = abs(tws[param.ids[j]]-tws[param.ids[i]]);
|
321
|
+
if (_windowsize != 0 && ( span > _windowsize || span == 0 )){
|
322
|
+
//if (_windowsize != 0 && span > _windowsize)
|
323
|
+
continue;
|
324
|
+
}
|
325
|
+
if (check_canonical(sig[i], sig[j], param) &&
|
326
|
+
calc_chunk_hamdist(sig[i] + param.start_chunk, sig[j] + param.start_chunk, param) &&
|
327
|
+
check_chunk_canonical(sig[i], sig[j], param) &&
|
328
|
+
((cosDist = checkCos(param.ids[i], param.ids[j])) <= param.cosDist)) {
|
329
|
+
(*param.os) << param.ids[i] << " " << param.ids[j] << " " << cosDist << std::endl;
|
330
|
+
}
|
331
|
+
}
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
void SketchSort::multi_classification(std::vector<uint8_t*> &sig, int maxind, int l, int r, params ¶m, unsigned int _windowsize) {
|
336
|
+
|
337
|
+
if (param.blocks.size() == param.numblocks - param.chunk_dist) {
|
338
|
+
report(sig, l, r, param, _windowsize);
|
339
|
+
return;
|
340
|
+
}
|
341
|
+
|
342
|
+
for (int bpos = maxind; bpos <= (int)param.numblocks; bpos++) {
|
343
|
+
|
344
|
+
if (param.blocks.size() + (param.numblocks - bpos + 1) < param.numblocks - param.chunk_dist) { // pruning
|
345
|
+
// std::cerr << "return " << std::endl;
|
346
|
+
return;
|
347
|
+
}
|
348
|
+
param.blocks.push_back(bpos);
|
349
|
+
sort(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, param);
|
350
|
+
classify(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, bpos, param, _windowsize);
|
351
|
+
param.blocks.pop_back();
|
352
|
+
}
|
353
|
+
}
|
354
|
+
|
355
|
+
double combination(int n, int m) {
|
356
|
+
double sum = 1.0;
|
357
|
+
for (int i = 0; i < m; i++) {
|
358
|
+
sum *= (n-i)/(m-i);
|
359
|
+
}
|
360
|
+
return sum;
|
361
|
+
}
|
362
|
+
|
363
|
+
double SketchSort::calcMissingEdgeRatio(params ¶m) {
|
364
|
+
double sum = 0.f;
|
365
|
+
double prob = acos(1.0 - param.cosDist)/M_PI;
|
366
|
+
for (unsigned int k = 0; k <= param.chunk_dist; k++) {
|
367
|
+
sum += (combination(param.projectDim, k) * pow(prob, k) * pow(1 - prob, param.projectDim - k));
|
368
|
+
}
|
369
|
+
return pow(1.0 - sum, param.numchunks);
|
370
|
+
}
|
371
|
+
|
372
|
+
void SketchSort::preComputeNorms() {
|
373
|
+
norms.resize(fvs.size());
|
374
|
+
float sum;
|
375
|
+
for (size_t i = 0; i < fvs.size(); i++) {
|
376
|
+
boost::numeric::ublas::vector<float> &fv = fvs[i];
|
377
|
+
sum = 0.f;
|
378
|
+
for (size_t j = 0; j < fv.size(); j++) {
|
379
|
+
sum += pow(fv[j], 2);
|
380
|
+
}
|
381
|
+
norms[i] = 1.f/sqrt(sum);
|
382
|
+
}
|
383
|
+
}
|
384
|
+
|
385
|
+
void SketchSort::decideParameters(float _missingratio, params ¶m) {
|
386
|
+
unsigned int hamDist = 1;
|
387
|
+
unsigned int numBlocks = hamDist + 3;
|
388
|
+
unsigned int numchunks = 0;
|
389
|
+
|
390
|
+
do {
|
391
|
+
if (numchunks > 30) {
|
392
|
+
hamDist += 1;
|
393
|
+
numBlocks = hamDist + 3;
|
394
|
+
numchunks = 0;
|
395
|
+
}
|
396
|
+
numchunks += 1;
|
397
|
+
param.chunk_dist = hamDist;
|
398
|
+
param.numblocks = numBlocks;
|
399
|
+
param.numchunks = numchunks;
|
400
|
+
} while (calcMissingEdgeRatio(param) >= _missingratio);
|
401
|
+
}
|
402
|
+
|
403
|
+
void SketchSort::run(const char *fname, const char *oname,
|
404
|
+
unsigned int _numblocks,
|
405
|
+
unsigned int _dist,
|
406
|
+
float _cosDist,
|
407
|
+
unsigned int _numchunks,
|
408
|
+
bool _autoFlag,
|
409
|
+
float _missingratio,
|
410
|
+
bool _centering,
|
411
|
+
unsigned int _windowsize,
|
412
|
+
unsigned int _seed)
|
413
|
+
{
|
414
|
+
params param;
|
415
|
+
param.numblocks = _numblocks;
|
416
|
+
param.numchunks = _numchunks;
|
417
|
+
param.chunk_dist = _dist;
|
418
|
+
param.cosDist = _cosDist;
|
419
|
+
num_char = 2;
|
420
|
+
param.projectDim = 32;
|
421
|
+
|
422
|
+
numSort = 0;
|
423
|
+
numCosDist = 0;
|
424
|
+
numHamDist = 0;
|
425
|
+
|
426
|
+
if (_autoFlag) {
|
427
|
+
// std::cerr << "deciding parameters such that the missing edge ratio is no more than " << _missingratio << std::endl;
|
428
|
+
decideParameters(_missingratio, param);
|
429
|
+
// std::cout << "decided parameters:" << std::endl;
|
430
|
+
// std::cout << "hamming distance threshold: " << param.chunk_dist << std::endl;
|
431
|
+
// std::cout << "number of blocks: " << param.numblocks << std::endl;
|
432
|
+
// std::cout << "number of chunks: " << param.numchunks << std::endl;
|
433
|
+
// std::cout << std::endl;
|
434
|
+
}
|
435
|
+
|
436
|
+
std::ofstream ofs(oname);
|
437
|
+
param.os = &ofs;
|
438
|
+
|
439
|
+
//std::cout << "missing edge ratio:" << calcMissingEdgeRatio(param) << std::endl;
|
440
|
+
|
441
|
+
//std::cerr << "start reading" << std::endl;
|
442
|
+
double readstart = clock();
|
443
|
+
readFeature(fname,_windowsize);
|
444
|
+
double readend = clock();
|
445
|
+
//std::cerr << "end reading" << std::endl;
|
446
|
+
//std::cout << "readtime:" << (readend - readstart)/(double)CLOCKS_PER_SEC << std::endl;
|
447
|
+
|
448
|
+
if (_centering) {
|
449
|
+
//std::cerr << "start making input-data centered at 0" << std::endl;
|
450
|
+
double centeringstart = clock();
|
451
|
+
centeringData();
|
452
|
+
double centeringend = clock();
|
453
|
+
//std::cerr << "end making input-data centered at 0" << std::endl;
|
454
|
+
//std::cout << "centering time:" << (centeringend - centeringstart)/(double)CLOCKS_PER_SEC << std::endl;
|
455
|
+
|
456
|
+
}
|
457
|
+
|
458
|
+
|
459
|
+
double totalstart = clock();
|
460
|
+
preComputeNorms();
|
461
|
+
//param.projectDim = 2*(int)log(dim);
|
462
|
+
|
463
|
+
param.counter = new unsigned int[num_char];
|
464
|
+
|
465
|
+
//std::cout << "number of data:" << fvs.size() << std::endl;
|
466
|
+
//std::cout << "data dimension:" << dim << std::endl;
|
467
|
+
//std::cout << "projected dimension:" << param.projectDim << std::endl;
|
468
|
+
//std::cout << "length of strings:" << param.projectDim * param.numchunks << std::endl;
|
469
|
+
//std::cout << "number of chunks:" << param.numchunks << std::endl;
|
470
|
+
|
471
|
+
double projectstart = clock();
|
472
|
+
//std::cerr << "start projection" << std::endl;
|
473
|
+
std::vector<uint8_t*> sig;
|
474
|
+
projectVectors(param.projectDim * param.numchunks, sig, _seed, param);
|
475
|
+
//read(fname, sig, param);
|
476
|
+
//std::cerr << "end projection" << std::endl;
|
477
|
+
double projectend = clock();
|
478
|
+
//std::cout << "projecttime:" << (projectend - projectstart)/(double)CLOCKS_PER_SEC << std::endl;
|
479
|
+
|
480
|
+
param.pchunks = new pstat[param.numchunks + 1];
|
481
|
+
for (int i = 1; i <= (int)param.numchunks; i++) {
|
482
|
+
param.pchunks[i].start = (int)ceil((double)param.seq_len*((double)(i - 1)/(double)param.numchunks)) + 1;
|
483
|
+
param.pchunks[i].end = (int)ceil((double)param.seq_len*(double)i/(double)param.numchunks);
|
484
|
+
}
|
485
|
+
|
486
|
+
double msmtime = 0.0;
|
487
|
+
|
488
|
+
|
489
|
+
//std::cerr << "chunk distance:" << param.chunk_dist << std::endl;
|
490
|
+
//std::cerr << "the number of blocks:" << param.numblocks << std::endl;
|
491
|
+
param.pos = new pstat[param.numblocks + 1];
|
492
|
+
for (int i = 1; i <= (int) param.numchunks; i++) {
|
493
|
+
param.chunk_len = param.pchunks[i].end - param.pchunks[i].start + 1;
|
494
|
+
param.start_chunk = param.pchunks[i].start;
|
495
|
+
param.end_chunk = param.pchunks[i].end;
|
496
|
+
param.cchunk = i;
|
497
|
+
for (int j = 1; j <= (int)param.numblocks; j++) {
|
498
|
+
param.pos[j].start = (int)ceil((double)param.chunk_len*((double)(j - 1)/(double)param.numblocks)) + param.pchunks[i].start;
|
499
|
+
param.pos[j].end = (int)ceil((double)param.chunk_len*(double)j/(double)param.numblocks) + param.pchunks[i].start - 1;
|
500
|
+
}
|
501
|
+
//std::cerr << "start enumeration chunk no " << i << std::endl;
|
502
|
+
double msmstart = clock();
|
503
|
+
//std::cout << "sig=" << sig << std::endl;
|
504
|
+
//std::cout << "param.num_seq=" << param.num_seq << std::endl;
|
505
|
+
//std::cout << "param=" << param << std::endl;
|
506
|
+
multi_classification(sig, 1, 0, param.num_seq - 1, param, _windowsize);
|
507
|
+
double msmend = clock();
|
508
|
+
msmtime += (msmend - msmstart)/(double)CLOCKS_PER_SEC;
|
509
|
+
}
|
510
|
+
//std::cout << "msmtime:" << msmtime << std::endl;
|
511
|
+
|
512
|
+
double totalend = clock();
|
513
|
+
//std::cout << "cputime:" << (totalend - totalstart)/(double)CLOCKS_PER_SEC << std::endl;
|
514
|
+
|
515
|
+
//std::cout << "numSort:" << combination(param.numblocks, param.chunk_dist) * param.numchunks << std::endl;
|
516
|
+
//std::cout << "numHamDist:" << numHamDist << std::endl;
|
517
|
+
//std::cout << "numCosDist:" << numCosDist << std::endl;
|
518
|
+
ofs.close();
|
519
|
+
// destructor
|
520
|
+
delete p;
|
521
|
+
delete[] param.counter;
|
522
|
+
delete[] param.pchunks;
|
523
|
+
delete[] param.pos;
|
524
|
+
|
525
|
+
return;
|
526
|
+
}
|