nysol-mining 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require "rubygems"
5
+ require "nysol/mcmd"
6
+ require "json"
7
+
8
+ # 1.0: first release: 2015/5/5
9
+ $version="1.0"
10
+ $revision="###VERSION###"
11
+
12
+ def help
13
+
14
+ STDERR.puts <<EOF
15
+ ----------------------------
16
+ msm.rb version #{$version}
17
+ ----------------------------
18
+ 概要) shift mean clustering
19
+ 特徴) 1) RパッケージLPCMを利用している。
20
+ 用法) msm.rb f= i= h= [O=] [--help]
21
+
22
+ f= : i=ファイル上の変数項目名【必須】
23
+ i= : 入力ファイル名【必須】
24
+ h= : band width
25
+ O= : 出力パス【必須】
26
+ -debug : Rの実行結果を表示
27
+
28
+ その他
29
+ --help : ヘルプの表示
30
+
31
+ 必要なソフトウェア)
32
+ 1) R
33
+ 2) RのLPCMパッケージ
34
+
35
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
36
+ EOF
37
+ exit
38
+ end
39
+
40
+ def ver()
41
+ $revision ="0" if $revision =~ /VERSION/
42
+ STDERR.puts "version #{$version} revision #{$revision}"
43
+ exit
44
+ end
45
+
46
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
47
+ ver() if ARGV[0]=="--version"
48
+
49
+ args=MCMD::Margs.new(ARGV,"f=,h=,i=,o=,O=,-debug,-mcmdenv,T=","f=,h=,i=,o=")
50
+
51
+ # mcmdのメッセージは警告とエラーのみ
52
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
53
+
54
+ # Rライブラリ実行可能確認
55
+ exit(1) unless(MCMD::chkRexe("LPCM"))
56
+
57
+ #ワークファイルパス
58
+ if args.str("T=")!=nil then
59
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
60
+ end
61
+
62
+ iFile = args.file("i=","r")
63
+ oFile = args.file("o=","w")
64
+ flds = args.field("f=", iFile)
65
+ names = flds["names"].join(",")
66
+ newnames = flds["newNames"]
67
+ if newnames.index(nil)
68
+ raise "#ERROR# f= parameter takes new field names for output."
69
+ end
70
+
71
+ bw = args.float("h=")
72
+ oPath = args.file("O=","w")
73
+ $debug = args.bool("-debug")
74
+
75
+ MCMD::mkDir(oPath) if oPath
76
+
77
+ def runR(names,bw,csv,wp)
78
+ wf=MCMD::Mtemp.new
79
+ scp=wf.file #"xxscp"
80
+
81
+ r_scp = <<EOF
82
+ library('LPCM')
83
+ d=read.csv("#{csv}")
84
+ cm=colMeans(d)
85
+ #print(cm)
86
+ sftM=function(x){return(x-cm)}
87
+ sftP=function(x){return(x+cm)}
88
+ dd=t(apply(d,1,sftM))
89
+ #print(dd)
90
+ model=ms(dd,h=#{bw},plotms=F)
91
+
92
+ center=t(apply(model$cluster.center,1,sftP))
93
+ #print(model$cluster)
94
+ #print(center)
95
+
96
+ #ms.self.coverage(d, taumin=0.02, taumax=0.5, gridsize=25,
97
+ #thr=0.0001, scaled=TRUE, cluster=FALSE, plot.type="o",
98
+ #or.labels=NULL, print=FALSE)
99
+
100
+ #print(model)
101
+ #write.csv(model$cluster.center,"#{wp}/xxcluster")
102
+ write.csv(center,"#{wp}/xxcluster")
103
+ write.csv(model$cluster.label ,"#{wp}/xxlabel")
104
+
105
+ #png("#{wp}/gpr.png")
106
+ # plot(model,as="improv")
107
+ #dev.off()
108
+ EOF
109
+
110
+ File.open(scp,"w"){|fpw| fpw.write r_scp}
111
+ if $debug
112
+ system "R --vanilla -q < #{scp}"
113
+ else
114
+ system "R --vanilla -q < #{scp} &>/dev/null"
115
+ end
116
+ end
117
+
118
+ # cluster.csv
119
+ # "","V1","V2"
120
+ # "1",0.107262943725142,0.0329636308034888
121
+ # "2",-0.655560794404871,-0.448416202492924
122
+ # "3",-0.218883486000835,0.44341544263141
123
+
124
+ # label.csv
125
+ # "","x"
126
+ # "1",1
127
+ # "2",1
128
+ # "3",1
129
+
130
+ wf=MCMD::Mtemp.new
131
+ xxbase =wf.file
132
+ xxwp =wf.file
133
+ xxcmf =wf.file
134
+ xxlabel =wf.file
135
+ MCMD::mkDir(xxwp)
136
+
137
+ system "mcut f=#{names} i=#{iFile} o=#{xxbase}"
138
+
139
+ runR(names,bw,xxbase,xxwp)
140
+
141
+ #
142
+ nn=[]
143
+ (1..newnames.size).each{|i|
144
+ nn << "#{i}:#{newnames[i-1]}"
145
+ }
146
+
147
+ # cluster master file
148
+ f=""
149
+ f << "tail +2 <#{xxwp}/xxcluster |"
150
+ f << "mcut f=0:cluster,#{nn.join(",")} -nfni o=#{xxcmf}"
151
+ system(f)
152
+
153
+ # label file
154
+ f=""
155
+ f << "tail +2 <#{xxwp}/xxlabel |"
156
+ f << "mcut f=1:cluster -nfni o=#{xxlabel}"
157
+ system(f)
158
+
159
+ # join cmf and label file to ifile
160
+ f=""
161
+ f << "mpaste m=#{xxlabel} i=#{iFile} |"
162
+ f << "mjoin k=cluster m=#{xxcmf} o=#{oFile}"
163
+ system(f)
164
+
165
+ if oPath then
166
+ system "cp #{xxcmf} #{oPath}/cluster.csv"
167
+ system "cp #{xxlabel} #{oPath}/label.csv"
168
+ end
169
+
170
+ # 終了メッセージ
171
+ MCMD::endLog(args.cmdline)
172
+
@@ -0,0 +1,161 @@
1
+ /*
2
+ * Main.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ #include "SketchSort.hpp"
25
+
26
+ #include <iostream>
27
+ #include <cstdlib>
28
+
29
+ /* Globals */
30
+ void usage();
31
+ void version();
32
+ void parse_parameters (int argc, char **argv);
33
+
34
+ char *fname, *oname;
35
+ int hamDist = 1;
36
+ int numblocks = 4;
37
+ int numchunks = 3;
38
+ float cosDist = 0.01;
39
+ bool autoFlag = false;
40
+ float missingratio = 0.0001;
41
+ bool centering = false;
42
+ int windowsize = 0;
43
+ int seed = 1;
44
+
45
+
46
+ /*******************************************************************************/
47
+ #ifndef _NO_MAIN_
48
+ #define _NO_MAIN_
49
+ int main(int argc, char **argv)
50
+ {
51
+ version();
52
+
53
+ parse_parameters(argc, argv);
54
+
55
+ SketchSort sketchsort;
56
+ sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
57
+
58
+ return 0;
59
+ }
60
+
61
+ #endif
62
+ /*******************************************************************************/
63
+
64
+
65
+ int sketchsort_main (int argc, char **argv){
66
+
67
+ parse_parameters(argc, argv);
68
+
69
+ SketchSort sketchsort;
70
+ sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
71
+
72
+ return 0;
73
+ }
74
+
75
+
76
+ void version(){
77
+ std::cerr << "SketchSort version 0.0.8" << std::endl
78
+ << "Written by Yasuo Tabei" << std::endl << std::endl;
79
+ }
80
+
81
+ void usage(){
82
+ std::cerr << std::endl
83
+ << "Usage: sketchsort [OPTION]... INFILE OUTFILE" << std::endl << std::endl
84
+ << " where [OPTION]... is a list of zero or more optional arguments" << std::endl
85
+ << " INFILE is the name of an input file" << std::endl
86
+ << " OUTFILE is the name of an output file" << std::endl << std::endl
87
+ << "Additional arguments (input and output files may be specified):" << std::endl
88
+ << " -hamdist [maximum hamming distance]" << std::endl
89
+ << " (default: " << hamDist << ")" << std::endl
90
+ << " -numblocks [the number of blocks]" << std::endl
91
+ << " (default: " << numblocks << ")" << std::endl
92
+ << " -cosdist [maximum cosine distance]" << std::endl
93
+ << " (default: " << cosDist << ")" << std::endl
94
+ << " -numchunks [the number of chunks]" << std::endl
95
+ << " (default: " << numchunks << ")" << std::endl
96
+ << " -auto " << std::endl
97
+ << " -missingratio " << std::endl
98
+ << " (default: " << missingratio << ")" << std::endl
99
+ << " -centering" << std::endl
100
+ << " -windowsize" << std::endl
101
+ << " (default: " << windowsize << ")" << std::endl
102
+ << " -seed " << std::endl
103
+ << std::endl;
104
+ exit(0);
105
+ }
106
+
107
+ void parse_parameters (int argc, char **argv){
108
+ if (argc == 1) usage();
109
+ int argno;
110
+ for (argno = 1; argno < argc; argno++){
111
+ if (argv[argno][0] == '-'){
112
+ if (!strcmp (argv[argno], "-version")){
113
+ version();
114
+ }
115
+ else if (!strcmp (argv[argno], "-auto")) {
116
+ autoFlag = true;
117
+ }
118
+ else if (!strcmp (argv[argno], "-centering")) {
119
+ centering = true;
120
+ }
121
+ else if (!strcmp (argv[argno], "-numblocks")) {
122
+ if (argno == argc - 1) std::cerr << "Must specify minimum support after -numblocks" << std::endl;
123
+ numblocks = atoi(argv[++argno]);
124
+ }
125
+ else if (!strcmp (argv[argno], "-hamdist")) {
126
+ if (argno == argc - 1) std::cerr << "Must specify hamming distance threshold after -hamdist" << std::endl;
127
+ hamDist = atoi(argv[++argno]);
128
+ }
129
+ else if (!strcmp (argv[argno], "-cosdist")) {
130
+ if (argno == argc - 1) std::cerr << "Must specify cosine distance threshold size after -cosdist" << std::endl;
131
+ cosDist = atof(argv[++argno]);
132
+ }
133
+ else if (!strcmp (argv[argno], "-numchunks")) {
134
+ if (argno == argc - 1) std::cerr << "Must specify number of chunks after -numchunks" << std::endl;
135
+ numchunks = atoi(argv[++argno]);
136
+ }
137
+ else if (!strcmp (argv[argno], "-missingratio")) {
138
+ if (argno == argc - 1) std::cerr << "Must specify missing edge ratio after -missingratio" << std::endl;
139
+ missingratio = atof(argv[++argno]);
140
+ }
141
+ else if (!strcmp (argv[argno], "-seed")) {
142
+ if (argno == argc - 1) std::cerr << "Must specify initial seed after -seed" << std::endl;
143
+ seed = atoi(argv[++argno]);
144
+ }
145
+ else if (!strcmp (argv[argno], "-windowsize")) {
146
+ if (argno == argc - 1) std::cerr << "Must specify windowsize after -windowsize" << std::endl;
147
+ windowsize = atoi(argv[++argno]);
148
+ }
149
+ else {
150
+ usage();
151
+ }
152
+ } else {
153
+ break;
154
+ }
155
+ }
156
+ if (argno > argc)
157
+ usage();
158
+
159
+ fname = argv[argno];
160
+ oname = argv[argno + 1];
161
+ }
@@ -0,0 +1,24 @@
1
+ /*
2
+ * Main.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ int sketchsort_main (int argc, char **argv);
@@ -0,0 +1,526 @@
1
+ /*
2
+ * SketchSort.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ #include "SketchSort.hpp"
25
+
26
+ template<class T>
27
+ inline uint8_t sign(T val) {
28
+ if (val > 0)
29
+ return 1;
30
+ return 0;
31
+ }
32
+
33
+ template<class T>
34
+ inline T max(T a1, T a2) {
35
+ if (a1 > a2)
36
+ return a1;
37
+ return a2;
38
+ }
39
+
40
+ bool cmp(const std::pair<int, float> &p1, const std::pair<int, float> &p2) {
41
+ return p1.second < p2.second;
42
+ }
43
+
44
+ void SketchSort::readFeature(const char *fname, unsigned int _windowsize) {
45
+ std::ifstream ifs(fname);
46
+
47
+ if (!ifs) {
48
+ std::cerr << "can not open " << fname << std::endl;
49
+ exit(0);
50
+ }
51
+
52
+ dim = 0;
53
+ float val = 0.f;
54
+ uint64_t lineCnt = 0;
55
+ std::string line;
56
+ while (std::getline(ifs, line)) {
57
+ fvs.resize(fvs.size() + 1);
58
+ tws.resize(fvs.size() + 1);
59
+ boost::numeric::ublas::vector<float> &fv = fvs[fvs.size() - 1];
60
+ uint32_t counter = 0;
61
+ std::istringstream is(line);
62
+ if (_windowsize > 0){
63
+ is >> val;
64
+ tws[lineCnt++] = val;
65
+ }
66
+ if (dim != 0) {
67
+ fv.resize(dim);
68
+ while (is >> val) {
69
+ fv[counter++]= val;
70
+ }
71
+ if (counter != dim) {
72
+ std::cerr << "dimesions of the input vector should be same!" << std::endl;
73
+ std::cerr << line << std::endl;
74
+ std::cerr << "dim:" << dim << " dim:" << counter << std::endl;
75
+ exit(1);
76
+ }
77
+ } else {
78
+ while (is >> val) {
79
+ fv.resize(counter + 1);
80
+ fv[counter] = val;
81
+ counter++;
82
+ }
83
+ dim = counter;
84
+ }
85
+ }
86
+ }
87
+
88
+ void SketchSort::centeringData() {
89
+ size_t dim = fvs[0].size();
90
+ size_t numData = fvs.size();
91
+ float mean;
92
+ for (size_t i = 0; i < dim; i++) {
93
+ mean = 0.f;
94
+ for (size_t j = 0; j < numData; j++) {
95
+ mean += fvs[j][i];
96
+ }
97
+ mean /= (float)numData;
98
+ for (size_t j = 0; j < numData; j++) {
99
+ fvs[j][i] -= mean;
100
+ }
101
+ }
102
+ }
103
+
104
+ /* sparce random projection
105
+ int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, params &param) {
106
+
107
+ p = new boost::pool<>(sizeof(uint8_t));
108
+ sig.resize(fvs.size());
109
+ param.ids.resize(fvs.size());
110
+ for (size_t i = 0; i < sig.size(); i++) {
111
+ // sig[i] = new uint32_t[projectDim + 1];
112
+ sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
113
+ param.ids[i] = i;
114
+ }
115
+
116
+ boost::mt19937 gen(static_cast<unsigned long>(time(0)));
117
+ boost::uniform_real<> dst(0.f, 1.f);
118
+ boost::variate_generator<boost::mt19937&, boost::uniform_real<> > rand(gen, dst);
119
+ // double tiny = 1.0/1.79e+308;
120
+ std::vector<std::pair<int, float> > randMat;
121
+ float s = sqrt(float(dim));
122
+ // float s = dim/log(dim);
123
+ float thr = 1.f/(2*s);
124
+ float coff = sqrt(s);
125
+ for (size_t i = 0; i < projectDim; i++) {
126
+ randMat.clear();
127
+ for (size_t j = 0; j < dim; j++) {
128
+ float r = rand();
129
+ if (r < thr) {
130
+ randMat.push_back(std::make_pair(j, coff));
131
+ } else if (r < 2*thr) {
132
+ randMat.push_back(std::make_pair(j, -coff));
133
+ }
134
+ }
135
+
136
+ for (size_t j = 0; j < fvs.size(); j++) {
137
+ boost::numeric::ublas::vector<float> &fv = fvs[j];
138
+ double proc = 0.f;
139
+ for (size_t k = 0; k < randMat.size(); k++) {
140
+ proc += fv[randMat[k].first] * randMat[k].second;
141
+ }
142
+ sig[j][i+1] = sign(proc);
143
+ }
144
+ }
145
+ param.seq_len = projectDim;
146
+ param.num_seq = fvs.size();
147
+
148
+ return 1;
149
+ }
150
+ */
151
+
152
+ int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, unsigned int _seed, params &param) {
153
+ std::vector<float> randMat;
154
+ p = new boost::pool<>(sizeof(uint8_t));
155
+ sig.resize(fvs.size());
156
+ param.ids.resize(fvs.size());
157
+ for (size_t i = 0; i < sig.size(); i++) {
158
+ // sig[i] = new uint32_t[projectDim + 1];
159
+ sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
160
+ param.ids[i] = i;
161
+ }
162
+ boost::mt19937 gen(static_cast<unsigned long>(_seed));
163
+ //boost::mt19937 gen(static_cast<unsigned long>(time(0)));
164
+ boost::normal_distribution<> dst(0.f, 1.f);
165
+ boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > rand(gen, dst);
166
+
167
+ // double tiny = 1.0/1.79e+308;
168
+ randMat.resize(dim + 1);
169
+ for (size_t i = 0; i < projectDim; i++) {
170
+ for (size_t j = 0; j <= dim; j++) {
171
+ randMat[j] = rand();
172
+ }
173
+
174
+ for (size_t j = 0; j < fvs.size(); j++) {
175
+ boost::numeric::ublas::vector<float> &fv = fvs[j];
176
+ double proc = 0.f;
177
+ for (size_t k = 0; k < fv.size(); k++)
178
+ proc += fv[k] * randMat[k];
179
+
180
+ sig[j][i+1] = sign(proc);
181
+ }
182
+ }
183
+ param.seq_len = projectDim;
184
+ param.num_seq = fvs.size();
185
+
186
+ return 1;
187
+ }
188
+
189
+ inline float SketchSort::checkCos(unsigned int id1, unsigned int id2) {
190
+ ++numCosDist;
191
+ boost::numeric::ublas::vector<float> &fv_1 = fvs[id1];
192
+ boost::numeric::ublas::vector<float> &fv_2 = fvs[id2];
193
+ float sum = boost::numeric::ublas::inner_prod(fv_1, fv_2);
194
+
195
+ return (1.f - sum*(norms[id1]*norms[id2]));
196
+ }
197
+
198
+ inline void SketchSort::sort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
199
+ if (r - l + 1 > 50) radixsort(sig, spos, epos, l, r, param);
200
+ else insertionSort(sig, spos, epos, l, r, param);
201
+ }
202
+
203
+ inline void SketchSort::radixsort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
204
+ unsigned int *c = param.counter;
205
+ std::vector<unsigned int> &ids = param.ids;
206
+ std::vector<uint8_t*> newsig(r - l + 1);
207
+ std::vector<unsigned int> newids(r - l + 1);
208
+ unsigned int tmp;
209
+ int tpos = spos - 1;
210
+ while (++tpos <= epos) {
211
+ for (int i = 0; i < num_char; i++) *(c + i) = 0;
212
+ for (int i = l; i <= r; i++) c[sig[i][tpos]]++;
213
+ for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
214
+ for (int i = r; i >= l; --i) {
215
+ tmp = --c[sig[i][tpos]] + l;
216
+ newids[tmp - l] = ids[i];
217
+ newsig[tmp - l] = sig[i];
218
+ }
219
+ if (++tpos <= epos) {
220
+ for (int i = 0; i < num_char; i++) *(c + i) = 0;
221
+ for (int i = l; i <= r; i++) c[newsig[i - l][tpos]]++;
222
+ for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
223
+ for (int i = r; i >= l; --i) {
224
+ tmp = --c[newsig[i - l][tpos]] + l;
225
+ ids[tmp] = newids[i - l];
226
+ sig[tmp] = newsig[i - l];
227
+ }
228
+ }
229
+ else {
230
+ for (int i = l; i <= r; i++) {
231
+ ids[i] = newids[i - l];
232
+ sig[i] = newsig[i - l];
233
+ }
234
+ return;
235
+ }
236
+ }
237
+ }
238
+
239
+ inline void SketchSort::insertionSort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
240
+ int i, j;
241
+ uint8_t *pivot, pval;
242
+ unsigned int pid;
243
+ std::vector<unsigned int> &ids = param.ids;
244
+ for (int tpos = spos; tpos <= epos; tpos++) {
245
+ for (i = l + 1; i <= r; i++) {
246
+ pivot = sig[i]; pval = sig[i][tpos]; pid = ids[i];
247
+ for (j = i; j > l && sig[j-1][tpos] > pval; j--) {
248
+ sig[j] = sig[j-1];
249
+ ids[j] = ids[j-1];
250
+ }
251
+ sig[j] = pivot;
252
+ ids[j] = pid;
253
+ }
254
+ }
255
+ }
256
+
257
+ inline void SketchSort::classify(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, int bpos, params &param, unsigned int _windowsize) {
258
+ int n_l = l, n_r = r;
259
+ for (int iter = l + 1; iter <= r; iter++) {
260
+ if (!std::equal(sig[n_l] + spos, sig[n_l] + epos + 1, sig[iter] + spos)) {
261
+ n_r = iter - 1;
262
+ if (n_r - n_l >= 1)
263
+ multi_classification(sig, bpos + 1, n_l, n_r, param, _windowsize);
264
+ n_l = iter;
265
+ }
266
+ }
267
+ if (r - n_l >= 1)
268
+ multi_classification(sig, bpos + 1, n_l, r, param, _windowsize);
269
+ }
270
+
271
+ inline bool SketchSort::calc_chunk_hamdist(uint8_t *seq1, uint8_t *seq2, const params &param) {
272
+ ++numHamDist;
273
+ unsigned int d = 0;
274
+ for (size_t i = 1; i <= param.chunk_len; i++)
275
+ if (*seq1++ != *seq2++ && ++d > param.chunk_dist) return false;
276
+ return true;
277
+ }
278
+
279
+ inline bool SketchSort::check_chunk_canonical(uint8_t *seq1, uint8_t *seq2, const params &param) {
280
+ unsigned int d = 0;
281
+ int end = param.pchunks[param.cchunk].start - 1;
282
+ int j = 1;
283
+ int tend = param.pchunks[j].end;
284
+ int i = 0;
285
+
286
+ while (++i <= end) {
287
+ if ((d += abs(seq1[i] - seq2[i])) > param.chunk_dist) {
288
+ while (++i <= tend) d += abs(seq1[i] - seq2[i]);
289
+ // if (seq1[i] != seq2[i]) ++d;
290
+ d = 0;
291
+ tend = param.pchunks[++j].end;
292
+ i = param.pchunks[j].start - 1;
293
+ continue;
294
+ }
295
+ if (tend == i)
296
+ return false;
297
+ }
298
+ return true;
299
+ }
300
+
301
+ inline bool SketchSort::check_canonical(uint8_t *seq1, uint8_t *seq2, const params &param) {
302
+ size_t sb = 1, eb = 1;
303
+ size_t b;
304
+ for (size_t i = 0, size = param.blocks.size(); i < size; i++) {
305
+ eb = param.blocks[i];
306
+ for (b = sb; b < eb; b++) {
307
+ if (std::equal(seq1 + param.pos[b].start, seq1 + param.pos[b].end + 1, seq2 + param.pos[b].start))
308
+ return false;
309
+ }
310
+ sb = param.blocks[i] + 1;
311
+ }
312
+ return true;
313
+ }
314
+
315
+ inline void SketchSort::report(std::vector<uint8_t*> &sig, int l, int r, params &param, unsigned int _windowsize) {
316
+ // std::cout << "report" << std::endl;
317
+ float cosDist;
318
+ for (int i = l; i < r; i++) {
319
+ for (int j = i + 1; j <= r; j++) {
320
+ unsigned int span = abs(tws[param.ids[j]]-tws[param.ids[i]]);
321
+ if (_windowsize != 0 && ( span > _windowsize || span == 0 )){
322
+ //if (_windowsize != 0 && span > _windowsize)
323
+ continue;
324
+ }
325
+ if (check_canonical(sig[i], sig[j], param) &&
326
+ calc_chunk_hamdist(sig[i] + param.start_chunk, sig[j] + param.start_chunk, param) &&
327
+ check_chunk_canonical(sig[i], sig[j], param) &&
328
+ ((cosDist = checkCos(param.ids[i], param.ids[j])) <= param.cosDist)) {
329
+ (*param.os) << param.ids[i] << " " << param.ids[j] << " " << cosDist << std::endl;
330
+ }
331
+ }
332
+ }
333
+ }
334
+
335
+ void SketchSort::multi_classification(std::vector<uint8_t*> &sig, int maxind, int l, int r, params &param, unsigned int _windowsize) {
336
+
337
+ if (param.blocks.size() == param.numblocks - param.chunk_dist) {
338
+ report(sig, l, r, param, _windowsize);
339
+ return;
340
+ }
341
+
342
+ for (int bpos = maxind; bpos <= (int)param.numblocks; bpos++) {
343
+
344
+ if (param.blocks.size() + (param.numblocks - bpos + 1) < param.numblocks - param.chunk_dist) { // pruning
345
+ // std::cerr << "return " << std::endl;
346
+ return;
347
+ }
348
+ param.blocks.push_back(bpos);
349
+ sort(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, param);
350
+ classify(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, bpos, param, _windowsize);
351
+ param.blocks.pop_back();
352
+ }
353
+ }
354
+
355
+ double combination(int n, int m) {
356
+ double sum = 1.0;
357
+ for (int i = 0; i < m; i++) {
358
+ sum *= (n-i)/(m-i);
359
+ }
360
+ return sum;
361
+ }
362
+
363
+ double SketchSort::calcMissingEdgeRatio(params &param) {
364
+ double sum = 0.f;
365
+ double prob = acos(1.0 - param.cosDist)/M_PI;
366
+ for (unsigned int k = 0; k <= param.chunk_dist; k++) {
367
+ sum += (combination(param.projectDim, k) * pow(prob, k) * pow(1 - prob, param.projectDim - k));
368
+ }
369
+ return pow(1.0 - sum, param.numchunks);
370
+ }
371
+
372
+ void SketchSort::preComputeNorms() {
373
+ norms.resize(fvs.size());
374
+ float sum;
375
+ for (size_t i = 0; i < fvs.size(); i++) {
376
+ boost::numeric::ublas::vector<float> &fv = fvs[i];
377
+ sum = 0.f;
378
+ for (size_t j = 0; j < fv.size(); j++) {
379
+ sum += pow(fv[j], 2);
380
+ }
381
+ norms[i] = 1.f/sqrt(sum);
382
+ }
383
+ }
384
+
385
+ void SketchSort::decideParameters(float _missingratio, params &param) {
386
+ unsigned int hamDist = 1;
387
+ unsigned int numBlocks = hamDist + 3;
388
+ unsigned int numchunks = 0;
389
+
390
+ do {
391
+ if (numchunks > 30) {
392
+ hamDist += 1;
393
+ numBlocks = hamDist + 3;
394
+ numchunks = 0;
395
+ }
396
+ numchunks += 1;
397
+ param.chunk_dist = hamDist;
398
+ param.numblocks = numBlocks;
399
+ param.numchunks = numchunks;
400
+ } while (calcMissingEdgeRatio(param) >= _missingratio);
401
+ }
402
+
403
+ void SketchSort::run(const char *fname, const char *oname,
404
+ unsigned int _numblocks,
405
+ unsigned int _dist,
406
+ float _cosDist,
407
+ unsigned int _numchunks,
408
+ bool _autoFlag,
409
+ float _missingratio,
410
+ bool _centering,
411
+ unsigned int _windowsize,
412
+ unsigned int _seed)
413
+ {
414
+ params param;
415
+ param.numblocks = _numblocks;
416
+ param.numchunks = _numchunks;
417
+ param.chunk_dist = _dist;
418
+ param.cosDist = _cosDist;
419
+ num_char = 2;
420
+ param.projectDim = 32;
421
+
422
+ numSort = 0;
423
+ numCosDist = 0;
424
+ numHamDist = 0;
425
+
426
+ if (_autoFlag) {
427
+ // std::cerr << "deciding parameters such that the missing edge ratio is no more than " << _missingratio << std::endl;
428
+ decideParameters(_missingratio, param);
429
+ // std::cout << "decided parameters:" << std::endl;
430
+ // std::cout << "hamming distance threshold: " << param.chunk_dist << std::endl;
431
+ // std::cout << "number of blocks: " << param.numblocks << std::endl;
432
+ // std::cout << "number of chunks: " << param.numchunks << std::endl;
433
+ // std::cout << std::endl;
434
+ }
435
+
436
+ std::ofstream ofs(oname);
437
+ param.os = &ofs;
438
+
439
+ //std::cout << "missing edge ratio:" << calcMissingEdgeRatio(param) << std::endl;
440
+
441
+ //std::cerr << "start reading" << std::endl;
442
+ double readstart = clock();
443
+ readFeature(fname,_windowsize);
444
+ double readend = clock();
445
+ //std::cerr << "end reading" << std::endl;
446
+ //std::cout << "readtime:" << (readend - readstart)/(double)CLOCKS_PER_SEC << std::endl;
447
+
448
+ if (_centering) {
449
+ //std::cerr << "start making input-data centered at 0" << std::endl;
450
+ double centeringstart = clock();
451
+ centeringData();
452
+ double centeringend = clock();
453
+ //std::cerr << "end making input-data centered at 0" << std::endl;
454
+ //std::cout << "centering time:" << (centeringend - centeringstart)/(double)CLOCKS_PER_SEC << std::endl;
455
+
456
+ }
457
+
458
+
459
+ double totalstart = clock();
460
+ preComputeNorms();
461
+ //param.projectDim = 2*(int)log(dim);
462
+
463
+ param.counter = new unsigned int[num_char];
464
+
465
+ //std::cout << "number of data:" << fvs.size() << std::endl;
466
+ //std::cout << "data dimension:" << dim << std::endl;
467
+ //std::cout << "projected dimension:" << param.projectDim << std::endl;
468
+ //std::cout << "length of strings:" << param.projectDim * param.numchunks << std::endl;
469
+ //std::cout << "number of chunks:" << param.numchunks << std::endl;
470
+
471
+ double projectstart = clock();
472
+ //std::cerr << "start projection" << std::endl;
473
+ std::vector<uint8_t*> sig;
474
+ projectVectors(param.projectDim * param.numchunks, sig, _seed, param);
475
+ //read(fname, sig, param);
476
+ //std::cerr << "end projection" << std::endl;
477
+ double projectend = clock();
478
+ //std::cout << "projecttime:" << (projectend - projectstart)/(double)CLOCKS_PER_SEC << std::endl;
479
+
480
+ param.pchunks = new pstat[param.numchunks + 1];
481
+ for (int i = 1; i <= (int)param.numchunks; i++) {
482
+ param.pchunks[i].start = (int)ceil((double)param.seq_len*((double)(i - 1)/(double)param.numchunks)) + 1;
483
+ param.pchunks[i].end = (int)ceil((double)param.seq_len*(double)i/(double)param.numchunks);
484
+ }
485
+
486
+ double msmtime = 0.0;
487
+
488
+
489
+ //std::cerr << "chunk distance:" << param.chunk_dist << std::endl;
490
+ //std::cerr << "the number of blocks:" << param.numblocks << std::endl;
491
+ param.pos = new pstat[param.numblocks + 1];
492
+ for (int i = 1; i <= (int) param.numchunks; i++) {
493
+ param.chunk_len = param.pchunks[i].end - param.pchunks[i].start + 1;
494
+ param.start_chunk = param.pchunks[i].start;
495
+ param.end_chunk = param.pchunks[i].end;
496
+ param.cchunk = i;
497
+ for (int j = 1; j <= (int)param.numblocks; j++) {
498
+ param.pos[j].start = (int)ceil((double)param.chunk_len*((double)(j - 1)/(double)param.numblocks)) + param.pchunks[i].start;
499
+ param.pos[j].end = (int)ceil((double)param.chunk_len*(double)j/(double)param.numblocks) + param.pchunks[i].start - 1;
500
+ }
501
+ //std::cerr << "start enumeration chunk no " << i << std::endl;
502
+ double msmstart = clock();
503
+ //std::cout << "sig=" << sig << std::endl;
504
+ //std::cout << "param.num_seq=" << param.num_seq << std::endl;
505
+ //std::cout << "param=" << param << std::endl;
506
+ multi_classification(sig, 1, 0, param.num_seq - 1, param, _windowsize);
507
+ double msmend = clock();
508
+ msmtime += (msmend - msmstart)/(double)CLOCKS_PER_SEC;
509
+ }
510
+ //std::cout << "msmtime:" << msmtime << std::endl;
511
+
512
+ double totalend = clock();
513
+ //std::cout << "cputime:" << (totalend - totalstart)/(double)CLOCKS_PER_SEC << std::endl;
514
+
515
+ //std::cout << "numSort:" << combination(param.numblocks, param.chunk_dist) * param.numchunks << std::endl;
516
+ //std::cout << "numHamDist:" << numHamDist << std::endl;
517
+ //std::cout << "numCosDist:" << numCosDist << std::endl;
518
+ ofs.close();
519
+ // destructor
520
+ delete p;
521
+ delete[] param.counter;
522
+ delete[] param.pchunks;
523
+ delete[] param.pos;
524
+
525
+ return;
526
+ }