nysol-mining 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require "rubygems"
5
+ require "nysol/mcmd"
6
+ require "json"
7
+
8
+ # 1.0: first release: 2015/5/5
9
+ $version="1.0"
10
+ $revision="###VERSION###"
11
+
12
+ def help
13
+
14
+ STDERR.puts <<EOF
15
+ ----------------------------
16
+ msm.rb version #{$version}
17
+ ----------------------------
18
+ 概要) shift mean clustering
19
+ 特徴) 1) RパッケージLPCMを利用している。
20
+ 用法) msm.rb f= i= h= [O=] [--help]
21
+
22
+ f= : i=ファイル上の変数項目名【必須】
23
+ i= : 入力ファイル名【必須】
24
+ h= : band width
25
+ O= : 出力パス【必須】
26
+ -debug : Rの実行結果を表示
27
+
28
+ その他
29
+ --help : ヘルプの表示
30
+
31
+ 必要なソフトウェア)
32
+ 1) R
33
+ 2) RのLPCMパッケージ
34
+
35
+ # Copyright(c) NYSOL 2012- All Rights Reserved.
36
+ EOF
37
+ exit
38
+ end
39
+
40
+ def ver()
41
+ $revision ="0" if $revision =~ /VERSION/
42
+ STDERR.puts "version #{$version} revision #{$revision}"
43
+ exit
44
+ end
45
+
46
+ help() if ARGV[0]=="--help" or ARGV.size <= 0
47
+ ver() if ARGV[0]=="--version"
48
+
49
+ args=MCMD::Margs.new(ARGV,"f=,h=,i=,o=,O=,-debug,-mcmdenv,T=","f=,h=,i=,o=")
50
+
51
+ # mcmdのメッセージは警告とエラーのみ
52
+ ENV["KG_VerboseLevel"]="2" unless args.bool("-mcmdenv")
53
+
54
+ # Rライブラリ実行可能確認
55
+ exit(1) unless(MCMD::chkRexe("LPCM"))
56
+
57
+ #ワークファイルパス
58
+ if args.str("T=")!=nil then
59
+ ENV["KG_TmpPath"] = args.str("T=").sub(/\/$/,"")
60
+ end
61
+
62
+ iFile = args.file("i=","r")
63
+ oFile = args.file("o=","w")
64
+ flds = args.field("f=", iFile)
65
+ names = flds["names"].join(",")
66
+ newnames = flds["newNames"]
67
+ if newnames.index(nil)
68
+ raise "#ERROR# f= parameter takes new field names for output."
69
+ end
70
+
71
+ bw = args.float("h=")
72
+ oPath = args.file("O=","w")
73
+ $debug = args.bool("-debug")
74
+
75
+ MCMD::mkDir(oPath) if oPath
76
+
77
+ def runR(names,bw,csv,wp)
78
+ wf=MCMD::Mtemp.new
79
+ scp=wf.file #"xxscp"
80
+
81
+ r_scp = <<EOF
82
+ library('LPCM')
83
+ d=read.csv("#{csv}")
84
+ cm=colMeans(d)
85
+ #print(cm)
86
+ sftM=function(x){return(x-cm)}
87
+ sftP=function(x){return(x+cm)}
88
+ dd=t(apply(d,1,sftM))
89
+ #print(dd)
90
+ model=ms(dd,h=#{bw},plotms=F)
91
+
92
+ center=t(apply(model$cluster.center,1,sftP))
93
+ #print(model$cluster)
94
+ #print(center)
95
+
96
+ #ms.self.coverage(d, taumin=0.02, taumax=0.5, gridsize=25,
97
+ #thr=0.0001, scaled=TRUE, cluster=FALSE, plot.type="o",
98
+ #or.labels=NULL, print=FALSE)
99
+
100
+ #print(model)
101
+ #write.csv(model$cluster.center,"#{wp}/xxcluster")
102
+ write.csv(center,"#{wp}/xxcluster")
103
+ write.csv(model$cluster.label ,"#{wp}/xxlabel")
104
+
105
+ #png("#{wp}/gpr.png")
106
+ # plot(model,as="improv")
107
+ #dev.off()
108
+ EOF
109
+
110
+ File.open(scp,"w"){|fpw| fpw.write r_scp}
111
+ if $debug
112
+ system "R --vanilla -q < #{scp}"
113
+ else
114
+ system "R --vanilla -q < #{scp} &>/dev/null"
115
+ end
116
+ end
117
+
118
+ # cluster.csv
119
+ # "","V1","V2"
120
+ # "1",0.107262943725142,0.0329636308034888
121
+ # "2",-0.655560794404871,-0.448416202492924
122
+ # "3",-0.218883486000835,0.44341544263141
123
+
124
+ # label.csv
125
+ # "","x"
126
+ # "1",1
127
+ # "2",1
128
+ # "3",1
129
+
130
+ wf=MCMD::Mtemp.new
131
+ xxbase =wf.file
132
+ xxwp =wf.file
133
+ xxcmf =wf.file
134
+ xxlabel =wf.file
135
+ MCMD::mkDir(xxwp)
136
+
137
+ system "mcut f=#{names} i=#{iFile} o=#{xxbase}"
138
+
139
+ runR(names,bw,xxbase,xxwp)
140
+
141
+ #
142
+ nn=[]
143
+ (1..newnames.size).each{|i|
144
+ nn << "#{i}:#{newnames[i-1]}"
145
+ }
146
+
147
+ # cluster master file
148
+ f=""
149
+ f << "tail +2 <#{xxwp}/xxcluster |"
150
+ f << "mcut f=0:cluster,#{nn.join(",")} -nfni o=#{xxcmf}"
151
+ system(f)
152
+
153
+ # label file
154
+ f=""
155
+ f << "tail +2 <#{xxwp}/xxlabel |"
156
+ f << "mcut f=1:cluster -nfni o=#{xxlabel}"
157
+ system(f)
158
+
159
+ # join cmf and label file to ifile
160
+ f=""
161
+ f << "mpaste m=#{xxlabel} i=#{iFile} |"
162
+ f << "mjoin k=cluster m=#{xxcmf} o=#{oFile}"
163
+ system(f)
164
+
165
+ if oPath then
166
+ system "cp #{xxcmf} #{oPath}/cluster.csv"
167
+ system "cp #{xxlabel} #{oPath}/label.csv"
168
+ end
169
+
170
+ # 終了メッセージ
171
+ MCMD::endLog(args.cmdline)
172
+
@@ -0,0 +1,161 @@
1
+ /*
2
+ * Main.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ #include "SketchSort.hpp"
25
+
26
+ #include <iostream>
27
+ #include <cstdlib>
28
+
29
+ /* Globals */
30
+ void usage();
31
+ void version();
32
+ void parse_parameters (int argc, char **argv);
33
+
34
+ char *fname, *oname;
35
+ int hamDist = 1;
36
+ int numblocks = 4;
37
+ int numchunks = 3;
38
+ float cosDist = 0.01;
39
+ bool autoFlag = false;
40
+ float missingratio = 0.0001;
41
+ bool centering = false;
42
+ int windowsize = 0;
43
+ int seed = 1;
44
+
45
+
46
+ /*******************************************************************************/
47
+ #ifndef _NO_MAIN_
48
+ #define _NO_MAIN_
49
+ int main(int argc, char **argv)
50
+ {
51
+ version();
52
+
53
+ parse_parameters(argc, argv);
54
+
55
+ SketchSort sketchsort;
56
+ sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
57
+
58
+ return 0;
59
+ }
60
+
61
+ #endif
62
+ /*******************************************************************************/
63
+
64
+
65
+ int sketchsort_main (int argc, char **argv){
66
+
67
+ parse_parameters(argc, argv);
68
+
69
+ SketchSort sketchsort;
70
+ sketchsort.run(fname, oname, numblocks, hamDist, cosDist, numchunks, autoFlag, missingratio, centering, windowsize, seed);
71
+
72
+ return 0;
73
+ }
74
+
75
+
76
+ void version(){
77
+ std::cerr << "SketchSort version 0.0.8" << std::endl
78
+ << "Written by Yasuo Tabei" << std::endl << std::endl;
79
+ }
80
+
81
+ void usage(){
82
+ std::cerr << std::endl
83
+ << "Usage: sketchsort [OPTION]... INFILE OUTFILE" << std::endl << std::endl
84
+ << " where [OPTION]... is a list of zero or more optional arguments" << std::endl
85
+ << " INFILE is the name of an input file" << std::endl
86
+ << " OUTFILE is the name of an output file" << std::endl << std::endl
87
+ << "Additional arguments (input and output files may be specified):" << std::endl
88
+ << " -hamdist [maximum hamming distance]" << std::endl
89
+ << " (default: " << hamDist << ")" << std::endl
90
+ << " -numblocks [the number of blocks]" << std::endl
91
+ << " (default: " << numblocks << ")" << std::endl
92
+ << " -cosdist [maximum cosine distance]" << std::endl
93
+ << " (default: " << cosDist << ")" << std::endl
94
+ << " -numchunks [the number of chunks]" << std::endl
95
+ << " (default: " << numchunks << ")" << std::endl
96
+ << " -auto " << std::endl
97
+ << " -missingratio " << std::endl
98
+ << " (default: " << missingratio << ")" << std::endl
99
+ << " -centering" << std::endl
100
+ << " -windowsize" << std::endl
101
+ << " (default: " << windowsize << ")" << std::endl
102
+ << " -seed " << std::endl
103
+ << std::endl;
104
+ exit(0);
105
+ }
106
+
107
+ void parse_parameters (int argc, char **argv){
108
+ if (argc == 1) usage();
109
+ int argno;
110
+ for (argno = 1; argno < argc; argno++){
111
+ if (argv[argno][0] == '-'){
112
+ if (!strcmp (argv[argno], "-version")){
113
+ version();
114
+ }
115
+ else if (!strcmp (argv[argno], "-auto")) {
116
+ autoFlag = true;
117
+ }
118
+ else if (!strcmp (argv[argno], "-centering")) {
119
+ centering = true;
120
+ }
121
+ else if (!strcmp (argv[argno], "-numblocks")) {
122
+ if (argno == argc - 1) std::cerr << "Must specify minimum support after -numblocks" << std::endl;
123
+ numblocks = atoi(argv[++argno]);
124
+ }
125
+ else if (!strcmp (argv[argno], "-hamdist")) {
126
+ if (argno == argc - 1) std::cerr << "Must specify hamming distance threshold after -hamdist" << std::endl;
127
+ hamDist = atoi(argv[++argno]);
128
+ }
129
+ else if (!strcmp (argv[argno], "-cosdist")) {
130
+ if (argno == argc - 1) std::cerr << "Must specify cosine distance threshold size after -cosdist" << std::endl;
131
+ cosDist = atof(argv[++argno]);
132
+ }
133
+ else if (!strcmp (argv[argno], "-numchunks")) {
134
+ if (argno == argc - 1) std::cerr << "Must specify number of chunks after -numchunks" << std::endl;
135
+ numchunks = atoi(argv[++argno]);
136
+ }
137
+ else if (!strcmp (argv[argno], "-missingratio")) {
138
+ if (argno == argc - 1) std::cerr << "Must specify missing edge ratio after -missingratio" << std::endl;
139
+ missingratio = atof(argv[++argno]);
140
+ }
141
+ else if (!strcmp (argv[argno], "-seed")) {
142
+ if (argno == argc - 1) std::cerr << "Must specify initial seed after -seed" << std::endl;
143
+ seed = atoi(argv[++argno]);
144
+ }
145
+ else if (!strcmp (argv[argno], "-windowsize")) {
146
+ if (argno == argc - 1) std::cerr << "Must specify windowsize after -windowsize" << std::endl;
147
+ windowsize = atoi(argv[++argno]);
148
+ }
149
+ else {
150
+ usage();
151
+ }
152
+ } else {
153
+ break;
154
+ }
155
+ }
156
+ if (argno > argc)
157
+ usage();
158
+
159
+ fname = argv[argno];
160
+ oname = argv[argno + 1];
161
+ }
@@ -0,0 +1,24 @@
1
+ /*
2
+ * Main.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ int sketchsort_main (int argc, char **argv);
@@ -0,0 +1,526 @@
1
+ /*
2
+ * SketchSort.cpp
3
+ * Copyright (c) 2011 Yasuo Tabei All Rights Reserved.
4
+ *
5
+ * Permission is hereby granted, free of charge, to any person
6
+ * obtaining a copy of this software and associated documentation
7
+ * files (the "Software"), to deal in the Software without
8
+ * restriction, including without limitation the rights to use,
9
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following
11
+ * conditions:
12
+ *
13
+ * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE and * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21
+ * OTHER DEALINGS IN THE SOFTWARE.
22
+ */
23
+
24
+ #include "SketchSort.hpp"
25
+
26
+ template<class T>
27
+ inline uint8_t sign(T val) {
28
+ if (val > 0)
29
+ return 1;
30
+ return 0;
31
+ }
32
+
33
+ template<class T>
34
+ inline T max(T a1, T a2) {
35
+ if (a1 > a2)
36
+ return a1;
37
+ return a2;
38
+ }
39
+
40
+ bool cmp(const std::pair<int, float> &p1, const std::pair<int, float> &p2) {
41
+ return p1.second < p2.second;
42
+ }
43
+
44
+ void SketchSort::readFeature(const char *fname, unsigned int _windowsize) {
45
+ std::ifstream ifs(fname);
46
+
47
+ if (!ifs) {
48
+ std::cerr << "can not open " << fname << std::endl;
49
+ exit(0);
50
+ }
51
+
52
+ dim = 0;
53
+ float val = 0.f;
54
+ uint64_t lineCnt = 0;
55
+ std::string line;
56
+ while (std::getline(ifs, line)) {
57
+ fvs.resize(fvs.size() + 1);
58
+ tws.resize(fvs.size() + 1);
59
+ boost::numeric::ublas::vector<float> &fv = fvs[fvs.size() - 1];
60
+ uint32_t counter = 0;
61
+ std::istringstream is(line);
62
+ if (_windowsize > 0){
63
+ is >> val;
64
+ tws[lineCnt++] = val;
65
+ }
66
+ if (dim != 0) {
67
+ fv.resize(dim);
68
+ while (is >> val) {
69
+ fv[counter++]= val;
70
+ }
71
+ if (counter != dim) {
72
+ std::cerr << "dimesions of the input vector should be same!" << std::endl;
73
+ std::cerr << line << std::endl;
74
+ std::cerr << "dim:" << dim << " dim:" << counter << std::endl;
75
+ exit(1);
76
+ }
77
+ } else {
78
+ while (is >> val) {
79
+ fv.resize(counter + 1);
80
+ fv[counter] = val;
81
+ counter++;
82
+ }
83
+ dim = counter;
84
+ }
85
+ }
86
+ }
87
+
88
+ void SketchSort::centeringData() {
89
+ size_t dim = fvs[0].size();
90
+ size_t numData = fvs.size();
91
+ float mean;
92
+ for (size_t i = 0; i < dim; i++) {
93
+ mean = 0.f;
94
+ for (size_t j = 0; j < numData; j++) {
95
+ mean += fvs[j][i];
96
+ }
97
+ mean /= (float)numData;
98
+ for (size_t j = 0; j < numData; j++) {
99
+ fvs[j][i] -= mean;
100
+ }
101
+ }
102
+ }
103
+
104
+ /* sparce random projection
105
+ int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, params &param) {
106
+
107
+ p = new boost::pool<>(sizeof(uint8_t));
108
+ sig.resize(fvs.size());
109
+ param.ids.resize(fvs.size());
110
+ for (size_t i = 0; i < sig.size(); i++) {
111
+ // sig[i] = new uint32_t[projectDim + 1];
112
+ sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
113
+ param.ids[i] = i;
114
+ }
115
+
116
+ boost::mt19937 gen(static_cast<unsigned long>(time(0)));
117
+ boost::uniform_real<> dst(0.f, 1.f);
118
+ boost::variate_generator<boost::mt19937&, boost::uniform_real<> > rand(gen, dst);
119
+ // double tiny = 1.0/1.79e+308;
120
+ std::vector<std::pair<int, float> > randMat;
121
+ float s = sqrt(float(dim));
122
+ // float s = dim/log(dim);
123
+ float thr = 1.f/(2*s);
124
+ float coff = sqrt(s);
125
+ for (size_t i = 0; i < projectDim; i++) {
126
+ randMat.clear();
127
+ for (size_t j = 0; j < dim; j++) {
128
+ float r = rand();
129
+ if (r < thr) {
130
+ randMat.push_back(std::make_pair(j, coff));
131
+ } else if (r < 2*thr) {
132
+ randMat.push_back(std::make_pair(j, -coff));
133
+ }
134
+ }
135
+
136
+ for (size_t j = 0; j < fvs.size(); j++) {
137
+ boost::numeric::ublas::vector<float> &fv = fvs[j];
138
+ double proc = 0.f;
139
+ for (size_t k = 0; k < randMat.size(); k++) {
140
+ proc += fv[randMat[k].first] * randMat[k].second;
141
+ }
142
+ sig[j][i+1] = sign(proc);
143
+ }
144
+ }
145
+ param.seq_len = projectDim;
146
+ param.num_seq = fvs.size();
147
+
148
+ return 1;
149
+ }
150
+ */
151
+
152
+ int SketchSort::projectVectors(unsigned int projectDim, std::vector<uint8_t*> &sig, unsigned int _seed, params &param) {
153
+ std::vector<float> randMat;
154
+ p = new boost::pool<>(sizeof(uint8_t));
155
+ sig.resize(fvs.size());
156
+ param.ids.resize(fvs.size());
157
+ for (size_t i = 0; i < sig.size(); i++) {
158
+ // sig[i] = new uint32_t[projectDim + 1];
159
+ sig[i] = (uint8_t*)p->ordered_malloc(projectDim + 1);
160
+ param.ids[i] = i;
161
+ }
162
+ boost::mt19937 gen(static_cast<unsigned long>(_seed));
163
+ //boost::mt19937 gen(static_cast<unsigned long>(time(0)));
164
+ boost::normal_distribution<> dst(0.f, 1.f);
165
+ boost::variate_generator<boost::mt19937&, boost::normal_distribution<> > rand(gen, dst);
166
+
167
+ // double tiny = 1.0/1.79e+308;
168
+ randMat.resize(dim + 1);
169
+ for (size_t i = 0; i < projectDim; i++) {
170
+ for (size_t j = 0; j <= dim; j++) {
171
+ randMat[j] = rand();
172
+ }
173
+
174
+ for (size_t j = 0; j < fvs.size(); j++) {
175
+ boost::numeric::ublas::vector<float> &fv = fvs[j];
176
+ double proc = 0.f;
177
+ for (size_t k = 0; k < fv.size(); k++)
178
+ proc += fv[k] * randMat[k];
179
+
180
+ sig[j][i+1] = sign(proc);
181
+ }
182
+ }
183
+ param.seq_len = projectDim;
184
+ param.num_seq = fvs.size();
185
+
186
+ return 1;
187
+ }
188
+
189
+ inline float SketchSort::checkCos(unsigned int id1, unsigned int id2) {
190
+ ++numCosDist;
191
+ boost::numeric::ublas::vector<float> &fv_1 = fvs[id1];
192
+ boost::numeric::ublas::vector<float> &fv_2 = fvs[id2];
193
+ float sum = boost::numeric::ublas::inner_prod(fv_1, fv_2);
194
+
195
+ return (1.f - sum*(norms[id1]*norms[id2]));
196
+ }
197
+
198
+ inline void SketchSort::sort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
199
+ if (r - l + 1 > 50) radixsort(sig, spos, epos, l, r, param);
200
+ else insertionSort(sig, spos, epos, l, r, param);
201
+ }
202
+
203
+ inline void SketchSort::radixsort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
204
+ unsigned int *c = param.counter;
205
+ std::vector<unsigned int> &ids = param.ids;
206
+ std::vector<uint8_t*> newsig(r - l + 1);
207
+ std::vector<unsigned int> newids(r - l + 1);
208
+ unsigned int tmp;
209
+ int tpos = spos - 1;
210
+ while (++tpos <= epos) {
211
+ for (int i = 0; i < num_char; i++) *(c + i) = 0;
212
+ for (int i = l; i <= r; i++) c[sig[i][tpos]]++;
213
+ for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
214
+ for (int i = r; i >= l; --i) {
215
+ tmp = --c[sig[i][tpos]] + l;
216
+ newids[tmp - l] = ids[i];
217
+ newsig[tmp - l] = sig[i];
218
+ }
219
+ if (++tpos <= epos) {
220
+ for (int i = 0; i < num_char; i++) *(c + i) = 0;
221
+ for (int i = l; i <= r; i++) c[newsig[i - l][tpos]]++;
222
+ for (int i = 1; i < num_char; i++) *(c + i) += *(c + i - 1);
223
+ for (int i = r; i >= l; --i) {
224
+ tmp = --c[newsig[i - l][tpos]] + l;
225
+ ids[tmp] = newids[i - l];
226
+ sig[tmp] = newsig[i - l];
227
+ }
228
+ }
229
+ else {
230
+ for (int i = l; i <= r; i++) {
231
+ ids[i] = newids[i - l];
232
+ sig[i] = newsig[i - l];
233
+ }
234
+ return;
235
+ }
236
+ }
237
+ }
238
+
239
+ inline void SketchSort::insertionSort(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, params &param) {
240
+ int i, j;
241
+ uint8_t *pivot, pval;
242
+ unsigned int pid;
243
+ std::vector<unsigned int> &ids = param.ids;
244
+ for (int tpos = spos; tpos <= epos; tpos++) {
245
+ for (i = l + 1; i <= r; i++) {
246
+ pivot = sig[i]; pval = sig[i][tpos]; pid = ids[i];
247
+ for (j = i; j > l && sig[j-1][tpos] > pval; j--) {
248
+ sig[j] = sig[j-1];
249
+ ids[j] = ids[j-1];
250
+ }
251
+ sig[j] = pivot;
252
+ ids[j] = pid;
253
+ }
254
+ }
255
+ }
256
+
257
+ inline void SketchSort::classify(std::vector<uint8_t*> &sig, int spos, int epos, int l, int r, int bpos, params &param, unsigned int _windowsize) {
258
+ int n_l = l, n_r = r;
259
+ for (int iter = l + 1; iter <= r; iter++) {
260
+ if (!std::equal(sig[n_l] + spos, sig[n_l] + epos + 1, sig[iter] + spos)) {
261
+ n_r = iter - 1;
262
+ if (n_r - n_l >= 1)
263
+ multi_classification(sig, bpos + 1, n_l, n_r, param, _windowsize);
264
+ n_l = iter;
265
+ }
266
+ }
267
+ if (r - n_l >= 1)
268
+ multi_classification(sig, bpos + 1, n_l, r, param, _windowsize);
269
+ }
270
+
271
+ inline bool SketchSort::calc_chunk_hamdist(uint8_t *seq1, uint8_t *seq2, const params &param) {
272
+ ++numHamDist;
273
+ unsigned int d = 0;
274
+ for (size_t i = 1; i <= param.chunk_len; i++)
275
+ if (*seq1++ != *seq2++ && ++d > param.chunk_dist) return false;
276
+ return true;
277
+ }
278
+
279
+ inline bool SketchSort::check_chunk_canonical(uint8_t *seq1, uint8_t *seq2, const params &param) {
280
+ unsigned int d = 0;
281
+ int end = param.pchunks[param.cchunk].start - 1;
282
+ int j = 1;
283
+ int tend = param.pchunks[j].end;
284
+ int i = 0;
285
+
286
+ while (++i <= end) {
287
+ if ((d += abs(seq1[i] - seq2[i])) > param.chunk_dist) {
288
+ while (++i <= tend) d += abs(seq1[i] - seq2[i]);
289
+ // if (seq1[i] != seq2[i]) ++d;
290
+ d = 0;
291
+ tend = param.pchunks[++j].end;
292
+ i = param.pchunks[j].start - 1;
293
+ continue;
294
+ }
295
+ if (tend == i)
296
+ return false;
297
+ }
298
+ return true;
299
+ }
300
+
301
+ inline bool SketchSort::check_canonical(uint8_t *seq1, uint8_t *seq2, const params &param) {
302
+ size_t sb = 1, eb = 1;
303
+ size_t b;
304
+ for (size_t i = 0, size = param.blocks.size(); i < size; i++) {
305
+ eb = param.blocks[i];
306
+ for (b = sb; b < eb; b++) {
307
+ if (std::equal(seq1 + param.pos[b].start, seq1 + param.pos[b].end + 1, seq2 + param.pos[b].start))
308
+ return false;
309
+ }
310
+ sb = param.blocks[i] + 1;
311
+ }
312
+ return true;
313
+ }
314
+
315
+ inline void SketchSort::report(std::vector<uint8_t*> &sig, int l, int r, params &param, unsigned int _windowsize) {
316
+ // std::cout << "report" << std::endl;
317
+ float cosDist;
318
+ for (int i = l; i < r; i++) {
319
+ for (int j = i + 1; j <= r; j++) {
320
+ unsigned int span = abs(tws[param.ids[j]]-tws[param.ids[i]]);
321
+ if (_windowsize != 0 && ( span > _windowsize || span == 0 )){
322
+ //if (_windowsize != 0 && span > _windowsize)
323
+ continue;
324
+ }
325
+ if (check_canonical(sig[i], sig[j], param) &&
326
+ calc_chunk_hamdist(sig[i] + param.start_chunk, sig[j] + param.start_chunk, param) &&
327
+ check_chunk_canonical(sig[i], sig[j], param) &&
328
+ ((cosDist = checkCos(param.ids[i], param.ids[j])) <= param.cosDist)) {
329
+ (*param.os) << param.ids[i] << " " << param.ids[j] << " " << cosDist << std::endl;
330
+ }
331
+ }
332
+ }
333
+ }
334
+
335
+ void SketchSort::multi_classification(std::vector<uint8_t*> &sig, int maxind, int l, int r, params &param, unsigned int _windowsize) {
336
+
337
+ if (param.blocks.size() == param.numblocks - param.chunk_dist) {
338
+ report(sig, l, r, param, _windowsize);
339
+ return;
340
+ }
341
+
342
+ for (int bpos = maxind; bpos <= (int)param.numblocks; bpos++) {
343
+
344
+ if (param.blocks.size() + (param.numblocks - bpos + 1) < param.numblocks - param.chunk_dist) { // pruning
345
+ // std::cerr << "return " << std::endl;
346
+ return;
347
+ }
348
+ param.blocks.push_back(bpos);
349
+ sort(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, param);
350
+ classify(sig, param.pos[bpos].start, param.pos[bpos].end, l, r, bpos, param, _windowsize);
351
+ param.blocks.pop_back();
352
+ }
353
+ }
354
+
355
+ double combination(int n, int m) {
356
+ double sum = 1.0;
357
+ for (int i = 0; i < m; i++) {
358
+ sum *= (n-i)/(m-i);
359
+ }
360
+ return sum;
361
+ }
362
+
363
+ double SketchSort::calcMissingEdgeRatio(params &param) {
364
+ double sum = 0.f;
365
+ double prob = acos(1.0 - param.cosDist)/M_PI;
366
+ for (unsigned int k = 0; k <= param.chunk_dist; k++) {
367
+ sum += (combination(param.projectDim, k) * pow(prob, k) * pow(1 - prob, param.projectDim - k));
368
+ }
369
+ return pow(1.0 - sum, param.numchunks);
370
+ }
371
+
372
+ void SketchSort::preComputeNorms() {
373
+ norms.resize(fvs.size());
374
+ float sum;
375
+ for (size_t i = 0; i < fvs.size(); i++) {
376
+ boost::numeric::ublas::vector<float> &fv = fvs[i];
377
+ sum = 0.f;
378
+ for (size_t j = 0; j < fv.size(); j++) {
379
+ sum += pow(fv[j], 2);
380
+ }
381
+ norms[i] = 1.f/sqrt(sum);
382
+ }
383
+ }
384
+
385
+ void SketchSort::decideParameters(float _missingratio, params &param) {
386
+ unsigned int hamDist = 1;
387
+ unsigned int numBlocks = hamDist + 3;
388
+ unsigned int numchunks = 0;
389
+
390
+ do {
391
+ if (numchunks > 30) {
392
+ hamDist += 1;
393
+ numBlocks = hamDist + 3;
394
+ numchunks = 0;
395
+ }
396
+ numchunks += 1;
397
+ param.chunk_dist = hamDist;
398
+ param.numblocks = numBlocks;
399
+ param.numchunks = numchunks;
400
+ } while (calcMissingEdgeRatio(param) >= _missingratio);
401
+ }
402
+
403
+ void SketchSort::run(const char *fname, const char *oname,
404
+ unsigned int _numblocks,
405
+ unsigned int _dist,
406
+ float _cosDist,
407
+ unsigned int _numchunks,
408
+ bool _autoFlag,
409
+ float _missingratio,
410
+ bool _centering,
411
+ unsigned int _windowsize,
412
+ unsigned int _seed)
413
+ {
414
+ params param;
415
+ param.numblocks = _numblocks;
416
+ param.numchunks = _numchunks;
417
+ param.chunk_dist = _dist;
418
+ param.cosDist = _cosDist;
419
+ num_char = 2;
420
+ param.projectDim = 32;
421
+
422
+ numSort = 0;
423
+ numCosDist = 0;
424
+ numHamDist = 0;
425
+
426
+ if (_autoFlag) {
427
+ // std::cerr << "deciding parameters such that the missing edge ratio is no more than " << _missingratio << std::endl;
428
+ decideParameters(_missingratio, param);
429
+ // std::cout << "decided parameters:" << std::endl;
430
+ // std::cout << "hamming distance threshold: " << param.chunk_dist << std::endl;
431
+ // std::cout << "number of blocks: " << param.numblocks << std::endl;
432
+ // std::cout << "number of chunks: " << param.numchunks << std::endl;
433
+ // std::cout << std::endl;
434
+ }
435
+
436
+ std::ofstream ofs(oname);
437
+ param.os = &ofs;
438
+
439
+ //std::cout << "missing edge ratio:" << calcMissingEdgeRatio(param) << std::endl;
440
+
441
+ //std::cerr << "start reading" << std::endl;
442
+ double readstart = clock();
443
+ readFeature(fname,_windowsize);
444
+ double readend = clock();
445
+ //std::cerr << "end reading" << std::endl;
446
+ //std::cout << "readtime:" << (readend - readstart)/(double)CLOCKS_PER_SEC << std::endl;
447
+
448
+ if (_centering) {
449
+ //std::cerr << "start making input-data centered at 0" << std::endl;
450
+ double centeringstart = clock();
451
+ centeringData();
452
+ double centeringend = clock();
453
+ //std::cerr << "end making input-data centered at 0" << std::endl;
454
+ //std::cout << "centering time:" << (centeringend - centeringstart)/(double)CLOCKS_PER_SEC << std::endl;
455
+
456
+ }
457
+
458
+
459
+ double totalstart = clock();
460
+ preComputeNorms();
461
+ //param.projectDim = 2*(int)log(dim);
462
+
463
+ param.counter = new unsigned int[num_char];
464
+
465
+ //std::cout << "number of data:" << fvs.size() << std::endl;
466
+ //std::cout << "data dimension:" << dim << std::endl;
467
+ //std::cout << "projected dimension:" << param.projectDim << std::endl;
468
+ //std::cout << "length of strings:" << param.projectDim * param.numchunks << std::endl;
469
+ //std::cout << "number of chunks:" << param.numchunks << std::endl;
470
+
471
+ double projectstart = clock();
472
+ //std::cerr << "start projection" << std::endl;
473
+ std::vector<uint8_t*> sig;
474
+ projectVectors(param.projectDim * param.numchunks, sig, _seed, param);
475
+ //read(fname, sig, param);
476
+ //std::cerr << "end projection" << std::endl;
477
+ double projectend = clock();
478
+ //std::cout << "projecttime:" << (projectend - projectstart)/(double)CLOCKS_PER_SEC << std::endl;
479
+
480
+ param.pchunks = new pstat[param.numchunks + 1];
481
+ for (int i = 1; i <= (int)param.numchunks; i++) {
482
+ param.pchunks[i].start = (int)ceil((double)param.seq_len*((double)(i - 1)/(double)param.numchunks)) + 1;
483
+ param.pchunks[i].end = (int)ceil((double)param.seq_len*(double)i/(double)param.numchunks);
484
+ }
485
+
486
+ double msmtime = 0.0;
487
+
488
+
489
+ //std::cerr << "chunk distance:" << param.chunk_dist << std::endl;
490
+ //std::cerr << "the number of blocks:" << param.numblocks << std::endl;
491
+ param.pos = new pstat[param.numblocks + 1];
492
+ for (int i = 1; i <= (int) param.numchunks; i++) {
493
+ param.chunk_len = param.pchunks[i].end - param.pchunks[i].start + 1;
494
+ param.start_chunk = param.pchunks[i].start;
495
+ param.end_chunk = param.pchunks[i].end;
496
+ param.cchunk = i;
497
+ for (int j = 1; j <= (int)param.numblocks; j++) {
498
+ param.pos[j].start = (int)ceil((double)param.chunk_len*((double)(j - 1)/(double)param.numblocks)) + param.pchunks[i].start;
499
+ param.pos[j].end = (int)ceil((double)param.chunk_len*(double)j/(double)param.numblocks) + param.pchunks[i].start - 1;
500
+ }
501
+ //std::cerr << "start enumeration chunk no " << i << std::endl;
502
+ double msmstart = clock();
503
+ //std::cout << "sig=" << sig << std::endl;
504
+ //std::cout << "param.num_seq=" << param.num_seq << std::endl;
505
+ //std::cout << "param=" << param << std::endl;
506
+ multi_classification(sig, 1, 0, param.num_seq - 1, param, _windowsize);
507
+ double msmend = clock();
508
+ msmtime += (msmend - msmstart)/(double)CLOCKS_PER_SEC;
509
+ }
510
+ //std::cout << "msmtime:" << msmtime << std::endl;
511
+
512
+ double totalend = clock();
513
+ //std::cout << "cputime:" << (totalend - totalstart)/(double)CLOCKS_PER_SEC << std::endl;
514
+
515
+ //std::cout << "numSort:" << combination(param.numblocks, param.chunk_dist) * param.numchunks << std::endl;
516
+ //std::cout << "numHamDist:" << numHamDist << std::endl;
517
+ //std::cout << "numCosDist:" << numCosDist << std::endl;
518
+ ofs.close();
519
+ // destructor
520
+ delete p;
521
+ delete[] param.counter;
522
+ delete[] param.pchunks;
523
+ delete[] param.pos;
524
+
525
+ return;
526
+ }