effspm 0.2.8__cp39-cp39-win_amd64.whl → 0.3.3__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cp39-win_amd64.pyd +0 -0
- effspm/_effspm.cpp +961 -210
- effspm/btminer/src/build_mdd.cpp +42 -17
- effspm/btminer/src/build_mdd.hpp +13 -19
- effspm/btminer/src/freq_miner.cpp +134 -49
- effspm/btminer/src/freq_miner.hpp +16 -0
- effspm/btminer/src/load_inst.cpp +211 -126
- effspm/btminer/src/load_inst.hpp +22 -4
- effspm/btminer/src/main.cpp +83 -0
- effspm/btminer/src/utility.cpp +26 -41
- effspm/btminer/src/utility.hpp +6 -30
- effspm/freq_miner.hpp +2 -1
- effspm/htminer/src/build_mdd.cpp +46 -124
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/build_mdd.cpp +69 -110
- effspm/largebm/src/build_mdd.hpp +22 -37
- effspm/largebm/src/freq_miner.cpp +241 -291
- effspm/largebm/src/freq_miner.hpp +25 -36
- effspm/largebm/src/load_inst.cpp +20 -26
- effspm/largebm/src/load_inst.hpp +24 -34
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +11 -21
- effspm/largebm/src/utility.hpp +7 -10
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +134 -191
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +137 -174
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/freq_miner.cpp +184 -156
- effspm/largepp/src/freq_miner.hpp +11 -36
- effspm/largepp/src/load_inst.cpp +32 -12
- effspm/largepp/src/load_inst.hpp +15 -9
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/load_inst.cpp +8 -8
- effspm/load_inst.hpp +1 -1
- effspm/main.cpp +103 -0
- {effspm-0.2.8.dist-info → effspm-0.3.3.dist-info}/METADATA +1 -1
- effspm-0.3.3.dist-info/RECORD +60 -0
- effspm-0.2.8.dist-info/RECORD +0 -53
- {effspm-0.2.8.dist-info → effspm-0.3.3.dist-info}/WHEEL +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.3.dist-info}/top_level.txt +0 -0
effspm/htminer/src/load_inst.cpp
CHANGED
|
@@ -1,394 +1,345 @@
|
|
|
1
1
|
#include <iostream>
|
|
2
|
-
#include <fstream>
|
|
3
2
|
#include <sstream>
|
|
4
|
-
#include <
|
|
5
|
-
#include <
|
|
3
|
+
#include <algorithm>
|
|
4
|
+
#include <math.h>
|
|
6
5
|
#include "load_inst.hpp"
|
|
7
|
-
#include "freq_miner.hpp"
|
|
8
6
|
#include "utility.hpp"
|
|
9
7
|
#include "build_mdd.hpp"
|
|
8
|
+
#include "freq_miner.hpp"
|
|
10
9
|
|
|
11
10
|
namespace htminer {
|
|
12
|
-
using namespace std;
|
|
13
11
|
|
|
14
|
-
|
|
15
|
-
// unsigned long long E = 0;
|
|
12
|
+
using namespace std;
|
|
16
13
|
|
|
17
|
-
//
|
|
14
|
+
// ✅ Fix types here: M, mlim are unsigned int; N, L, theta, E are unsigned long long
|
|
15
|
+
unsigned int M = 0, mlim = 0;
|
|
16
|
+
unsigned long long N = 0, L = 0, theta = 0, E = 0;
|
|
18
17
|
|
|
19
18
|
bool itmset_exists = 0;
|
|
20
19
|
|
|
21
|
-
vector<int>
|
|
22
|
-
|
|
23
|
-
|
|
20
|
+
vector<int> item_dic;
|
|
21
|
+
vector<Pattern> DFS;
|
|
22
|
+
vector<VPattern> VDFS;
|
|
23
|
+
|
|
24
|
+
string out_file;
|
|
25
|
+
string folder;
|
|
26
|
+
|
|
27
|
+
bool b_disp = 0;
|
|
28
|
+
bool b_write = 0;
|
|
29
|
+
bool use_dic = 0;
|
|
30
|
+
bool just_build = 0;
|
|
31
|
+
bool pre_pro = 1;
|
|
32
|
+
|
|
33
|
+
unsigned int time_limit = 10 * 3600;
|
|
34
|
+
clock_t start_time;
|
|
24
35
|
|
|
25
36
|
void Load_items_pre(string &inst_name);
|
|
26
37
|
bool Load_items(string &inst_name);
|
|
27
38
|
bool Preprocess(string& inst, double thresh);
|
|
28
39
|
|
|
29
|
-
bool Load_instance(
|
|
30
|
-
// Debug: entry
|
|
31
|
-
// std::cerr << "[HTMiner::Load_instance] called with file=\"" << items_file
|
|
32
|
-
// << "\" minsup=" << thresh << std::endl;
|
|
33
|
-
|
|
34
|
-
// ── RESET EVERYTHING BEFORE BUILDING ───────────────────────────────────
|
|
35
|
-
Tree.clear();
|
|
36
|
-
CTree.clear();
|
|
37
|
-
VTree.clear();
|
|
38
|
-
DFS.clear();
|
|
39
|
-
VDFS.clear();
|
|
40
|
-
N = 0;
|
|
41
|
-
M = 0;
|
|
42
|
-
L = 0;
|
|
43
|
-
E = 0;
|
|
44
|
-
itmset_exists = false;
|
|
40
|
+
bool Load_instance(string& items_file, double thresh) {
|
|
45
41
|
|
|
46
|
-
|
|
47
|
-
|
|
42
|
+
clock_t kk = clock();
|
|
48
43
|
|
|
49
|
-
// Initialize root of Tree (after reset)
|
|
50
44
|
Tree.emplace_back(0, 0, 0);
|
|
51
|
-
Tree[0].itmset = 1;
|
|
52
|
-
{
|
|
53
|
-
std::vector<unsigned int> dummyAnc(L, 0);
|
|
54
|
-
std::vector<int> dummyItems;
|
|
55
|
-
CTree.emplace_back(dummyAnc, dummyItems);
|
|
56
|
-
VTree.emplace_back(); // calls VArc() default ctor
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// ─── Allocate DFS/VDFS to length L so that Add_vec can index them ───
|
|
60
|
-
DFS.reserve(L);
|
|
61
|
-
for (unsigned int i = 0; i < L; ++i)
|
|
62
|
-
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
63
|
-
VDFS.resize(L);
|
|
64
45
|
|
|
65
46
|
if (pre_pro) {
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
}
|
|
71
|
-
// std::cout << "\nPreprocess done in " << give_time(std::clock() - kk) << " seconds\n\n";
|
|
47
|
+
if (!Preprocess(items_file, thresh))
|
|
48
|
+
return 0;
|
|
49
|
+
if (b_disp)
|
|
50
|
+
cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
|
|
72
51
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
for (int i = 0; i < static_cast<int>(L); ++i)
|
|
52
|
+
DFS.reserve((size_t)L);
|
|
53
|
+
for (int i = 0; i < (int)L; ++i)
|
|
76
54
|
DFS.emplace_back(-i - 1);
|
|
77
55
|
|
|
78
|
-
kk =
|
|
79
|
-
//std::cerr << "[HTMiner::Load_instance] calling Load_items_pre" << std::endl;
|
|
80
|
-
Load_items_pre(items_file);
|
|
56
|
+
kk = clock();
|
|
81
57
|
|
|
82
|
-
|
|
58
|
+
Load_items_pre(items_file);
|
|
83
59
|
if (Tree.size() > 100000000) {
|
|
84
60
|
Tree.shrink_to_fit();
|
|
85
61
|
CTree.shrink_to_fit();
|
|
86
62
|
VTree.shrink_to_fit();
|
|
87
|
-
// std::cerr << "[HTMiner::Load_instance] Shrunk Tree, CTree, VTree to fit" << std::endl;
|
|
88
63
|
}
|
|
89
|
-
// std::cerr << "[HTMiner::Load_instance] Load_items_pre completed; N=" << N
|
|
90
|
-
// << " M=" << M << " L=" << L << " E=" << E << std::endl;
|
|
91
|
-
}
|
|
92
|
-
else if (!Load_items(items_file)) {
|
|
93
|
-
// std::cerr << "[HTMiner::Load_instance] Load_items failed; returning false" << std::endl;
|
|
94
|
-
return false;
|
|
95
64
|
}
|
|
65
|
+
else if (!Load_items(items_file))
|
|
66
|
+
return 0;
|
|
96
67
|
else {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
} else {
|
|
102
|
-
theta = static_cast<unsigned long long>(thresh);
|
|
103
|
-
// std::cerr << "[HTMiner::Load_instance] Using theta = " << theta << " (absolute)" << std::endl;
|
|
104
|
-
}
|
|
105
|
-
// std::cerr << "[HTMiner::Load_instance] No preprocessing; Load_items succeeded; N=" << N
|
|
106
|
-
// << " M=" << M << " L=" << L << " E=" << E << std::endl;
|
|
68
|
+
if (thresh < 1)
|
|
69
|
+
theta = (unsigned long long)ceil(thresh * N);
|
|
70
|
+
else
|
|
71
|
+
theta = (unsigned long long)thresh;
|
|
107
72
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
//
|
|
115
|
-
//
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
// std::cerr << "[HTMiner::Load_instance] Exiting normally; returning true" << std::endl;
|
|
119
|
-
return true;
|
|
73
|
+
if (b_disp)
|
|
74
|
+
cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
|
|
75
|
+
if (b_disp)
|
|
76
|
+
cout << "Found " << N << " sequence, with max line len " << M
|
|
77
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
78
|
+
// cout << "Total Trie nodes: " << Tree.size()
|
|
79
|
+
// << " Total CTree nodes: " << CTree.size()
|
|
80
|
+
// << " Total VTree nodes: " << VTree.size() << endl;
|
|
81
|
+
|
|
82
|
+
return 1;
|
|
120
83
|
}
|
|
121
84
|
|
|
122
|
-
|
|
123
85
|
bool Preprocess(string &inst, double thresh) {
|
|
124
86
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
else {
|
|
190
|
-
//cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
191
|
-
return 0;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
if (thresh < 1)
|
|
195
|
-
theta = ceil(thresh * N);
|
|
196
|
-
else
|
|
197
|
-
theta = thresh;
|
|
198
|
-
|
|
199
|
-
int real_L = 0;
|
|
200
|
-
item_dic = vector<int>(L, -1);
|
|
201
|
-
vector<bool> item_in(L, 0);
|
|
202
|
-
for (int i = 0; i < L; ++i) {
|
|
203
|
-
if (freq[i] >= theta) {
|
|
204
|
-
item_dic[i] = ++real_L;
|
|
205
|
-
item_in[i] = 1;
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
//cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
|
|
210
|
-
|
|
211
|
-
unsigned long long int LpM = 1;
|
|
212
|
-
mlim = M;
|
|
213
|
-
int orgmlim;
|
|
214
|
-
int ulim = min(3 + real_L / 5, 10);
|
|
215
|
-
unsigned long long int ml;
|
|
216
|
-
int coef = 1 + 1 * itmset_exists;
|
|
217
|
-
for (int i = 0; i + ulim < MN.size() && i + ulim < M; ++i) {
|
|
218
|
-
ml = 0;
|
|
219
|
-
for (int j = 0; j < L; ++j) {
|
|
220
|
-
if (ML[i][j] && item_in[j])
|
|
221
|
-
++ml;
|
|
222
|
-
}
|
|
223
|
-
LpM *= ml * coef;
|
|
224
|
-
// cout << ml << " " << LpM << " " << MN[i] << endl;
|
|
225
|
-
if (LpM * ulim > MN[i]) {
|
|
226
|
-
if (6 * (MN[i] - LpM) >= 5 * MN[i])
|
|
227
|
-
orgmlim = i;
|
|
228
|
-
while (i + ulim - 1 < MN.size() && i + ulim - 1 < M) {
|
|
229
|
-
// cout << MN[i - 1] - MN[i + ulim - 1] << " " << MN[i + ulim - 1] << endl;
|
|
230
|
-
if (MN[i - 1] - MN[i + ulim - 1] < MN[i + ulim - 1] && MN[i + ulim - 1] < 600000000 ) {
|
|
231
|
-
mlim = i - 1;
|
|
232
|
-
break;
|
|
233
|
-
}
|
|
234
|
-
i += 1;
|
|
235
|
-
}
|
|
236
|
-
break;
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
// cout << "M is: " << M << " Mlim is: " << mlim << " ulim is: " << ulim << " original mlim is: " << orgmlim << " guess is: " << round((log(N) - log(6)) / log(real_L)) << endl;
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if (mlim < M) {
|
|
244
|
-
for (int i = 0; i < real_L; ++i)
|
|
245
|
-
VDFS.emplace_back(i);
|
|
246
|
-
if (MN[mlim + ulim] > 100000000) {
|
|
247
|
-
CTree.reserve(MN[mlim + ulim] / 2);
|
|
248
|
-
VTree.reserve(MN[mlim + ulim] / 2);
|
|
249
|
-
Tree.reserve((N - MN[mlim + ulim]) * 2);
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
else if (N > 100000000)
|
|
253
|
-
Tree.reserve(500000000);
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
L = real_L;
|
|
257
|
-
N = 0;
|
|
258
|
-
M = 0;
|
|
259
|
-
|
|
260
|
-
return 1;
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
void Load_items_pre(string &inst_name) {
|
|
265
|
-
|
|
266
|
-
ifstream file(inst_name);
|
|
267
|
-
|
|
268
|
-
if (file.good()) {
|
|
269
|
-
string line;
|
|
270
|
-
int ditem;
|
|
271
|
-
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
272
|
-
istringstream word(line);
|
|
273
|
-
string itm;
|
|
274
|
-
vector<int> temp_vec;
|
|
275
|
-
vector<int> temp_lim;
|
|
276
|
-
bool sgn = 0;
|
|
277
|
-
while (word >> itm) {
|
|
278
|
-
|
|
279
|
-
ditem = stoi(itm);
|
|
280
|
-
|
|
281
|
-
if (item_dic[abs(ditem) - 1] == -1) {
|
|
282
|
-
if (!sgn)
|
|
283
|
-
sgn = ditem < 0;
|
|
284
|
-
continue;
|
|
285
|
-
}
|
|
286
|
-
else {
|
|
287
|
-
if (ditem > 0)
|
|
288
|
-
ditem = item_dic[ditem - 1];
|
|
289
|
-
else
|
|
290
|
-
ditem = -item_dic[-ditem - 1];
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
if (sgn) {
|
|
294
|
-
if (ditem > 0)
|
|
295
|
-
ditem = -ditem;
|
|
296
|
-
sgn = 0;
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
if (temp_vec.size() <= mlim)
|
|
300
|
-
temp_vec.push_back(ditem);
|
|
301
|
-
else
|
|
302
|
-
temp_lim.push_back(ditem);
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
if (temp_vec.empty())
|
|
306
|
-
continue;
|
|
307
|
-
|
|
308
|
-
++N;
|
|
309
|
-
//if (N % 1000000 == 0)
|
|
310
|
-
// cout << N << " " << Tree.size() << " " << CTree.size() << " " << VTree.size() << endl;
|
|
311
|
-
|
|
312
|
-
if (temp_vec.size() + temp_lim.size() > M)
|
|
313
|
-
M = temp_vec.size() + temp_lim.size();
|
|
314
|
-
|
|
315
|
-
Build_MDD(temp_vec, temp_lim);
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
bool Load_items(std::string &inst_name) {
|
|
322
|
-
//std::cerr << "[HTMiner::Load_items] called with filename=\"" << inst_name << "\"" << std::endl; // DEBUG
|
|
323
|
-
unsigned int mlim = 0;
|
|
324
|
-
// reset the global M and E here instead of redeclaring:
|
|
325
|
-
M = 0;
|
|
326
|
-
E = 0;
|
|
327
|
-
|
|
328
|
-
std::ifstream file(inst_name);
|
|
329
|
-
if (!file.good()) {
|
|
330
|
-
//std::cerr << "[HTMiner::Load_items] ERROR: No such file: " << inst_name << std::endl;
|
|
331
|
-
return false;
|
|
87
|
+
vector<unsigned long long int> MN(100, 0);
|
|
88
|
+
vector<vector<bool>> ML(100, vector<bool>(1000000, 0));
|
|
89
|
+
|
|
90
|
+
ifstream file(inst);
|
|
91
|
+
|
|
92
|
+
vector<unsigned int> freq(1000000, 0);
|
|
93
|
+
vector<unsigned long long int> counted(1000000, 0);
|
|
94
|
+
|
|
95
|
+
if (file.good()) {
|
|
96
|
+
string line;
|
|
97
|
+
int ditem;
|
|
98
|
+
while (getline(file, line) &&
|
|
99
|
+
give_time(clock() - start_time) < time_limit) {
|
|
100
|
+
++N;
|
|
101
|
+
|
|
102
|
+
// if (N % 10000000 == 0) cout << "N: " << N << endl;
|
|
103
|
+
istringstream word(line);
|
|
104
|
+
string itm;
|
|
105
|
+
int size_m = 0;
|
|
106
|
+
while (word >> itm) {
|
|
107
|
+
++size_m;
|
|
108
|
+
ditem = stoi(itm);
|
|
109
|
+
|
|
110
|
+
if (ditem > 0)
|
|
111
|
+
itmset_exists = 1;
|
|
112
|
+
else
|
|
113
|
+
ditem *= -1;
|
|
114
|
+
|
|
115
|
+
if ((size_t)size_m < MN.size()) {
|
|
116
|
+
++MN[size_m - 1];
|
|
117
|
+
if (ML[size_m - 1].size() < (size_t)ditem) {
|
|
118
|
+
ML[size_m - 1].reserve(ditem);
|
|
119
|
+
while (ML[size_m - 1].size() < (size_t)ditem)
|
|
120
|
+
ML[size_m - 1].push_back(0);
|
|
121
|
+
}
|
|
122
|
+
ML[size_m - 1][ditem - 1] = 1;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if (L < (unsigned long long)ditem)
|
|
126
|
+
L = (unsigned long long)ditem;
|
|
127
|
+
|
|
128
|
+
if (freq.size() < (size_t)L) {
|
|
129
|
+
freq.reserve((size_t)L);
|
|
130
|
+
counted.reserve((size_t)L);
|
|
131
|
+
while (freq.size() < (size_t)L) {
|
|
132
|
+
freq.push_back(0);
|
|
133
|
+
counted.push_back(0);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if (counted[ditem - 1] != N) {
|
|
138
|
+
++freq[ditem - 1];
|
|
139
|
+
counted[ditem - 1] = N;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
++E; // count entries
|
|
143
|
+
}
|
|
144
|
+
if (size_m > (int)M)
|
|
145
|
+
M = (unsigned int)size_m;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
else {
|
|
149
|
+
cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
150
|
+
return 0;
|
|
332
151
|
}
|
|
333
152
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
153
|
+
if (thresh < 1)
|
|
154
|
+
theta = (unsigned long long)ceil(thresh * N);
|
|
155
|
+
else
|
|
156
|
+
theta = (unsigned long long)thresh;
|
|
157
|
+
|
|
158
|
+
int real_L = 0;
|
|
159
|
+
item_dic = vector<int>((size_t)L, -1);
|
|
160
|
+
vector<bool> item_in((size_t)L, 0);
|
|
161
|
+
for (int i = 0; i < (int)L; ++i) {
|
|
162
|
+
if (freq[i] >= theta) {
|
|
163
|
+
item_dic[i] = ++real_L;
|
|
164
|
+
item_in[i] = 1;
|
|
341
165
|
}
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
166
|
+
}
|
|
167
|
+
if (b_disp)
|
|
168
|
+
cout << "Original number of items: " << L
|
|
169
|
+
<< " Reduced to: " << real_L << endl;
|
|
170
|
+
|
|
171
|
+
unsigned long long int LpM = 1;
|
|
172
|
+
mlim = M;
|
|
173
|
+
int orgmlim;
|
|
174
|
+
int ulim = std::min(3 + real_L / 5, 10);
|
|
175
|
+
unsigned long long int ml;
|
|
176
|
+
int coef = 1 + 1 * itmset_exists;
|
|
177
|
+
for (int i = 0; i + ulim < (int)MN.size() && i + ulim < (int)M; ++i) {
|
|
178
|
+
ml = 0;
|
|
179
|
+
for (int j = 0; j < (int)L; ++j) {
|
|
180
|
+
if (ML[i][j] && item_in[j])
|
|
181
|
+
++ml;
|
|
182
|
+
}
|
|
183
|
+
LpM *= ml * coef;
|
|
184
|
+
// debug: ml, LpM, MN[i]
|
|
185
|
+
// cout << ml << " " << LpM << " " << MN[i] << endl;
|
|
186
|
+
if (LpM * ulim > MN[i]) {
|
|
187
|
+
if (6 * (MN[i] - LpM) >= 5 * MN[i])
|
|
188
|
+
orgmlim = i;
|
|
189
|
+
while (i + ulim - 1 < (int)MN.size() && i + ulim - 1 < (int)M) {
|
|
190
|
+
// debug: MN[i-1] - MN[i+ulim-1], MN[i+ulim-1]
|
|
191
|
+
// cout << MN[i - 1] - MN[i + ulim - 1]
|
|
192
|
+
// << " " << MN[i + ulim - 1] << endl;
|
|
193
|
+
if (MN[i - 1] - MN[i + ulim - 1] < MN[i + ulim - 1] &&
|
|
194
|
+
MN[i + ulim - 1] < 600000000) {
|
|
195
|
+
mlim = i - 1;
|
|
196
|
+
break;
|
|
197
|
+
}
|
|
198
|
+
i += 1;
|
|
361
199
|
}
|
|
362
|
-
|
|
363
|
-
if (temp_vec.size() < mlim)
|
|
364
|
-
temp_vec.push_back(ditem);
|
|
365
|
-
else
|
|
366
|
-
temp_lim.push_back(ditem);
|
|
200
|
+
break;
|
|
367
201
|
}
|
|
202
|
+
}
|
|
368
203
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
204
|
+
// debug: final M/MLIM summary
|
|
205
|
+
// cout << "M is: " << M << " Mlim is: " << mlim
|
|
206
|
+
// << " ulim is: " << ulim
|
|
207
|
+
// << " original mlim is: " << orgmlim
|
|
208
|
+
// << " guess is: "
|
|
209
|
+
// << round((log(N) - log(6)) / log(real_L)) << endl;
|
|
210
|
+
|
|
211
|
+
if (mlim < M) {
|
|
212
|
+
for (int i = 0; i < real_L; ++i)
|
|
213
|
+
VDFS.emplace_back(i);
|
|
214
|
+
if (MN[mlim + ulim] > 100000000) {
|
|
215
|
+
CTree.reserve(MN[mlim + ulim] / 2);
|
|
216
|
+
VTree.reserve(MN[mlim + ulim] / 2);
|
|
217
|
+
Tree.reserve((N - MN[mlim + ulim]) * 2);
|
|
372
218
|
}
|
|
373
|
-
|
|
374
|
-
// std::cerr << "[HTMiner::Load_items] Calling Build_MDD with temp_vec size=" << temp_vec.size()
|
|
375
|
-
// << ", temp_lim size=" << temp_lim.size() << std::endl;
|
|
376
|
-
Build_MDD(temp_vec, temp_lim);
|
|
377
|
-
//std::cerr << "[HTMiner::Load_items] Build_MDD returned; Tree size now=" << Tree.size() << std::endl;
|
|
378
219
|
}
|
|
220
|
+
else if (N > 100000000)
|
|
221
|
+
Tree.reserve(500000000);
|
|
379
222
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
}
|
|
223
|
+
L = (unsigned long long)real_L;
|
|
224
|
+
N = 0;
|
|
225
|
+
M = 0;
|
|
384
226
|
|
|
227
|
+
return 1;
|
|
385
228
|
}
|
|
386
229
|
|
|
387
230
|
|
|
231
|
+
void Load_items_pre(string &inst_name) {
|
|
232
|
+
|
|
233
|
+
ifstream file(inst_name);
|
|
234
|
+
|
|
235
|
+
if (file.good()) {
|
|
236
|
+
string line;
|
|
237
|
+
int ditem;
|
|
238
|
+
while (getline(file, line) &&
|
|
239
|
+
give_time(clock() - start_time) < time_limit) {
|
|
240
|
+
istringstream word(line);
|
|
241
|
+
string itm;
|
|
242
|
+
vector<int> temp_vec;
|
|
243
|
+
vector<int> temp_lim;
|
|
244
|
+
bool sgn = 0;
|
|
245
|
+
while (word >> itm) {
|
|
246
|
+
|
|
247
|
+
ditem = stoi(itm);
|
|
248
|
+
|
|
249
|
+
if (item_dic[std::abs(ditem) - 1] == -1) {
|
|
250
|
+
if (!sgn)
|
|
251
|
+
sgn = ditem < 0;
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
else {
|
|
255
|
+
if (ditem > 0)
|
|
256
|
+
ditem = item_dic[ditem - 1];
|
|
257
|
+
else
|
|
258
|
+
ditem = -item_dic[-ditem - 1];
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (sgn) {
|
|
262
|
+
if (ditem > 0)
|
|
263
|
+
ditem = -ditem;
|
|
264
|
+
sgn = 0;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (temp_vec.size() <= mlim)
|
|
268
|
+
temp_vec.push_back(ditem);
|
|
269
|
+
else
|
|
270
|
+
temp_lim.push_back(ditem);
|
|
271
|
+
|
|
272
|
+
++E;
|
|
273
|
+
}
|
|
388
274
|
|
|
275
|
+
if (temp_vec.empty())
|
|
276
|
+
continue;
|
|
389
277
|
|
|
278
|
+
++N;
|
|
279
|
+
// if (N % 1000000 == 0)
|
|
280
|
+
// cout << N << " " << Tree.size() << " " << CTree.size()
|
|
281
|
+
// << " " << VTree.size() << endl;
|
|
390
282
|
|
|
283
|
+
if (temp_vec.size() + temp_lim.size() > M)
|
|
284
|
+
M = (unsigned int)(temp_vec.size() + temp_lim.size());
|
|
391
285
|
|
|
286
|
+
Build_MDD(temp_vec, temp_lim);
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
bool Load_items(string &inst_name) {
|
|
292
|
+
|
|
293
|
+
ifstream file(inst_name);
|
|
294
|
+
|
|
295
|
+
if (file.good()) {
|
|
296
|
+
string line;
|
|
297
|
+
int ditem;
|
|
298
|
+
while (getline(file, line) &&
|
|
299
|
+
give_time(clock() - start_time) < time_limit) {
|
|
300
|
+
++N;
|
|
301
|
+
// Optional progress print — only if verbose:
|
|
302
|
+
// if (b_disp && N % 1000000 == 0)
|
|
303
|
+
//cout << "Found " << N << " sequence, with max line len " << M
|
|
304
|
+
//<< ", and " << L << " items, and " << E
|
|
305
|
+
// << " enteries\n";
|
|
306
|
+
|
|
307
|
+
istringstream word(line);
|
|
308
|
+
string itm;
|
|
309
|
+
vector<int> temp_vec;
|
|
310
|
+
vector<int> temp_lim;
|
|
311
|
+
while (word >> itm) {
|
|
312
|
+
ditem = stoi(itm);
|
|
313
|
+
if (ditem > 0)
|
|
314
|
+
itmset_exists = 1;
|
|
315
|
+
if (L < (unsigned long long)std::abs(ditem)) {
|
|
316
|
+
L = (unsigned long long)std::abs(ditem);
|
|
317
|
+
while (DFS.size() < (size_t)L) {
|
|
318
|
+
DFS.reserve((size_t)L);
|
|
319
|
+
DFS.emplace_back(-((int)DFS.size()) - 1);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if (temp_vec.size() < mlim)
|
|
324
|
+
temp_vec.push_back(ditem);
|
|
325
|
+
else
|
|
326
|
+
temp_lim.push_back(ditem);
|
|
327
|
+
|
|
328
|
+
++E;
|
|
329
|
+
}
|
|
392
330
|
|
|
331
|
+
if (temp_vec.size() + temp_lim.size() > M)
|
|
332
|
+
M = (unsigned int)(temp_vec.size());
|
|
393
333
|
|
|
334
|
+
Build_MDD(temp_vec, temp_lim);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
else {
|
|
338
|
+
cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
|
|
339
|
+
return 0;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return 1;
|
|
343
|
+
}
|
|
394
344
|
|
|
345
|
+
} // namespace htminer
|