effspm 0.1.7__cp312-cp312-win_amd64.whl → 0.2.6__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/__init__.py +10 -2
- effspm/_effspm.cp312-win_amd64.pyd +0 -0
- effspm/_effspm.cpp +609 -0
- effspm/btminer/src/build_mdd.cpp +63 -0
- effspm/btminer/src/build_mdd.hpp +40 -0
- effspm/btminer/src/freq_miner.cpp +179 -0
- effspm/btminer/src/freq_miner.hpp +39 -0
- effspm/btminer/src/load_inst.cpp +200 -0
- effspm/btminer/src/load_inst.hpp +25 -0
- effspm/btminer/src/utility.cpp +65 -0
- effspm/btminer/src/utility.hpp +40 -0
- effspm/freq_miner.hpp +4 -1
- effspm/htminer/src/build_mdd.cpp +192 -0
- effspm/htminer/src/build_mdd.hpp +64 -0
- effspm/htminer/src/freq_miner.cpp +350 -0
- effspm/htminer/src/freq_miner.hpp +60 -0
- effspm/htminer/src/load_inst.cpp +394 -0
- effspm/htminer/src/load_inst.hpp +23 -0
- effspm/htminer/src/utility.cpp +72 -0
- effspm/htminer/src/utility.hpp +77 -0
- effspm/largebm/src/build_mdd.cpp +137 -0
- effspm/largebm/src/build_mdd.hpp +47 -0
- effspm/largebm/src/freq_miner.cpp +349 -0
- effspm/largebm/src/freq_miner.hpp +48 -0
- effspm/largebm/src/load_inst.cpp +230 -0
- effspm/largebm/src/load_inst.hpp +45 -0
- effspm/largebm/src/utility.cpp +45 -0
- effspm/largebm/src/utility.hpp +18 -0
- effspm/largehm/src/build_mdd.cpp +174 -0
- effspm/largehm/src/build_mdd.hpp +93 -0
- effspm/largehm/src/freq_miner.cpp +445 -0
- effspm/largehm/src/freq_miner.hpp +77 -0
- effspm/largehm/src/load_inst.cpp +357 -0
- effspm/largehm/src/load_inst.hpp +64 -0
- effspm/largehm/src/utility.cpp +38 -0
- effspm/largehm/src/utility.hpp +29 -0
- effspm/largepp/src/freq_miner.cpp +170 -0
- effspm/largepp/src/freq_miner.hpp +43 -0
- effspm/largepp/src/load_inst.cpp +219 -0
- effspm/largepp/src/load_inst.hpp +28 -0
- effspm/largepp/src/utility.cpp +34 -0
- effspm/largepp/src/utility.hpp +21 -0
- effspm/load_inst.hpp +2 -1
- effspm-0.2.6.dist-info/METADATA +237 -0
- effspm-0.2.6.dist-info/RECORD +53 -0
- {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/WHEEL +1 -1
- effspm/_core.cp312-win_amd64.pyd +0 -0
- effspm-0.1.7.dist-info/METADATA +0 -38
- effspm-0.1.7.dist-info/RECORD +0 -14
- {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <sstream>
|
|
3
|
+
#include <algorithm>
|
|
4
|
+
#include <fstream>
|
|
5
|
+
#include <cmath>
|
|
6
|
+
#include <ctime>
|
|
7
|
+
|
|
8
|
+
#include "load_inst.hpp"
|
|
9
|
+
#include "utility.hpp"
|
|
10
|
+
#include "build_mdd.hpp"
|
|
11
|
+
#include "freq_miner.hpp"
|
|
12
|
+
|
|
13
|
+
namespace largehm {
|
|
14
|
+
using namespace std;
|
|
15
|
+
|
|
16
|
+
string out_file;
|
|
17
|
+
string folder;
|
|
18
|
+
|
|
19
|
+
bool b_disp = false;
|
|
20
|
+
bool b_write = false;
|
|
21
|
+
bool use_dic = false;
|
|
22
|
+
bool use_list = false;
|
|
23
|
+
bool just_build = false;
|
|
24
|
+
bool pre_pro = false;
|
|
25
|
+
bool itmset_exists = false;
|
|
26
|
+
|
|
27
|
+
unsigned int M = 0;
|
|
28
|
+
unsigned int L = 0;
|
|
29
|
+
unsigned int mlim = 0;
|
|
30
|
+
unsigned int time_limit = 0;
|
|
31
|
+
|
|
32
|
+
unsigned long long int N = 0;
|
|
33
|
+
unsigned long long int theta = 0;
|
|
34
|
+
unsigned long long int E = 0;
|
|
35
|
+
|
|
36
|
+
clock_t start_time = 0;
|
|
37
|
+
|
|
38
|
+
vector<vector<int>> items;
|
|
39
|
+
|
|
40
|
+
vector<int> item_dic;
|
|
41
|
+
vector<Pattern> DFS;
|
|
42
|
+
vector<VPattern> VDFS;
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
bool Load_instance(string& items_file, double thresh) {
|
|
46
|
+
// ─── 1) CLEAR ANY leftover state from a previous run ───
|
|
47
|
+
Tree.clear();
|
|
48
|
+
VTree.clear();
|
|
49
|
+
CTree.clear();
|
|
50
|
+
DFS.clear();
|
|
51
|
+
VDFS.clear();
|
|
52
|
+
item_dic.clear();
|
|
53
|
+
items.clear();
|
|
54
|
+
|
|
55
|
+
N = 0;
|
|
56
|
+
M = 0;
|
|
57
|
+
L = 0;
|
|
58
|
+
E = 0;
|
|
59
|
+
theta = 0;
|
|
60
|
+
itmset_exists = false;
|
|
61
|
+
// ────────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
clock_t kk = clock();
|
|
64
|
+
|
|
65
|
+
// Insert fresh dummy root node:
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
Tree.emplace_back(0, 0, 0);
|
|
71
|
+
|
|
72
|
+
if (!pre_pro) {
|
|
73
|
+
if (!Load_items(items_file))
|
|
74
|
+
return false;
|
|
75
|
+
DFS.reserve(L);
|
|
76
|
+
while (DFS.size() < L) {
|
|
77
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
|
|
78
|
+
}
|
|
79
|
+
VDFS.reserve(L);
|
|
80
|
+
while (VDFS.size() < L) {
|
|
81
|
+
VDFS.emplace_back(static_cast<int>(VDFS.size()));
|
|
82
|
+
}
|
|
83
|
+
if (thresh < 1.0) {
|
|
84
|
+
theta = static_cast<unsigned long long>(ceil(thresh * N));
|
|
85
|
+
} else {
|
|
86
|
+
theta = static_cast<unsigned long long>(thresh);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
if (!Load_items(items_file))
|
|
91
|
+
return false;
|
|
92
|
+
if (thresh < 1.0) {
|
|
93
|
+
theta = static_cast<unsigned long long>(ceil(thresh * N));
|
|
94
|
+
} else {
|
|
95
|
+
theta = static_cast<unsigned long long>(thresh);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
|
|
100
|
+
cout << "Found " << N << " sequence, with max line len " << M
|
|
101
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
102
|
+
// cout << "Total Trie nodes: " << Tree.size()
|
|
103
|
+
// << " Total CTree nodes: " << CTree.size()
|
|
104
|
+
// << " Total VTree nodes: " << VTree.size() << endl;
|
|
105
|
+
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
bool Preprocess(string &inst, double thresh) {
|
|
111
|
+
vector<unsigned long long int> MN(100, 0);
|
|
112
|
+
vector<vector<bool>> ML(100, vector<bool>(1000000, false));
|
|
113
|
+
|
|
114
|
+
ifstream file(inst);
|
|
115
|
+
if (!file.good()) {
|
|
116
|
+
cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
117
|
+
return false;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
vector<unsigned long long int> freq(1000000, 0ULL);
|
|
121
|
+
vector<unsigned long long int> counted(1000000, 0ULL);
|
|
122
|
+
|
|
123
|
+
string line;
|
|
124
|
+
int ditem;
|
|
125
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
126
|
+
++N;
|
|
127
|
+
if (N % 10000000 == 0)
|
|
128
|
+
cout << "N: " << N << endl;
|
|
129
|
+
|
|
130
|
+
istringstream word(line);
|
|
131
|
+
string itm;
|
|
132
|
+
int size_m = 0;
|
|
133
|
+
while (word >> itm) {
|
|
134
|
+
++size_m;
|
|
135
|
+
ditem = stoi(itm);
|
|
136
|
+
|
|
137
|
+
if (ditem > 0)
|
|
138
|
+
itmset_exists = true;
|
|
139
|
+
else
|
|
140
|
+
ditem = -ditem;
|
|
141
|
+
|
|
142
|
+
if (size_m < (int)MN.size()) {
|
|
143
|
+
++MN[size_m - 1];
|
|
144
|
+
if ((int)ML[size_m - 1].size() < ditem) {
|
|
145
|
+
ML[size_m - 1].resize(ditem, false);
|
|
146
|
+
}
|
|
147
|
+
ML[size_m - 1][ditem - 1] = true;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (L < static_cast<unsigned int>(ditem)) {
|
|
151
|
+
L = static_cast<unsigned int>(ditem);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if ((int)freq.size() < ditem) {
|
|
155
|
+
freq.resize(ditem, 0ULL);
|
|
156
|
+
counted.resize(ditem, 0ULL);
|
|
157
|
+
}
|
|
158
|
+
if (counted[ditem - 1] != N) {
|
|
159
|
+
++freq[ditem - 1];
|
|
160
|
+
counted[ditem - 1] = N;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
if (size_m > (int)M)
|
|
164
|
+
M = size_m;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (thresh < 1.0) {
|
|
168
|
+
theta = static_cast<unsigned long long>(ceil(thresh * N));
|
|
169
|
+
} else {
|
|
170
|
+
theta = static_cast<unsigned long long>(thresh);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
int real_L = 0;
|
|
174
|
+
item_dic.assign(L, -1);
|
|
175
|
+
vector<bool> item_in(L, false);
|
|
176
|
+
for (int i = 0; i < (int)L; ++i) {
|
|
177
|
+
if (freq[i] >= theta) {
|
|
178
|
+
item_dic[i] = ++real_L;
|
|
179
|
+
item_in[i] = true;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
|
|
184
|
+
|
|
185
|
+
unsigned long long int LpM = 1;
|
|
186
|
+
mlim = M;
|
|
187
|
+
int orgmlim = 0;
|
|
188
|
+
int ulim = min(1 + real_L / 4, 10);
|
|
189
|
+
unsigned long long int ml;
|
|
190
|
+
|
|
191
|
+
for (int i = 0; i + ulim < (int)MN.size() && i + ulim < (int)M; ++i) {
|
|
192
|
+
ml = 0;
|
|
193
|
+
for (int j = 0; j < (int)L; ++j) {
|
|
194
|
+
if (ML[i][j] && item_in[j])
|
|
195
|
+
++ml;
|
|
196
|
+
}
|
|
197
|
+
LpM *= ml * (1 + itmset_exists);
|
|
198
|
+
cout << ml << " " << LpM << " " << MN[i] << endl;
|
|
199
|
+
if (LpM * ulim > MN[i]) {
|
|
200
|
+
orgmlim = i;
|
|
201
|
+
while (i + ulim - 1 < (int)MN.size() && i + ulim - 1 < (int)M) {
|
|
202
|
+
cout << (MN[i - 1] - MN[i + ulim - 1]) << " "
|
|
203
|
+
<< MN[i + ulim - 1] << endl;
|
|
204
|
+
if ((MN[i - 1] - MN[i + ulim - 1]) < MN[i + ulim - 1]
|
|
205
|
+
&& MN[i + ulim - 1] < 600000000) {
|
|
206
|
+
mlim = i - 1;
|
|
207
|
+
break;
|
|
208
|
+
}
|
|
209
|
+
++i;
|
|
210
|
+
}
|
|
211
|
+
break;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
cout << "M is: " << M << " Mlim is: " << mlim
|
|
216
|
+
<< " ulim is: " << ulim
|
|
217
|
+
<< " original mlim is: " << orgmlim
|
|
218
|
+
<< " guess is: "
|
|
219
|
+
<< round((log(N) - log(6)) / log(real_L)) << endl;
|
|
220
|
+
|
|
221
|
+
if (mlim < (int)M) {
|
|
222
|
+
for (int i = 0; i < real_L; ++i)
|
|
223
|
+
VDFS.emplace_back(i);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
L = static_cast<unsigned int>(real_L);
|
|
227
|
+
N = 0;
|
|
228
|
+
M = 0;
|
|
229
|
+
return true;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
bool Load_items_pre(string &inst_name) {
|
|
234
|
+
ifstream file(inst_name);
|
|
235
|
+
if (!file.good()) {
|
|
236
|
+
cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
|
|
237
|
+
return false;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
string line;
|
|
241
|
+
int ditem;
|
|
242
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
243
|
+
istringstream word(line);
|
|
244
|
+
string itm;
|
|
245
|
+
vector<int> temp_vec;
|
|
246
|
+
vector<int> temp_lim;
|
|
247
|
+
bool sgn = false;
|
|
248
|
+
|
|
249
|
+
// L is final from Preprocess
|
|
250
|
+
while (word >> itm) {
|
|
251
|
+
ditem = stoi(itm);
|
|
252
|
+
if (item_dic[std::abs(ditem) - 1] == -1) {
|
|
253
|
+
if (!sgn)
|
|
254
|
+
sgn = (ditem < 0);
|
|
255
|
+
continue;
|
|
256
|
+
} else {
|
|
257
|
+
if (ditem > 0)
|
|
258
|
+
ditem = item_dic[ditem - 1];
|
|
259
|
+
else
|
|
260
|
+
ditem = -item_dic[-ditem - 1];
|
|
261
|
+
}
|
|
262
|
+
if (sgn) {
|
|
263
|
+
if (ditem > 0)
|
|
264
|
+
ditem = -ditem;
|
|
265
|
+
sgn = false;
|
|
266
|
+
}
|
|
267
|
+
if (temp_vec.size() <= (size_t)mlim)
|
|
268
|
+
temp_vec.push_back(ditem);
|
|
269
|
+
else
|
|
270
|
+
temp_lim.push_back(ditem);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (temp_vec.empty())
|
|
274
|
+
continue;
|
|
275
|
+
|
|
276
|
+
++N;
|
|
277
|
+
if (N % 10000000 == 0)
|
|
278
|
+
cout << N << endl;
|
|
279
|
+
|
|
280
|
+
if (temp_vec.size() + temp_lim.size() > (size_t)M)
|
|
281
|
+
M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
|
|
282
|
+
|
|
283
|
+
// ─── Ensure DFS/VDFS size before Build_MDD ───
|
|
284
|
+
while (DFS.size() < L)
|
|
285
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
|
|
286
|
+
while (VDFS.size() < L)
|
|
287
|
+
VDFS.emplace_back(static_cast<int>(VDFS.size()));
|
|
288
|
+
// ──────────────────────────────────────────────
|
|
289
|
+
|
|
290
|
+
Build_MDD(temp_vec, temp_lim);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return true;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
bool Load_items(string &inst_name) {
|
|
298
|
+
// std::cerr << "[SANITY] In Load_items: inst_name='"
|
|
299
|
+
// << inst_name << "'" << std::endl;
|
|
300
|
+
ifstream file(inst_name);
|
|
301
|
+
if (!file.good()) {
|
|
302
|
+
cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
string line;
|
|
307
|
+
int ditem;
|
|
308
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
309
|
+
++N;
|
|
310
|
+
if (N % 1000000 == 0)
|
|
311
|
+
cout << "Found " << N << " sequence, with max line len "
|
|
312
|
+
<< M << ", and " << L << " items, and " << E
|
|
313
|
+
<< " enteries\n";
|
|
314
|
+
|
|
315
|
+
istringstream word(line);
|
|
316
|
+
string itm;
|
|
317
|
+
vector<int> temp_vec;
|
|
318
|
+
vector<int> temp_lim;
|
|
319
|
+
|
|
320
|
+
while (word >> itm) {
|
|
321
|
+
ditem = stoi(itm);
|
|
322
|
+
|
|
323
|
+
if (ditem > 0)
|
|
324
|
+
itmset_exists = true;
|
|
325
|
+
|
|
326
|
+
if (L < static_cast<unsigned int>(std::abs(ditem))) {
|
|
327
|
+
L = static_cast<unsigned int>(std::abs(ditem));
|
|
328
|
+
// Immediately grow DFS/VDFS to handle new L
|
|
329
|
+
while (DFS.size() < L)
|
|
330
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
|
|
331
|
+
while (VDFS.size() < L)
|
|
332
|
+
VDFS.emplace_back(static_cast<int>(VDFS.size()));
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (temp_vec.size() < (size_t)mlim)
|
|
336
|
+
temp_vec.push_back(ditem);
|
|
337
|
+
else
|
|
338
|
+
temp_lim.push_back(ditem);
|
|
339
|
+
}
|
|
340
|
+
E += static_cast<unsigned long long>(temp_vec.size() + temp_lim.size());
|
|
341
|
+
if (temp_vec.size() + temp_lim.size() > (size_t)M)
|
|
342
|
+
M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
|
|
343
|
+
|
|
344
|
+
// ─── Ensure DFS/VDFS size before Build_MDD ───
|
|
345
|
+
while (DFS.size() < L)
|
|
346
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
|
|
347
|
+
while (VDFS.size() < L)
|
|
348
|
+
VDFS.emplace_back(static_cast<int>(VDFS.size()));
|
|
349
|
+
// ──────────────────────────────────────────────
|
|
350
|
+
|
|
351
|
+
Build_MDD(temp_vec, temp_lim);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
return true;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
} // namespace largehm
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#ifndef LARGEHM_LOAD_INST_HPP
|
|
2
|
+
#define LARGEHM_LOAD_INST_HPP
|
|
3
|
+
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <vector>
|
|
6
|
+
#include <fstream>
|
|
7
|
+
#include <ctime> // for clock_t
|
|
8
|
+
|
|
9
|
+
// We need Pattern and VPattern, so include freq_miner.hpp here:
|
|
10
|
+
#include "freq_miner.hpp"
|
|
11
|
+
|
|
12
|
+
namespace largehm {
|
|
13
|
+
|
|
14
|
+
//
|
|
15
|
+
// ─── Globals & Function Prototypes ───────────────────────────────────────────
|
|
16
|
+
//
|
|
17
|
+
|
|
18
|
+
// Output/folder:
|
|
19
|
+
extern std::string out_file;
|
|
20
|
+
extern std::string folder;
|
|
21
|
+
|
|
22
|
+
// Flags:
|
|
23
|
+
extern bool b_disp;
|
|
24
|
+
extern bool b_write;
|
|
25
|
+
extern bool use_dic;
|
|
26
|
+
extern bool use_list;
|
|
27
|
+
extern bool just_build;
|
|
28
|
+
extern bool pre_pro;
|
|
29
|
+
extern bool itmset_exists;
|
|
30
|
+
|
|
31
|
+
// Database statistics:
|
|
32
|
+
extern unsigned int M;
|
|
33
|
+
extern unsigned int L;
|
|
34
|
+
extern unsigned int mlim;
|
|
35
|
+
extern unsigned int time_limit;
|
|
36
|
+
|
|
37
|
+
extern unsigned long long int N;
|
|
38
|
+
extern unsigned long long int theta;
|
|
39
|
+
extern unsigned long long int E;
|
|
40
|
+
|
|
41
|
+
// Timing:
|
|
42
|
+
extern clock_t start_time;
|
|
43
|
+
|
|
44
|
+
// In‐memory sequences (only if “in‐memory” mode):
|
|
45
|
+
extern std::vector<std::vector<int>> items;
|
|
46
|
+
|
|
47
|
+
// Preprocessing dictionary (maps original → compressed IDs):
|
|
48
|
+
extern std::vector<int> item_dic;
|
|
49
|
+
|
|
50
|
+
// DFS stacks used by the miner (Pattern / VPattern):
|
|
51
|
+
extern std::vector<Pattern> DFS;
|
|
52
|
+
extern std::vector<VPattern> VDFS;
|
|
53
|
+
|
|
54
|
+
// Internal loader functions:
|
|
55
|
+
bool Load_items_pre(std::string &inst_name);
|
|
56
|
+
bool Load_items(std::string &inst_name);
|
|
57
|
+
bool Preprocess(std::string &inst, double thresh);
|
|
58
|
+
|
|
59
|
+
// Main entry‐point for loading & building the MDD:
|
|
60
|
+
bool Load_instance(std::string &items_file, double thresh);
|
|
61
|
+
|
|
62
|
+
} // namespace largehm
|
|
63
|
+
|
|
64
|
+
#endif // LARGEHM_LOAD_INST_HPP
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#include "utility.hpp"
|
|
2
|
+
#include "build_mdd.hpp"
|
|
3
|
+
#include "load_inst.hpp"
|
|
4
|
+
#include <iostream>
|
|
5
|
+
namespace largehm {
|
|
6
|
+
std::vector<std::vector<int>> collected;
|
|
7
|
+
bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec) {
|
|
8
|
+
|
|
9
|
+
vector<unsigned long long int> ancestors;
|
|
10
|
+
|
|
11
|
+
while (abs(Tree[cur_anct].itmset) > abs(Tree[str_pnt].itmset)) {
|
|
12
|
+
if (Tree[cur_anct].item > 0)
|
|
13
|
+
ancestors.push_back(cur_anct);
|
|
14
|
+
cur_anct = Tree[cur_anct].anct;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
if (abs(Tree[cur_anct].itmset) == abs(Tree[str_pnt].itmset))
|
|
18
|
+
return 1;
|
|
19
|
+
else {
|
|
20
|
+
for (vector<unsigned long long int>::reverse_iterator it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
|
|
21
|
+
for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
|
|
22
|
+
if (strpnt_vec[i] == *it)
|
|
23
|
+
return 1;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return 0;
|
|
29
|
+
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
// float give_time(clock_t kk) {
|
|
35
|
+
// float ll = ((float)kk) / CLOCKS_PER_SEC;
|
|
36
|
+
// return ll;
|
|
37
|
+
// }
|
|
38
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <time.h>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "build_mdd.hpp"
|
|
7
|
+
|
|
8
|
+
namespace largehm {
|
|
9
|
+
using namespace std;
|
|
10
|
+
|
|
11
|
+
extern std::vector<std::vector<int>> collected;
|
|
12
|
+
|
|
13
|
+
// Helpers to clear and fetch collected patterns from Python:
|
|
14
|
+
inline void ClearCollected() {
|
|
15
|
+
collected.clear();
|
|
16
|
+
}
|
|
17
|
+
inline const std::vector<std::vector<int>>& GetCollected() {
|
|
18
|
+
return collected;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// A small timer helper:
|
|
22
|
+
inline float give_time(clock_t kk) {
|
|
23
|
+
float ll = ((float)kk) / CLOCKS_PER_SEC;
|
|
24
|
+
return ll;
|
|
25
|
+
}
|
|
26
|
+
bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec);
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <time.h>
|
|
3
|
+
#include "freq_miner.hpp"
|
|
4
|
+
#include "utility.hpp"
|
|
5
|
+
|
|
6
|
+
namespace largepp // ← BEGIN namespacing
|
|
7
|
+
{
|
|
8
|
+
void Out_patt(vector<int>& seq, unsigned int freq);
|
|
9
|
+
void Extend_patt(Pattern& _patt);
|
|
10
|
+
|
|
11
|
+
unsigned long long int num_patt = 0;
|
|
12
|
+
|
|
13
|
+
Pattern _patt;
|
|
14
|
+
|
|
15
|
+
void Freq_miner() {
|
|
16
|
+
|
|
17
|
+
vector<int> islist;
|
|
18
|
+
|
|
19
|
+
for (int i = 0; i < L; ++i) {
|
|
20
|
+
if (DFS[i].freq >= theta)
|
|
21
|
+
islist.push_back(i);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
for (int i = 0; i < DFS.size(); ++i) {
|
|
25
|
+
DFS[i].ilist = islist;
|
|
26
|
+
DFS[i].slist = islist;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
while (!DFS.empty() && give_time(clock() - start_time) < time_limit) {
|
|
30
|
+
if (DFS.back().freq >= theta)
|
|
31
|
+
Extend_patt(DFS.back());
|
|
32
|
+
else
|
|
33
|
+
DFS.pop_back();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
void Extend_patt(Pattern& _pattern) {
|
|
39
|
+
|
|
40
|
+
swap(_patt, _pattern);
|
|
41
|
+
DFS.pop_back();
|
|
42
|
+
|
|
43
|
+
vector<bool> slist(L, 0);
|
|
44
|
+
vector<bool> ilist(L, 0);
|
|
45
|
+
|
|
46
|
+
for (vector<int>::iterator it = _patt.slist.begin(); it != _patt.slist.end(); ++it)
|
|
47
|
+
slist[*it] = 1;
|
|
48
|
+
for (vector<int>::iterator it = _patt.ilist.begin(); it != _patt.ilist.end(); ++it)
|
|
49
|
+
ilist[*it] = 1;
|
|
50
|
+
|
|
51
|
+
vector<Pattern> pot_patt(L * 2);
|
|
52
|
+
|
|
53
|
+
int last_neg = _patt.seq.size() - 1;
|
|
54
|
+
while (_patt.seq[last_neg] > 0)
|
|
55
|
+
--last_neg;
|
|
56
|
+
|
|
57
|
+
for (int i = 0; i < _patt.str_pnt.size(); ++i) {
|
|
58
|
+
|
|
59
|
+
vector<bool> found(L * 2, 0);
|
|
60
|
+
|
|
61
|
+
unsigned long long int seq = _patt.seq_ID[i];
|
|
62
|
+
unsigned int j = _patt.str_pnt[i] + 1;
|
|
63
|
+
while (j < items[seq].size() && items[seq][j] > 0) {
|
|
64
|
+
int cur_itm = items[seq][j];
|
|
65
|
+
if (ilist[cur_itm - 1]) {
|
|
66
|
+
pot_patt[cur_itm - 1].seq_ID.push_back(seq);
|
|
67
|
+
pot_patt[cur_itm - 1].str_pnt.push_back(j);
|
|
68
|
+
++pot_patt[cur_itm - 1].freq;
|
|
69
|
+
found[cur_itm - 1] = 1;
|
|
70
|
+
}
|
|
71
|
+
++j;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
int num_itmfnd = 0;
|
|
75
|
+
for (int k = j; k < items[seq].size(); ++k) {
|
|
76
|
+
int cur_itm = abs(items[seq][k]);
|
|
77
|
+
if (items[seq][k] < 0)
|
|
78
|
+
num_itmfnd = 0;
|
|
79
|
+
if (slist[cur_itm - 1] && !found[L + cur_itm - 1]) {
|
|
80
|
+
pot_patt[L + cur_itm - 1].seq_ID.push_back(seq);
|
|
81
|
+
pot_patt[L + cur_itm - 1].str_pnt.push_back(k);
|
|
82
|
+
++pot_patt[L + cur_itm - 1].freq;
|
|
83
|
+
found[L + cur_itm - 1] = 1;
|
|
84
|
+
}
|
|
85
|
+
if (num_itmfnd == _patt.seq.size() - last_neg) {
|
|
86
|
+
if (ilist[cur_itm - 1] && !found[cur_itm - 1]) {
|
|
87
|
+
pot_patt[cur_itm - 1].seq_ID.push_back(seq);
|
|
88
|
+
pot_patt[cur_itm - 1].str_pnt.push_back(k);
|
|
89
|
+
++pot_patt[cur_itm - 1].freq;
|
|
90
|
+
found[cur_itm - 1] = 1;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
else if (cur_itm == abs(_patt.seq[last_neg + num_itmfnd]))
|
|
94
|
+
++num_itmfnd;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
vector<int> slistp;
|
|
100
|
+
vector<int> ilistp;
|
|
101
|
+
|
|
102
|
+
for (vector<int>::iterator it = _patt.ilist.begin(); it != _patt.ilist.end(); ++it) {
|
|
103
|
+
if (pot_patt[*it].freq >= theta)
|
|
104
|
+
ilistp.push_back(*it);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for (vector<int>::iterator it = _patt.slist.begin(); it != _patt.slist.end(); ++it) {
|
|
108
|
+
if (pot_patt[(*it) + L].freq >= theta)
|
|
109
|
+
slistp.push_back(*it);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
for (vector<int>::iterator it = ilistp.begin(); it != ilistp.end(); ++it) {
|
|
113
|
+
DFS.emplace_back();
|
|
114
|
+
swap(DFS.back(), pot_patt[*it]);
|
|
115
|
+
DFS.back().seq = _patt.seq;
|
|
116
|
+
DFS.back().seq.push_back((*it) + 1);
|
|
117
|
+
DFS.back().slist = slistp;
|
|
118
|
+
DFS.back().ilist = ilistp;
|
|
119
|
+
if (b_disp || b_write)
|
|
120
|
+
Out_patt(DFS.back().seq, DFS.back().freq);
|
|
121
|
+
++num_patt;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
for (vector<int>::iterator it = slistp.begin(); it != slistp.end(); ++it) {
|
|
126
|
+
DFS.emplace_back();
|
|
127
|
+
swap(DFS.back(), pot_patt[(*it) + L]);
|
|
128
|
+
DFS.back().seq = _patt.seq;
|
|
129
|
+
DFS.back().seq.push_back(-(*it) - 1);
|
|
130
|
+
DFS.back().slist = slistp;
|
|
131
|
+
DFS.back().ilist = slistp;
|
|
132
|
+
if (b_disp || b_write)
|
|
133
|
+
Out_patt(DFS.back().seq, DFS.back().freq);
|
|
134
|
+
++num_patt;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
void Out_patt(vector<int>& seq, unsigned int freq) {
|
|
141
|
+
|
|
142
|
+
largepp::collected.push_back(seq);
|
|
143
|
+
|
|
144
|
+
ofstream file_o;
|
|
145
|
+
if (b_write)
|
|
146
|
+
file_o.open(out_file, std::ios::app);
|
|
147
|
+
|
|
148
|
+
for (int ii = 0; ii < seq.size(); ii++) {
|
|
149
|
+
if (b_disp)
|
|
150
|
+
cout << seq[ii] << " ";
|
|
151
|
+
if (b_write)
|
|
152
|
+
file_o << seq[ii] << " ";
|
|
153
|
+
}
|
|
154
|
+
if (b_disp)
|
|
155
|
+
cout << endl;
|
|
156
|
+
if (b_write)
|
|
157
|
+
file_o << endl;
|
|
158
|
+
|
|
159
|
+
if (b_disp)
|
|
160
|
+
cout << "************** Freq: " << freq << endl;
|
|
161
|
+
if (b_write) {
|
|
162
|
+
file_o << "************** Freq: " << freq << endl;
|
|
163
|
+
file_o.close();
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "load_inst.hpp"
|
|
4
|
+
namespace largepp // ← BEGIN namespacing
|
|
5
|
+
{
|
|
6
|
+
void Freq_miner();
|
|
7
|
+
|
|
8
|
+
class Pattern {
|
|
9
|
+
public:
|
|
10
|
+
|
|
11
|
+
vector<int> seq;
|
|
12
|
+
vector<unsigned int> str_pnt;
|
|
13
|
+
vector<unsigned long long int> seq_ID;
|
|
14
|
+
|
|
15
|
+
vector<int> slist;
|
|
16
|
+
vector<int> ilist;
|
|
17
|
+
|
|
18
|
+
unsigned long long int freq;
|
|
19
|
+
|
|
20
|
+
Pattern(vector<int>& _seq, int item) {
|
|
21
|
+
seq.reserve(_seq.size());
|
|
22
|
+
for (int i = 0; i < _seq.size(); ++i)
|
|
23
|
+
seq.push_back(_seq[i]);
|
|
24
|
+
seq.push_back(item);
|
|
25
|
+
freq = 0;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
Pattern(int item) {
|
|
30
|
+
seq.push_back(item);
|
|
31
|
+
freq = 0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
Pattern() {
|
|
35
|
+
freq = 0;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
extern vector<Pattern> DFS; //DFS queue of potential patterns to extend
|
|
41
|
+
|
|
42
|
+
extern unsigned long long int num_patt;
|
|
43
|
+
}
|