effspm 0.1.11__cp313-cp313-macosx_10_13_universal2.whl → 0.2.1__cp313-cp313-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of effspm might be problematic. Click here for more details.
- effspm/__init__.py +3 -3
- effspm/_effspm.cpp +437 -13
- effspm/_effspm.cpython-313-darwin.so +0 -0
- effspm/btminer/src/freq_miner.cpp +3 -0
- effspm/btminer/src/load_inst.cpp +10 -4
- effspm/btminer/src/load_inst.hpp +2 -0
- effspm/btminer/src/utility.cpp +31 -33
- effspm/btminer/src/utility.hpp +9 -13
- effspm/htminer/src/build_mdd.cpp +192 -0
- effspm/htminer/src/build_mdd.hpp +64 -0
- effspm/htminer/src/freq_miner.cpp +350 -0
- effspm/htminer/src/freq_miner.hpp +60 -0
- effspm/htminer/src/load_inst.cpp +381 -0
- effspm/htminer/src/load_inst.hpp +23 -0
- effspm/htminer/src/main.cpp +96 -0
- effspm/htminer/src/utility.cpp +72 -0
- effspm/htminer/src/utility.hpp +77 -0
- effspm/largebm/src/build_mdd.cpp +137 -0
- effspm/largebm/src/build_mdd.hpp +47 -0
- effspm/largebm/src/freq_miner.cpp +342 -0
- effspm/largebm/src/freq_miner.hpp +48 -0
- effspm/largebm/src/load_inst.cpp +235 -0
- effspm/largebm/src/load_inst.hpp +45 -0
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +45 -0
- effspm/largebm/src/utility.hpp +18 -0
- effspm/largehm/src/build_mdd.cpp +173 -0
- effspm/largehm/src/build_mdd.hpp +93 -0
- effspm/largehm/src/freq_miner.cpp +441 -0
- effspm/largehm/src/freq_miner.hpp +77 -0
- effspm/largehm/src/load_inst.cpp +357 -0
- effspm/largehm/src/load_inst.hpp +64 -0
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +38 -0
- effspm/largehm/src/utility.hpp +29 -0
- effspm/largepp/src/freq_miner.cpp +170 -0
- effspm/largepp/src/freq_miner.hpp +43 -0
- effspm/largepp/src/load_inst.cpp +219 -0
- effspm/largepp/src/load_inst.hpp +28 -0
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/utility.cpp +33 -0
- effspm/largepp/src/utility.hpp +20 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/METADATA +1 -1
- effspm-0.2.1.dist-info/RECORD +59 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/WHEEL +1 -1
- effspm-0.1.11.dist-info/RECORD +0 -25
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "load_inst.hpp"
|
|
4
|
+
#include "build_mdd.hpp"
|
|
5
|
+
|
|
6
|
+
namespace htminer {
|
|
7
|
+
void Freq_miner();
|
|
8
|
+
|
|
9
|
+
class Pattern {
|
|
10
|
+
public:
|
|
11
|
+
|
|
12
|
+
vector<int> seq;
|
|
13
|
+
vector<unsigned int> str_pnt;
|
|
14
|
+
vector<int> list;
|
|
15
|
+
|
|
16
|
+
unsigned long long int freq;
|
|
17
|
+
|
|
18
|
+
Pattern(int item) {
|
|
19
|
+
seq.push_back(item);
|
|
20
|
+
freq = 0;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
Pattern(size_t _pnt, bool _res) {
|
|
24
|
+
str_pnt.reserve(_pnt);
|
|
25
|
+
freq = 0;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
Pattern() {
|
|
29
|
+
freq = 0;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
class VPattern {
|
|
36
|
+
public:
|
|
37
|
+
|
|
38
|
+
unsigned long long int ass_patt;
|
|
39
|
+
|
|
40
|
+
vector<int> str_pnt;
|
|
41
|
+
vector<unsigned int> seq_ID;
|
|
42
|
+
|
|
43
|
+
VPattern(unsigned long long int _patt) {
|
|
44
|
+
ass_patt = _patt;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
VPattern(size_t _pnt, bool a) {
|
|
48
|
+
str_pnt.reserve(_pnt);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
VPattern() {}
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
extern unsigned long long int num_patt;
|
|
56
|
+
extern vector<Pattern> DFS;
|
|
57
|
+
extern vector<VPattern> VDFS;
|
|
58
|
+
|
|
59
|
+
}
|
|
60
|
+
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <fstream>
|
|
3
|
+
#include <sstream>
|
|
4
|
+
#include <cmath>
|
|
5
|
+
#include <ctime>
|
|
6
|
+
#include "load_inst.hpp"
|
|
7
|
+
#include "freq_miner.hpp"
|
|
8
|
+
#include "utility.hpp"
|
|
9
|
+
#include "build_mdd.hpp"
|
|
10
|
+
|
|
11
|
+
namespace htminer {
|
|
12
|
+
using namespace std;
|
|
13
|
+
|
|
14
|
+
// unsigned int M = 0, mlim;
|
|
15
|
+
// unsigned long long E = 0;
|
|
16
|
+
|
|
17
|
+
// unsigned long long int N = 0, L = 0, theta;
|
|
18
|
+
|
|
19
|
+
bool itmset_exists = 0;
|
|
20
|
+
|
|
21
|
+
vector<int> item_dic;
|
|
22
|
+
// vector<Pattern> DFS;
|
|
23
|
+
// vector<VPattern> VDFS;
|
|
24
|
+
|
|
25
|
+
void Load_items_pre(string &inst_name);
|
|
26
|
+
bool Load_items(string &inst_name);
|
|
27
|
+
bool Preprocess(string& inst, double thresh);
|
|
28
|
+
|
|
29
|
+
bool Load_instance(std::string& items_file, double thresh) {
|
|
30
|
+
// Debug: entry
|
|
31
|
+
// std::cerr << "[HTMiner::Load_instance] called with file=\"" << items_file
|
|
32
|
+
// << "\" minsup=" << thresh << std::endl;
|
|
33
|
+
|
|
34
|
+
std::clock_t kk = std::clock();
|
|
35
|
+
|
|
36
|
+
// Initialize root of Tree
|
|
37
|
+
Tree.emplace_back(0, 0, 0);
|
|
38
|
+
Tree[0].itmset = 1;
|
|
39
|
+
{
|
|
40
|
+
std::vector<unsigned int> dummyAnc(L, 0);
|
|
41
|
+
std::vector<int> dummyItems;
|
|
42
|
+
CTree.emplace_back(dummyAnc, dummyItems);
|
|
43
|
+
VTree.emplace_back(); // calls VArc() default ctor
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// ─── Initialize DFS and VDFS to length L so Add_vec can index them ─
|
|
47
|
+
DFS.clear();
|
|
48
|
+
DFS.reserve(L);
|
|
49
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
50
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
51
|
+
VDFS.clear();
|
|
52
|
+
VDFS.resize(L);
|
|
53
|
+
|
|
54
|
+
if (pre_pro) {
|
|
55
|
+
// std::cerr << "[HTMiner::Load_instance] pre_pro flag is true; calling Preprocess" << std::endl;
|
|
56
|
+
if (!Preprocess(items_file, thresh)) {
|
|
57
|
+
// std::cerr << "[HTMiner::Load_instance] Preprocess failed; returning false" << std::endl;
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
// std::cout << "\nPreprocess done in " << give_time(std::clock() - kk) << " seconds\n\n";
|
|
61
|
+
|
|
62
|
+
// Reserve DFS stack
|
|
63
|
+
DFS.reserve(L);
|
|
64
|
+
for (int i = 0; i < static_cast<int>(L); ++i)
|
|
65
|
+
DFS.emplace_back(-i - 1);
|
|
66
|
+
|
|
67
|
+
kk = std::clock();
|
|
68
|
+
//std::cerr << "[HTMiner::Load_instance] calling Load_items_pre" << std::endl;
|
|
69
|
+
Load_items_pre(items_file);
|
|
70
|
+
|
|
71
|
+
// If Tree is very large, shrink vectors
|
|
72
|
+
if (Tree.size() > 100000000) {
|
|
73
|
+
Tree.shrink_to_fit();
|
|
74
|
+
CTree.shrink_to_fit();
|
|
75
|
+
VTree.shrink_to_fit();
|
|
76
|
+
// std::cerr << "[HTMiner::Load_instance] Shrunk Tree, CTree, VTree to fit" << std::endl;
|
|
77
|
+
}
|
|
78
|
+
// std::cerr << "[HTMiner::Load_instance] Load_items_pre completed; N=" << N
|
|
79
|
+
// << " M=" << M << " L=" << L << " E=" << E << std::endl;
|
|
80
|
+
}
|
|
81
|
+
else if (!Load_items(items_file)) {
|
|
82
|
+
// std::cerr << "[HTMiner::Load_instance] Load_items failed; returning false" << std::endl;
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
// Calculate absolute theta if user passed a fraction
|
|
87
|
+
if (thresh < 1.0) {
|
|
88
|
+
theta = static_cast<unsigned long long>(std::ceil(thresh * N));
|
|
89
|
+
//std::cerr << "[HTMiner::Load_instance] Computed theta = ceil(" << thresh << " * " << N << ") = " << theta << std::endl;
|
|
90
|
+
} else {
|
|
91
|
+
theta = static_cast<unsigned long long>(thresh);
|
|
92
|
+
// std::cerr << "[HTMiner::Load_instance] Using theta = " << theta << " (absolute)" << std::endl;
|
|
93
|
+
}
|
|
94
|
+
// std::cerr << "[HTMiner::Load_instance] No preprocessing; Load_items succeeded; N=" << N
|
|
95
|
+
// << " M=" << M << " L=" << L << " E=" << E << std::endl;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
std::cout << "\nMDD Database built in " << give_time(std::clock() - kk) << " seconds\n\n";
|
|
99
|
+
std::cout << "Found " << N << " sequence, with max line len " << M
|
|
100
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
101
|
+
// std::cout << "Total Trie nodes: " << Tree.size()
|
|
102
|
+
// << " Total CTree nodes: " << CTree.size()
|
|
103
|
+
// << " Total VTree nodes: " << VTree.size() << std::endl;
|
|
104
|
+
|
|
105
|
+
// std::cerr << "[HTMiner::Load_instance] Exiting normally; returning true" << std::endl;
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
bool Preprocess(string &inst, double thresh) {
|
|
111
|
+
|
|
112
|
+
//std::cerr << "[HTMiner::Preprocess] called with \"" << inst << "\" minsup=" << thresh << std::endl; //DEBUG
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
vector<unsigned long long int> MN(100, 0);
|
|
116
|
+
vector<vector<bool>> ML(100, vector<bool>(1000000, 0));
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
ifstream file(inst);
|
|
120
|
+
|
|
121
|
+
vector<unsigned int> freq(1000000, 0);
|
|
122
|
+
vector<unsigned long long int> counted(1000000, 0);
|
|
123
|
+
|
|
124
|
+
if (file.good()) {
|
|
125
|
+
string line;
|
|
126
|
+
int ditem;
|
|
127
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
128
|
+
++N;
|
|
129
|
+
|
|
130
|
+
//if (N % 10000000 == 0)
|
|
131
|
+
//cout << "N: " << N << endl;
|
|
132
|
+
istringstream word(line);
|
|
133
|
+
string itm;
|
|
134
|
+
int size_m = 0;
|
|
135
|
+
while (word >> itm) {
|
|
136
|
+
++size_m;
|
|
137
|
+
ditem = stoi(itm);
|
|
138
|
+
|
|
139
|
+
if (ditem > 0)
|
|
140
|
+
itmset_exists = 1;
|
|
141
|
+
else
|
|
142
|
+
ditem *= -1;
|
|
143
|
+
|
|
144
|
+
if (size_m < MN.size()) {
|
|
145
|
+
++MN[size_m - 1];
|
|
146
|
+
if (ML[size_m - 1].size() < ditem) {
|
|
147
|
+
ML[size_m - 1].reserve(ditem);
|
|
148
|
+
while (ML[size_m - 1].size() < ditem)
|
|
149
|
+
ML[size_m - 1].push_back(0);
|
|
150
|
+
}
|
|
151
|
+
ML[size_m - 1][ditem - 1] = 1;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (L < ditem)
|
|
155
|
+
L = ditem;
|
|
156
|
+
|
|
157
|
+
if (freq.size() < L) {
|
|
158
|
+
freq.reserve(L);
|
|
159
|
+
counted.reserve(L);
|
|
160
|
+
while (freq.size() < L) {
|
|
161
|
+
freq.push_back(0);
|
|
162
|
+
counted.push_back(0);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (counted[ditem - 1] != N) {
|
|
167
|
+
++freq[ditem - 1];
|
|
168
|
+
counted[ditem - 1] = N;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
}
|
|
172
|
+
if (size_m > M)
|
|
173
|
+
M = size_m;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
else {
|
|
177
|
+
//cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
178
|
+
return 0;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (thresh < 1)
|
|
182
|
+
theta = ceil(thresh * N);
|
|
183
|
+
else
|
|
184
|
+
theta = thresh;
|
|
185
|
+
|
|
186
|
+
int real_L = 0;
|
|
187
|
+
item_dic = vector<int>(L, -1);
|
|
188
|
+
vector<bool> item_in(L, 0);
|
|
189
|
+
for (int i = 0; i < L; ++i) {
|
|
190
|
+
if (freq[i] >= theta) {
|
|
191
|
+
item_dic[i] = ++real_L;
|
|
192
|
+
item_in[i] = 1;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
//cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
|
|
197
|
+
|
|
198
|
+
unsigned long long int LpM = 1;
|
|
199
|
+
mlim = M;
|
|
200
|
+
int orgmlim;
|
|
201
|
+
int ulim = min(3 + real_L / 5, 10);
|
|
202
|
+
unsigned long long int ml;
|
|
203
|
+
int coef = 1 + 1 * itmset_exists;
|
|
204
|
+
for (int i = 0; i + ulim < MN.size() && i + ulim < M; ++i) {
|
|
205
|
+
ml = 0;
|
|
206
|
+
for (int j = 0; j < L; ++j) {
|
|
207
|
+
if (ML[i][j] && item_in[j])
|
|
208
|
+
++ml;
|
|
209
|
+
}
|
|
210
|
+
LpM *= ml * coef;
|
|
211
|
+
// cout << ml << " " << LpM << " " << MN[i] << endl;
|
|
212
|
+
if (LpM * ulim > MN[i]) {
|
|
213
|
+
if (6 * (MN[i] - LpM) >= 5 * MN[i])
|
|
214
|
+
orgmlim = i;
|
|
215
|
+
while (i + ulim - 1 < MN.size() && i + ulim - 1 < M) {
|
|
216
|
+
// cout << MN[i - 1] - MN[i + ulim - 1] << " " << MN[i + ulim - 1] << endl;
|
|
217
|
+
if (MN[i - 1] - MN[i + ulim - 1] < MN[i + ulim - 1] && MN[i + ulim - 1] < 600000000 ) {
|
|
218
|
+
mlim = i - 1;
|
|
219
|
+
break;
|
|
220
|
+
}
|
|
221
|
+
i += 1;
|
|
222
|
+
}
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// cout << "M is: " << M << " Mlim is: " << mlim << " ulim is: " << ulim << " original mlim is: " << orgmlim << " guess is: " << round((log(N) - log(6)) / log(real_L)) << endl;
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if (mlim < M) {
|
|
231
|
+
for (int i = 0; i < real_L; ++i)
|
|
232
|
+
VDFS.emplace_back(i);
|
|
233
|
+
if (MN[mlim + ulim] > 100000000) {
|
|
234
|
+
CTree.reserve(MN[mlim + ulim] / 2);
|
|
235
|
+
VTree.reserve(MN[mlim + ulim] / 2);
|
|
236
|
+
Tree.reserve((N - MN[mlim + ulim]) * 2);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
else if (N > 100000000)
|
|
240
|
+
Tree.reserve(500000000);
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
L = real_L;
|
|
244
|
+
N = 0;
|
|
245
|
+
M = 0;
|
|
246
|
+
|
|
247
|
+
return 1;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
void Load_items_pre(string &inst_name) {
|
|
252
|
+
|
|
253
|
+
ifstream file(inst_name);
|
|
254
|
+
|
|
255
|
+
if (file.good()) {
|
|
256
|
+
string line;
|
|
257
|
+
int ditem;
|
|
258
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
259
|
+
istringstream word(line);
|
|
260
|
+
string itm;
|
|
261
|
+
vector<int> temp_vec;
|
|
262
|
+
vector<int> temp_lim;
|
|
263
|
+
bool sgn = 0;
|
|
264
|
+
while (word >> itm) {
|
|
265
|
+
|
|
266
|
+
ditem = stoi(itm);
|
|
267
|
+
|
|
268
|
+
if (item_dic[abs(ditem) - 1] == -1) {
|
|
269
|
+
if (!sgn)
|
|
270
|
+
sgn = ditem < 0;
|
|
271
|
+
continue;
|
|
272
|
+
}
|
|
273
|
+
else {
|
|
274
|
+
if (ditem > 0)
|
|
275
|
+
ditem = item_dic[ditem - 1];
|
|
276
|
+
else
|
|
277
|
+
ditem = -item_dic[-ditem - 1];
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (sgn) {
|
|
281
|
+
if (ditem > 0)
|
|
282
|
+
ditem = -ditem;
|
|
283
|
+
sgn = 0;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (temp_vec.size() <= mlim)
|
|
287
|
+
temp_vec.push_back(ditem);
|
|
288
|
+
else
|
|
289
|
+
temp_lim.push_back(ditem);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (temp_vec.empty())
|
|
293
|
+
continue;
|
|
294
|
+
|
|
295
|
+
++N;
|
|
296
|
+
//if (N % 1000000 == 0)
|
|
297
|
+
// cout << N << " " << Tree.size() << " " << CTree.size() << " " << VTree.size() << endl;
|
|
298
|
+
|
|
299
|
+
if (temp_vec.size() + temp_lim.size() > M)
|
|
300
|
+
M = temp_vec.size() + temp_lim.size();
|
|
301
|
+
|
|
302
|
+
Build_MDD(temp_vec, temp_lim);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
bool Load_items(std::string &inst_name) {
|
|
309
|
+
//std::cerr << "[HTMiner::Load_items] called with filename=\"" << inst_name << "\"" << std::endl; // DEBUG
|
|
310
|
+
unsigned int mlim = 0;
|
|
311
|
+
// reset the global M and E here instead of redeclaring:
|
|
312
|
+
M = 0;
|
|
313
|
+
E = 0;
|
|
314
|
+
|
|
315
|
+
std::ifstream file(inst_name);
|
|
316
|
+
if (!file.good()) {
|
|
317
|
+
//std::cerr << "[HTMiner::Load_items] ERROR: No such file: " << inst_name << std::endl;
|
|
318
|
+
return false;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
std::string line;
|
|
322
|
+
int ditem;
|
|
323
|
+
while (std::getline(file, line) && give_time(std::clock() - start_time) < time_limit) {
|
|
324
|
+
++N;
|
|
325
|
+
if (N % 1000000 == 0) {
|
|
326
|
+
// std::cerr << "[HTMiner::Load_items] Read " << N << " sequences so far; "
|
|
327
|
+
// << "current M=" << M << ", L=" << L << ", E=" << E << std::endl;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
std::istringstream word(line);
|
|
331
|
+
std::vector<int> temp_vec;
|
|
332
|
+
std::vector<int> temp_lim;
|
|
333
|
+
while (word >> ditem) {
|
|
334
|
+
E+=1;
|
|
335
|
+
if (ditem > 0)
|
|
336
|
+
itmset_exists = 1;
|
|
337
|
+
|
|
338
|
+
if (std::abs(ditem) > static_cast<int>(L)) {
|
|
339
|
+
L = std::abs(ditem);
|
|
340
|
+
//std::cerr << "[HTMiner::Load_items] Updated L to " << L << " (expanding DFS and VDFS)" << std::endl;
|
|
341
|
+
// Expand DFS
|
|
342
|
+
while (DFS.size() < L) {
|
|
343
|
+
DFS.reserve(L);
|
|
344
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
|
|
345
|
+
}
|
|
346
|
+
// ALSO expand VDFS so that VDFS[cur_itm-1] is valid later
|
|
347
|
+
VDFS.resize(L);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if (temp_vec.size() < mlim)
|
|
351
|
+
temp_vec.push_back(ditem);
|
|
352
|
+
else
|
|
353
|
+
temp_lim.push_back(ditem);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
if (temp_vec.size() + temp_lim.size() > M) {
|
|
357
|
+
M = temp_vec.size() + temp_lim.size();
|
|
358
|
+
//std::cerr << "[HTMiner::Load_items] Updated M to " << M << std::endl;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// std::cerr << "[HTMiner::Load_items] Calling Build_MDD with temp_vec size=" << temp_vec.size()
|
|
362
|
+
// << ", temp_lim size=" << temp_lim.size() << std::endl;
|
|
363
|
+
Build_MDD(temp_vec, temp_lim);
|
|
364
|
+
//std::cerr << "[HTMiner::Load_items] Build_MDD returned; Tree size now=" << Tree.size() << std::endl;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// std::cerr << "[HTMiner::Load_items] Finished loading. Final counts: N=" << N
|
|
368
|
+
// << ", M=" << M << ", L=" << L << ", E=" << E << std::endl;
|
|
369
|
+
return true;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include <map>
|
|
7
|
+
#include <unordered_set>
|
|
8
|
+
#include <unordered_map>
|
|
9
|
+
namespace htminer {
|
|
10
|
+
using namespace std;
|
|
11
|
+
|
|
12
|
+
bool Load_instance(string& items_file, double thresh);
|
|
13
|
+
|
|
14
|
+
extern string out_file, folder;
|
|
15
|
+
|
|
16
|
+
extern bool b_disp, b_write, use_dic, just_build, pre_pro, itmset_exists;
|
|
17
|
+
|
|
18
|
+
extern unsigned int M, mlim, time_limit;
|
|
19
|
+
|
|
20
|
+
extern unsigned long long int N, L, theta;
|
|
21
|
+
|
|
22
|
+
extern clock_t start_time;
|
|
23
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <time.h>
|
|
3
|
+
#include <string.h>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include "load_inst.hpp"
|
|
6
|
+
#include "build_mdd.hpp"
|
|
7
|
+
#include "utility.hpp"
|
|
8
|
+
#include "freq_miner.hpp"
|
|
9
|
+
|
|
10
|
+
using namespace std;
|
|
11
|
+
|
|
12
|
+
string out_file;
|
|
13
|
+
|
|
14
|
+
bool b_disp = 0, b_write = 0, use_dic = 0, just_build = 0, pre_pro = 1;
|
|
15
|
+
|
|
16
|
+
unsigned int time_limit = 10 * 3600;
|
|
17
|
+
|
|
18
|
+
clock_t start_time;
|
|
19
|
+
|
|
20
|
+
string folder;
|
|
21
|
+
|
|
22
|
+
int main(int argc, char* argv[]) {
|
|
23
|
+
|
|
24
|
+
string VV, attr;
|
|
25
|
+
|
|
26
|
+
double thresh = 0;
|
|
27
|
+
for (int i = 1; i<argc; i++) {
|
|
28
|
+
if (argv[i][0] != '-' || isdigit(argv[i][1]))
|
|
29
|
+
continue;
|
|
30
|
+
else if (strcmp(argv[i], "-thr") == 0)
|
|
31
|
+
thresh = stod(argv[i + 1]);
|
|
32
|
+
else if (strcmp(argv[i], "-file") == 0)
|
|
33
|
+
VV = argv[i + 1];
|
|
34
|
+
else if (strcmp(argv[i], "-time") == 0)
|
|
35
|
+
time_limit = stoi(argv[i + 1]);
|
|
36
|
+
else if (strcmp(argv[i], "-jbuild") == 0)
|
|
37
|
+
just_build = 1;
|
|
38
|
+
else if (strcmp(argv[i], "-folder") == 0)
|
|
39
|
+
folder = argv[i + 1];
|
|
40
|
+
else if (strcmp(argv[i], "-npre") == 0)
|
|
41
|
+
pre_pro = 0;
|
|
42
|
+
else if (strcmp(argv[i], "-dic") == 0)
|
|
43
|
+
use_dic = 1;
|
|
44
|
+
else if (strcmp(argv[i], "-out") == 0) {
|
|
45
|
+
if (i + 1 == argc || argv[i + 1][0] == '-')
|
|
46
|
+
b_disp = 1;
|
|
47
|
+
else if (argv[i + 1][0] == '+') {
|
|
48
|
+
b_disp = 1;
|
|
49
|
+
b_write = 1;
|
|
50
|
+
if (strlen(argv[i + 1]) > 1) {
|
|
51
|
+
out_file = argv[i + 1];
|
|
52
|
+
out_file = out_file.substr(1, out_file.size() - 1);
|
|
53
|
+
}
|
|
54
|
+
else
|
|
55
|
+
out_file = VV;
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
b_write = 1;
|
|
59
|
+
out_file = argv[i + 1];
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
else
|
|
64
|
+
cout << "Command " << argv[i] << " not recognized and skipped.\n";
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
cout << "\n********************** " << VV << "**********************\n";
|
|
70
|
+
|
|
71
|
+
string item_file = folder + VV + ".txt";
|
|
72
|
+
|
|
73
|
+
cout << "loading instances...\n";
|
|
74
|
+
|
|
75
|
+
clock_t start_time_all = clock();
|
|
76
|
+
start_time = clock();
|
|
77
|
+
|
|
78
|
+
if (!Load_instance(item_file, thresh)) {
|
|
79
|
+
cout << "Files invalid, exiting.\n";
|
|
80
|
+
cin.get();
|
|
81
|
+
return 0;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
start_time = clock();
|
|
85
|
+
|
|
86
|
+
if (!just_build && give_time(clock() - start_time) < time_limit) {
|
|
87
|
+
Freq_miner();
|
|
88
|
+
if (give_time(clock() - start_time) >= time_limit)
|
|
89
|
+
cout << "TIME LIMIT REACHED\n";
|
|
90
|
+
cout << "Mining Complete\n\nFound a total of " << num_patt << " patterns\n";
|
|
91
|
+
cout << "\nTotal CPU time " << give_time(clock() - start_time_all) << " seconds\n\n";
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
return 0;
|
|
96
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#include "utility.hpp"
|
|
2
|
+
#include "load_inst.hpp"
|
|
3
|
+
#include "freq_miner.hpp"
|
|
4
|
+
#include <vector>
|
|
5
|
+
namespace htminer {
|
|
6
|
+
|
|
7
|
+
// ─── Flag‐like globals ──────────────────────────────────────────────────────
|
|
8
|
+
bool use_list = false;
|
|
9
|
+
bool just_build = false;
|
|
10
|
+
bool b_disp = false;
|
|
11
|
+
bool b_write = false;
|
|
12
|
+
bool use_dic = false;
|
|
13
|
+
bool pre_pro = false;
|
|
14
|
+
|
|
15
|
+
unsigned int time_limit = 0;
|
|
16
|
+
std::string out_file = "";
|
|
17
|
+
std::clock_t start_time = 0;
|
|
18
|
+
|
|
19
|
+
// ─── Dataset‐level globals ─────────────────────────────────────────────────
|
|
20
|
+
std::vector<std::vector<int>> items;
|
|
21
|
+
unsigned long long N = 0;
|
|
22
|
+
unsigned long long L = 0;
|
|
23
|
+
unsigned long long theta = 0;
|
|
24
|
+
unsigned int M = 0;
|
|
25
|
+
unsigned long long E = 0;
|
|
26
|
+
unsigned int mlim = 0;
|
|
27
|
+
// ─── DFS stacks ─────────────────────────────────────────────────────────────
|
|
28
|
+
std::vector<Pattern> DFS;
|
|
29
|
+
std::vector<VPattern> VDFS;
|
|
30
|
+
|
|
31
|
+
// ─── Collected patterns storage ────────────────────────────────────────────
|
|
32
|
+
std::vector<std::vector<int>> collectedPatterns;
|
|
33
|
+
const std::vector<std::vector<int>>& GetCollected() {
|
|
34
|
+
return collectedPatterns;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ─── give_time and check_parent get their definitions here (as provided) ───
|
|
38
|
+
float give_time(std::clock_t kk) {
|
|
39
|
+
return static_cast<float>(kk) / static_cast<float>(CLOCKS_PER_SEC);
|
|
40
|
+
}
|
|
41
|
+
bool check_parent(unsigned int cur_anct, unsigned int str_pnt, unsigned int start, vector<unsigned int>& strpnt_vec) {
|
|
42
|
+
|
|
43
|
+
vector<unsigned int> ancestors;
|
|
44
|
+
|
|
45
|
+
while (abs(Tree[cur_anct].itmset) > abs(Tree[str_pnt].itmset)) {
|
|
46
|
+
if (Tree[cur_anct].item > 0)
|
|
47
|
+
ancestors.push_back(cur_anct);
|
|
48
|
+
cur_anct = Tree[cur_anct].anct;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (abs(Tree[cur_anct].itmset) == abs(Tree[str_pnt].itmset))
|
|
52
|
+
return 1;
|
|
53
|
+
else {
|
|
54
|
+
for (vector<unsigned int>::reverse_iterator it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
|
|
55
|
+
for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
|
|
56
|
+
if (strpnt_vec[i] == *it)
|
|
57
|
+
return 1;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
return 0;
|
|
64
|
+
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
}
|