effspm 0.2.7__cp312-cp312-win_amd64.whl → 0.3.3__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cp312-win_amd64.pyd +0 -0
- effspm/_effspm.cpp +961 -210
- effspm/btminer/src/build_mdd.cpp +42 -17
- effspm/btminer/src/build_mdd.hpp +13 -19
- effspm/btminer/src/freq_miner.cpp +134 -49
- effspm/btminer/src/freq_miner.hpp +16 -0
- effspm/btminer/src/load_inst.cpp +211 -126
- effspm/btminer/src/load_inst.hpp +22 -4
- effspm/btminer/src/main.cpp +83 -0
- effspm/btminer/src/utility.cpp +26 -41
- effspm/btminer/src/utility.hpp +6 -30
- effspm/freq_miner.hpp +2 -1
- effspm/htminer/src/build_mdd.cpp +46 -124
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/build_mdd.cpp +69 -110
- effspm/largebm/src/build_mdd.hpp +22 -37
- effspm/largebm/src/freq_miner.cpp +241 -291
- effspm/largebm/src/freq_miner.hpp +25 -36
- effspm/largebm/src/load_inst.cpp +20 -26
- effspm/largebm/src/load_inst.hpp +24 -34
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +11 -21
- effspm/largebm/src/utility.hpp +7 -10
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +134 -191
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +137 -174
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/freq_miner.cpp +184 -156
- effspm/largepp/src/freq_miner.hpp +11 -36
- effspm/largepp/src/load_inst.cpp +32 -12
- effspm/largepp/src/load_inst.hpp +15 -9
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/load_inst.cpp +8 -8
- effspm/load_inst.hpp +1 -1
- effspm/main.cpp +103 -0
- {effspm-0.2.7.dist-info → effspm-0.3.3.dist-info}/METADATA +1 -1
- effspm-0.3.3.dist-info/RECORD +60 -0
- effspm-0.2.7.dist-info/RECORD +0 -53
- {effspm-0.2.7.dist-info → effspm-0.3.3.dist-info}/WHEEL +0 -0
- {effspm-0.2.7.dist-info → effspm-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.2.7.dist-info → effspm-0.3.3.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -2,49 +2,97 @@
|
|
|
2
2
|
|
|
3
3
|
#include <pybind11/pybind11.h>
|
|
4
4
|
#include <pybind11/stl.h>
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
#include <iostream>
|
|
7
|
+
#include <fstream>
|
|
8
|
+
#include <cstdio> // std::remove
|
|
9
|
+
#include <vector>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <ctime>
|
|
12
|
+
#include <cmath>
|
|
7
13
|
|
|
14
|
+
namespace py = pybind11;
|
|
8
15
|
|
|
9
|
-
// PrefixProjection headers
|
|
16
|
+
// PrefixProjection headers (global namespace)
|
|
10
17
|
#include "freq_miner.hpp"
|
|
11
18
|
#include "load_inst.hpp"
|
|
12
19
|
#include "utility.hpp"
|
|
13
20
|
|
|
14
|
-
// BTMiner (
|
|
21
|
+
// BTMiner (namespaced)
|
|
15
22
|
#include "btminer/src/freq_miner.hpp"
|
|
16
23
|
#include "btminer/src/load_inst.hpp"
|
|
17
24
|
#include "btminer/src/utility.hpp"
|
|
18
25
|
#include "btminer/src/build_mdd.hpp"
|
|
19
26
|
|
|
20
|
-
// HTMiner (
|
|
21
|
-
#include "htminer/src/build_mdd.hpp"
|
|
27
|
+
// HTMiner (namespaced)
|
|
28
|
+
#include "htminer/src/build_mdd.hpp"
|
|
22
29
|
#include "htminer/src/freq_miner.hpp"
|
|
23
30
|
#include "htminer/src/load_inst.hpp"
|
|
24
31
|
#include "htminer/src/utility.hpp"
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
// LargePrefixProjection
|
|
27
34
|
#include "largepp/src/freq_miner.hpp"
|
|
28
35
|
#include "largepp/src/load_inst.hpp"
|
|
29
36
|
#include "largepp/src/utility.hpp"
|
|
30
37
|
|
|
38
|
+
// LargeBTMiner
|
|
31
39
|
#include "largebm/src/freq_miner.hpp"
|
|
32
40
|
#include "largebm/src/load_inst.hpp"
|
|
33
41
|
#include "largebm/src/utility.hpp"
|
|
34
42
|
#include "largebm/src/build_mdd.hpp"
|
|
35
43
|
|
|
44
|
+
// LargeHTMiner
|
|
36
45
|
#include "largehm/src/freq_miner.hpp"
|
|
37
46
|
#include "largehm/src/load_inst.hpp"
|
|
38
47
|
#include "largehm/src/utility.hpp"
|
|
39
48
|
#include "largehm/src/build_mdd.hpp"
|
|
40
49
|
|
|
50
|
+
namespace {
|
|
51
|
+
|
|
52
|
+
// RAII helper for temp file
|
|
53
|
+
struct TempFile {
|
|
54
|
+
std::string path;
|
|
55
|
+
~TempFile() {
|
|
56
|
+
if (!path.empty()) {
|
|
57
|
+
std::remove(path.c_str());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Write Python list[list[int]] to a temp file in professor’s format:
|
|
63
|
+
// one sequence per line, items separated by spaces.
|
|
64
|
+
std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
|
|
65
|
+
char tmp_name[L_tmpnam];
|
|
66
|
+
if (!std::tmpnam(tmp_name)) {
|
|
67
|
+
throw std::runtime_error("Failed to create temporary file name");
|
|
68
|
+
}
|
|
69
|
+
std::string path = std::string(tmp_name) + ".txt";
|
|
70
|
+
|
|
71
|
+
std::ofstream ofs(path);
|
|
72
|
+
if (!ofs) {
|
|
73
|
+
throw std::runtime_error("Failed to open temporary file for writing: " + path);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const auto& seq : seqs) {
|
|
77
|
+
for (size_t i = 0; i < seq.size(); ++i) {
|
|
78
|
+
if (i) ofs << ' ';
|
|
79
|
+
ofs << seq[i];
|
|
80
|
+
}
|
|
81
|
+
ofs << '\n';
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ofs.close();
|
|
85
|
+
return path;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
} // anonymous namespace
|
|
41
89
|
|
|
42
90
|
|
|
43
91
|
PYBIND11_MODULE(_effspm, m) {
|
|
44
|
-
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
92
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
|
|
45
93
|
|
|
46
94
|
// ─────────────────────────────────────────────────────────────
|
|
47
|
-
// PrefixProjection
|
|
95
|
+
// PrefixProjection (works directly on Python lists or files)
|
|
48
96
|
// ─────────────────────────────────────────────────────────────
|
|
49
97
|
m.def("PrefixProjection",
|
|
50
98
|
[](py::object data,
|
|
@@ -59,7 +107,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
59
107
|
::pre_pro = preproc;
|
|
60
108
|
::use_dic = use_dic;
|
|
61
109
|
::use_list = false;
|
|
62
|
-
::b_disp = verbose;
|
|
110
|
+
::b_disp = verbose; // controls prints in original code
|
|
63
111
|
::b_write = !out_file.empty();
|
|
64
112
|
::out_file = out_file;
|
|
65
113
|
|
|
@@ -69,7 +117,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
69
117
|
if (py::isinstance<py::str>(data)) {
|
|
70
118
|
std::string path = data.cast<std::string>();
|
|
71
119
|
if (!Load_instance(path, minsup))
|
|
72
|
-
throw std::runtime_error("
|
|
120
|
+
throw std::runtime_error("PrefixProjection: failed to load file: " + path);
|
|
73
121
|
} else {
|
|
74
122
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
75
123
|
items = std::move(seqs);
|
|
@@ -113,9 +161,355 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
113
161
|
);
|
|
114
162
|
|
|
115
163
|
// ─────────────────────────────────────────────────────────────
|
|
116
|
-
// BTMiner
|
|
164
|
+
// BTMiner (always uses professor's Load_instance)
|
|
165
|
+
// ─────────────────────────────────────────────────────────────
|
|
166
|
+
// ─────────────────────────────────────────────────────────────
|
|
167
|
+
// BTMiner (always uses professor's Load_instance)
|
|
168
|
+
// ─────────────────────────────────────────────────────────────
|
|
169
|
+
/*m.def("BTMiner",
|
|
170
|
+
[](py::object data,
|
|
171
|
+
double minsup,
|
|
172
|
+
unsigned int time_limit,
|
|
173
|
+
bool preproc,
|
|
174
|
+
bool use_dic,
|
|
175
|
+
bool verbose,
|
|
176
|
+
const std::string &out_file)
|
|
177
|
+
{
|
|
178
|
+
// 1) Configure professor globals
|
|
179
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
180
|
+
btminer::pre_pro = preproc;
|
|
181
|
+
btminer::use_dic = use_dic;
|
|
182
|
+
btminer::b_disp = verbose;
|
|
183
|
+
btminer::b_write = !out_file.empty();
|
|
184
|
+
btminer::out_file = out_file;
|
|
185
|
+
btminer::N_mult = 1;
|
|
186
|
+
btminer::M_mult = 1;
|
|
187
|
+
btminer::just_build = false;
|
|
188
|
+
|
|
189
|
+
// 2) HARD RESET of *known* global state for BTMiner
|
|
190
|
+
// (Only touch what we know exists in btminer namespace)
|
|
191
|
+
btminer::ClearCollected(); // clear collected patterns
|
|
192
|
+
btminer::Tree.clear(); // clear MDD tree
|
|
193
|
+
btminer::DFS.clear(); // clear DFS patterns
|
|
194
|
+
|
|
195
|
+
btminer::M = 0;
|
|
196
|
+
btminer::L = 0;
|
|
197
|
+
btminer::N = 0;
|
|
198
|
+
btminer::theta = 0;
|
|
199
|
+
btminer::E = 0;
|
|
200
|
+
btminer::num_patt = 0; // reset pattern counter if defined
|
|
201
|
+
|
|
202
|
+
// NOTE: we do NOT reinsert root here; btminer::Load_instance()
|
|
203
|
+
// is responsible for calling Tree.emplace_back(0,0,0) as needed.
|
|
204
|
+
|
|
205
|
+
btminer::start_time = std::clock();
|
|
206
|
+
|
|
207
|
+
// 3) Handle input (path or list-of-lists)
|
|
208
|
+
TempFile tmp;
|
|
209
|
+
std::string path;
|
|
210
|
+
|
|
211
|
+
if (py::isinstance<py::str>(data)) {
|
|
212
|
+
// File path: use directly
|
|
213
|
+
path = data.cast<std::string>();
|
|
214
|
+
} else {
|
|
215
|
+
// Python list → write to a temp file in professor’s format
|
|
216
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
217
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
218
|
+
path = tmp.path;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (verbose) {
|
|
222
|
+
std::cerr << "[BTMiner] path=" << path
|
|
223
|
+
<< " minsup=" << minsup
|
|
224
|
+
<< " preproc=" << preproc
|
|
225
|
+
<< " use_dic=" << use_dic
|
|
226
|
+
<< std::endl;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// 4) Build MDD + run miner
|
|
230
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
231
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
btminer::Freq_miner();
|
|
235
|
+
|
|
236
|
+
// 5) Return results
|
|
237
|
+
py::dict out;
|
|
238
|
+
out["patterns"] = btminer::GetCollected();
|
|
239
|
+
out["num_patterns"] = btminer::num_patt;
|
|
240
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
241
|
+
out["N"] = btminer::N;
|
|
242
|
+
out["L"] = btminer::L;
|
|
243
|
+
out["theta"] = btminer::theta;
|
|
244
|
+
return out;
|
|
245
|
+
},
|
|
246
|
+
py::arg("data"),
|
|
247
|
+
py::arg("minsup") = 0.01,
|
|
248
|
+
py::arg("time_limit") = 36000,
|
|
249
|
+
py::arg("preproc") = false,
|
|
250
|
+
py::arg("use_dic") = false,
|
|
251
|
+
py::arg("verbose") = false,
|
|
252
|
+
py::arg("out_file") = ""
|
|
253
|
+
); */
|
|
254
|
+
m.def("BTMiner",
|
|
255
|
+
[](py::object data,
|
|
256
|
+
double minsup,
|
|
257
|
+
unsigned int time_limit,
|
|
258
|
+
bool preproc,
|
|
259
|
+
bool use_dic,
|
|
260
|
+
bool verbose,
|
|
261
|
+
const std::string &out_file)
|
|
262
|
+
{
|
|
263
|
+
// 1) Configure professor globals
|
|
264
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
265
|
+
btminer::pre_pro = preproc;
|
|
266
|
+
btminer::use_dic = use_dic;
|
|
267
|
+
btminer::b_disp = verbose;
|
|
268
|
+
btminer::b_write = !out_file.empty();
|
|
269
|
+
btminer::out_file = out_file;
|
|
270
|
+
btminer::N_mult = 1;
|
|
271
|
+
btminer::M_mult = 1;
|
|
272
|
+
btminer::just_build = false;
|
|
273
|
+
|
|
274
|
+
// 2) HARD RESET of *known* global state for BTMiner
|
|
275
|
+
btminer::ClearCollected(); // clear collected patterns
|
|
276
|
+
btminer::Tree.clear(); // clear MDD tree
|
|
277
|
+
btminer::DFS.clear(); // clear DFS patterns
|
|
278
|
+
|
|
279
|
+
// clear all frequency / mapping / item structures
|
|
280
|
+
btminer::freq.clear();
|
|
281
|
+
btminer::item_dic.clear();
|
|
282
|
+
btminer::item_map.clear();
|
|
283
|
+
btminer::item_map_rev.clear();
|
|
284
|
+
btminer::items.clear(); // if you have this defined anywhere
|
|
285
|
+
|
|
286
|
+
// reset scalar globals
|
|
287
|
+
btminer::M = 0;
|
|
288
|
+
btminer::L = 0;
|
|
289
|
+
btminer::N = 0;
|
|
290
|
+
btminer::theta = 0;
|
|
291
|
+
btminer::E = 0;
|
|
292
|
+
btminer::num_patt = 0;
|
|
293
|
+
btminer::num_nodes = 0;
|
|
294
|
+
btminer::cur_node = 0;
|
|
295
|
+
// N_mult, M_mult, flags are set just above
|
|
296
|
+
|
|
297
|
+
btminer::start_time = std::clock();
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
// 3) Handle input (path or list-of-lists)
|
|
301
|
+
TempFile tmp;
|
|
302
|
+
std::string path;
|
|
303
|
+
|
|
304
|
+
if (py::isinstance<py::str>(data)) {
|
|
305
|
+
// File path: use directly
|
|
306
|
+
path = data.cast<std::string>();
|
|
307
|
+
} else {
|
|
308
|
+
// Python list → write to a temp file in professor’s format
|
|
309
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
310
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
311
|
+
path = tmp.path;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
if (verbose) {
|
|
315
|
+
std::cerr << "[BTMiner] path=" << path
|
|
316
|
+
<< " minsup=" << minsup
|
|
317
|
+
<< " preproc=" << preproc
|
|
318
|
+
<< " use_dic=" << use_dic
|
|
319
|
+
<< std::endl;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// 4) Build MDD + run miner
|
|
323
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
324
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
btminer::Freq_miner();
|
|
328
|
+
|
|
329
|
+
// 5) Return results
|
|
330
|
+
py::dict out;
|
|
331
|
+
out["patterns"] = btminer::GetCollected();
|
|
332
|
+
out["num_patterns"] = btminer::num_patt;
|
|
333
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
334
|
+
out["N"] = btminer::N;
|
|
335
|
+
out["L"] = btminer::L;
|
|
336
|
+
out["theta"] = btminer::theta;
|
|
337
|
+
return out;
|
|
338
|
+
},
|
|
339
|
+
py::arg("data"),
|
|
340
|
+
py::arg("minsup") = 0.01,
|
|
341
|
+
py::arg("time_limit") = 36000,
|
|
342
|
+
py::arg("preproc") = false,
|
|
343
|
+
py::arg("use_dic") = false,
|
|
344
|
+
py::arg("verbose") = false,
|
|
345
|
+
py::arg("out_file") = ""
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
// ─────────────────────────────────────────────────────────────
|
|
349
|
+
// HTMiner (works on files; we use a temp file for in-memory data)
|
|
350
|
+
// ─────────────────────────────────────────────────────────────
|
|
351
|
+
// ─────────────────────────────────────────────────────────────
|
|
352
|
+
// HTMiner (always uses professor's Load_instance; pre_pro forced ON)
|
|
353
|
+
// ─────────────────────────────────────────────────────────────
|
|
354
|
+
m.def("HTMiner",
|
|
355
|
+
[](py::object data,
|
|
356
|
+
double minsup,
|
|
357
|
+
unsigned int time_limit,
|
|
358
|
+
bool /*preproc*/, // Python arg is ignored internally
|
|
359
|
+
bool use_dic,
|
|
360
|
+
bool verbose,
|
|
361
|
+
const std::string &out_file)
|
|
362
|
+
{
|
|
363
|
+
using namespace htminer;
|
|
364
|
+
|
|
365
|
+
// ───────── Global parameter setup ─────────
|
|
366
|
+
htminer::time_limit = time_limit;
|
|
367
|
+
|
|
368
|
+
// IMPORTANT: always run with preprocessing ON,
|
|
369
|
+
// regardless of the Python `preproc` flag.
|
|
370
|
+
htminer::pre_pro = true;
|
|
371
|
+
htminer::use_dic = use_dic;
|
|
372
|
+
htminer::just_build = false;
|
|
373
|
+
htminer::b_disp = verbose;
|
|
374
|
+
htminer::b_write = !out_file.empty();
|
|
375
|
+
htminer::out_file = out_file;
|
|
376
|
+
|
|
377
|
+
// ───────── HARD RESET of HTMiner globals ─────────
|
|
378
|
+
htminer::ClearCollected();
|
|
379
|
+
htminer::Tree.clear();
|
|
380
|
+
htminer::VTree.clear();
|
|
381
|
+
htminer::CTree.clear();
|
|
382
|
+
htminer::DFS.clear();
|
|
383
|
+
htminer::VDFS.clear();
|
|
384
|
+
htminer::item_dic.clear();
|
|
385
|
+
|
|
386
|
+
htminer::M = 0;
|
|
387
|
+
htminer::N = 0;
|
|
388
|
+
htminer::L = 0;
|
|
389
|
+
htminer::E = 0;
|
|
390
|
+
htminer::theta = 0;
|
|
391
|
+
htminer::mlim = 0;
|
|
392
|
+
htminer::itmset_exists = false;
|
|
393
|
+
|
|
394
|
+
// NOTE: do NOT add a root arc here;
|
|
395
|
+
// htminer::Load_instance() already does Tree.emplace_back(0,0,0)
|
|
396
|
+
htminer::start_time = std::clock();
|
|
397
|
+
|
|
398
|
+
// ───────── Handle input (path or in-memory sequences) ─────────
|
|
399
|
+
TempFile tmp;
|
|
400
|
+
std::string path;
|
|
401
|
+
|
|
402
|
+
if (py::isinstance<py::str>(data)) {
|
|
403
|
+
// data is a file path
|
|
404
|
+
path = data.cast<std::string>();
|
|
405
|
+
} else {
|
|
406
|
+
// data is a list[list[int]] → write a temp file in the same text format
|
|
407
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
408
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
409
|
+
path = tmp.path;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if (verbose) {
|
|
413
|
+
std::cerr << "[HTMiner] path=" << path
|
|
414
|
+
<< " minsup=" << minsup
|
|
415
|
+
<< " preproc(always)=true"
|
|
416
|
+
<< " use_dic=" << use_dic
|
|
417
|
+
<< std::endl;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// ───────── Build MDD via professor's loader ─────────
|
|
421
|
+
if (!htminer::Load_instance(path, minsup)) {
|
|
422
|
+
throw std::runtime_error("HTMiner: failed to load instance from: " + path);
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// ───────── Run miner ─────────
|
|
426
|
+
htminer::Freq_miner();
|
|
427
|
+
|
|
428
|
+
// ───────── Return results ─────────
|
|
429
|
+
py::dict out;
|
|
430
|
+
out["patterns"] = htminer::GetCollected();
|
|
431
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
432
|
+
return out;
|
|
433
|
+
},
|
|
434
|
+
py::arg("data"),
|
|
435
|
+
py::arg("minsup") = 0.01,
|
|
436
|
+
py::arg("time_limit") = 36000,
|
|
437
|
+
py::arg("preproc") = false, // kept for API symmetry, but IGNORED
|
|
438
|
+
py::arg("use_dic") = false,
|
|
439
|
+
py::arg("verbose") = false,
|
|
440
|
+
py::arg("out_file") = ""
|
|
441
|
+
);
|
|
442
|
+
|
|
443
|
+
// ─────────────────────────────────────────────────────────────
|
|
444
|
+
// LargePrefixProjection (already has its own Load_py)
|
|
117
445
|
// ─────────────────────────────────────────────────────────────
|
|
118
|
-
|
|
446
|
+
m.def("LargePrefixProjection",
|
|
447
|
+
[](py::object data,
|
|
448
|
+
double minsup,
|
|
449
|
+
unsigned int time_limit,
|
|
450
|
+
bool preproc,
|
|
451
|
+
bool use_dic,
|
|
452
|
+
bool verbose,
|
|
453
|
+
const std::string &out_file)
|
|
454
|
+
{
|
|
455
|
+
// 1) Configure global flags
|
|
456
|
+
largepp::time_limit = time_limit;
|
|
457
|
+
largepp::pre_pro = preproc;
|
|
458
|
+
largepp::use_dic = use_dic;
|
|
459
|
+
largepp::use_list = true; // LargePrefixProjection is list-based
|
|
460
|
+
largepp::b_disp = verbose;
|
|
461
|
+
largepp::b_write = !out_file.empty();
|
|
462
|
+
largepp::out_file = out_file;
|
|
463
|
+
largepp::just_build = false;
|
|
464
|
+
|
|
465
|
+
// 2) HARD RESET of largepp global state
|
|
466
|
+
// (only touch symbols that actually exist in largepp)
|
|
467
|
+
largepp::ClearCollected(); // clear previously collected patterns
|
|
468
|
+
|
|
469
|
+
// If these exist in largepp::load_inst.hpp / utility.hpp they’ll compile;
|
|
470
|
+
// if the compiler complains about any of them, just comment that line out.
|
|
471
|
+
largepp::items.clear(); // transaction DB
|
|
472
|
+
largepp::DFS.clear(); // DFS pattern stack, if list-based miner uses it
|
|
473
|
+
|
|
474
|
+
largepp::M = 0;
|
|
475
|
+
largepp::L = 0;
|
|
476
|
+
largepp::N = 0;
|
|
477
|
+
largepp::theta = 0;
|
|
478
|
+
largepp::E = 0;
|
|
479
|
+
largepp::num_patt = 0;
|
|
480
|
+
|
|
481
|
+
largepp::start_time = std::clock();
|
|
482
|
+
|
|
483
|
+
// 3) Handle input (path or Python list)
|
|
484
|
+
if (py::isinstance<py::str>(data)) {
|
|
485
|
+
std::string fname = data.cast<std::string>();
|
|
486
|
+
largepp::Load_instance(fname, minsup);
|
|
487
|
+
} else {
|
|
488
|
+
largepp::Load_py(data, minsup);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// 4) Run miner
|
|
492
|
+
largepp::Freq_miner();
|
|
493
|
+
|
|
494
|
+
// 5) Return results
|
|
495
|
+
py::dict out;
|
|
496
|
+
out["patterns"] = largepp::GetCollected();
|
|
497
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
498
|
+
return out;
|
|
499
|
+
},
|
|
500
|
+
py::arg("data"),
|
|
501
|
+
py::arg("minsup") = 0.01,
|
|
502
|
+
py::arg("time_limit") = 36000,
|
|
503
|
+
py::arg("preproc") = false,
|
|
504
|
+
py::arg("use_dic") = false,
|
|
505
|
+
py::arg("verbose") = false,
|
|
506
|
+
py::arg("out_file") = ""
|
|
507
|
+
);
|
|
508
|
+
|
|
509
|
+
// ─────────────────────────────────────────────────────────────
|
|
510
|
+
// LargeBTMiner (always uses professor's largebm::Load_instance)
|
|
511
|
+
// ─────────────────────────────────────────────────────────────
|
|
512
|
+
m.def("LargeBTMiner",
|
|
119
513
|
[](py::object data,
|
|
120
514
|
double minsup,
|
|
121
515
|
unsigned int time_limit,
|
|
@@ -124,140 +518,270 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
124
518
|
bool verbose,
|
|
125
519
|
const std::string &out_file)
|
|
126
520
|
{
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
521
|
+
using namespace largebm;
|
|
522
|
+
|
|
523
|
+
largebm::time_limit = time_limit;
|
|
524
|
+
largebm::pre_pro = preproc;
|
|
525
|
+
largebm::use_dic = use_dic;
|
|
526
|
+
largebm::use_list = false; // MDD-based
|
|
527
|
+
largebm::b_disp = verbose;
|
|
528
|
+
largebm::b_write = !out_file.empty();
|
|
529
|
+
largebm::out_file = out_file;
|
|
530
|
+
largebm::just_build = false;
|
|
531
|
+
|
|
532
|
+
largebm::ClearCollected();
|
|
533
|
+
largebm::items.clear();
|
|
534
|
+
largebm::item_dic.clear();
|
|
535
|
+
largebm::inv_item_dic.clear();
|
|
536
|
+
largebm::Tree.clear();
|
|
537
|
+
largebm::DFS.clear();
|
|
538
|
+
|
|
539
|
+
largebm::start_time = std::clock();
|
|
134
540
|
|
|
135
|
-
|
|
136
|
-
|
|
541
|
+
TempFile tmp;
|
|
542
|
+
std::string path;
|
|
137
543
|
|
|
138
544
|
if (py::isinstance<py::str>(data)) {
|
|
139
|
-
|
|
140
|
-
if (!btminer::Load_instance(path, minsup))
|
|
141
|
-
throw std::runtime_error("Failed to load file: " + path);
|
|
545
|
+
path = data.cast<std::string>();
|
|
142
546
|
} else {
|
|
143
547
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
144
|
-
|
|
145
|
-
|
|
548
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
549
|
+
path = tmp.path;
|
|
550
|
+
}
|
|
146
551
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
552
|
+
if (verbose) {
|
|
553
|
+
std::cerr << "[LargeBTMiner] path=" << path
|
|
554
|
+
<< " minsup=" << minsup
|
|
555
|
+
<< " preproc=" << preproc
|
|
556
|
+
<< " use_dic=" << use_dic
|
|
557
|
+
<< std::endl;
|
|
558
|
+
}
|
|
152
559
|
|
|
153
|
-
|
|
560
|
+
if (!largebm::Load_instance(path, minsup)) {
|
|
561
|
+
throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
|
|
562
|
+
}
|
|
154
563
|
|
|
155
|
-
|
|
156
|
-
btminer::DFS.reserve(btminer::L);
|
|
157
|
-
for (unsigned int i = 0; i < btminer::L; ++i)
|
|
158
|
-
btminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
564
|
+
largebm::Freq_miner();
|
|
159
565
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
566
|
+
py::dict out;
|
|
567
|
+
out["patterns"] = largebm::GetCollected();
|
|
568
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
569
|
+
return out;
|
|
570
|
+
},
|
|
571
|
+
py::arg("data"),
|
|
572
|
+
py::arg("minsup") = 0.01,
|
|
573
|
+
py::arg("time_limit") = 36000,
|
|
574
|
+
py::arg("preproc") = false,
|
|
575
|
+
py::arg("use_dic") = false,
|
|
576
|
+
py::arg("verbose") = false,
|
|
577
|
+
py::arg("out_file") = ""
|
|
578
|
+
);
|
|
579
|
+
|
|
580
|
+
// ─────────────────────────────────────────────────────────────
|
|
581
|
+
// LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
|
|
582
|
+
// ─────────────────────────────────────────────────────────────
|
|
583
|
+
// ─────────────────────────────────────────────────────────────
|
|
584
|
+
// LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
|
|
585
|
+
// ─────────────────────────────────────────────────────────────
|
|
586
|
+
m.def("LargeHTMiner",
|
|
587
|
+
[](py::object data,
|
|
588
|
+
double minsup,
|
|
589
|
+
unsigned int time_limit,
|
|
590
|
+
bool /*preproc*/, // kept for API symmetry; ignored
|
|
591
|
+
bool use_dic,
|
|
592
|
+
bool verbose,
|
|
593
|
+
const std::string &out_file)
|
|
594
|
+
{
|
|
595
|
+
using namespace largehm;
|
|
596
|
+
|
|
597
|
+
// 1) Global configuration (mirror professor's style)
|
|
598
|
+
largehm::time_limit = time_limit;
|
|
599
|
+
largehm::pre_pro = true; // always preprocess
|
|
600
|
+
largehm::use_dic = use_dic;
|
|
601
|
+
largehm::just_build = false;
|
|
602
|
+
largehm::b_disp = verbose;
|
|
603
|
+
largehm::b_write = !out_file.empty();
|
|
604
|
+
largehm::out_file = out_file;
|
|
605
|
+
|
|
606
|
+
// 2) HARD RESET of all global state for a fresh run
|
|
607
|
+
largehm::ClearCollected(); // our helper in largehm::utility.cpp
|
|
608
|
+
|
|
609
|
+
largehm::M = 0;
|
|
610
|
+
largehm::L = 0;
|
|
611
|
+
largehm::mlim = 0;
|
|
612
|
+
largehm::N = 0;
|
|
613
|
+
largehm::theta = 0;
|
|
614
|
+
largehm::E = 0;
|
|
615
|
+
largehm::itmset_exists = false;
|
|
616
|
+
|
|
617
|
+
// containers
|
|
618
|
+
// (item_dic reset is optional and not strictly needed here)
|
|
619
|
+
largehm::DFS.clear();
|
|
620
|
+
largehm::VDFS.clear();
|
|
621
|
+
largehm::Tree.clear();
|
|
622
|
+
largehm::VTree.clear();
|
|
623
|
+
largehm::CTree.clear();
|
|
624
|
+
|
|
625
|
+
largehm::start_time = std::clock();
|
|
626
|
+
|
|
627
|
+
// 3) Handle input (file path or Python list)
|
|
628
|
+
TempFile tmp;
|
|
629
|
+
std::string path;
|
|
630
|
+
|
|
631
|
+
if (py::isinstance<py::str>(data)) {
|
|
632
|
+
path = data.cast<std::string>();
|
|
633
|
+
} else {
|
|
634
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
635
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
636
|
+
path = tmp.path;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
if (verbose) {
|
|
640
|
+
std::cerr << "[LargeHTMiner] path=" << path
|
|
641
|
+
<< " minsup=" << minsup
|
|
642
|
+
<< " preproc(always)=true"
|
|
643
|
+
<< " use_dic=" << use_dic
|
|
644
|
+
<< std::endl;
|
|
166
645
|
}
|
|
167
646
|
|
|
168
|
-
|
|
647
|
+
// 4) Build MDD / load instance.
|
|
648
|
+
// NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
|
|
649
|
+
// so we DO NOT create a root node here.
|
|
650
|
+
if (!largehm::Load_instance(path, minsup)) {
|
|
651
|
+
throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
|
|
652
|
+
}
|
|
169
653
|
|
|
654
|
+
// 5) Run miner (same timing logic as original main)
|
|
655
|
+
if (!largehm::just_build &&
|
|
656
|
+
largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
|
|
657
|
+
{
|
|
658
|
+
largehm::Freq_miner();
|
|
659
|
+
if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
|
|
660
|
+
std::cout << "TIME LIMIT REACHED\n";
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// 6) Return collected patterns + runtime
|
|
170
665
|
py::dict out;
|
|
171
|
-
out["patterns"] =
|
|
172
|
-
out["time"] =
|
|
666
|
+
out["patterns"] = largehm::GetCollected();
|
|
667
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
173
668
|
return out;
|
|
174
669
|
},
|
|
175
670
|
py::arg("data"),
|
|
176
671
|
py::arg("minsup") = 0.01,
|
|
177
672
|
py::arg("time_limit") = 36000,
|
|
178
|
-
py::arg("preproc") = false,
|
|
673
|
+
py::arg("preproc") = false, // kept for API symmetry
|
|
179
674
|
py::arg("use_dic") = false,
|
|
180
675
|
py::arg("verbose") = false,
|
|
181
676
|
py::arg("out_file") = ""
|
|
182
677
|
);
|
|
183
678
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
679
|
+
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
/*#include <pybind11/pybind11.h>
|
|
684
|
+
#include <pybind11/stl.h>
|
|
685
|
+
namespace py = pybind11;
|
|
686
|
+
#include <iostream>
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
// PrefixProjection headers
|
|
690
|
+
#include "freq_miner.hpp"
|
|
691
|
+
#include "load_inst.hpp"
|
|
692
|
+
#include "utility.hpp"
|
|
693
|
+
|
|
694
|
+
// BTMiner (wrapped in its own namespace in source files)
|
|
695
|
+
#include "btminer/src/freq_miner.hpp"
|
|
696
|
+
#include "btminer/src/load_inst.hpp"
|
|
697
|
+
#include "btminer/src/utility.hpp"
|
|
698
|
+
#include "btminer/src/build_mdd.hpp"
|
|
699
|
+
|
|
700
|
+
// HTMiner (wrapped in its own namespace in source files)
|
|
701
|
+
#include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
|
|
702
|
+
#include "htminer/src/freq_miner.hpp"
|
|
703
|
+
#include "htminer/src/load_inst.hpp"
|
|
704
|
+
#include "htminer/src/utility.hpp"
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
#include "largepp/src/freq_miner.hpp"
|
|
708
|
+
#include "largepp/src/load_inst.hpp"
|
|
709
|
+
#include "largepp/src/utility.hpp"
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
#include "largebm/src/freq_miner.hpp"
|
|
713
|
+
#include "largebm/src/load_inst.hpp"
|
|
714
|
+
#include "largebm/src/utility.hpp"
|
|
715
|
+
#include "largebm/src/build_mdd.hpp"
|
|
716
|
+
|
|
717
|
+
#include "largehm/src/freq_miner.hpp"
|
|
718
|
+
#include "largehm/src/load_inst.hpp"
|
|
719
|
+
#include "largehm/src/utility.hpp"
|
|
720
|
+
#include "largehm/src/build_mdd.hpp"
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
725
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
726
|
+
|
|
727
|
+
// ─────────────────────────────────────────────────────────────
|
|
728
|
+
// PrefixProjection
|
|
729
|
+
// ─────────────────────────────────────────────────────────────
|
|
730
|
+
m.def("PrefixProjection",
|
|
188
731
|
[](py::object data,
|
|
189
|
-
double minsup,
|
|
190
|
-
|
|
191
|
-
bool
|
|
732
|
+
double minsup,
|
|
733
|
+
unsigned int time_limit,
|
|
734
|
+
bool preproc,
|
|
735
|
+
bool use_dic,
|
|
736
|
+
bool verbose,
|
|
737
|
+
const std::string &out_file)
|
|
192
738
|
{
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
// 2) load sequences (either from filename or from Python list)
|
|
739
|
+
::time_limit = time_limit;
|
|
740
|
+
::pre_pro = preproc;
|
|
741
|
+
::use_dic = use_dic;
|
|
742
|
+
::use_list = false;
|
|
743
|
+
::b_disp = verbose;
|
|
744
|
+
::b_write = !out_file.empty();
|
|
745
|
+
::out_file = out_file;
|
|
746
|
+
|
|
747
|
+
ClearCollected();
|
|
748
|
+
start_time = std::clock();
|
|
749
|
+
|
|
206
750
|
if (py::isinstance<py::str>(data)) {
|
|
207
751
|
std::string path = data.cast<std::string>();
|
|
208
|
-
if (!
|
|
752
|
+
if (!Load_instance(path, minsup))
|
|
209
753
|
throw std::runtime_error("Failed to load file: " + path);
|
|
210
754
|
} else {
|
|
211
755
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
212
|
-
|
|
213
|
-
|
|
756
|
+
items = std::move(seqs);
|
|
757
|
+
N = items.size();
|
|
214
758
|
|
|
215
|
-
// compute L (max item ID), M (max sequence length), E (total entries)
|
|
216
759
|
int max_id = 0;
|
|
217
|
-
|
|
218
|
-
htminer::E = 0;
|
|
219
|
-
for (auto &seq : htminer::items) {
|
|
220
|
-
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
760
|
+
for (auto &seq : items)
|
|
221
761
|
for (int x : seq)
|
|
222
762
|
max_id = std::max(max_id, std::abs(x));
|
|
223
|
-
|
|
763
|
+
L = max_id;
|
|
764
|
+
|
|
765
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
766
|
+
|
|
767
|
+
DFS.clear();
|
|
768
|
+
DFS.reserve(L);
|
|
769
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
770
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
771
|
+
|
|
772
|
+
M = 0;
|
|
773
|
+
E = 0;
|
|
774
|
+
for (auto &seq : items) {
|
|
775
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
776
|
+
E += seq.size();
|
|
224
777
|
}
|
|
225
|
-
htminer::L = max_id;
|
|
226
|
-
htminer::theta = (minsup < 1.0)
|
|
227
|
-
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
228
|
-
: static_cast<unsigned long long>(minsup);
|
|
229
|
-
|
|
230
|
-
// build empty DFS stack (size L) as HTMiner expects
|
|
231
|
-
htminer::DFS.clear();
|
|
232
|
-
htminer::DFS.reserve(htminer::L);
|
|
233
|
-
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
234
|
-
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
235
|
-
|
|
236
|
-
// initialize VDFS if HTMiner needs it
|
|
237
|
-
htminer::VDFS.clear();
|
|
238
|
-
htminer::VDFS.resize(htminer::L);
|
|
239
778
|
}
|
|
240
779
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
// std::cout << "[HTMiner] dumping all collected patterns:\n";
|
|
245
|
-
// for (size_t i = 0; i < htminer::collectedPatterns.size(); ++i) {
|
|
246
|
-
// const auto &seq = htminer::collectedPatterns[i];
|
|
247
|
-
// std::cout << "Pattern " << i << ": { ";
|
|
248
|
-
// for (int x : seq) {
|
|
249
|
-
// std::cout << x << " ";
|
|
250
|
-
// }
|
|
251
|
-
// std::cout << "}\n";
|
|
252
|
-
//}
|
|
253
|
-
std::cout << " total patterns = "
|
|
254
|
-
<< htminer::collectedPatterns.size() << "\n";
|
|
255
|
-
// ─────────────────────────────────────────────────
|
|
256
|
-
|
|
257
|
-
// 4) return patterns + elapsed time
|
|
780
|
+
Freq_miner();
|
|
781
|
+
|
|
258
782
|
py::dict out;
|
|
259
|
-
out["patterns"] =
|
|
260
|
-
out["time"] =
|
|
783
|
+
out["patterns"] = GetCollected();
|
|
784
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
261
785
|
return out;
|
|
262
786
|
},
|
|
263
787
|
py::arg("data"),
|
|
@@ -268,8 +792,223 @@ std::cout << " total patterns = "
|
|
|
268
792
|
py::arg("verbose") = false,
|
|
269
793
|
py::arg("out_file") = ""
|
|
270
794
|
);
|
|
795
|
+
m.def("BTMiner",
|
|
796
|
+
[](py::object data,
|
|
797
|
+
double minsup,
|
|
798
|
+
unsigned int time_limit,
|
|
799
|
+
bool preproc,
|
|
800
|
+
bool use_dic,
|
|
801
|
+
bool verbose,
|
|
802
|
+
const std::string &out_file)
|
|
803
|
+
{
|
|
804
|
+
// We are calling the *professor* BTMiner, now namespaced as btminer::.
|
|
805
|
+
// So we only set the globals the professor code actually has.
|
|
806
|
+
|
|
807
|
+
// 1) configure professor globals
|
|
808
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
809
|
+
btminer::pre_pro = preproc;
|
|
810
|
+
btminer::use_dic = use_dic;
|
|
811
|
+
btminer::b_disp = verbose;
|
|
812
|
+
btminer::b_write = !out_file.empty();
|
|
813
|
+
btminer::out_file = out_file;
|
|
814
|
+
btminer::N_mult = 1; // professor uses these too
|
|
815
|
+
btminer::M_mult = 1;
|
|
816
|
+
btminer::just_build = false; // we want full mining
|
|
817
|
+
|
|
818
|
+
btminer::start_time = std::clock();
|
|
819
|
+
|
|
820
|
+
// 2) load data
|
|
821
|
+
//
|
|
822
|
+
// Professor’s code is primarily file-based (Load_instance(const string&, double)).
|
|
823
|
+
// So: if user passes a file path → use the professor loader directly.
|
|
824
|
+
// If user passes a Python list-of-lists → we will build the MDD the same
|
|
825
|
+
// way professor’s loader does, but without changing his logic.
|
|
826
|
+
if (py::isinstance<py::str>(data)) {
|
|
827
|
+
// ----- FILE MODE -----
|
|
828
|
+
std::string path = data.cast<std::string>();
|
|
829
|
+
|
|
830
|
+
if (verbose) {
|
|
831
|
+
std::cerr << "[BT][binding] file=" << path
|
|
832
|
+
<< " minsup=" << minsup
|
|
833
|
+
<< " preproc=" << preproc << std::endl;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
837
|
+
throw std::runtime_error("BTMiner: failed to load file: " + path);
|
|
838
|
+
}
|
|
839
|
+
} else {
|
|
840
|
+
// ----- PYTHON LIST MODE -----
|
|
841
|
+
//
|
|
842
|
+
// We mimic professor’s loader:
|
|
843
|
+
// - create root in Tree
|
|
844
|
+
// - compute N, M, L
|
|
845
|
+
// - compute theta from minsup
|
|
846
|
+
// - seed DFS (one Pattern per item, as in Preprocess branch)
|
|
847
|
+
// - call Build_MDD(...) for each sequence
|
|
848
|
+
//
|
|
849
|
+
// This DOES NOT change his mining logic; it just drives it from memory.
|
|
850
|
+
|
|
851
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
852
|
+
|
|
853
|
+
// clear MDD and globals to a known state
|
|
854
|
+
btminer::Tree.clear();
|
|
855
|
+
btminer::Tree.emplace_back(0, 0, 0); // root (exactly like professor)
|
|
856
|
+
|
|
857
|
+
// compute basic stats
|
|
858
|
+
int max_id = 0;
|
|
859
|
+
int max_len = 0;
|
|
860
|
+
int seq_count = 0;
|
|
861
|
+
long long entries = 0;
|
|
862
|
+
|
|
863
|
+
for (const auto &s : seqs) {
|
|
864
|
+
if (s.empty()) continue;
|
|
865
|
+
++seq_count;
|
|
866
|
+
max_len = std::max<int>(max_len, static_cast<int>(s.size()));
|
|
867
|
+
for (int x : s) {
|
|
868
|
+
max_id = std::max(max_id, std::abs(x));
|
|
869
|
+
++entries;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
btminer::N = seq_count;
|
|
874
|
+
btminer::M = max_len;
|
|
875
|
+
btminer::L = max_id;
|
|
876
|
+
btminer::E = static_cast<int>(entries);
|
|
877
|
+
|
|
878
|
+
// theta = abs support
|
|
879
|
+
if (minsup < 1.0)
|
|
880
|
+
btminer::theta = static_cast<int>(std::ceil(minsup * btminer::N * btminer::N_mult));
|
|
881
|
+
else
|
|
882
|
+
btminer::theta = static_cast<int>(minsup);
|
|
883
|
+
|
|
884
|
+
// seed DFS exactly like professor does in the preprocessed branch:
|
|
885
|
+
btminer::DFS.clear();
|
|
886
|
+
btminer::DFS.reserve(btminer::L);
|
|
887
|
+
for (int i = 0; i < btminer::L; ++i)
|
|
888
|
+
btminer::DFS.emplace_back(-i - 1);
|
|
889
|
+
|
|
890
|
+
// now build the MDD, sequence by sequence
|
|
891
|
+
for (const auto &s : seqs) {
|
|
892
|
+
if (s.empty()) continue;
|
|
893
|
+
// professor’s Build_MDD takes a vector<int> by non-const ref
|
|
894
|
+
std::vector<int> tmp = s;
|
|
895
|
+
btminer::Build_MDD(tmp);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
if (verbose) {
|
|
899
|
+
std::cerr << "[BT][binding] PY mode: N=" << btminer::N
|
|
900
|
+
<< " L=" << btminer::L
|
|
901
|
+
<< " M=" << btminer::M
|
|
902
|
+
<< " E=" << btminer::E
|
|
903
|
+
<< " theta=" << btminer::theta
|
|
904
|
+
<< " Tree.size()=" << btminer::Tree.size()
|
|
905
|
+
<< std::endl;
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// 3) run professor’s miner
|
|
910
|
+
btminer::Freq_miner();
|
|
911
|
+
|
|
912
|
+
// 4) build python result
|
|
913
|
+
// 4) build python result
|
|
914
|
+
py::dict out;
|
|
915
|
+
out["patterns"] = btminer::GetCollected(); // ← NEW
|
|
916
|
+
out["num_patterns"] = btminer::num_patt;
|
|
917
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
918
|
+
out["N"] = btminer::N;
|
|
919
|
+
out["L"] = btminer::L;
|
|
920
|
+
out["theta"] = btminer::theta;
|
|
921
|
+
return out;
|
|
922
|
+
|
|
923
|
+
},
|
|
924
|
+
py::arg("data"),
|
|
925
|
+
py::arg("minsup") = 0.01,
|
|
926
|
+
py::arg("time_limit") = 36000,
|
|
927
|
+
py::arg("preproc") = false,
|
|
928
|
+
py::arg("use_dic") = false,
|
|
929
|
+
py::arg("verbose") = false,
|
|
930
|
+
py::arg("out_file") = ""
|
|
931
|
+
);
|
|
932
|
+
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
// HTMiner
|
|
937
|
+
// ─────────────────────────────────────────────────────────────
|
|
938
|
+
// HTMiner
|
|
939
|
+
m.def("HTMiner",
|
|
940
|
+
[](py::object data,
|
|
941
|
+
double minsup, unsigned int time_limit,
|
|
942
|
+
bool preproc, bool use_dic,
|
|
943
|
+
bool verbose, const std::string &out_file)
|
|
944
|
+
{
|
|
945
|
+
htminer::time_limit = time_limit;
|
|
946
|
+
htminer::pre_pro = preproc;
|
|
947
|
+
htminer::use_dic = use_dic;
|
|
948
|
+
htminer::just_build = false;
|
|
949
|
+
htminer::use_list = false;
|
|
950
|
+
htminer::b_disp = verbose;
|
|
951
|
+
htminer::b_write = !out_file.empty();
|
|
952
|
+
htminer::out_file = out_file;
|
|
953
|
+
htminer::ClearCollected();
|
|
954
|
+
htminer::start_time = std::clock();
|
|
955
|
+
|
|
956
|
+
if (py::isinstance<py::str>(data)) {
|
|
957
|
+
std::string path = data.cast<std::string>();
|
|
958
|
+
if (!htminer::Load_instance(path, minsup))
|
|
959
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
960
|
+
} else {
|
|
961
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
962
|
+
htminer::items = std::move(seqs);
|
|
963
|
+
htminer::N = htminer::items.size();
|
|
964
|
+
|
|
965
|
+
int max_id = 0;
|
|
966
|
+
htminer::M = 0;
|
|
967
|
+
htminer::E = 0;
|
|
968
|
+
for (auto &seq : htminer::items) {
|
|
969
|
+
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
970
|
+
for (int x : seq)
|
|
971
|
+
max_id = std::max(max_id, std::abs(x));
|
|
972
|
+
htminer::E += seq.size();
|
|
973
|
+
}
|
|
974
|
+
htminer::L = max_id;
|
|
975
|
+
htminer::theta = (minsup < 1.0)
|
|
976
|
+
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
977
|
+
: static_cast<unsigned long long>(minsup);
|
|
978
|
+
|
|
979
|
+
htminer::DFS.clear();
|
|
980
|
+
htminer::DFS.reserve(htminer::L);
|
|
981
|
+
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
982
|
+
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
983
|
+
|
|
984
|
+
htminer::VDFS.clear();
|
|
985
|
+
htminer::VDFS.resize(htminer::L);
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
htminer::Freq_miner();
|
|
989
|
+
|
|
990
|
+
// 👇 now really respects verbose
|
|
991
|
+
if (verbose) {
|
|
992
|
+
std::cout << " total patterns = "
|
|
993
|
+
<< htminer::collectedPatterns.size() << "\n";
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
py::dict out;
|
|
997
|
+
out["patterns"] = htminer::GetCollected();
|
|
998
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
999
|
+
return out;
|
|
1000
|
+
},
|
|
1001
|
+
py::arg("data"),
|
|
1002
|
+
py::arg("minsup") = 0.01,
|
|
1003
|
+
py::arg("time_limit") = 36000,
|
|
1004
|
+
py::arg("preproc") = false,
|
|
1005
|
+
py::arg("use_dic") = false,
|
|
1006
|
+
py::arg("verbose") = false,
|
|
1007
|
+
py::arg("out_file") = ""
|
|
1008
|
+
);
|
|
1009
|
+
|
|
271
1010
|
|
|
272
|
-
|
|
1011
|
+
m.def("LargePrefixProjection",
|
|
273
1012
|
[](py::object data,
|
|
274
1013
|
double minsup,
|
|
275
1014
|
unsigned int time_limit,
|
|
@@ -281,28 +1020,30 @@ std::cout << " total patterns = "
|
|
|
281
1020
|
largepp::time_limit = time_limit;
|
|
282
1021
|
largepp::pre_pro = preproc;
|
|
283
1022
|
largepp::use_dic = use_dic;
|
|
284
|
-
largepp::use_list = true;
|
|
1023
|
+
largepp::use_list = true;
|
|
285
1024
|
largepp::b_disp = verbose;
|
|
286
1025
|
largepp::b_write = !out_file.empty();
|
|
287
1026
|
largepp::out_file = out_file;
|
|
288
|
-
largepp::just_build = false;
|
|
1027
|
+
largepp::just_build = false;
|
|
289
1028
|
|
|
290
1029
|
largepp::ClearCollected();
|
|
291
1030
|
largepp::start_time = std::clock();
|
|
292
|
-
std::string fname = data.cast<std::string>();
|
|
293
|
-
/* 1) load instance (py list or filename) */
|
|
294
|
-
if (py::isinstance<py::str>(data))
|
|
295
|
-
|
|
296
|
-
largepp::Load_instance(fname, minsup);
|
|
297
|
-
else
|
|
298
|
-
largepp::Load_py(data, minsup); // helper you’ll expose
|
|
299
|
-
|
|
300
|
-
std::vector<unsigned long long> dbg;
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
1031
|
|
|
1032
|
+
// 👇 this was the last noisy one
|
|
1033
|
+
if (verbose) {
|
|
1034
|
+
std::cerr << " minsup=" << minsup
|
|
1035
|
+
<< " preproc=" << preproc
|
|
1036
|
+
<< " verbose=" << verbose
|
|
1037
|
+
<< " out_file=" << (out_file.empty() ? "(none)" : out_file)
|
|
1038
|
+
<< " use_dic=" << use_dic << "\n";
|
|
1039
|
+
}
|
|
305
1040
|
|
|
1041
|
+
if (py::isinstance<py::str>(data)) {
|
|
1042
|
+
std::string fname = data.cast<std::string>();
|
|
1043
|
+
largepp::Load_instance(fname, minsup);
|
|
1044
|
+
} else {
|
|
1045
|
+
largepp::Load_py(data, minsup);
|
|
1046
|
+
}
|
|
306
1047
|
|
|
307
1048
|
largepp::Freq_miner();
|
|
308
1049
|
|
|
@@ -320,6 +1061,8 @@ std::cout << " total patterns = "
|
|
|
320
1061
|
py::arg("out_file") = ""
|
|
321
1062
|
);
|
|
322
1063
|
|
|
1064
|
+
|
|
1065
|
+
|
|
323
1066
|
// ─────────────────────────────────────────────────────────────
|
|
324
1067
|
// LargeBTMiner -- Python wrapper for the largebm implementation
|
|
325
1068
|
// ─────────────────────────────────────────────────────────────
|
|
@@ -404,100 +1147,108 @@ std::cout << " total patterns = "
|
|
|
404
1147
|
|
|
405
1148
|
|
|
406
1149
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
1150
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1151
|
+
// LargeBTMiner (MDD-based)
|
|
1152
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1153
|
+
/*m.def("LargeBTMiner",
|
|
1154
|
+
[](py::object data,
|
|
1155
|
+
double minsup,
|
|
1156
|
+
unsigned int time_limit,
|
|
1157
|
+
bool preproc,
|
|
1158
|
+
bool use_dic,
|
|
1159
|
+
bool verbose,
|
|
1160
|
+
const std::string &out_file)
|
|
1161
|
+
{
|
|
1162
|
+
using namespace largebm;
|
|
1163
|
+
|
|
1164
|
+
// 0) Set global flags and timers
|
|
1165
|
+
largebm::time_limit = time_limit;
|
|
1166
|
+
largebm::pre_pro = preproc;
|
|
1167
|
+
largebm::use_dic = use_dic;
|
|
1168
|
+
largebm::use_list = false; // large-mode → always MDD
|
|
1169
|
+
largebm::b_disp = verbose;
|
|
1170
|
+
largebm::b_write = !out_file.empty();
|
|
1171
|
+
largebm::out_file = out_file;
|
|
1172
|
+
largebm::just_build = false;
|
|
1173
|
+
|
|
1174
|
+
// 0.1) Clear any leftover data/state from previous runs
|
|
1175
|
+
largebm::items.clear();
|
|
1176
|
+
largebm::item_dic.clear();
|
|
1177
|
+
largebm::inv_item_dic.clear();
|
|
1178
|
+
largebm::Tree.clear();
|
|
1179
|
+
largebm::DFS.clear();
|
|
1180
|
+
largebm::ClearCollected();
|
|
1181
|
+
|
|
1182
|
+
// 1) Load sequences (either from filename or from Python list)
|
|
1183
|
+
if (py::isinstance<py::str>(data)) {
|
|
1184
|
+
// ─────────── FILE-BASED MODE ───────────
|
|
1185
|
+
std::string path = data.cast<std::string>();
|
|
1186
|
+
if (!largebm::Load_instance(path, minsup))
|
|
1187
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
425
1188
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
largebm::
|
|
430
|
-
largebm::
|
|
431
|
-
largebm::DFS.clear();
|
|
432
|
-
largebm::ClearCollected();
|
|
1189
|
+
} else {
|
|
1190
|
+
// ────────── IN-MEMORY MODE ──────────
|
|
1191
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
1192
|
+
largebm::items = std::move(seqs);
|
|
1193
|
+
largebm::N = largebm::items.size();
|
|
433
1194
|
|
|
434
|
-
// 1)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
1195
|
+
// 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
|
|
1196
|
+
int max_id = 0;
|
|
1197
|
+
largebm::M = 0;
|
|
1198
|
+
largebm::E = 0;
|
|
1199
|
+
for (auto &seq : largebm::items) {
|
|
1200
|
+
largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
|
|
1201
|
+
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
1202
|
+
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
1203
|
+
}
|
|
1204
|
+
largebm::L = static_cast<unsigned int>(max_id);
|
|
1205
|
+
largebm::theta = (minsup < 1.0)
|
|
1206
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
1207
|
+
: static_cast<unsigned long long>(minsup);
|
|
440
1208
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
largebm::
|
|
445
|
-
largebm::N = largebm::items.size();
|
|
1209
|
+
// 1.2) Initialize DFS buffer (size = L)
|
|
1210
|
+
largebm::DFS.reserve(largebm::L);
|
|
1211
|
+
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
1212
|
+
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
446
1213
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
454
|
-
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
455
|
-
}
|
|
456
|
-
largebm::L = static_cast<unsigned int>(max_id);
|
|
457
|
-
largebm::theta = (minsup < 1.0)
|
|
458
|
-
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
459
|
-
: static_cast<unsigned long long>(minsup);
|
|
460
|
-
|
|
461
|
-
// 1.2) Initialize DFS buffer (size = L)
|
|
462
|
-
largebm::DFS.reserve(largebm::L);
|
|
463
|
-
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
464
|
-
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
465
|
-
|
|
466
|
-
// 1.3) Build the MDD “Tree”
|
|
467
|
-
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
468
|
-
largebm::Tree.emplace_back(0, 0, 0);
|
|
469
|
-
for (auto &seq : largebm::items)
|
|
470
|
-
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
471
|
-
}
|
|
1214
|
+
// 1.3) Build the MDD “Tree”
|
|
1215
|
+
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
1216
|
+
largebm::Tree.emplace_back(0, 0, 0);
|
|
1217
|
+
for (auto &seq : largebm::items)
|
|
1218
|
+
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
1219
|
+
}
|
|
472
1220
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
}
|
|
480
|
-
largebm::inv_item_dic = std::move(inv);
|
|
1221
|
+
// 2) Rebuild inverse-dictionary from fresh item_dic
|
|
1222
|
+
{
|
|
1223
|
+
std::vector<int> inv(largebm::item_dic.size() + 1);
|
|
1224
|
+
for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
|
|
1225
|
+
int cid = largebm::item_dic[old - 1];
|
|
1226
|
+
if (cid > 0) inv[cid] = old;
|
|
481
1227
|
}
|
|
1228
|
+
largebm::inv_item_dic = std::move(inv);
|
|
1229
|
+
}
|
|
482
1230
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
1231
|
+
// 3) Start timing and run the miner
|
|
1232
|
+
largebm::start_time = std::clock();
|
|
1233
|
+
largebm::Freq_miner();
|
|
1234
|
+
|
|
1235
|
+
// 4) Collect results and elapsed time
|
|
1236
|
+
const auto& pats = largebm::GetCollected();
|
|
1237
|
+
|
|
1238
|
+
py::dict out;
|
|
1239
|
+
out["patterns"] = pats;
|
|
1240
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
1241
|
+
return out;
|
|
1242
|
+
},
|
|
1243
|
+
py::arg("data"),
|
|
1244
|
+
py::arg("minsup") = 0.01,
|
|
1245
|
+
py::arg("time_limit") = 36000,
|
|
1246
|
+
py::arg("preproc") = false,
|
|
1247
|
+
py::arg("use_dic") = false,
|
|
1248
|
+
py::arg("verbose") = false,
|
|
1249
|
+
py::arg("out_file") = ""
|
|
1250
|
+
);
|
|
486
1251
|
|
|
487
|
-
// 4) Collect results and elapsed time
|
|
488
|
-
py::dict out;
|
|
489
|
-
out["patterns"] = largebm::GetCollected();
|
|
490
|
-
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
491
|
-
return out;
|
|
492
|
-
},
|
|
493
|
-
py::arg("data"),
|
|
494
|
-
py::arg("minsup") = 0.01,
|
|
495
|
-
py::arg("time_limit") = 36000,
|
|
496
|
-
py::arg("preproc") = false,
|
|
497
|
-
py::arg("use_dic") = false,
|
|
498
|
-
py::arg("verbose") = false,
|
|
499
|
-
py::arg("out_file") = ""
|
|
500
|
-
);
|
|
501
1252
|
|
|
502
1253
|
|
|
503
1254
|
m.def("LargeHTMiner",
|
|
@@ -606,4 +1357,4 @@ m.def("LargeHTMiner",
|
|
|
606
1357
|
|
|
607
1358
|
|
|
608
1359
|
|
|
609
|
-
}
|
|
1360
|
+
} */
|