effspm 0.2.8__cp39-cp39-macosx_10_9_x86_64.whl → 0.3.2__cp39-cp39-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +850 -210
- effspm/_effspm.cpython-39-darwin.so +0 -0
- effspm/btminer/src/build_mdd.cpp +42 -17
- effspm/btminer/src/build_mdd.hpp +13 -19
- effspm/btminer/src/freq_miner.cpp +134 -49
- effspm/btminer/src/freq_miner.hpp +16 -0
- effspm/btminer/src/load_inst.cpp +202 -126
- effspm/btminer/src/load_inst.hpp +22 -4
- effspm/btminer/src/main.cpp +83 -0
- effspm/btminer/src/utility.cpp +26 -41
- effspm/btminer/src/utility.hpp +6 -30
- effspm/freq_miner.hpp +2 -1
- effspm/htminer/src/build_mdd.cpp +46 -124
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/build_mdd.cpp +69 -110
- effspm/largebm/src/build_mdd.hpp +22 -37
- effspm/largebm/src/freq_miner.cpp +241 -291
- effspm/largebm/src/freq_miner.hpp +25 -36
- effspm/largebm/src/load_inst.cpp +20 -26
- effspm/largebm/src/load_inst.hpp +24 -34
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +11 -21
- effspm/largebm/src/utility.hpp +7 -10
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +134 -191
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +137 -174
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/freq_miner.cpp +184 -156
- effspm/largepp/src/freq_miner.hpp +11 -36
- effspm/largepp/src/load_inst.cpp +32 -12
- effspm/largepp/src/load_inst.hpp +15 -9
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/load_inst.cpp +8 -8
- effspm/load_inst.hpp +1 -1
- effspm/main.cpp +103 -0
- {effspm-0.2.8.dist-info → effspm-0.3.2.dist-info}/METADATA +1 -1
- effspm-0.3.2.dist-info/RECORD +60 -0
- effspm-0.2.8.dist-info/RECORD +0 -53
- {effspm-0.2.8.dist-info → effspm-0.3.2.dist-info}/WHEEL +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.2.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -2,49 +2,97 @@
|
|
|
2
2
|
|
|
3
3
|
#include <pybind11/pybind11.h>
|
|
4
4
|
#include <pybind11/stl.h>
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
#include <iostream>
|
|
7
|
+
#include <fstream>
|
|
8
|
+
#include <cstdio> // std::remove
|
|
9
|
+
#include <vector>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <ctime>
|
|
12
|
+
#include <cmath>
|
|
7
13
|
|
|
14
|
+
namespace py = pybind11;
|
|
8
15
|
|
|
9
|
-
// PrefixProjection headers
|
|
16
|
+
// PrefixProjection headers (global namespace)
|
|
10
17
|
#include "freq_miner.hpp"
|
|
11
18
|
#include "load_inst.hpp"
|
|
12
19
|
#include "utility.hpp"
|
|
13
20
|
|
|
14
|
-
// BTMiner (
|
|
21
|
+
// BTMiner (namespaced)
|
|
15
22
|
#include "btminer/src/freq_miner.hpp"
|
|
16
23
|
#include "btminer/src/load_inst.hpp"
|
|
17
24
|
#include "btminer/src/utility.hpp"
|
|
18
25
|
#include "btminer/src/build_mdd.hpp"
|
|
19
26
|
|
|
20
|
-
// HTMiner (
|
|
21
|
-
#include "htminer/src/build_mdd.hpp"
|
|
27
|
+
// HTMiner (namespaced)
|
|
28
|
+
#include "htminer/src/build_mdd.hpp"
|
|
22
29
|
#include "htminer/src/freq_miner.hpp"
|
|
23
30
|
#include "htminer/src/load_inst.hpp"
|
|
24
31
|
#include "htminer/src/utility.hpp"
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
// LargePrefixProjection
|
|
27
34
|
#include "largepp/src/freq_miner.hpp"
|
|
28
35
|
#include "largepp/src/load_inst.hpp"
|
|
29
36
|
#include "largepp/src/utility.hpp"
|
|
30
37
|
|
|
38
|
+
// LargeBTMiner
|
|
31
39
|
#include "largebm/src/freq_miner.hpp"
|
|
32
40
|
#include "largebm/src/load_inst.hpp"
|
|
33
41
|
#include "largebm/src/utility.hpp"
|
|
34
42
|
#include "largebm/src/build_mdd.hpp"
|
|
35
43
|
|
|
44
|
+
// LargeHTMiner
|
|
36
45
|
#include "largehm/src/freq_miner.hpp"
|
|
37
46
|
#include "largehm/src/load_inst.hpp"
|
|
38
47
|
#include "largehm/src/utility.hpp"
|
|
39
48
|
#include "largehm/src/build_mdd.hpp"
|
|
40
49
|
|
|
50
|
+
namespace {
|
|
51
|
+
|
|
52
|
+
// RAII helper for temp file
|
|
53
|
+
struct TempFile {
|
|
54
|
+
std::string path;
|
|
55
|
+
~TempFile() {
|
|
56
|
+
if (!path.empty()) {
|
|
57
|
+
std::remove(path.c_str());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Write Python list[list[int]] to a temp file in professor’s format:
|
|
63
|
+
// one sequence per line, items separated by spaces.
|
|
64
|
+
std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
|
|
65
|
+
char tmp_name[L_tmpnam];
|
|
66
|
+
if (!std::tmpnam(tmp_name)) {
|
|
67
|
+
throw std::runtime_error("Failed to create temporary file name");
|
|
68
|
+
}
|
|
69
|
+
std::string path = std::string(tmp_name) + ".txt";
|
|
70
|
+
|
|
71
|
+
std::ofstream ofs(path);
|
|
72
|
+
if (!ofs) {
|
|
73
|
+
throw std::runtime_error("Failed to open temporary file for writing: " + path);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const auto& seq : seqs) {
|
|
77
|
+
for (size_t i = 0; i < seq.size(); ++i) {
|
|
78
|
+
if (i) ofs << ' ';
|
|
79
|
+
ofs << seq[i];
|
|
80
|
+
}
|
|
81
|
+
ofs << '\n';
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ofs.close();
|
|
85
|
+
return path;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
} // anonymous namespace
|
|
41
89
|
|
|
42
90
|
|
|
43
91
|
PYBIND11_MODULE(_effspm, m) {
|
|
44
|
-
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
92
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
|
|
45
93
|
|
|
46
94
|
// ─────────────────────────────────────────────────────────────
|
|
47
|
-
// PrefixProjection
|
|
95
|
+
// PrefixProjection (works directly on Python lists or files)
|
|
48
96
|
// ─────────────────────────────────────────────────────────────
|
|
49
97
|
m.def("PrefixProjection",
|
|
50
98
|
[](py::object data,
|
|
@@ -59,7 +107,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
59
107
|
::pre_pro = preproc;
|
|
60
108
|
::use_dic = use_dic;
|
|
61
109
|
::use_list = false;
|
|
62
|
-
::b_disp = verbose;
|
|
110
|
+
::b_disp = verbose; // controls prints in original code
|
|
63
111
|
::b_write = !out_file.empty();
|
|
64
112
|
::out_file = out_file;
|
|
65
113
|
|
|
@@ -69,7 +117,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
69
117
|
if (py::isinstance<py::str>(data)) {
|
|
70
118
|
std::string path = data.cast<std::string>();
|
|
71
119
|
if (!Load_instance(path, minsup))
|
|
72
|
-
throw std::runtime_error("
|
|
120
|
+
throw std::runtime_error("PrefixProjection: failed to load file: " + path);
|
|
73
121
|
} else {
|
|
74
122
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
75
123
|
items = std::move(seqs);
|
|
@@ -113,9 +161,197 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
113
161
|
);
|
|
114
162
|
|
|
115
163
|
// ─────────────────────────────────────────────────────────────
|
|
116
|
-
// BTMiner
|
|
164
|
+
// BTMiner (always uses professor's Load_instance)
|
|
165
|
+
// ─────────────────────────────────────────────────────────────
|
|
166
|
+
// ─────────────────────────────────────────────────────────────
|
|
167
|
+
// BTMiner (always uses professor's Load_instance)
|
|
168
|
+
// ─────────────────────────────────────────────────────────────
|
|
169
|
+
m.def("BTMiner",
|
|
170
|
+
[](py::object data,
|
|
171
|
+
double minsup,
|
|
172
|
+
unsigned int time_limit,
|
|
173
|
+
bool preproc,
|
|
174
|
+
bool use_dic,
|
|
175
|
+
bool verbose,
|
|
176
|
+
const std::string &out_file)
|
|
177
|
+
{
|
|
178
|
+
// 1) Configure professor globals
|
|
179
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
180
|
+
btminer::pre_pro = preproc;
|
|
181
|
+
btminer::use_dic = use_dic;
|
|
182
|
+
btminer::b_disp = verbose;
|
|
183
|
+
btminer::b_write = !out_file.empty();
|
|
184
|
+
btminer::out_file = out_file;
|
|
185
|
+
btminer::N_mult = 1;
|
|
186
|
+
btminer::M_mult = 1;
|
|
187
|
+
btminer::just_build = false;
|
|
188
|
+
|
|
189
|
+
// 2) HARD RESET of *known* global state for BTMiner
|
|
190
|
+
// (Only touch what we know exists in btminer namespace)
|
|
191
|
+
btminer::ClearCollected(); // clear collected patterns
|
|
192
|
+
btminer::Tree.clear(); // clear MDD tree
|
|
193
|
+
btminer::DFS.clear(); // clear DFS patterns
|
|
194
|
+
|
|
195
|
+
btminer::M = 0;
|
|
196
|
+
btminer::L = 0;
|
|
197
|
+
btminer::N = 0;
|
|
198
|
+
btminer::theta = 0;
|
|
199
|
+
btminer::E = 0;
|
|
200
|
+
btminer::num_patt = 0; // reset pattern counter if defined
|
|
201
|
+
|
|
202
|
+
// NOTE: we do NOT reinsert root here; btminer::Load_instance()
|
|
203
|
+
// is responsible for calling Tree.emplace_back(0,0,0) as needed.
|
|
204
|
+
|
|
205
|
+
btminer::start_time = std::clock();
|
|
206
|
+
|
|
207
|
+
// 3) Handle input (path or list-of-lists)
|
|
208
|
+
TempFile tmp;
|
|
209
|
+
std::string path;
|
|
210
|
+
|
|
211
|
+
if (py::isinstance<py::str>(data)) {
|
|
212
|
+
// File path: use directly
|
|
213
|
+
path = data.cast<std::string>();
|
|
214
|
+
} else {
|
|
215
|
+
// Python list → write to a temp file in professor’s format
|
|
216
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
217
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
218
|
+
path = tmp.path;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (verbose) {
|
|
222
|
+
std::cerr << "[BTMiner] path=" << path
|
|
223
|
+
<< " minsup=" << minsup
|
|
224
|
+
<< " preproc=" << preproc
|
|
225
|
+
<< " use_dic=" << use_dic
|
|
226
|
+
<< std::endl;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// 4) Build MDD + run miner
|
|
230
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
231
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
btminer::Freq_miner();
|
|
235
|
+
|
|
236
|
+
// 5) Return results
|
|
237
|
+
py::dict out;
|
|
238
|
+
out["patterns"] = btminer::GetCollected();
|
|
239
|
+
out["num_patterns"] = btminer::num_patt;
|
|
240
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
241
|
+
out["N"] = btminer::N;
|
|
242
|
+
out["L"] = btminer::L;
|
|
243
|
+
out["theta"] = btminer::theta;
|
|
244
|
+
return out;
|
|
245
|
+
},
|
|
246
|
+
py::arg("data"),
|
|
247
|
+
py::arg("minsup") = 0.01,
|
|
248
|
+
py::arg("time_limit") = 36000,
|
|
249
|
+
py::arg("preproc") = false,
|
|
250
|
+
py::arg("use_dic") = false,
|
|
251
|
+
py::arg("verbose") = false,
|
|
252
|
+
py::arg("out_file") = ""
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
// ─────────────────────────────────────────────────────────────
|
|
257
|
+
// HTMiner (works on files; we use a temp file for in-memory data)
|
|
258
|
+
// ─────────────────────────────────────────────────────────────
|
|
259
|
+
// ─────────────────────────────────────────────────────────────
|
|
260
|
+
// HTMiner (always uses professor's Load_instance; pre_pro forced ON)
|
|
261
|
+
// ─────────────────────────────────────────────────────────────
|
|
262
|
+
m.def("HTMiner",
|
|
263
|
+
[](py::object data,
|
|
264
|
+
double minsup,
|
|
265
|
+
unsigned int time_limit,
|
|
266
|
+
bool /*preproc*/, // Python arg is ignored internally
|
|
267
|
+
bool use_dic,
|
|
268
|
+
bool verbose,
|
|
269
|
+
const std::string &out_file)
|
|
270
|
+
{
|
|
271
|
+
using namespace htminer;
|
|
272
|
+
|
|
273
|
+
// ───────── Global parameter setup ─────────
|
|
274
|
+
htminer::time_limit = time_limit;
|
|
275
|
+
|
|
276
|
+
// IMPORTANT: always run with preprocessing ON,
|
|
277
|
+
// regardless of the Python `preproc` flag.
|
|
278
|
+
htminer::pre_pro = true;
|
|
279
|
+
htminer::use_dic = use_dic;
|
|
280
|
+
htminer::just_build = false;
|
|
281
|
+
htminer::b_disp = verbose;
|
|
282
|
+
htminer::b_write = !out_file.empty();
|
|
283
|
+
htminer::out_file = out_file;
|
|
284
|
+
|
|
285
|
+
// ───────── HARD RESET of HTMiner globals ─────────
|
|
286
|
+
htminer::ClearCollected();
|
|
287
|
+
htminer::Tree.clear();
|
|
288
|
+
htminer::VTree.clear();
|
|
289
|
+
htminer::CTree.clear();
|
|
290
|
+
htminer::DFS.clear();
|
|
291
|
+
htminer::VDFS.clear();
|
|
292
|
+
htminer::item_dic.clear();
|
|
293
|
+
|
|
294
|
+
htminer::M = 0;
|
|
295
|
+
htminer::N = 0;
|
|
296
|
+
htminer::L = 0;
|
|
297
|
+
htminer::E = 0;
|
|
298
|
+
htminer::theta = 0;
|
|
299
|
+
htminer::mlim = 0;
|
|
300
|
+
htminer::itmset_exists = false;
|
|
301
|
+
|
|
302
|
+
// NOTE: do NOT add a root arc here;
|
|
303
|
+
// htminer::Load_instance() already does Tree.emplace_back(0,0,0)
|
|
304
|
+
htminer::start_time = std::clock();
|
|
305
|
+
|
|
306
|
+
// ───────── Handle input (path or in-memory sequences) ─────────
|
|
307
|
+
TempFile tmp;
|
|
308
|
+
std::string path;
|
|
309
|
+
|
|
310
|
+
if (py::isinstance<py::str>(data)) {
|
|
311
|
+
// data is a file path
|
|
312
|
+
path = data.cast<std::string>();
|
|
313
|
+
} else {
|
|
314
|
+
// data is a list[list[int]] → write a temp file in the same text format
|
|
315
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
316
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
317
|
+
path = tmp.path;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if (verbose) {
|
|
321
|
+
std::cerr << "[HTMiner] path=" << path
|
|
322
|
+
<< " minsup=" << minsup
|
|
323
|
+
<< " preproc(always)=true"
|
|
324
|
+
<< " use_dic=" << use_dic
|
|
325
|
+
<< std::endl;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// ───────── Build MDD via professor's loader ─────────
|
|
329
|
+
if (!htminer::Load_instance(path, minsup)) {
|
|
330
|
+
throw std::runtime_error("HTMiner: failed to load instance from: " + path);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// ───────── Run miner ─────────
|
|
334
|
+
htminer::Freq_miner();
|
|
335
|
+
|
|
336
|
+
// ───────── Return results ─────────
|
|
337
|
+
py::dict out;
|
|
338
|
+
out["patterns"] = htminer::GetCollected();
|
|
339
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
340
|
+
return out;
|
|
341
|
+
},
|
|
342
|
+
py::arg("data"),
|
|
343
|
+
py::arg("minsup") = 0.01,
|
|
344
|
+
py::arg("time_limit") = 36000,
|
|
345
|
+
py::arg("preproc") = false, // kept for API symmetry, but IGNORED
|
|
346
|
+
py::arg("use_dic") = false,
|
|
347
|
+
py::arg("verbose") = false,
|
|
348
|
+
py::arg("out_file") = ""
|
|
349
|
+
);
|
|
350
|
+
|
|
351
|
+
// ─────────────────────────────────────────────────────────────
|
|
352
|
+
// LargePrefixProjection (already has its own Load_py)
|
|
117
353
|
// ─────────────────────────────────────────────────────────────
|
|
118
|
-
m.def("
|
|
354
|
+
m.def("LargePrefixProjection",
|
|
119
355
|
[](py::object data,
|
|
120
356
|
double minsup,
|
|
121
357
|
unsigned int time_limit,
|
|
@@ -124,140 +360,317 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
124
360
|
bool verbose,
|
|
125
361
|
const std::string &out_file)
|
|
126
362
|
{
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
363
|
+
largepp::time_limit = time_limit;
|
|
364
|
+
largepp::pre_pro = preproc;
|
|
365
|
+
largepp::use_dic = use_dic;
|
|
366
|
+
largepp::use_list = true; // large prefix uses list-based mining
|
|
367
|
+
largepp::b_disp = verbose;
|
|
368
|
+
largepp::b_write = !out_file.empty();
|
|
369
|
+
largepp::out_file = out_file;
|
|
370
|
+
largepp::just_build = false;
|
|
371
|
+
|
|
372
|
+
largepp::ClearCollected();
|
|
373
|
+
largepp::start_time = std::clock();
|
|
134
374
|
|
|
135
|
-
|
|
136
|
-
|
|
375
|
+
if (py::isinstance<py::str>(data)) {
|
|
376
|
+
std::string fname = data.cast<std::string>();
|
|
377
|
+
largepp::Load_instance(fname, minsup);
|
|
378
|
+
} else {
|
|
379
|
+
largepp::Load_py(data, minsup);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
largepp::Freq_miner();
|
|
383
|
+
|
|
384
|
+
py::dict out;
|
|
385
|
+
out["patterns"] = largepp::GetCollected();
|
|
386
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
387
|
+
return out;
|
|
388
|
+
},
|
|
389
|
+
py::arg("data"),
|
|
390
|
+
py::arg("minsup") = 0.01,
|
|
391
|
+
py::arg("time_limit") = 36000,
|
|
392
|
+
py::arg("preproc") = false,
|
|
393
|
+
py::arg("use_dic") = false,
|
|
394
|
+
py::arg("verbose") = false,
|
|
395
|
+
py::arg("out_file") = ""
|
|
396
|
+
);
|
|
397
|
+
|
|
398
|
+
// ─────────────────────────────────────────────────────────────
|
|
399
|
+
// LargeBTMiner (always uses professor's largebm::Load_instance)
|
|
400
|
+
// ─────────────────────────────────────────────────────────────
|
|
401
|
+
m.def("LargeBTMiner",
|
|
402
|
+
[](py::object data,
|
|
403
|
+
double minsup,
|
|
404
|
+
unsigned int time_limit,
|
|
405
|
+
bool preproc,
|
|
406
|
+
bool use_dic,
|
|
407
|
+
bool verbose,
|
|
408
|
+
const std::string &out_file)
|
|
409
|
+
{
|
|
410
|
+
using namespace largebm;
|
|
411
|
+
|
|
412
|
+
largebm::time_limit = time_limit;
|
|
413
|
+
largebm::pre_pro = preproc;
|
|
414
|
+
largebm::use_dic = use_dic;
|
|
415
|
+
largebm::use_list = false; // MDD-based
|
|
416
|
+
largebm::b_disp = verbose;
|
|
417
|
+
largebm::b_write = !out_file.empty();
|
|
418
|
+
largebm::out_file = out_file;
|
|
419
|
+
largebm::just_build = false;
|
|
420
|
+
|
|
421
|
+
largebm::ClearCollected();
|
|
422
|
+
largebm::items.clear();
|
|
423
|
+
largebm::item_dic.clear();
|
|
424
|
+
largebm::inv_item_dic.clear();
|
|
425
|
+
largebm::Tree.clear();
|
|
426
|
+
largebm::DFS.clear();
|
|
427
|
+
|
|
428
|
+
largebm::start_time = std::clock();
|
|
429
|
+
|
|
430
|
+
TempFile tmp;
|
|
431
|
+
std::string path;
|
|
137
432
|
|
|
138
433
|
if (py::isinstance<py::str>(data)) {
|
|
139
|
-
|
|
140
|
-
if (!btminer::Load_instance(path, minsup))
|
|
141
|
-
throw std::runtime_error("Failed to load file: " + path);
|
|
434
|
+
path = data.cast<std::string>();
|
|
142
435
|
} else {
|
|
143
436
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
144
|
-
|
|
145
|
-
|
|
437
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
438
|
+
path = tmp.path;
|
|
439
|
+
}
|
|
146
440
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
441
|
+
if (verbose) {
|
|
442
|
+
std::cerr << "[LargeBTMiner] path=" << path
|
|
443
|
+
<< " minsup=" << minsup
|
|
444
|
+
<< " preproc=" << preproc
|
|
445
|
+
<< " use_dic=" << use_dic
|
|
446
|
+
<< std::endl;
|
|
447
|
+
}
|
|
152
448
|
|
|
153
|
-
|
|
449
|
+
if (!largebm::Load_instance(path, minsup)) {
|
|
450
|
+
throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
|
|
451
|
+
}
|
|
154
452
|
|
|
155
|
-
|
|
156
|
-
btminer::DFS.reserve(btminer::L);
|
|
157
|
-
for (unsigned int i = 0; i < btminer::L; ++i)
|
|
158
|
-
btminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
453
|
+
largebm::Freq_miner();
|
|
159
454
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
455
|
+
py::dict out;
|
|
456
|
+
out["patterns"] = largebm::GetCollected();
|
|
457
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
458
|
+
return out;
|
|
459
|
+
},
|
|
460
|
+
py::arg("data"),
|
|
461
|
+
py::arg("minsup") = 0.01,
|
|
462
|
+
py::arg("time_limit") = 36000,
|
|
463
|
+
py::arg("preproc") = false,
|
|
464
|
+
py::arg("use_dic") = false,
|
|
465
|
+
py::arg("verbose") = false,
|
|
466
|
+
py::arg("out_file") = ""
|
|
467
|
+
);
|
|
468
|
+
|
|
469
|
+
// ─────────────────────────────────────────────────────────────
|
|
470
|
+
// LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
|
|
471
|
+
// ─────────────────────────────────────────────────────────────
|
|
472
|
+
// ─────────────────────────────────────────────────────────────
|
|
473
|
+
// LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
|
|
474
|
+
// ─────────────────────────────────────────────────────────────
|
|
475
|
+
m.def("LargeHTMiner",
|
|
476
|
+
[](py::object data,
|
|
477
|
+
double minsup,
|
|
478
|
+
unsigned int time_limit,
|
|
479
|
+
bool /*preproc*/, // kept for API symmetry; ignored
|
|
480
|
+
bool use_dic,
|
|
481
|
+
bool verbose,
|
|
482
|
+
const std::string &out_file)
|
|
483
|
+
{
|
|
484
|
+
using namespace largehm;
|
|
485
|
+
|
|
486
|
+
// 1) Global configuration (mirror professor's style)
|
|
487
|
+
largehm::time_limit = time_limit;
|
|
488
|
+
largehm::pre_pro = true; // always preprocess
|
|
489
|
+
largehm::use_dic = use_dic;
|
|
490
|
+
largehm::just_build = false;
|
|
491
|
+
largehm::b_disp = verbose;
|
|
492
|
+
largehm::b_write = !out_file.empty();
|
|
493
|
+
largehm::out_file = out_file;
|
|
494
|
+
|
|
495
|
+
// 2) HARD RESET of all global state for a fresh run
|
|
496
|
+
largehm::ClearCollected(); // our helper in largehm::utility.cpp
|
|
497
|
+
|
|
498
|
+
largehm::M = 0;
|
|
499
|
+
largehm::L = 0;
|
|
500
|
+
largehm::mlim = 0;
|
|
501
|
+
largehm::N = 0;
|
|
502
|
+
largehm::theta = 0;
|
|
503
|
+
largehm::E = 0;
|
|
504
|
+
largehm::itmset_exists = false;
|
|
505
|
+
|
|
506
|
+
// containers
|
|
507
|
+
// (item_dic reset is optional and not strictly needed here)
|
|
508
|
+
largehm::DFS.clear();
|
|
509
|
+
largehm::VDFS.clear();
|
|
510
|
+
largehm::Tree.clear();
|
|
511
|
+
largehm::VTree.clear();
|
|
512
|
+
largehm::CTree.clear();
|
|
513
|
+
|
|
514
|
+
largehm::start_time = std::clock();
|
|
515
|
+
|
|
516
|
+
// 3) Handle input (file path or Python list)
|
|
517
|
+
TempFile tmp;
|
|
518
|
+
std::string path;
|
|
519
|
+
|
|
520
|
+
if (py::isinstance<py::str>(data)) {
|
|
521
|
+
path = data.cast<std::string>();
|
|
522
|
+
} else {
|
|
523
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
524
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
525
|
+
path = tmp.path;
|
|
166
526
|
}
|
|
167
527
|
|
|
168
|
-
|
|
528
|
+
if (verbose) {
|
|
529
|
+
std::cerr << "[LargeHTMiner] path=" << path
|
|
530
|
+
<< " minsup=" << minsup
|
|
531
|
+
<< " preproc(always)=true"
|
|
532
|
+
<< " use_dic=" << use_dic
|
|
533
|
+
<< std::endl;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// 4) Build MDD / load instance.
|
|
537
|
+
// NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
|
|
538
|
+
// so we DO NOT create a root node here.
|
|
539
|
+
if (!largehm::Load_instance(path, minsup)) {
|
|
540
|
+
throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// 5) Run miner (same timing logic as original main)
|
|
544
|
+
if (!largehm::just_build &&
|
|
545
|
+
largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
|
|
546
|
+
{
|
|
547
|
+
largehm::Freq_miner();
|
|
548
|
+
if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
|
|
549
|
+
std::cout << "TIME LIMIT REACHED\n";
|
|
550
|
+
}
|
|
551
|
+
}
|
|
169
552
|
|
|
553
|
+
// 6) Return collected patterns + runtime
|
|
170
554
|
py::dict out;
|
|
171
|
-
out["patterns"] =
|
|
172
|
-
out["time"] =
|
|
555
|
+
out["patterns"] = largehm::GetCollected();
|
|
556
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
173
557
|
return out;
|
|
174
558
|
},
|
|
175
559
|
py::arg("data"),
|
|
176
560
|
py::arg("minsup") = 0.01,
|
|
177
561
|
py::arg("time_limit") = 36000,
|
|
178
|
-
py::arg("preproc") = false,
|
|
562
|
+
py::arg("preproc") = false, // kept for API symmetry
|
|
179
563
|
py::arg("use_dic") = false,
|
|
180
564
|
py::arg("verbose") = false,
|
|
181
565
|
py::arg("out_file") = ""
|
|
182
566
|
);
|
|
183
567
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
568
|
+
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
/*#include <pybind11/pybind11.h>
|
|
573
|
+
#include <pybind11/stl.h>
|
|
574
|
+
namespace py = pybind11;
|
|
575
|
+
#include <iostream>
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
// PrefixProjection headers
|
|
579
|
+
#include "freq_miner.hpp"
|
|
580
|
+
#include "load_inst.hpp"
|
|
581
|
+
#include "utility.hpp"
|
|
582
|
+
|
|
583
|
+
// BTMiner (wrapped in its own namespace in source files)
|
|
584
|
+
#include "btminer/src/freq_miner.hpp"
|
|
585
|
+
#include "btminer/src/load_inst.hpp"
|
|
586
|
+
#include "btminer/src/utility.hpp"
|
|
587
|
+
#include "btminer/src/build_mdd.hpp"
|
|
588
|
+
|
|
589
|
+
// HTMiner (wrapped in its own namespace in source files)
|
|
590
|
+
#include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
|
|
591
|
+
#include "htminer/src/freq_miner.hpp"
|
|
592
|
+
#include "htminer/src/load_inst.hpp"
|
|
593
|
+
#include "htminer/src/utility.hpp"
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
#include "largepp/src/freq_miner.hpp"
|
|
597
|
+
#include "largepp/src/load_inst.hpp"
|
|
598
|
+
#include "largepp/src/utility.hpp"
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
#include "largebm/src/freq_miner.hpp"
|
|
602
|
+
#include "largebm/src/load_inst.hpp"
|
|
603
|
+
#include "largebm/src/utility.hpp"
|
|
604
|
+
#include "largebm/src/build_mdd.hpp"
|
|
605
|
+
|
|
606
|
+
#include "largehm/src/freq_miner.hpp"
|
|
607
|
+
#include "largehm/src/load_inst.hpp"
|
|
608
|
+
#include "largehm/src/utility.hpp"
|
|
609
|
+
#include "largehm/src/build_mdd.hpp"
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
614
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
615
|
+
|
|
616
|
+
// ─────────────────────────────────────────────────────────────
|
|
617
|
+
// PrefixProjection
|
|
618
|
+
// ─────────────────────────────────────────────────────────────
|
|
619
|
+
m.def("PrefixProjection",
|
|
188
620
|
[](py::object data,
|
|
189
|
-
double minsup,
|
|
190
|
-
|
|
191
|
-
bool
|
|
621
|
+
double minsup,
|
|
622
|
+
unsigned int time_limit,
|
|
623
|
+
bool preproc,
|
|
624
|
+
bool use_dic,
|
|
625
|
+
bool verbose,
|
|
626
|
+
const std::string &out_file)
|
|
192
627
|
{
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
// 2) load sequences (either from filename or from Python list)
|
|
628
|
+
::time_limit = time_limit;
|
|
629
|
+
::pre_pro = preproc;
|
|
630
|
+
::use_dic = use_dic;
|
|
631
|
+
::use_list = false;
|
|
632
|
+
::b_disp = verbose;
|
|
633
|
+
::b_write = !out_file.empty();
|
|
634
|
+
::out_file = out_file;
|
|
635
|
+
|
|
636
|
+
ClearCollected();
|
|
637
|
+
start_time = std::clock();
|
|
638
|
+
|
|
206
639
|
if (py::isinstance<py::str>(data)) {
|
|
207
640
|
std::string path = data.cast<std::string>();
|
|
208
|
-
if (!
|
|
641
|
+
if (!Load_instance(path, minsup))
|
|
209
642
|
throw std::runtime_error("Failed to load file: " + path);
|
|
210
643
|
} else {
|
|
211
644
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
212
|
-
|
|
213
|
-
|
|
645
|
+
items = std::move(seqs);
|
|
646
|
+
N = items.size();
|
|
214
647
|
|
|
215
|
-
// compute L (max item ID), M (max sequence length), E (total entries)
|
|
216
648
|
int max_id = 0;
|
|
217
|
-
|
|
218
|
-
htminer::E = 0;
|
|
219
|
-
for (auto &seq : htminer::items) {
|
|
220
|
-
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
649
|
+
for (auto &seq : items)
|
|
221
650
|
for (int x : seq)
|
|
222
651
|
max_id = std::max(max_id, std::abs(x));
|
|
223
|
-
|
|
652
|
+
L = max_id;
|
|
653
|
+
|
|
654
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
655
|
+
|
|
656
|
+
DFS.clear();
|
|
657
|
+
DFS.reserve(L);
|
|
658
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
659
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
660
|
+
|
|
661
|
+
M = 0;
|
|
662
|
+
E = 0;
|
|
663
|
+
for (auto &seq : items) {
|
|
664
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
665
|
+
E += seq.size();
|
|
224
666
|
}
|
|
225
|
-
htminer::L = max_id;
|
|
226
|
-
htminer::theta = (minsup < 1.0)
|
|
227
|
-
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
228
|
-
: static_cast<unsigned long long>(minsup);
|
|
229
|
-
|
|
230
|
-
// build empty DFS stack (size L) as HTMiner expects
|
|
231
|
-
htminer::DFS.clear();
|
|
232
|
-
htminer::DFS.reserve(htminer::L);
|
|
233
|
-
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
234
|
-
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
235
|
-
|
|
236
|
-
// initialize VDFS if HTMiner needs it
|
|
237
|
-
htminer::VDFS.clear();
|
|
238
|
-
htminer::VDFS.resize(htminer::L);
|
|
239
667
|
}
|
|
240
668
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
// std::cout << "[HTMiner] dumping all collected patterns:\n";
|
|
245
|
-
// for (size_t i = 0; i < htminer::collectedPatterns.size(); ++i) {
|
|
246
|
-
// const auto &seq = htminer::collectedPatterns[i];
|
|
247
|
-
// std::cout << "Pattern " << i << ": { ";
|
|
248
|
-
// for (int x : seq) {
|
|
249
|
-
// std::cout << x << " ";
|
|
250
|
-
// }
|
|
251
|
-
// std::cout << "}\n";
|
|
252
|
-
//}
|
|
253
|
-
std::cout << " total patterns = "
|
|
254
|
-
<< htminer::collectedPatterns.size() << "\n";
|
|
255
|
-
// ─────────────────────────────────────────────────
|
|
256
|
-
|
|
257
|
-
// 4) return patterns + elapsed time
|
|
669
|
+
Freq_miner();
|
|
670
|
+
|
|
258
671
|
py::dict out;
|
|
259
|
-
out["patterns"] =
|
|
260
|
-
out["time"] =
|
|
672
|
+
out["patterns"] = GetCollected();
|
|
673
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
261
674
|
return out;
|
|
262
675
|
},
|
|
263
676
|
py::arg("data"),
|
|
@@ -268,8 +681,223 @@ std::cout << " total patterns = "
|
|
|
268
681
|
py::arg("verbose") = false,
|
|
269
682
|
py::arg("out_file") = ""
|
|
270
683
|
);
|
|
684
|
+
m.def("BTMiner",
|
|
685
|
+
[](py::object data,
|
|
686
|
+
double minsup,
|
|
687
|
+
unsigned int time_limit,
|
|
688
|
+
bool preproc,
|
|
689
|
+
bool use_dic,
|
|
690
|
+
bool verbose,
|
|
691
|
+
const std::string &out_file)
|
|
692
|
+
{
|
|
693
|
+
// We are calling the *professor* BTMiner, now namespaced as btminer::.
|
|
694
|
+
// So we only set the globals the professor code actually has.
|
|
695
|
+
|
|
696
|
+
// 1) configure professor globals
|
|
697
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
698
|
+
btminer::pre_pro = preproc;
|
|
699
|
+
btminer::use_dic = use_dic;
|
|
700
|
+
btminer::b_disp = verbose;
|
|
701
|
+
btminer::b_write = !out_file.empty();
|
|
702
|
+
btminer::out_file = out_file;
|
|
703
|
+
btminer::N_mult = 1; // professor uses these too
|
|
704
|
+
btminer::M_mult = 1;
|
|
705
|
+
btminer::just_build = false; // we want full mining
|
|
706
|
+
|
|
707
|
+
btminer::start_time = std::clock();
|
|
708
|
+
|
|
709
|
+
// 2) load data
|
|
710
|
+
//
|
|
711
|
+
// Professor’s code is primarily file-based (Load_instance(const string&, double)).
|
|
712
|
+
// So: if user passes a file path → use the professor loader directly.
|
|
713
|
+
// If user passes a Python list-of-lists → we will build the MDD the same
|
|
714
|
+
// way professor’s loader does, but without changing his logic.
|
|
715
|
+
if (py::isinstance<py::str>(data)) {
|
|
716
|
+
// ----- FILE MODE -----
|
|
717
|
+
std::string path = data.cast<std::string>();
|
|
271
718
|
|
|
272
|
-
|
|
719
|
+
if (verbose) {
|
|
720
|
+
std::cerr << "[BT][binding] file=" << path
|
|
721
|
+
<< " minsup=" << minsup
|
|
722
|
+
<< " preproc=" << preproc << std::endl;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
726
|
+
throw std::runtime_error("BTMiner: failed to load file: " + path);
|
|
727
|
+
}
|
|
728
|
+
} else {
|
|
729
|
+
// ----- PYTHON LIST MODE -----
|
|
730
|
+
//
|
|
731
|
+
// We mimic professor’s loader:
|
|
732
|
+
// - create root in Tree
|
|
733
|
+
// - compute N, M, L
|
|
734
|
+
// - compute theta from minsup
|
|
735
|
+
// - seed DFS (one Pattern per item, as in Preprocess branch)
|
|
736
|
+
// - call Build_MDD(...) for each sequence
|
|
737
|
+
//
|
|
738
|
+
// This DOES NOT change his mining logic; it just drives it from memory.
|
|
739
|
+
|
|
740
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
741
|
+
|
|
742
|
+
// clear MDD and globals to a known state
|
|
743
|
+
btminer::Tree.clear();
|
|
744
|
+
btminer::Tree.emplace_back(0, 0, 0); // root (exactly like professor)
|
|
745
|
+
|
|
746
|
+
// compute basic stats
|
|
747
|
+
int max_id = 0;
|
|
748
|
+
int max_len = 0;
|
|
749
|
+
int seq_count = 0;
|
|
750
|
+
long long entries = 0;
|
|
751
|
+
|
|
752
|
+
for (const auto &s : seqs) {
|
|
753
|
+
if (s.empty()) continue;
|
|
754
|
+
++seq_count;
|
|
755
|
+
max_len = std::max<int>(max_len, static_cast<int>(s.size()));
|
|
756
|
+
for (int x : s) {
|
|
757
|
+
max_id = std::max(max_id, std::abs(x));
|
|
758
|
+
++entries;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
btminer::N = seq_count;
|
|
763
|
+
btminer::M = max_len;
|
|
764
|
+
btminer::L = max_id;
|
|
765
|
+
btminer::E = static_cast<int>(entries);
|
|
766
|
+
|
|
767
|
+
// theta = abs support
|
|
768
|
+
if (minsup < 1.0)
|
|
769
|
+
btminer::theta = static_cast<int>(std::ceil(minsup * btminer::N * btminer::N_mult));
|
|
770
|
+
else
|
|
771
|
+
btminer::theta = static_cast<int>(minsup);
|
|
772
|
+
|
|
773
|
+
// seed DFS exactly like professor does in the preprocessed branch:
|
|
774
|
+
btminer::DFS.clear();
|
|
775
|
+
btminer::DFS.reserve(btminer::L);
|
|
776
|
+
for (int i = 0; i < btminer::L; ++i)
|
|
777
|
+
btminer::DFS.emplace_back(-i - 1);
|
|
778
|
+
|
|
779
|
+
// now build the MDD, sequence by sequence
|
|
780
|
+
for (const auto &s : seqs) {
|
|
781
|
+
if (s.empty()) continue;
|
|
782
|
+
// professor’s Build_MDD takes a vector<int> by non-const ref
|
|
783
|
+
std::vector<int> tmp = s;
|
|
784
|
+
btminer::Build_MDD(tmp);
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
if (verbose) {
|
|
788
|
+
std::cerr << "[BT][binding] PY mode: N=" << btminer::N
|
|
789
|
+
<< " L=" << btminer::L
|
|
790
|
+
<< " M=" << btminer::M
|
|
791
|
+
<< " E=" << btminer::E
|
|
792
|
+
<< " theta=" << btminer::theta
|
|
793
|
+
<< " Tree.size()=" << btminer::Tree.size()
|
|
794
|
+
<< std::endl;
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
// 3) run professor’s miner
|
|
799
|
+
btminer::Freq_miner();
|
|
800
|
+
|
|
801
|
+
// 4) build python result
|
|
802
|
+
// 4) build python result
|
|
803
|
+
py::dict out;
|
|
804
|
+
out["patterns"] = btminer::GetCollected(); // ← NEW
|
|
805
|
+
out["num_patterns"] = btminer::num_patt;
|
|
806
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
807
|
+
out["N"] = btminer::N;
|
|
808
|
+
out["L"] = btminer::L;
|
|
809
|
+
out["theta"] = btminer::theta;
|
|
810
|
+
return out;
|
|
811
|
+
|
|
812
|
+
},
|
|
813
|
+
py::arg("data"),
|
|
814
|
+
py::arg("minsup") = 0.01,
|
|
815
|
+
py::arg("time_limit") = 36000,
|
|
816
|
+
py::arg("preproc") = false,
|
|
817
|
+
py::arg("use_dic") = false,
|
|
818
|
+
py::arg("verbose") = false,
|
|
819
|
+
py::arg("out_file") = ""
|
|
820
|
+
);
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
// HTMiner
|
|
826
|
+
// ─────────────────────────────────────────────────────────────
|
|
827
|
+
// HTMiner
|
|
828
|
+
m.def("HTMiner",
|
|
829
|
+
[](py::object data,
|
|
830
|
+
double minsup, unsigned int time_limit,
|
|
831
|
+
bool preproc, bool use_dic,
|
|
832
|
+
bool verbose, const std::string &out_file)
|
|
833
|
+
{
|
|
834
|
+
htminer::time_limit = time_limit;
|
|
835
|
+
htminer::pre_pro = preproc;
|
|
836
|
+
htminer::use_dic = use_dic;
|
|
837
|
+
htminer::just_build = false;
|
|
838
|
+
htminer::use_list = false;
|
|
839
|
+
htminer::b_disp = verbose;
|
|
840
|
+
htminer::b_write = !out_file.empty();
|
|
841
|
+
htminer::out_file = out_file;
|
|
842
|
+
htminer::ClearCollected();
|
|
843
|
+
htminer::start_time = std::clock();
|
|
844
|
+
|
|
845
|
+
if (py::isinstance<py::str>(data)) {
|
|
846
|
+
std::string path = data.cast<std::string>();
|
|
847
|
+
if (!htminer::Load_instance(path, minsup))
|
|
848
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
849
|
+
} else {
|
|
850
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
851
|
+
htminer::items = std::move(seqs);
|
|
852
|
+
htminer::N = htminer::items.size();
|
|
853
|
+
|
|
854
|
+
int max_id = 0;
|
|
855
|
+
htminer::M = 0;
|
|
856
|
+
htminer::E = 0;
|
|
857
|
+
for (auto &seq : htminer::items) {
|
|
858
|
+
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
859
|
+
for (int x : seq)
|
|
860
|
+
max_id = std::max(max_id, std::abs(x));
|
|
861
|
+
htminer::E += seq.size();
|
|
862
|
+
}
|
|
863
|
+
htminer::L = max_id;
|
|
864
|
+
htminer::theta = (minsup < 1.0)
|
|
865
|
+
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
866
|
+
: static_cast<unsigned long long>(minsup);
|
|
867
|
+
|
|
868
|
+
htminer::DFS.clear();
|
|
869
|
+
htminer::DFS.reserve(htminer::L);
|
|
870
|
+
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
871
|
+
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
872
|
+
|
|
873
|
+
htminer::VDFS.clear();
|
|
874
|
+
htminer::VDFS.resize(htminer::L);
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
htminer::Freq_miner();
|
|
878
|
+
|
|
879
|
+
// 👇 now really respects verbose
|
|
880
|
+
if (verbose) {
|
|
881
|
+
std::cout << " total patterns = "
|
|
882
|
+
<< htminer::collectedPatterns.size() << "\n";
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
py::dict out;
|
|
886
|
+
out["patterns"] = htminer::GetCollected();
|
|
887
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
888
|
+
return out;
|
|
889
|
+
},
|
|
890
|
+
py::arg("data"),
|
|
891
|
+
py::arg("minsup") = 0.01,
|
|
892
|
+
py::arg("time_limit") = 36000,
|
|
893
|
+
py::arg("preproc") = false,
|
|
894
|
+
py::arg("use_dic") = false,
|
|
895
|
+
py::arg("verbose") = false,
|
|
896
|
+
py::arg("out_file") = ""
|
|
897
|
+
);
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
m.def("LargePrefixProjection",
|
|
273
901
|
[](py::object data,
|
|
274
902
|
double minsup,
|
|
275
903
|
unsigned int time_limit,
|
|
@@ -281,28 +909,30 @@ std::cout << " total patterns = "
|
|
|
281
909
|
largepp::time_limit = time_limit;
|
|
282
910
|
largepp::pre_pro = preproc;
|
|
283
911
|
largepp::use_dic = use_dic;
|
|
284
|
-
largepp::use_list = true;
|
|
912
|
+
largepp::use_list = true;
|
|
285
913
|
largepp::b_disp = verbose;
|
|
286
914
|
largepp::b_write = !out_file.empty();
|
|
287
915
|
largepp::out_file = out_file;
|
|
288
|
-
largepp::just_build = false;
|
|
916
|
+
largepp::just_build = false;
|
|
289
917
|
|
|
290
918
|
largepp::ClearCollected();
|
|
291
919
|
largepp::start_time = std::clock();
|
|
292
|
-
std::string fname = data.cast<std::string>();
|
|
293
|
-
/* 1) load instance (py list or filename) */
|
|
294
|
-
if (py::isinstance<py::str>(data))
|
|
295
|
-
|
|
296
|
-
largepp::Load_instance(fname, minsup);
|
|
297
|
-
else
|
|
298
|
-
largepp::Load_py(data, minsup); // helper you’ll expose
|
|
299
|
-
|
|
300
|
-
std::vector<unsigned long long> dbg;
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
920
|
|
|
921
|
+
// 👇 this was the last noisy one
|
|
922
|
+
if (verbose) {
|
|
923
|
+
std::cerr << " minsup=" << minsup
|
|
924
|
+
<< " preproc=" << preproc
|
|
925
|
+
<< " verbose=" << verbose
|
|
926
|
+
<< " out_file=" << (out_file.empty() ? "(none)" : out_file)
|
|
927
|
+
<< " use_dic=" << use_dic << "\n";
|
|
928
|
+
}
|
|
305
929
|
|
|
930
|
+
if (py::isinstance<py::str>(data)) {
|
|
931
|
+
std::string fname = data.cast<std::string>();
|
|
932
|
+
largepp::Load_instance(fname, minsup);
|
|
933
|
+
} else {
|
|
934
|
+
largepp::Load_py(data, minsup);
|
|
935
|
+
}
|
|
306
936
|
|
|
307
937
|
largepp::Freq_miner();
|
|
308
938
|
|
|
@@ -320,6 +950,8 @@ std::cout << " total patterns = "
|
|
|
320
950
|
py::arg("out_file") = ""
|
|
321
951
|
);
|
|
322
952
|
|
|
953
|
+
|
|
954
|
+
|
|
323
955
|
// ─────────────────────────────────────────────────────────────
|
|
324
956
|
// LargeBTMiner -- Python wrapper for the largebm implementation
|
|
325
957
|
// ─────────────────────────────────────────────────────────────
|
|
@@ -404,100 +1036,108 @@ std::cout << " total patterns = "
|
|
|
404
1036
|
|
|
405
1037
|
|
|
406
1038
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
1039
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1040
|
+
// LargeBTMiner (MDD-based)
|
|
1041
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1042
|
+
/*m.def("LargeBTMiner",
|
|
1043
|
+
[](py::object data,
|
|
1044
|
+
double minsup,
|
|
1045
|
+
unsigned int time_limit,
|
|
1046
|
+
bool preproc,
|
|
1047
|
+
bool use_dic,
|
|
1048
|
+
bool verbose,
|
|
1049
|
+
const std::string &out_file)
|
|
1050
|
+
{
|
|
1051
|
+
using namespace largebm;
|
|
1052
|
+
|
|
1053
|
+
// 0) Set global flags and timers
|
|
1054
|
+
largebm::time_limit = time_limit;
|
|
1055
|
+
largebm::pre_pro = preproc;
|
|
1056
|
+
largebm::use_dic = use_dic;
|
|
1057
|
+
largebm::use_list = false; // large-mode → always MDD
|
|
1058
|
+
largebm::b_disp = verbose;
|
|
1059
|
+
largebm::b_write = !out_file.empty();
|
|
1060
|
+
largebm::out_file = out_file;
|
|
1061
|
+
largebm::just_build = false;
|
|
1062
|
+
|
|
1063
|
+
// 0.1) Clear any leftover data/state from previous runs
|
|
1064
|
+
largebm::items.clear();
|
|
1065
|
+
largebm::item_dic.clear();
|
|
1066
|
+
largebm::inv_item_dic.clear();
|
|
1067
|
+
largebm::Tree.clear();
|
|
1068
|
+
largebm::DFS.clear();
|
|
1069
|
+
largebm::ClearCollected();
|
|
1070
|
+
|
|
1071
|
+
// 1) Load sequences (either from filename or from Python list)
|
|
1072
|
+
if (py::isinstance<py::str>(data)) {
|
|
1073
|
+
// ─────────── FILE-BASED MODE ───────────
|
|
1074
|
+
std::string path = data.cast<std::string>();
|
|
1075
|
+
if (!largebm::Load_instance(path, minsup))
|
|
1076
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
425
1077
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
largebm::
|
|
430
|
-
largebm::
|
|
431
|
-
largebm::DFS.clear();
|
|
432
|
-
largebm::ClearCollected();
|
|
1078
|
+
} else {
|
|
1079
|
+
// ────────── IN-MEMORY MODE ──────────
|
|
1080
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
1081
|
+
largebm::items = std::move(seqs);
|
|
1082
|
+
largebm::N = largebm::items.size();
|
|
433
1083
|
|
|
434
|
-
// 1)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
1084
|
+
// 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
|
|
1085
|
+
int max_id = 0;
|
|
1086
|
+
largebm::M = 0;
|
|
1087
|
+
largebm::E = 0;
|
|
1088
|
+
for (auto &seq : largebm::items) {
|
|
1089
|
+
largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
|
|
1090
|
+
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
1091
|
+
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
1092
|
+
}
|
|
1093
|
+
largebm::L = static_cast<unsigned int>(max_id);
|
|
1094
|
+
largebm::theta = (minsup < 1.0)
|
|
1095
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
1096
|
+
: static_cast<unsigned long long>(minsup);
|
|
440
1097
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
largebm::
|
|
445
|
-
largebm::N = largebm::items.size();
|
|
1098
|
+
// 1.2) Initialize DFS buffer (size = L)
|
|
1099
|
+
largebm::DFS.reserve(largebm::L);
|
|
1100
|
+
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
1101
|
+
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
446
1102
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
454
|
-
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
455
|
-
}
|
|
456
|
-
largebm::L = static_cast<unsigned int>(max_id);
|
|
457
|
-
largebm::theta = (minsup < 1.0)
|
|
458
|
-
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
459
|
-
: static_cast<unsigned long long>(minsup);
|
|
460
|
-
|
|
461
|
-
// 1.2) Initialize DFS buffer (size = L)
|
|
462
|
-
largebm::DFS.reserve(largebm::L);
|
|
463
|
-
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
464
|
-
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
465
|
-
|
|
466
|
-
// 1.3) Build the MDD “Tree”
|
|
467
|
-
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
468
|
-
largebm::Tree.emplace_back(0, 0, 0);
|
|
469
|
-
for (auto &seq : largebm::items)
|
|
470
|
-
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
471
|
-
}
|
|
1103
|
+
// 1.3) Build the MDD “Tree”
|
|
1104
|
+
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
1105
|
+
largebm::Tree.emplace_back(0, 0, 0);
|
|
1106
|
+
for (auto &seq : largebm::items)
|
|
1107
|
+
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
1108
|
+
}
|
|
472
1109
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
}
|
|
480
|
-
largebm::inv_item_dic = std::move(inv);
|
|
1110
|
+
// 2) Rebuild inverse-dictionary from fresh item_dic
|
|
1111
|
+
{
|
|
1112
|
+
std::vector<int> inv(largebm::item_dic.size() + 1);
|
|
1113
|
+
for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
|
|
1114
|
+
int cid = largebm::item_dic[old - 1];
|
|
1115
|
+
if (cid > 0) inv[cid] = old;
|
|
481
1116
|
}
|
|
1117
|
+
largebm::inv_item_dic = std::move(inv);
|
|
1118
|
+
}
|
|
482
1119
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
1120
|
+
// 3) Start timing and run the miner
|
|
1121
|
+
largebm::start_time = std::clock();
|
|
1122
|
+
largebm::Freq_miner();
|
|
1123
|
+
|
|
1124
|
+
// 4) Collect results and elapsed time
|
|
1125
|
+
const auto& pats = largebm::GetCollected();
|
|
1126
|
+
|
|
1127
|
+
py::dict out;
|
|
1128
|
+
out["patterns"] = pats;
|
|
1129
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
1130
|
+
return out;
|
|
1131
|
+
},
|
|
1132
|
+
py::arg("data"),
|
|
1133
|
+
py::arg("minsup") = 0.01,
|
|
1134
|
+
py::arg("time_limit") = 36000,
|
|
1135
|
+
py::arg("preproc") = false,
|
|
1136
|
+
py::arg("use_dic") = false,
|
|
1137
|
+
py::arg("verbose") = false,
|
|
1138
|
+
py::arg("out_file") = ""
|
|
1139
|
+
);
|
|
486
1140
|
|
|
487
|
-
// 4) Collect results and elapsed time
|
|
488
|
-
py::dict out;
|
|
489
|
-
out["patterns"] = largebm::GetCollected();
|
|
490
|
-
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
491
|
-
return out;
|
|
492
|
-
},
|
|
493
|
-
py::arg("data"),
|
|
494
|
-
py::arg("minsup") = 0.01,
|
|
495
|
-
py::arg("time_limit") = 36000,
|
|
496
|
-
py::arg("preproc") = false,
|
|
497
|
-
py::arg("use_dic") = false,
|
|
498
|
-
py::arg("verbose") = false,
|
|
499
|
-
py::arg("out_file") = ""
|
|
500
|
-
);
|
|
501
1141
|
|
|
502
1142
|
|
|
503
1143
|
m.def("LargeHTMiner",
|
|
@@ -606,4 +1246,4 @@ m.def("LargeHTMiner",
|
|
|
606
1246
|
|
|
607
1247
|
|
|
608
1248
|
|
|
609
|
-
}
|
|
1249
|
+
} */
|