effspm 0.2.7__cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl → 0.3.1__cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +818 -200
- effspm/_effspm.cpython-310-i386-linux-gnu.so +0 -0
- effspm/btminer/src/build_mdd.cpp +42 -17
- effspm/btminer/src/build_mdd.hpp +13 -19
- effspm/btminer/src/freq_miner.cpp +134 -49
- effspm/btminer/src/freq_miner.hpp +16 -0
- effspm/btminer/src/load_inst.cpp +202 -126
- effspm/btminer/src/load_inst.hpp +22 -4
- effspm/btminer/src/main.cpp +83 -0
- effspm/btminer/src/utility.cpp +26 -41
- effspm/btminer/src/utility.hpp +6 -30
- effspm/freq_miner.hpp +2 -1
- effspm/htminer/src/build_mdd.cpp +46 -124
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/build_mdd.cpp +69 -110
- effspm/largebm/src/build_mdd.hpp +22 -37
- effspm/largebm/src/freq_miner.cpp +241 -291
- effspm/largebm/src/freq_miner.hpp +25 -36
- effspm/largebm/src/load_inst.cpp +20 -26
- effspm/largebm/src/load_inst.hpp +24 -34
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +11 -21
- effspm/largebm/src/utility.hpp +7 -10
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +134 -191
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +137 -174
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/freq_miner.cpp +184 -156
- effspm/largepp/src/freq_miner.hpp +11 -36
- effspm/largepp/src/load_inst.cpp +32 -12
- effspm/largepp/src/load_inst.hpp +15 -9
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/load_inst.cpp +8 -8
- effspm/load_inst.hpp +1 -1
- effspm/main.cpp +103 -0
- {effspm-0.2.7.dist-info → effspm-0.3.1.dist-info}/METADATA +1 -1
- effspm-0.3.1.dist-info/RECORD +60 -0
- effspm-0.2.7.dist-info/RECORD +0 -53
- {effspm-0.2.7.dist-info → effspm-0.3.1.dist-info}/WHEEL +0 -0
- {effspm-0.2.7.dist-info → effspm-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.2.7.dist-info → effspm-0.3.1.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -2,49 +2,97 @@
|
|
|
2
2
|
|
|
3
3
|
#include <pybind11/pybind11.h>
|
|
4
4
|
#include <pybind11/stl.h>
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
#include <iostream>
|
|
7
|
+
#include <fstream>
|
|
8
|
+
#include <cstdio> // std::remove
|
|
9
|
+
#include <vector>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <ctime>
|
|
12
|
+
#include <cmath>
|
|
7
13
|
|
|
14
|
+
namespace py = pybind11;
|
|
8
15
|
|
|
9
|
-
// PrefixProjection headers
|
|
16
|
+
// PrefixProjection headers (global namespace)
|
|
10
17
|
#include "freq_miner.hpp"
|
|
11
18
|
#include "load_inst.hpp"
|
|
12
19
|
#include "utility.hpp"
|
|
13
20
|
|
|
14
|
-
// BTMiner (
|
|
21
|
+
// BTMiner (namespaced)
|
|
15
22
|
#include "btminer/src/freq_miner.hpp"
|
|
16
23
|
#include "btminer/src/load_inst.hpp"
|
|
17
24
|
#include "btminer/src/utility.hpp"
|
|
18
25
|
#include "btminer/src/build_mdd.hpp"
|
|
19
26
|
|
|
20
|
-
// HTMiner (
|
|
21
|
-
#include "htminer/src/build_mdd.hpp"
|
|
27
|
+
// HTMiner (namespaced)
|
|
28
|
+
#include "htminer/src/build_mdd.hpp"
|
|
22
29
|
#include "htminer/src/freq_miner.hpp"
|
|
23
30
|
#include "htminer/src/load_inst.hpp"
|
|
24
31
|
#include "htminer/src/utility.hpp"
|
|
25
32
|
|
|
26
|
-
|
|
33
|
+
// LargePrefixProjection
|
|
27
34
|
#include "largepp/src/freq_miner.hpp"
|
|
28
35
|
#include "largepp/src/load_inst.hpp"
|
|
29
36
|
#include "largepp/src/utility.hpp"
|
|
30
37
|
|
|
38
|
+
// LargeBTMiner
|
|
31
39
|
#include "largebm/src/freq_miner.hpp"
|
|
32
40
|
#include "largebm/src/load_inst.hpp"
|
|
33
41
|
#include "largebm/src/utility.hpp"
|
|
34
42
|
#include "largebm/src/build_mdd.hpp"
|
|
35
43
|
|
|
44
|
+
// LargeHTMiner
|
|
36
45
|
#include "largehm/src/freq_miner.hpp"
|
|
37
46
|
#include "largehm/src/load_inst.hpp"
|
|
38
47
|
#include "largehm/src/utility.hpp"
|
|
39
48
|
#include "largehm/src/build_mdd.hpp"
|
|
40
49
|
|
|
50
|
+
namespace {
|
|
51
|
+
|
|
52
|
+
// RAII helper for temp file
|
|
53
|
+
struct TempFile {
|
|
54
|
+
std::string path;
|
|
55
|
+
~TempFile() {
|
|
56
|
+
if (!path.empty()) {
|
|
57
|
+
std::remove(path.c_str());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Write Python list[list[int]] to a temp file in professor’s format:
|
|
63
|
+
// one sequence per line, items separated by spaces.
|
|
64
|
+
std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
|
|
65
|
+
char tmp_name[L_tmpnam];
|
|
66
|
+
if (!std::tmpnam(tmp_name)) {
|
|
67
|
+
throw std::runtime_error("Failed to create temporary file name");
|
|
68
|
+
}
|
|
69
|
+
std::string path = std::string(tmp_name) + ".txt";
|
|
70
|
+
|
|
71
|
+
std::ofstream ofs(path);
|
|
72
|
+
if (!ofs) {
|
|
73
|
+
throw std::runtime_error("Failed to open temporary file for writing: " + path);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const auto& seq : seqs) {
|
|
77
|
+
for (size_t i = 0; i < seq.size(); ++i) {
|
|
78
|
+
if (i) ofs << ' ';
|
|
79
|
+
ofs << seq[i];
|
|
80
|
+
}
|
|
81
|
+
ofs << '\n';
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ofs.close();
|
|
85
|
+
return path;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
} // anonymous namespace
|
|
41
89
|
|
|
42
90
|
|
|
43
91
|
PYBIND11_MODULE(_effspm, m) {
|
|
44
|
-
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
92
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
|
|
45
93
|
|
|
46
94
|
// ─────────────────────────────────────────────────────────────
|
|
47
|
-
// PrefixProjection
|
|
95
|
+
// PrefixProjection (works directly on Python lists or files)
|
|
48
96
|
// ─────────────────────────────────────────────────────────────
|
|
49
97
|
m.def("PrefixProjection",
|
|
50
98
|
[](py::object data,
|
|
@@ -59,7 +107,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
59
107
|
::pre_pro = preproc;
|
|
60
108
|
::use_dic = use_dic;
|
|
61
109
|
::use_list = false;
|
|
62
|
-
::b_disp = verbose;
|
|
110
|
+
::b_disp = verbose; // controls prints in original code
|
|
63
111
|
::b_write = !out_file.empty();
|
|
64
112
|
::out_file = out_file;
|
|
65
113
|
|
|
@@ -69,7 +117,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
69
117
|
if (py::isinstance<py::str>(data)) {
|
|
70
118
|
std::string path = data.cast<std::string>();
|
|
71
119
|
if (!Load_instance(path, minsup))
|
|
72
|
-
throw std::runtime_error("
|
|
120
|
+
throw std::runtime_error("PrefixProjection: failed to load file: " + path);
|
|
73
121
|
} else {
|
|
74
122
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
75
123
|
items = std::move(seqs);
|
|
@@ -113,7 +161,7 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
113
161
|
);
|
|
114
162
|
|
|
115
163
|
// ─────────────────────────────────────────────────────────────
|
|
116
|
-
// BTMiner
|
|
164
|
+
// BTMiner (always uses professor's Load_instance)
|
|
117
165
|
// ─────────────────────────────────────────────────────────────
|
|
118
166
|
m.def("BTMiner",
|
|
119
167
|
[](py::object data,
|
|
@@ -124,52 +172,54 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
124
172
|
bool verbose,
|
|
125
173
|
const std::string &out_file)
|
|
126
174
|
{
|
|
127
|
-
|
|
175
|
+
// Configure professor globals
|
|
176
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
128
177
|
btminer::pre_pro = preproc;
|
|
129
178
|
btminer::use_dic = use_dic;
|
|
130
|
-
btminer::use_list = false;
|
|
131
179
|
btminer::b_disp = verbose;
|
|
132
180
|
btminer::b_write = !out_file.empty();
|
|
133
181
|
btminer::out_file = out_file;
|
|
182
|
+
btminer::N_mult = 1;
|
|
183
|
+
btminer::M_mult = 1;
|
|
184
|
+
btminer::just_build = false;
|
|
134
185
|
|
|
135
186
|
btminer::ClearCollected();
|
|
136
187
|
btminer::start_time = std::clock();
|
|
137
188
|
|
|
189
|
+
TempFile tmp;
|
|
190
|
+
std::string path;
|
|
191
|
+
|
|
138
192
|
if (py::isinstance<py::str>(data)) {
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
throw std::runtime_error("Failed to load file: " + path);
|
|
193
|
+
// File path: use directly
|
|
194
|
+
path = data.cast<std::string>();
|
|
142
195
|
} else {
|
|
196
|
+
// Python list → write to a temp file in the same format
|
|
143
197
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
int max_id = 0;
|
|
148
|
-
for (auto &seq : btminer::items)
|
|
149
|
-
for (int x : seq)
|
|
150
|
-
max_id = std::max(max_id, std::abs(x));
|
|
151
|
-
btminer::L = max_id;
|
|
152
|
-
|
|
153
|
-
btminer::theta = (minsup < 1.0) ? std::ceil(minsup * btminer::N) : minsup;
|
|
198
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
199
|
+
path = tmp.path;
|
|
200
|
+
}
|
|
154
201
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
202
|
+
if (verbose) {
|
|
203
|
+
std::cerr << "[BTMiner] path=" << path
|
|
204
|
+
<< " minsup=" << minsup
|
|
205
|
+
<< " preproc=" << preproc
|
|
206
|
+
<< " use_dic=" << use_dic
|
|
207
|
+
<< std::endl;
|
|
208
|
+
}
|
|
159
209
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
for (auto &seq : btminer::items) {
|
|
163
|
-
btminer::M = std::max<unsigned int>(btminer::M, seq.size());
|
|
164
|
-
btminer::E += seq.size();
|
|
165
|
-
}
|
|
210
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
211
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
166
212
|
}
|
|
167
213
|
|
|
168
214
|
btminer::Freq_miner();
|
|
169
215
|
|
|
170
216
|
py::dict out;
|
|
171
|
-
out["patterns"]
|
|
172
|
-
out["
|
|
217
|
+
out["patterns"] = btminer::GetCollected();
|
|
218
|
+
out["num_patterns"] = btminer::num_patt;
|
|
219
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
220
|
+
out["N"] = btminer::N;
|
|
221
|
+
out["L"] = btminer::L;
|
|
222
|
+
out["theta"] = btminer::theta;
|
|
173
223
|
return out;
|
|
174
224
|
},
|
|
175
225
|
py::arg("data"),
|
|
@@ -181,83 +231,424 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
181
231
|
py::arg("out_file") = ""
|
|
182
232
|
);
|
|
183
233
|
|
|
184
|
-
// ─────────────────────────────────────────────────────────────
|
|
185
|
-
// HTMiner
|
|
234
|
+
// ─────────────────────────────────────────────────────────────
|
|
235
|
+
// HTMiner (works on files; we use a temp file for in-memory data)
|
|
236
|
+
// ─────────────────────────────────────────────────────────────
|
|
237
|
+
// ─────────────────────────────────────────────────────────────
|
|
238
|
+
// HTMiner (always uses professor's Load_instance; pre_pro forced ON)
|
|
186
239
|
// ─────────────────────────────────────────────────────────────
|
|
187
240
|
m.def("HTMiner",
|
|
241
|
+
[](py::object data,
|
|
242
|
+
double minsup,
|
|
243
|
+
unsigned int time_limit,
|
|
244
|
+
bool /*preproc*/, // Python arg is ignored internally
|
|
245
|
+
bool use_dic,
|
|
246
|
+
bool verbose,
|
|
247
|
+
const std::string &out_file)
|
|
248
|
+
{
|
|
249
|
+
using namespace htminer;
|
|
250
|
+
|
|
251
|
+
// ───────── Global parameter setup ─────────
|
|
252
|
+
htminer::time_limit = time_limit;
|
|
253
|
+
|
|
254
|
+
// IMPORTANT: always run with preprocessing ON,
|
|
255
|
+
// regardless of the Python `preproc` flag.
|
|
256
|
+
htminer::pre_pro = true;
|
|
257
|
+
htminer::use_dic = use_dic;
|
|
258
|
+
htminer::just_build = false;
|
|
259
|
+
htminer::b_disp = verbose;
|
|
260
|
+
htminer::b_write = !out_file.empty();
|
|
261
|
+
htminer::out_file = out_file;
|
|
262
|
+
|
|
263
|
+
// ───────── HARD RESET of HTMiner globals ─────────
|
|
264
|
+
htminer::ClearCollected();
|
|
265
|
+
htminer::Tree.clear();
|
|
266
|
+
htminer::VTree.clear();
|
|
267
|
+
htminer::CTree.clear();
|
|
268
|
+
htminer::DFS.clear();
|
|
269
|
+
htminer::VDFS.clear();
|
|
270
|
+
htminer::item_dic.clear();
|
|
271
|
+
|
|
272
|
+
htminer::M = 0;
|
|
273
|
+
htminer::N = 0;
|
|
274
|
+
htminer::L = 0;
|
|
275
|
+
htminer::E = 0;
|
|
276
|
+
htminer::theta = 0;
|
|
277
|
+
htminer::mlim = 0;
|
|
278
|
+
htminer::itmset_exists = false;
|
|
279
|
+
|
|
280
|
+
// NOTE: do NOT add a root arc here;
|
|
281
|
+
// htminer::Load_instance() already does Tree.emplace_back(0,0,0)
|
|
282
|
+
htminer::start_time = std::clock();
|
|
283
|
+
|
|
284
|
+
// ───────── Handle input (path or in-memory sequences) ─────────
|
|
285
|
+
TempFile tmp;
|
|
286
|
+
std::string path;
|
|
287
|
+
|
|
288
|
+
if (py::isinstance<py::str>(data)) {
|
|
289
|
+
// data is a file path
|
|
290
|
+
path = data.cast<std::string>();
|
|
291
|
+
} else {
|
|
292
|
+
// data is a list[list[int]] → write a temp file in the same text format
|
|
293
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
294
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
295
|
+
path = tmp.path;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (verbose) {
|
|
299
|
+
std::cerr << "[HTMiner] path=" << path
|
|
300
|
+
<< " minsup=" << minsup
|
|
301
|
+
<< " preproc(always)=true"
|
|
302
|
+
<< " use_dic=" << use_dic
|
|
303
|
+
<< std::endl;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// ───────── Build MDD via professor's loader ─────────
|
|
307
|
+
if (!htminer::Load_instance(path, minsup)) {
|
|
308
|
+
throw std::runtime_error("HTMiner: failed to load instance from: " + path);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ───────── Run miner ─────────
|
|
312
|
+
htminer::Freq_miner();
|
|
313
|
+
|
|
314
|
+
// ───────── Return results ─────────
|
|
315
|
+
py::dict out;
|
|
316
|
+
out["patterns"] = htminer::GetCollected();
|
|
317
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
318
|
+
return out;
|
|
319
|
+
},
|
|
320
|
+
py::arg("data"),
|
|
321
|
+
py::arg("minsup") = 0.01,
|
|
322
|
+
py::arg("time_limit") = 36000,
|
|
323
|
+
py::arg("preproc") = false, // kept for API symmetry, but IGNORED
|
|
324
|
+
py::arg("use_dic") = false,
|
|
325
|
+
py::arg("verbose") = false,
|
|
326
|
+
py::arg("out_file") = ""
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
// ─────────────────────────────────────────────────────────────
|
|
330
|
+
// LargePrefixProjection (already has its own Load_py)
|
|
331
|
+
// ─────────────────────────────────────────────────────────────
|
|
332
|
+
m.def("LargePrefixProjection",
|
|
188
333
|
[](py::object data,
|
|
189
|
-
double minsup,
|
|
190
|
-
|
|
191
|
-
bool
|
|
334
|
+
double minsup,
|
|
335
|
+
unsigned int time_limit,
|
|
336
|
+
bool preproc,
|
|
337
|
+
bool use_dic,
|
|
338
|
+
bool verbose,
|
|
339
|
+
const std::string &out_file)
|
|
192
340
|
{
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
341
|
+
largepp::time_limit = time_limit;
|
|
342
|
+
largepp::pre_pro = preproc;
|
|
343
|
+
largepp::use_dic = use_dic;
|
|
344
|
+
largepp::use_list = true; // large prefix uses list-based mining
|
|
345
|
+
largepp::b_disp = verbose;
|
|
346
|
+
largepp::b_write = !out_file.empty();
|
|
347
|
+
largepp::out_file = out_file;
|
|
348
|
+
largepp::just_build = false;
|
|
349
|
+
|
|
350
|
+
largepp::ClearCollected();
|
|
351
|
+
largepp::start_time = std::clock();
|
|
352
|
+
|
|
353
|
+
if (py::isinstance<py::str>(data)) {
|
|
354
|
+
std::string fname = data.cast<std::string>();
|
|
355
|
+
largepp::Load_instance(fname, minsup);
|
|
356
|
+
} else {
|
|
357
|
+
largepp::Load_py(data, minsup);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
largepp::Freq_miner();
|
|
361
|
+
|
|
362
|
+
py::dict out;
|
|
363
|
+
out["patterns"] = largepp::GetCollected();
|
|
364
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
365
|
+
return out;
|
|
366
|
+
},
|
|
367
|
+
py::arg("data"),
|
|
368
|
+
py::arg("minsup") = 0.01,
|
|
369
|
+
py::arg("time_limit") = 36000,
|
|
370
|
+
py::arg("preproc") = false,
|
|
371
|
+
py::arg("use_dic") = false,
|
|
372
|
+
py::arg("verbose") = false,
|
|
373
|
+
py::arg("out_file") = ""
|
|
374
|
+
);
|
|
375
|
+
|
|
376
|
+
// ─────────────────────────────────────────────────────────────
|
|
377
|
+
// LargeBTMiner (always uses professor's largebm::Load_instance)
|
|
378
|
+
// ─────────────────────────────────────────────────────────────
|
|
379
|
+
m.def("LargeBTMiner",
|
|
380
|
+
[](py::object data,
|
|
381
|
+
double minsup,
|
|
382
|
+
unsigned int time_limit,
|
|
383
|
+
bool preproc,
|
|
384
|
+
bool use_dic,
|
|
385
|
+
bool verbose,
|
|
386
|
+
const std::string &out_file)
|
|
387
|
+
{
|
|
388
|
+
using namespace largebm;
|
|
389
|
+
|
|
390
|
+
largebm::time_limit = time_limit;
|
|
391
|
+
largebm::pre_pro = preproc;
|
|
392
|
+
largebm::use_dic = use_dic;
|
|
393
|
+
largebm::use_list = false; // MDD-based
|
|
394
|
+
largebm::b_disp = verbose;
|
|
395
|
+
largebm::b_write = !out_file.empty();
|
|
396
|
+
largebm::out_file = out_file;
|
|
397
|
+
largebm::just_build = false;
|
|
398
|
+
|
|
399
|
+
largebm::ClearCollected();
|
|
400
|
+
largebm::items.clear();
|
|
401
|
+
largebm::item_dic.clear();
|
|
402
|
+
largebm::inv_item_dic.clear();
|
|
403
|
+
largebm::Tree.clear();
|
|
404
|
+
largebm::DFS.clear();
|
|
405
|
+
|
|
406
|
+
largebm::start_time = std::clock();
|
|
407
|
+
|
|
408
|
+
TempFile tmp;
|
|
409
|
+
std::string path;
|
|
410
|
+
|
|
411
|
+
if (py::isinstance<py::str>(data)) {
|
|
412
|
+
path = data.cast<std::string>();
|
|
413
|
+
} else {
|
|
414
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
415
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
416
|
+
path = tmp.path;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
if (verbose) {
|
|
420
|
+
std::cerr << "[LargeBTMiner] path=" << path
|
|
421
|
+
<< " minsup=" << minsup
|
|
422
|
+
<< " preproc=" << preproc
|
|
423
|
+
<< " use_dic=" << use_dic
|
|
424
|
+
<< std::endl;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (!largebm::Load_instance(path, minsup)) {
|
|
428
|
+
throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
largebm::Freq_miner();
|
|
432
|
+
|
|
433
|
+
py::dict out;
|
|
434
|
+
out["patterns"] = largebm::GetCollected();
|
|
435
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
436
|
+
return out;
|
|
437
|
+
},
|
|
438
|
+
py::arg("data"),
|
|
439
|
+
py::arg("minsup") = 0.01,
|
|
440
|
+
py::arg("time_limit") = 36000,
|
|
441
|
+
py::arg("preproc") = false,
|
|
442
|
+
py::arg("use_dic") = false,
|
|
443
|
+
py::arg("verbose") = false,
|
|
444
|
+
py::arg("out_file") = ""
|
|
445
|
+
);
|
|
446
|
+
|
|
447
|
+
// ─────────────────────────────────────────────────────────────
|
|
448
|
+
// LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
|
|
449
|
+
// ─────────────────────────────────────────────────────────────
|
|
450
|
+
// ─────────────────────────────────────────────────────────────
|
|
451
|
+
// LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
|
|
452
|
+
// ─────────────────────────────────────────────────────────────
|
|
453
|
+
m.def("LargeHTMiner",
|
|
454
|
+
[](py::object data,
|
|
455
|
+
double minsup,
|
|
456
|
+
unsigned int time_limit,
|
|
457
|
+
bool /*preproc*/, // kept for API symmetry; ignored
|
|
458
|
+
bool use_dic,
|
|
459
|
+
bool verbose,
|
|
460
|
+
const std::string &out_file)
|
|
461
|
+
{
|
|
462
|
+
using namespace largehm;
|
|
463
|
+
|
|
464
|
+
// 1) Global configuration (mirror professor's style)
|
|
465
|
+
largehm::time_limit = time_limit;
|
|
466
|
+
largehm::pre_pro = true; // always preprocess
|
|
467
|
+
largehm::use_dic = use_dic;
|
|
468
|
+
largehm::just_build = false;
|
|
469
|
+
largehm::b_disp = verbose;
|
|
470
|
+
largehm::b_write = !out_file.empty();
|
|
471
|
+
largehm::out_file = out_file;
|
|
472
|
+
|
|
473
|
+
// 2) HARD RESET of all global state for a fresh run
|
|
474
|
+
largehm::ClearCollected(); // our helper in largehm::utility.cpp
|
|
475
|
+
|
|
476
|
+
largehm::M = 0;
|
|
477
|
+
largehm::L = 0;
|
|
478
|
+
largehm::mlim = 0;
|
|
479
|
+
largehm::N = 0;
|
|
480
|
+
largehm::theta = 0;
|
|
481
|
+
largehm::E = 0;
|
|
482
|
+
largehm::itmset_exists = false;
|
|
483
|
+
|
|
484
|
+
// containers
|
|
485
|
+
// (item_dic reset is optional and not strictly needed here)
|
|
486
|
+
largehm::DFS.clear();
|
|
487
|
+
largehm::VDFS.clear();
|
|
488
|
+
largehm::Tree.clear();
|
|
489
|
+
largehm::VTree.clear();
|
|
490
|
+
largehm::CTree.clear();
|
|
491
|
+
|
|
492
|
+
largehm::start_time = std::clock();
|
|
493
|
+
|
|
494
|
+
// 3) Handle input (file path or Python list)
|
|
495
|
+
TempFile tmp;
|
|
496
|
+
std::string path;
|
|
497
|
+
|
|
498
|
+
if (py::isinstance<py::str>(data)) {
|
|
499
|
+
path = data.cast<std::string>();
|
|
500
|
+
} else {
|
|
501
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
502
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
503
|
+
path = tmp.path;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
if (verbose) {
|
|
507
|
+
std::cerr << "[LargeHTMiner] path=" << path
|
|
508
|
+
<< " minsup=" << minsup
|
|
509
|
+
<< " preproc(always)=true"
|
|
510
|
+
<< " use_dic=" << use_dic
|
|
511
|
+
<< std::endl;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// 4) Build MDD / load instance.
|
|
515
|
+
// NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
|
|
516
|
+
// so we DO NOT create a root node here.
|
|
517
|
+
if (!largehm::Load_instance(path, minsup)) {
|
|
518
|
+
throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// 5) Run miner (same timing logic as original main)
|
|
522
|
+
if (!largehm::just_build &&
|
|
523
|
+
largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
|
|
524
|
+
{
|
|
525
|
+
largehm::Freq_miner();
|
|
526
|
+
if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
|
|
527
|
+
std::cout << "TIME LIMIT REACHED\n";
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// 6) Return collected patterns + runtime
|
|
532
|
+
py::dict out;
|
|
533
|
+
out["patterns"] = largehm::GetCollected();
|
|
534
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
535
|
+
return out;
|
|
536
|
+
},
|
|
537
|
+
py::arg("data"),
|
|
538
|
+
py::arg("minsup") = 0.01,
|
|
539
|
+
py::arg("time_limit") = 36000,
|
|
540
|
+
py::arg("preproc") = false, // kept for API symmetry
|
|
541
|
+
py::arg("use_dic") = false,
|
|
542
|
+
py::arg("verbose") = false,
|
|
543
|
+
py::arg("out_file") = ""
|
|
544
|
+
);
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
/*#include <pybind11/pybind11.h>
|
|
551
|
+
#include <pybind11/stl.h>
|
|
552
|
+
namespace py = pybind11;
|
|
553
|
+
#include <iostream>
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
// PrefixProjection headers
|
|
557
|
+
#include "freq_miner.hpp"
|
|
558
|
+
#include "load_inst.hpp"
|
|
559
|
+
#include "utility.hpp"
|
|
560
|
+
|
|
561
|
+
// BTMiner (wrapped in its own namespace in source files)
|
|
562
|
+
#include "btminer/src/freq_miner.hpp"
|
|
563
|
+
#include "btminer/src/load_inst.hpp"
|
|
564
|
+
#include "btminer/src/utility.hpp"
|
|
565
|
+
#include "btminer/src/build_mdd.hpp"
|
|
566
|
+
|
|
567
|
+
// HTMiner (wrapped in its own namespace in source files)
|
|
568
|
+
#include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
|
|
569
|
+
#include "htminer/src/freq_miner.hpp"
|
|
570
|
+
#include "htminer/src/load_inst.hpp"
|
|
571
|
+
#include "htminer/src/utility.hpp"
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
#include "largepp/src/freq_miner.hpp"
|
|
575
|
+
#include "largepp/src/load_inst.hpp"
|
|
576
|
+
#include "largepp/src/utility.hpp"
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
#include "largebm/src/freq_miner.hpp"
|
|
580
|
+
#include "largebm/src/load_inst.hpp"
|
|
581
|
+
#include "largebm/src/utility.hpp"
|
|
582
|
+
#include "largebm/src/build_mdd.hpp"
|
|
583
|
+
|
|
584
|
+
#include "largehm/src/freq_miner.hpp"
|
|
585
|
+
#include "largehm/src/load_inst.hpp"
|
|
586
|
+
#include "largehm/src/utility.hpp"
|
|
587
|
+
#include "largehm/src/build_mdd.hpp"
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
592
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
593
|
+
|
|
594
|
+
// ─────────────────────────────────────────────────────────────
|
|
595
|
+
// PrefixProjection
|
|
596
|
+
// ─────────────────────────────────────────────────────────────
|
|
597
|
+
m.def("PrefixProjection",
|
|
598
|
+
[](py::object data,
|
|
599
|
+
double minsup,
|
|
600
|
+
unsigned int time_limit,
|
|
601
|
+
bool preproc,
|
|
602
|
+
bool use_dic,
|
|
603
|
+
bool verbose,
|
|
604
|
+
const std::string &out_file)
|
|
605
|
+
{
|
|
606
|
+
::time_limit = time_limit;
|
|
607
|
+
::pre_pro = preproc;
|
|
608
|
+
::use_dic = use_dic;
|
|
609
|
+
::use_list = false;
|
|
610
|
+
::b_disp = verbose;
|
|
611
|
+
::b_write = !out_file.empty();
|
|
612
|
+
::out_file = out_file;
|
|
613
|
+
|
|
614
|
+
ClearCollected();
|
|
615
|
+
start_time = std::clock();
|
|
616
|
+
|
|
206
617
|
if (py::isinstance<py::str>(data)) {
|
|
207
618
|
std::string path = data.cast<std::string>();
|
|
208
|
-
if (!
|
|
619
|
+
if (!Load_instance(path, minsup))
|
|
209
620
|
throw std::runtime_error("Failed to load file: " + path);
|
|
210
621
|
} else {
|
|
211
622
|
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
212
|
-
|
|
213
|
-
|
|
623
|
+
items = std::move(seqs);
|
|
624
|
+
N = items.size();
|
|
214
625
|
|
|
215
|
-
// compute L (max item ID), M (max sequence length), E (total entries)
|
|
216
626
|
int max_id = 0;
|
|
217
|
-
|
|
218
|
-
htminer::E = 0;
|
|
219
|
-
for (auto &seq : htminer::items) {
|
|
220
|
-
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
627
|
+
for (auto &seq : items)
|
|
221
628
|
for (int x : seq)
|
|
222
629
|
max_id = std::max(max_id, std::abs(x));
|
|
223
|
-
|
|
630
|
+
L = max_id;
|
|
631
|
+
|
|
632
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
633
|
+
|
|
634
|
+
DFS.clear();
|
|
635
|
+
DFS.reserve(L);
|
|
636
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
637
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
638
|
+
|
|
639
|
+
M = 0;
|
|
640
|
+
E = 0;
|
|
641
|
+
for (auto &seq : items) {
|
|
642
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
643
|
+
E += seq.size();
|
|
224
644
|
}
|
|
225
|
-
htminer::L = max_id;
|
|
226
|
-
htminer::theta = (minsup < 1.0)
|
|
227
|
-
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
228
|
-
: static_cast<unsigned long long>(minsup);
|
|
229
|
-
|
|
230
|
-
// build empty DFS stack (size L) as HTMiner expects
|
|
231
|
-
htminer::DFS.clear();
|
|
232
|
-
htminer::DFS.reserve(htminer::L);
|
|
233
|
-
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
234
|
-
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
235
|
-
|
|
236
|
-
// initialize VDFS if HTMiner needs it
|
|
237
|
-
htminer::VDFS.clear();
|
|
238
|
-
htminer::VDFS.resize(htminer::L);
|
|
239
645
|
}
|
|
240
646
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
// std::cout << "[HTMiner] dumping all collected patterns:\n";
|
|
245
|
-
// for (size_t i = 0; i < htminer::collectedPatterns.size(); ++i) {
|
|
246
|
-
// const auto &seq = htminer::collectedPatterns[i];
|
|
247
|
-
// std::cout << "Pattern " << i << ": { ";
|
|
248
|
-
// for (int x : seq) {
|
|
249
|
-
// std::cout << x << " ";
|
|
250
|
-
// }
|
|
251
|
-
// std::cout << "}\n";
|
|
252
|
-
//}
|
|
253
|
-
std::cout << " total patterns = "
|
|
254
|
-
<< htminer::collectedPatterns.size() << "\n";
|
|
255
|
-
// ─────────────────────────────────────────────────
|
|
256
|
-
|
|
257
|
-
// 4) return patterns + elapsed time
|
|
647
|
+
Freq_miner();
|
|
648
|
+
|
|
258
649
|
py::dict out;
|
|
259
|
-
out["patterns"] =
|
|
260
|
-
out["time"] =
|
|
650
|
+
out["patterns"] = GetCollected();
|
|
651
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
261
652
|
return out;
|
|
262
653
|
},
|
|
263
654
|
py::arg("data"),
|
|
@@ -268,8 +659,223 @@ std::cout << " total patterns = "
|
|
|
268
659
|
py::arg("verbose") = false,
|
|
269
660
|
py::arg("out_file") = ""
|
|
270
661
|
);
|
|
662
|
+
m.def("BTMiner",
|
|
663
|
+
[](py::object data,
|
|
664
|
+
double minsup,
|
|
665
|
+
unsigned int time_limit,
|
|
666
|
+
bool preproc,
|
|
667
|
+
bool use_dic,
|
|
668
|
+
bool verbose,
|
|
669
|
+
const std::string &out_file)
|
|
670
|
+
{
|
|
671
|
+
// We are calling the *professor* BTMiner, now namespaced as btminer::.
|
|
672
|
+
// So we only set the globals the professor code actually has.
|
|
673
|
+
|
|
674
|
+
// 1) configure professor globals
|
|
675
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
676
|
+
btminer::pre_pro = preproc;
|
|
677
|
+
btminer::use_dic = use_dic;
|
|
678
|
+
btminer::b_disp = verbose;
|
|
679
|
+
btminer::b_write = !out_file.empty();
|
|
680
|
+
btminer::out_file = out_file;
|
|
681
|
+
btminer::N_mult = 1; // professor uses these too
|
|
682
|
+
btminer::M_mult = 1;
|
|
683
|
+
btminer::just_build = false; // we want full mining
|
|
684
|
+
|
|
685
|
+
btminer::start_time = std::clock();
|
|
686
|
+
|
|
687
|
+
// 2) load data
|
|
688
|
+
//
|
|
689
|
+
// Professor’s code is primarily file-based (Load_instance(const string&, double)).
|
|
690
|
+
// So: if user passes a file path → use the professor loader directly.
|
|
691
|
+
// If user passes a Python list-of-lists → we will build the MDD the same
|
|
692
|
+
// way professor’s loader does, but without changing his logic.
|
|
693
|
+
if (py::isinstance<py::str>(data)) {
|
|
694
|
+
// ----- FILE MODE -----
|
|
695
|
+
std::string path = data.cast<std::string>();
|
|
271
696
|
|
|
272
|
-
|
|
697
|
+
if (verbose) {
|
|
698
|
+
std::cerr << "[BT][binding] file=" << path
|
|
699
|
+
<< " minsup=" << minsup
|
|
700
|
+
<< " preproc=" << preproc << std::endl;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
704
|
+
throw std::runtime_error("BTMiner: failed to load file: " + path);
|
|
705
|
+
}
|
|
706
|
+
} else {
|
|
707
|
+
// ----- PYTHON LIST MODE -----
|
|
708
|
+
//
|
|
709
|
+
// We mimic professor’s loader:
|
|
710
|
+
// - create root in Tree
|
|
711
|
+
// - compute N, M, L
|
|
712
|
+
// - compute theta from minsup
|
|
713
|
+
// - seed DFS (one Pattern per item, as in Preprocess branch)
|
|
714
|
+
// - call Build_MDD(...) for each sequence
|
|
715
|
+
//
|
|
716
|
+
// This DOES NOT change his mining logic; it just drives it from memory.
|
|
717
|
+
|
|
718
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
719
|
+
|
|
720
|
+
// clear MDD and globals to a known state
|
|
721
|
+
btminer::Tree.clear();
|
|
722
|
+
btminer::Tree.emplace_back(0, 0, 0); // root (exactly like professor)
|
|
723
|
+
|
|
724
|
+
// compute basic stats
|
|
725
|
+
int max_id = 0;
|
|
726
|
+
int max_len = 0;
|
|
727
|
+
int seq_count = 0;
|
|
728
|
+
long long entries = 0;
|
|
729
|
+
|
|
730
|
+
for (const auto &s : seqs) {
|
|
731
|
+
if (s.empty()) continue;
|
|
732
|
+
++seq_count;
|
|
733
|
+
max_len = std::max<int>(max_len, static_cast<int>(s.size()));
|
|
734
|
+
for (int x : s) {
|
|
735
|
+
max_id = std::max(max_id, std::abs(x));
|
|
736
|
+
++entries;
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
btminer::N = seq_count;
|
|
741
|
+
btminer::M = max_len;
|
|
742
|
+
btminer::L = max_id;
|
|
743
|
+
btminer::E = static_cast<int>(entries);
|
|
744
|
+
|
|
745
|
+
// theta = abs support
|
|
746
|
+
if (minsup < 1.0)
|
|
747
|
+
btminer::theta = static_cast<int>(std::ceil(minsup * btminer::N * btminer::N_mult));
|
|
748
|
+
else
|
|
749
|
+
btminer::theta = static_cast<int>(minsup);
|
|
750
|
+
|
|
751
|
+
// seed DFS exactly like professor does in the preprocessed branch:
|
|
752
|
+
btminer::DFS.clear();
|
|
753
|
+
btminer::DFS.reserve(btminer::L);
|
|
754
|
+
for (int i = 0; i < btminer::L; ++i)
|
|
755
|
+
btminer::DFS.emplace_back(-i - 1);
|
|
756
|
+
|
|
757
|
+
// now build the MDD, sequence by sequence
|
|
758
|
+
for (const auto &s : seqs) {
|
|
759
|
+
if (s.empty()) continue;
|
|
760
|
+
// professor’s Build_MDD takes a vector<int> by non-const ref
|
|
761
|
+
std::vector<int> tmp = s;
|
|
762
|
+
btminer::Build_MDD(tmp);
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
if (verbose) {
|
|
766
|
+
std::cerr << "[BT][binding] PY mode: N=" << btminer::N
|
|
767
|
+
<< " L=" << btminer::L
|
|
768
|
+
<< " M=" << btminer::M
|
|
769
|
+
<< " E=" << btminer::E
|
|
770
|
+
<< " theta=" << btminer::theta
|
|
771
|
+
<< " Tree.size()=" << btminer::Tree.size()
|
|
772
|
+
<< std::endl;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// 3) run professor’s miner
|
|
777
|
+
btminer::Freq_miner();
|
|
778
|
+
|
|
779
|
+
// 4) build python result
|
|
780
|
+
// 4) build python result
|
|
781
|
+
py::dict out;
|
|
782
|
+
out["patterns"] = btminer::GetCollected(); // ← NEW
|
|
783
|
+
out["num_patterns"] = btminer::num_patt;
|
|
784
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
785
|
+
out["N"] = btminer::N;
|
|
786
|
+
out["L"] = btminer::L;
|
|
787
|
+
out["theta"] = btminer::theta;
|
|
788
|
+
return out;
|
|
789
|
+
|
|
790
|
+
},
|
|
791
|
+
py::arg("data"),
|
|
792
|
+
py::arg("minsup") = 0.01,
|
|
793
|
+
py::arg("time_limit") = 36000,
|
|
794
|
+
py::arg("preproc") = false,
|
|
795
|
+
py::arg("use_dic") = false,
|
|
796
|
+
py::arg("verbose") = false,
|
|
797
|
+
py::arg("out_file") = ""
|
|
798
|
+
);
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
// HTMiner
|
|
804
|
+
// ─────────────────────────────────────────────────────────────
|
|
805
|
+
// HTMiner
|
|
806
|
+
m.def("HTMiner",
|
|
807
|
+
[](py::object data,
|
|
808
|
+
double minsup, unsigned int time_limit,
|
|
809
|
+
bool preproc, bool use_dic,
|
|
810
|
+
bool verbose, const std::string &out_file)
|
|
811
|
+
{
|
|
812
|
+
htminer::time_limit = time_limit;
|
|
813
|
+
htminer::pre_pro = preproc;
|
|
814
|
+
htminer::use_dic = use_dic;
|
|
815
|
+
htminer::just_build = false;
|
|
816
|
+
htminer::use_list = false;
|
|
817
|
+
htminer::b_disp = verbose;
|
|
818
|
+
htminer::b_write = !out_file.empty();
|
|
819
|
+
htminer::out_file = out_file;
|
|
820
|
+
htminer::ClearCollected();
|
|
821
|
+
htminer::start_time = std::clock();
|
|
822
|
+
|
|
823
|
+
if (py::isinstance<py::str>(data)) {
|
|
824
|
+
std::string path = data.cast<std::string>();
|
|
825
|
+
if (!htminer::Load_instance(path, minsup))
|
|
826
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
827
|
+
} else {
|
|
828
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
829
|
+
htminer::items = std::move(seqs);
|
|
830
|
+
htminer::N = htminer::items.size();
|
|
831
|
+
|
|
832
|
+
int max_id = 0;
|
|
833
|
+
htminer::M = 0;
|
|
834
|
+
htminer::E = 0;
|
|
835
|
+
for (auto &seq : htminer::items) {
|
|
836
|
+
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
837
|
+
for (int x : seq)
|
|
838
|
+
max_id = std::max(max_id, std::abs(x));
|
|
839
|
+
htminer::E += seq.size();
|
|
840
|
+
}
|
|
841
|
+
htminer::L = max_id;
|
|
842
|
+
htminer::theta = (minsup < 1.0)
|
|
843
|
+
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
844
|
+
: static_cast<unsigned long long>(minsup);
|
|
845
|
+
|
|
846
|
+
htminer::DFS.clear();
|
|
847
|
+
htminer::DFS.reserve(htminer::L);
|
|
848
|
+
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
849
|
+
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
850
|
+
|
|
851
|
+
htminer::VDFS.clear();
|
|
852
|
+
htminer::VDFS.resize(htminer::L);
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
htminer::Freq_miner();
|
|
856
|
+
|
|
857
|
+
// 👇 now really respects verbose
|
|
858
|
+
if (verbose) {
|
|
859
|
+
std::cout << " total patterns = "
|
|
860
|
+
<< htminer::collectedPatterns.size() << "\n";
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
py::dict out;
|
|
864
|
+
out["patterns"] = htminer::GetCollected();
|
|
865
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
866
|
+
return out;
|
|
867
|
+
},
|
|
868
|
+
py::arg("data"),
|
|
869
|
+
py::arg("minsup") = 0.01,
|
|
870
|
+
py::arg("time_limit") = 36000,
|
|
871
|
+
py::arg("preproc") = false,
|
|
872
|
+
py::arg("use_dic") = false,
|
|
873
|
+
py::arg("verbose") = false,
|
|
874
|
+
py::arg("out_file") = ""
|
|
875
|
+
);
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
m.def("LargePrefixProjection",
|
|
273
879
|
[](py::object data,
|
|
274
880
|
double minsup,
|
|
275
881
|
unsigned int time_limit,
|
|
@@ -281,28 +887,30 @@ std::cout << " total patterns = "
|
|
|
281
887
|
largepp::time_limit = time_limit;
|
|
282
888
|
largepp::pre_pro = preproc;
|
|
283
889
|
largepp::use_dic = use_dic;
|
|
284
|
-
largepp::use_list = true;
|
|
890
|
+
largepp::use_list = true;
|
|
285
891
|
largepp::b_disp = verbose;
|
|
286
892
|
largepp::b_write = !out_file.empty();
|
|
287
893
|
largepp::out_file = out_file;
|
|
288
|
-
largepp::just_build = false;
|
|
894
|
+
largepp::just_build = false;
|
|
289
895
|
|
|
290
896
|
largepp::ClearCollected();
|
|
291
897
|
largepp::start_time = std::clock();
|
|
292
|
-
std::string fname = data.cast<std::string>();
|
|
293
|
-
/* 1) load instance (py list or filename) */
|
|
294
|
-
if (py::isinstance<py::str>(data))
|
|
295
|
-
|
|
296
|
-
largepp::Load_instance(fname, minsup);
|
|
297
|
-
else
|
|
298
|
-
largepp::Load_py(data, minsup); // helper you’ll expose
|
|
299
|
-
|
|
300
|
-
std::vector<unsigned long long> dbg;
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
898
|
|
|
899
|
+
// 👇 this was the last noisy one
|
|
900
|
+
if (verbose) {
|
|
901
|
+
std::cerr << " minsup=" << minsup
|
|
902
|
+
<< " preproc=" << preproc
|
|
903
|
+
<< " verbose=" << verbose
|
|
904
|
+
<< " out_file=" << (out_file.empty() ? "(none)" : out_file)
|
|
905
|
+
<< " use_dic=" << use_dic << "\n";
|
|
906
|
+
}
|
|
305
907
|
|
|
908
|
+
if (py::isinstance<py::str>(data)) {
|
|
909
|
+
std::string fname = data.cast<std::string>();
|
|
910
|
+
largepp::Load_instance(fname, minsup);
|
|
911
|
+
} else {
|
|
912
|
+
largepp::Load_py(data, minsup);
|
|
913
|
+
}
|
|
306
914
|
|
|
307
915
|
largepp::Freq_miner();
|
|
308
916
|
|
|
@@ -320,6 +928,8 @@ std::cout << " total patterns = "
|
|
|
320
928
|
py::arg("out_file") = ""
|
|
321
929
|
);
|
|
322
930
|
|
|
931
|
+
|
|
932
|
+
|
|
323
933
|
// ─────────────────────────────────────────────────────────────
|
|
324
934
|
// LargeBTMiner -- Python wrapper for the largebm implementation
|
|
325
935
|
// ─────────────────────────────────────────────────────────────
|
|
@@ -404,100 +1014,108 @@ std::cout << " total patterns = "
|
|
|
404
1014
|
|
|
405
1015
|
|
|
406
1016
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
1017
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1018
|
+
// LargeBTMiner (MDD-based)
|
|
1019
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1020
|
+
/*m.def("LargeBTMiner",
|
|
1021
|
+
[](py::object data,
|
|
1022
|
+
double minsup,
|
|
1023
|
+
unsigned int time_limit,
|
|
1024
|
+
bool preproc,
|
|
1025
|
+
bool use_dic,
|
|
1026
|
+
bool verbose,
|
|
1027
|
+
const std::string &out_file)
|
|
1028
|
+
{
|
|
1029
|
+
using namespace largebm;
|
|
1030
|
+
|
|
1031
|
+
// 0) Set global flags and timers
|
|
1032
|
+
largebm::time_limit = time_limit;
|
|
1033
|
+
largebm::pre_pro = preproc;
|
|
1034
|
+
largebm::use_dic = use_dic;
|
|
1035
|
+
largebm::use_list = false; // large-mode → always MDD
|
|
1036
|
+
largebm::b_disp = verbose;
|
|
1037
|
+
largebm::b_write = !out_file.empty();
|
|
1038
|
+
largebm::out_file = out_file;
|
|
1039
|
+
largebm::just_build = false;
|
|
1040
|
+
|
|
1041
|
+
// 0.1) Clear any leftover data/state from previous runs
|
|
1042
|
+
largebm::items.clear();
|
|
1043
|
+
largebm::item_dic.clear();
|
|
1044
|
+
largebm::inv_item_dic.clear();
|
|
1045
|
+
largebm::Tree.clear();
|
|
1046
|
+
largebm::DFS.clear();
|
|
1047
|
+
largebm::ClearCollected();
|
|
1048
|
+
|
|
1049
|
+
// 1) Load sequences (either from filename or from Python list)
|
|
1050
|
+
if (py::isinstance<py::str>(data)) {
|
|
1051
|
+
// ─────────── FILE-BASED MODE ───────────
|
|
1052
|
+
std::string path = data.cast<std::string>();
|
|
1053
|
+
if (!largebm::Load_instance(path, minsup))
|
|
1054
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
425
1055
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
largebm::
|
|
430
|
-
largebm::
|
|
431
|
-
largebm::DFS.clear();
|
|
432
|
-
largebm::ClearCollected();
|
|
1056
|
+
} else {
|
|
1057
|
+
// ────────── IN-MEMORY MODE ──────────
|
|
1058
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
1059
|
+
largebm::items = std::move(seqs);
|
|
1060
|
+
largebm::N = largebm::items.size();
|
|
433
1061
|
|
|
434
|
-
// 1)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
1062
|
+
// 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
|
|
1063
|
+
int max_id = 0;
|
|
1064
|
+
largebm::M = 0;
|
|
1065
|
+
largebm::E = 0;
|
|
1066
|
+
for (auto &seq : largebm::items) {
|
|
1067
|
+
largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
|
|
1068
|
+
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
1069
|
+
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
1070
|
+
}
|
|
1071
|
+
largebm::L = static_cast<unsigned int>(max_id);
|
|
1072
|
+
largebm::theta = (minsup < 1.0)
|
|
1073
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
1074
|
+
: static_cast<unsigned long long>(minsup);
|
|
440
1075
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
largebm::
|
|
445
|
-
largebm::N = largebm::items.size();
|
|
1076
|
+
// 1.2) Initialize DFS buffer (size = L)
|
|
1077
|
+
largebm::DFS.reserve(largebm::L);
|
|
1078
|
+
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
1079
|
+
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
446
1080
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
454
|
-
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
455
|
-
}
|
|
456
|
-
largebm::L = static_cast<unsigned int>(max_id);
|
|
457
|
-
largebm::theta = (minsup < 1.0)
|
|
458
|
-
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
459
|
-
: static_cast<unsigned long long>(minsup);
|
|
460
|
-
|
|
461
|
-
// 1.2) Initialize DFS buffer (size = L)
|
|
462
|
-
largebm::DFS.reserve(largebm::L);
|
|
463
|
-
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
464
|
-
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
465
|
-
|
|
466
|
-
// 1.3) Build the MDD “Tree”
|
|
467
|
-
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
468
|
-
largebm::Tree.emplace_back(0, 0, 0);
|
|
469
|
-
for (auto &seq : largebm::items)
|
|
470
|
-
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
471
|
-
}
|
|
1081
|
+
// 1.3) Build the MDD “Tree”
|
|
1082
|
+
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
1083
|
+
largebm::Tree.emplace_back(0, 0, 0);
|
|
1084
|
+
for (auto &seq : largebm::items)
|
|
1085
|
+
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
1086
|
+
}
|
|
472
1087
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
}
|
|
480
|
-
largebm::inv_item_dic = std::move(inv);
|
|
1088
|
+
// 2) Rebuild inverse-dictionary from fresh item_dic
|
|
1089
|
+
{
|
|
1090
|
+
std::vector<int> inv(largebm::item_dic.size() + 1);
|
|
1091
|
+
for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
|
|
1092
|
+
int cid = largebm::item_dic[old - 1];
|
|
1093
|
+
if (cid > 0) inv[cid] = old;
|
|
481
1094
|
}
|
|
1095
|
+
largebm::inv_item_dic = std::move(inv);
|
|
1096
|
+
}
|
|
482
1097
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
1098
|
+
// 3) Start timing and run the miner
|
|
1099
|
+
largebm::start_time = std::clock();
|
|
1100
|
+
largebm::Freq_miner();
|
|
1101
|
+
|
|
1102
|
+
// 4) Collect results and elapsed time
|
|
1103
|
+
const auto& pats = largebm::GetCollected();
|
|
1104
|
+
|
|
1105
|
+
py::dict out;
|
|
1106
|
+
out["patterns"] = pats;
|
|
1107
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
1108
|
+
return out;
|
|
1109
|
+
},
|
|
1110
|
+
py::arg("data"),
|
|
1111
|
+
py::arg("minsup") = 0.01,
|
|
1112
|
+
py::arg("time_limit") = 36000,
|
|
1113
|
+
py::arg("preproc") = false,
|
|
1114
|
+
py::arg("use_dic") = false,
|
|
1115
|
+
py::arg("verbose") = false,
|
|
1116
|
+
py::arg("out_file") = ""
|
|
1117
|
+
);
|
|
486
1118
|
|
|
487
|
-
// 4) Collect results and elapsed time
|
|
488
|
-
py::dict out;
|
|
489
|
-
out["patterns"] = largebm::GetCollected();
|
|
490
|
-
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
491
|
-
return out;
|
|
492
|
-
},
|
|
493
|
-
py::arg("data"),
|
|
494
|
-
py::arg("minsup") = 0.01,
|
|
495
|
-
py::arg("time_limit") = 36000,
|
|
496
|
-
py::arg("preproc") = false,
|
|
497
|
-
py::arg("use_dic") = false,
|
|
498
|
-
py::arg("verbose") = false,
|
|
499
|
-
py::arg("out_file") = ""
|
|
500
|
-
);
|
|
501
1119
|
|
|
502
1120
|
|
|
503
1121
|
m.def("LargeHTMiner",
|
|
@@ -606,4 +1224,4 @@ m.def("LargeHTMiner",
|
|
|
606
1224
|
|
|
607
1225
|
|
|
608
1226
|
|
|
609
|
-
}
|
|
1227
|
+
} */
|