effspm 0.3.0__cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl → 0.3.2__cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +572 -2
- effspm/_effspm.cpython-311-i386-linux-gnu.so +0 -0
- effspm/btminer/src/load_inst.cpp +11 -10
- effspm/btminer/src/main.cpp +83 -0
- effspm/htminer/src/build_mdd.cpp +41 -66
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/main.cpp +95 -0
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +132 -173
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +136 -191
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/load_inst.cpp +5 -4
- effspm/largepp/src/main.cpp +108 -0
- effspm/load_inst.cpp +8 -8
- effspm/main.cpp +103 -0
- {effspm-0.3.0.dist-info → effspm-0.3.2.dist-info}/METADATA +1 -1
- effspm-0.3.2.dist-info/RECORD +60 -0
- effspm-0.3.0.dist-info/RECORD +0 -54
- {effspm-0.3.0.dist-info → effspm-0.3.2.dist-info}/WHEEL +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.2.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -2,6 +2,575 @@
|
|
|
2
2
|
|
|
3
3
|
#include <pybind11/pybind11.h>
|
|
4
4
|
#include <pybind11/stl.h>
|
|
5
|
+
|
|
6
|
+
#include <iostream>
|
|
7
|
+
#include <fstream>
|
|
8
|
+
#include <cstdio> // std::remove
|
|
9
|
+
#include <vector>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <ctime>
|
|
12
|
+
#include <cmath>
|
|
13
|
+
|
|
14
|
+
namespace py = pybind11;
|
|
15
|
+
|
|
16
|
+
// PrefixProjection headers (global namespace)
|
|
17
|
+
#include "freq_miner.hpp"
|
|
18
|
+
#include "load_inst.hpp"
|
|
19
|
+
#include "utility.hpp"
|
|
20
|
+
|
|
21
|
+
// BTMiner (namespaced)
|
|
22
|
+
#include "btminer/src/freq_miner.hpp"
|
|
23
|
+
#include "btminer/src/load_inst.hpp"
|
|
24
|
+
#include "btminer/src/utility.hpp"
|
|
25
|
+
#include "btminer/src/build_mdd.hpp"
|
|
26
|
+
|
|
27
|
+
// HTMiner (namespaced)
|
|
28
|
+
#include "htminer/src/build_mdd.hpp"
|
|
29
|
+
#include "htminer/src/freq_miner.hpp"
|
|
30
|
+
#include "htminer/src/load_inst.hpp"
|
|
31
|
+
#include "htminer/src/utility.hpp"
|
|
32
|
+
|
|
33
|
+
// LargePrefixProjection
|
|
34
|
+
#include "largepp/src/freq_miner.hpp"
|
|
35
|
+
#include "largepp/src/load_inst.hpp"
|
|
36
|
+
#include "largepp/src/utility.hpp"
|
|
37
|
+
|
|
38
|
+
// LargeBTMiner
|
|
39
|
+
#include "largebm/src/freq_miner.hpp"
|
|
40
|
+
#include "largebm/src/load_inst.hpp"
|
|
41
|
+
#include "largebm/src/utility.hpp"
|
|
42
|
+
#include "largebm/src/build_mdd.hpp"
|
|
43
|
+
|
|
44
|
+
// LargeHTMiner
|
|
45
|
+
#include "largehm/src/freq_miner.hpp"
|
|
46
|
+
#include "largehm/src/load_inst.hpp"
|
|
47
|
+
#include "largehm/src/utility.hpp"
|
|
48
|
+
#include "largehm/src/build_mdd.hpp"
|
|
49
|
+
|
|
50
|
+
namespace {
|
|
51
|
+
|
|
52
|
+
// RAII helper for temp file
|
|
53
|
+
struct TempFile {
|
|
54
|
+
std::string path;
|
|
55
|
+
~TempFile() {
|
|
56
|
+
if (!path.empty()) {
|
|
57
|
+
std::remove(path.c_str());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Write Python list[list[int]] to a temp file in professor’s format:
|
|
63
|
+
// one sequence per line, items separated by spaces.
|
|
64
|
+
std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
|
|
65
|
+
char tmp_name[L_tmpnam];
|
|
66
|
+
if (!std::tmpnam(tmp_name)) {
|
|
67
|
+
throw std::runtime_error("Failed to create temporary file name");
|
|
68
|
+
}
|
|
69
|
+
std::string path = std::string(tmp_name) + ".txt";
|
|
70
|
+
|
|
71
|
+
std::ofstream ofs(path);
|
|
72
|
+
if (!ofs) {
|
|
73
|
+
throw std::runtime_error("Failed to open temporary file for writing: " + path);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const auto& seq : seqs) {
|
|
77
|
+
for (size_t i = 0; i < seq.size(); ++i) {
|
|
78
|
+
if (i) ofs << ' ';
|
|
79
|
+
ofs << seq[i];
|
|
80
|
+
}
|
|
81
|
+
ofs << '\n';
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ofs.close();
|
|
85
|
+
return path;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
} // anonymous namespace
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
92
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
|
|
93
|
+
|
|
94
|
+
// ─────────────────────────────────────────────────────────────
|
|
95
|
+
// PrefixProjection (works directly on Python lists or files)
|
|
96
|
+
// ─────────────────────────────────────────────────────────────
|
|
97
|
+
m.def("PrefixProjection",
|
|
98
|
+
[](py::object data,
|
|
99
|
+
double minsup,
|
|
100
|
+
unsigned int time_limit,
|
|
101
|
+
bool preproc,
|
|
102
|
+
bool use_dic,
|
|
103
|
+
bool verbose,
|
|
104
|
+
const std::string &out_file)
|
|
105
|
+
{
|
|
106
|
+
::time_limit = time_limit;
|
|
107
|
+
::pre_pro = preproc;
|
|
108
|
+
::use_dic = use_dic;
|
|
109
|
+
::use_list = false;
|
|
110
|
+
::b_disp = verbose; // controls prints in original code
|
|
111
|
+
::b_write = !out_file.empty();
|
|
112
|
+
::out_file = out_file;
|
|
113
|
+
|
|
114
|
+
ClearCollected();
|
|
115
|
+
start_time = std::clock();
|
|
116
|
+
|
|
117
|
+
if (py::isinstance<py::str>(data)) {
|
|
118
|
+
std::string path = data.cast<std::string>();
|
|
119
|
+
if (!Load_instance(path, minsup))
|
|
120
|
+
throw std::runtime_error("PrefixProjection: failed to load file: " + path);
|
|
121
|
+
} else {
|
|
122
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
123
|
+
items = std::move(seqs);
|
|
124
|
+
N = items.size();
|
|
125
|
+
|
|
126
|
+
int max_id = 0;
|
|
127
|
+
for (auto &seq : items)
|
|
128
|
+
for (int x : seq)
|
|
129
|
+
max_id = std::max(max_id, std::abs(x));
|
|
130
|
+
L = max_id;
|
|
131
|
+
|
|
132
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
133
|
+
|
|
134
|
+
DFS.clear();
|
|
135
|
+
DFS.reserve(L);
|
|
136
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
137
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
138
|
+
|
|
139
|
+
M = 0;
|
|
140
|
+
E = 0;
|
|
141
|
+
for (auto &seq : items) {
|
|
142
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
143
|
+
E += seq.size();
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
Freq_miner();
|
|
148
|
+
|
|
149
|
+
py::dict out;
|
|
150
|
+
out["patterns"] = GetCollected();
|
|
151
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
152
|
+
return out;
|
|
153
|
+
},
|
|
154
|
+
py::arg("data"),
|
|
155
|
+
py::arg("minsup") = 0.01,
|
|
156
|
+
py::arg("time_limit") = 36000,
|
|
157
|
+
py::arg("preproc") = false,
|
|
158
|
+
py::arg("use_dic") = false,
|
|
159
|
+
py::arg("verbose") = false,
|
|
160
|
+
py::arg("out_file") = ""
|
|
161
|
+
);
|
|
162
|
+
|
|
163
|
+
// ─────────────────────────────────────────────────────────────
|
|
164
|
+
// BTMiner (always uses professor's Load_instance)
|
|
165
|
+
// ─────────────────────────────────────────────────────────────
|
|
166
|
+
// ─────────────────────────────────────────────────────────────
|
|
167
|
+
// BTMiner (always uses professor's Load_instance)
|
|
168
|
+
// ─────────────────────────────────────────────────────────────
|
|
169
|
+
m.def("BTMiner",
|
|
170
|
+
[](py::object data,
|
|
171
|
+
double minsup,
|
|
172
|
+
unsigned int time_limit,
|
|
173
|
+
bool preproc,
|
|
174
|
+
bool use_dic,
|
|
175
|
+
bool verbose,
|
|
176
|
+
const std::string &out_file)
|
|
177
|
+
{
|
|
178
|
+
// 1) Configure professor globals
|
|
179
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
180
|
+
btminer::pre_pro = preproc;
|
|
181
|
+
btminer::use_dic = use_dic;
|
|
182
|
+
btminer::b_disp = verbose;
|
|
183
|
+
btminer::b_write = !out_file.empty();
|
|
184
|
+
btminer::out_file = out_file;
|
|
185
|
+
btminer::N_mult = 1;
|
|
186
|
+
btminer::M_mult = 1;
|
|
187
|
+
btminer::just_build = false;
|
|
188
|
+
|
|
189
|
+
// 2) HARD RESET of *known* global state for BTMiner
|
|
190
|
+
// (Only touch what we know exists in btminer namespace)
|
|
191
|
+
btminer::ClearCollected(); // clear collected patterns
|
|
192
|
+
btminer::Tree.clear(); // clear MDD tree
|
|
193
|
+
btminer::DFS.clear(); // clear DFS patterns
|
|
194
|
+
|
|
195
|
+
btminer::M = 0;
|
|
196
|
+
btminer::L = 0;
|
|
197
|
+
btminer::N = 0;
|
|
198
|
+
btminer::theta = 0;
|
|
199
|
+
btminer::E = 0;
|
|
200
|
+
btminer::num_patt = 0; // reset pattern counter if defined
|
|
201
|
+
|
|
202
|
+
// NOTE: we do NOT reinsert root here; btminer::Load_instance()
|
|
203
|
+
// is responsible for calling Tree.emplace_back(0,0,0) as needed.
|
|
204
|
+
|
|
205
|
+
btminer::start_time = std::clock();
|
|
206
|
+
|
|
207
|
+
// 3) Handle input (path or list-of-lists)
|
|
208
|
+
TempFile tmp;
|
|
209
|
+
std::string path;
|
|
210
|
+
|
|
211
|
+
if (py::isinstance<py::str>(data)) {
|
|
212
|
+
// File path: use directly
|
|
213
|
+
path = data.cast<std::string>();
|
|
214
|
+
} else {
|
|
215
|
+
// Python list → write to a temp file in professor’s format
|
|
216
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
217
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
218
|
+
path = tmp.path;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (verbose) {
|
|
222
|
+
std::cerr << "[BTMiner] path=" << path
|
|
223
|
+
<< " minsup=" << minsup
|
|
224
|
+
<< " preproc=" << preproc
|
|
225
|
+
<< " use_dic=" << use_dic
|
|
226
|
+
<< std::endl;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// 4) Build MDD + run miner
|
|
230
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
231
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
btminer::Freq_miner();
|
|
235
|
+
|
|
236
|
+
// 5) Return results
|
|
237
|
+
py::dict out;
|
|
238
|
+
out["patterns"] = btminer::GetCollected();
|
|
239
|
+
out["num_patterns"] = btminer::num_patt;
|
|
240
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
241
|
+
out["N"] = btminer::N;
|
|
242
|
+
out["L"] = btminer::L;
|
|
243
|
+
out["theta"] = btminer::theta;
|
|
244
|
+
return out;
|
|
245
|
+
},
|
|
246
|
+
py::arg("data"),
|
|
247
|
+
py::arg("minsup") = 0.01,
|
|
248
|
+
py::arg("time_limit") = 36000,
|
|
249
|
+
py::arg("preproc") = false,
|
|
250
|
+
py::arg("use_dic") = false,
|
|
251
|
+
py::arg("verbose") = false,
|
|
252
|
+
py::arg("out_file") = ""
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
// ─────────────────────────────────────────────────────────────
|
|
257
|
+
// HTMiner (works on files; we use a temp file for in-memory data)
|
|
258
|
+
// ─────────────────────────────────────────────────────────────
|
|
259
|
+
// ─────────────────────────────────────────────────────────────
|
|
260
|
+
// HTMiner (always uses professor's Load_instance; pre_pro forced ON)
|
|
261
|
+
// ─────────────────────────────────────────────────────────────
|
|
262
|
+
m.def("HTMiner",
|
|
263
|
+
[](py::object data,
|
|
264
|
+
double minsup,
|
|
265
|
+
unsigned int time_limit,
|
|
266
|
+
bool /*preproc*/, // Python arg is ignored internally
|
|
267
|
+
bool use_dic,
|
|
268
|
+
bool verbose,
|
|
269
|
+
const std::string &out_file)
|
|
270
|
+
{
|
|
271
|
+
using namespace htminer;
|
|
272
|
+
|
|
273
|
+
// ───────── Global parameter setup ─────────
|
|
274
|
+
htminer::time_limit = time_limit;
|
|
275
|
+
|
|
276
|
+
// IMPORTANT: always run with preprocessing ON,
|
|
277
|
+
// regardless of the Python `preproc` flag.
|
|
278
|
+
htminer::pre_pro = true;
|
|
279
|
+
htminer::use_dic = use_dic;
|
|
280
|
+
htminer::just_build = false;
|
|
281
|
+
htminer::b_disp = verbose;
|
|
282
|
+
htminer::b_write = !out_file.empty();
|
|
283
|
+
htminer::out_file = out_file;
|
|
284
|
+
|
|
285
|
+
// ───────── HARD RESET of HTMiner globals ─────────
|
|
286
|
+
htminer::ClearCollected();
|
|
287
|
+
htminer::Tree.clear();
|
|
288
|
+
htminer::VTree.clear();
|
|
289
|
+
htminer::CTree.clear();
|
|
290
|
+
htminer::DFS.clear();
|
|
291
|
+
htminer::VDFS.clear();
|
|
292
|
+
htminer::item_dic.clear();
|
|
293
|
+
|
|
294
|
+
htminer::M = 0;
|
|
295
|
+
htminer::N = 0;
|
|
296
|
+
htminer::L = 0;
|
|
297
|
+
htminer::E = 0;
|
|
298
|
+
htminer::theta = 0;
|
|
299
|
+
htminer::mlim = 0;
|
|
300
|
+
htminer::itmset_exists = false;
|
|
301
|
+
|
|
302
|
+
// NOTE: do NOT add a root arc here;
|
|
303
|
+
// htminer::Load_instance() already does Tree.emplace_back(0,0,0)
|
|
304
|
+
htminer::start_time = std::clock();
|
|
305
|
+
|
|
306
|
+
// ───────── Handle input (path or in-memory sequences) ─────────
|
|
307
|
+
TempFile tmp;
|
|
308
|
+
std::string path;
|
|
309
|
+
|
|
310
|
+
if (py::isinstance<py::str>(data)) {
|
|
311
|
+
// data is a file path
|
|
312
|
+
path = data.cast<std::string>();
|
|
313
|
+
} else {
|
|
314
|
+
// data is a list[list[int]] → write a temp file in the same text format
|
|
315
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
316
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
317
|
+
path = tmp.path;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if (verbose) {
|
|
321
|
+
std::cerr << "[HTMiner] path=" << path
|
|
322
|
+
<< " minsup=" << minsup
|
|
323
|
+
<< " preproc(always)=true"
|
|
324
|
+
<< " use_dic=" << use_dic
|
|
325
|
+
<< std::endl;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// ───────── Build MDD via professor's loader ─────────
|
|
329
|
+
if (!htminer::Load_instance(path, minsup)) {
|
|
330
|
+
throw std::runtime_error("HTMiner: failed to load instance from: " + path);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// ───────── Run miner ─────────
|
|
334
|
+
htminer::Freq_miner();
|
|
335
|
+
|
|
336
|
+
// ───────── Return results ─────────
|
|
337
|
+
py::dict out;
|
|
338
|
+
out["patterns"] = htminer::GetCollected();
|
|
339
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
340
|
+
return out;
|
|
341
|
+
},
|
|
342
|
+
py::arg("data"),
|
|
343
|
+
py::arg("minsup") = 0.01,
|
|
344
|
+
py::arg("time_limit") = 36000,
|
|
345
|
+
py::arg("preproc") = false, // kept for API symmetry, but IGNORED
|
|
346
|
+
py::arg("use_dic") = false,
|
|
347
|
+
py::arg("verbose") = false,
|
|
348
|
+
py::arg("out_file") = ""
|
|
349
|
+
);
|
|
350
|
+
|
|
351
|
+
// ─────────────────────────────────────────────────────────────
|
|
352
|
+
// LargePrefixProjection (already has its own Load_py)
|
|
353
|
+
// ─────────────────────────────────────────────────────────────
|
|
354
|
+
m.def("LargePrefixProjection",
|
|
355
|
+
[](py::object data,
|
|
356
|
+
double minsup,
|
|
357
|
+
unsigned int time_limit,
|
|
358
|
+
bool preproc,
|
|
359
|
+
bool use_dic,
|
|
360
|
+
bool verbose,
|
|
361
|
+
const std::string &out_file)
|
|
362
|
+
{
|
|
363
|
+
largepp::time_limit = time_limit;
|
|
364
|
+
largepp::pre_pro = preproc;
|
|
365
|
+
largepp::use_dic = use_dic;
|
|
366
|
+
largepp::use_list = true; // large prefix uses list-based mining
|
|
367
|
+
largepp::b_disp = verbose;
|
|
368
|
+
largepp::b_write = !out_file.empty();
|
|
369
|
+
largepp::out_file = out_file;
|
|
370
|
+
largepp::just_build = false;
|
|
371
|
+
|
|
372
|
+
largepp::ClearCollected();
|
|
373
|
+
largepp::start_time = std::clock();
|
|
374
|
+
|
|
375
|
+
if (py::isinstance<py::str>(data)) {
|
|
376
|
+
std::string fname = data.cast<std::string>();
|
|
377
|
+
largepp::Load_instance(fname, minsup);
|
|
378
|
+
} else {
|
|
379
|
+
largepp::Load_py(data, minsup);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
largepp::Freq_miner();
|
|
383
|
+
|
|
384
|
+
py::dict out;
|
|
385
|
+
out["patterns"] = largepp::GetCollected();
|
|
386
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
387
|
+
return out;
|
|
388
|
+
},
|
|
389
|
+
py::arg("data"),
|
|
390
|
+
py::arg("minsup") = 0.01,
|
|
391
|
+
py::arg("time_limit") = 36000,
|
|
392
|
+
py::arg("preproc") = false,
|
|
393
|
+
py::arg("use_dic") = false,
|
|
394
|
+
py::arg("verbose") = false,
|
|
395
|
+
py::arg("out_file") = ""
|
|
396
|
+
);
|
|
397
|
+
|
|
398
|
+
// ─────────────────────────────────────────────────────────────
|
|
399
|
+
// LargeBTMiner (always uses professor's largebm::Load_instance)
|
|
400
|
+
// ─────────────────────────────────────────────────────────────
|
|
401
|
+
m.def("LargeBTMiner",
|
|
402
|
+
[](py::object data,
|
|
403
|
+
double minsup,
|
|
404
|
+
unsigned int time_limit,
|
|
405
|
+
bool preproc,
|
|
406
|
+
bool use_dic,
|
|
407
|
+
bool verbose,
|
|
408
|
+
const std::string &out_file)
|
|
409
|
+
{
|
|
410
|
+
using namespace largebm;
|
|
411
|
+
|
|
412
|
+
largebm::time_limit = time_limit;
|
|
413
|
+
largebm::pre_pro = preproc;
|
|
414
|
+
largebm::use_dic = use_dic;
|
|
415
|
+
largebm::use_list = false; // MDD-based
|
|
416
|
+
largebm::b_disp = verbose;
|
|
417
|
+
largebm::b_write = !out_file.empty();
|
|
418
|
+
largebm::out_file = out_file;
|
|
419
|
+
largebm::just_build = false;
|
|
420
|
+
|
|
421
|
+
largebm::ClearCollected();
|
|
422
|
+
largebm::items.clear();
|
|
423
|
+
largebm::item_dic.clear();
|
|
424
|
+
largebm::inv_item_dic.clear();
|
|
425
|
+
largebm::Tree.clear();
|
|
426
|
+
largebm::DFS.clear();
|
|
427
|
+
|
|
428
|
+
largebm::start_time = std::clock();
|
|
429
|
+
|
|
430
|
+
TempFile tmp;
|
|
431
|
+
std::string path;
|
|
432
|
+
|
|
433
|
+
if (py::isinstance<py::str>(data)) {
|
|
434
|
+
path = data.cast<std::string>();
|
|
435
|
+
} else {
|
|
436
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
437
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
438
|
+
path = tmp.path;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
if (verbose) {
|
|
442
|
+
std::cerr << "[LargeBTMiner] path=" << path
|
|
443
|
+
<< " minsup=" << minsup
|
|
444
|
+
<< " preproc=" << preproc
|
|
445
|
+
<< " use_dic=" << use_dic
|
|
446
|
+
<< std::endl;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if (!largebm::Load_instance(path, minsup)) {
|
|
450
|
+
throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
largebm::Freq_miner();
|
|
454
|
+
|
|
455
|
+
py::dict out;
|
|
456
|
+
out["patterns"] = largebm::GetCollected();
|
|
457
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
458
|
+
return out;
|
|
459
|
+
},
|
|
460
|
+
py::arg("data"),
|
|
461
|
+
py::arg("minsup") = 0.01,
|
|
462
|
+
py::arg("time_limit") = 36000,
|
|
463
|
+
py::arg("preproc") = false,
|
|
464
|
+
py::arg("use_dic") = false,
|
|
465
|
+
py::arg("verbose") = false,
|
|
466
|
+
py::arg("out_file") = ""
|
|
467
|
+
);
|
|
468
|
+
|
|
469
|
+
// ─────────────────────────────────────────────────────────────
|
|
470
|
+
// LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
|
|
471
|
+
// ─────────────────────────────────────────────────────────────
|
|
472
|
+
// ─────────────────────────────────────────────────────────────
|
|
473
|
+
// LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
|
|
474
|
+
// ─────────────────────────────────────────────────────────────
|
|
475
|
+
m.def("LargeHTMiner",
|
|
476
|
+
[](py::object data,
|
|
477
|
+
double minsup,
|
|
478
|
+
unsigned int time_limit,
|
|
479
|
+
bool /*preproc*/, // kept for API symmetry; ignored
|
|
480
|
+
bool use_dic,
|
|
481
|
+
bool verbose,
|
|
482
|
+
const std::string &out_file)
|
|
483
|
+
{
|
|
484
|
+
using namespace largehm;
|
|
485
|
+
|
|
486
|
+
// 1) Global configuration (mirror professor's style)
|
|
487
|
+
largehm::time_limit = time_limit;
|
|
488
|
+
largehm::pre_pro = true; // always preprocess
|
|
489
|
+
largehm::use_dic = use_dic;
|
|
490
|
+
largehm::just_build = false;
|
|
491
|
+
largehm::b_disp = verbose;
|
|
492
|
+
largehm::b_write = !out_file.empty();
|
|
493
|
+
largehm::out_file = out_file;
|
|
494
|
+
|
|
495
|
+
// 2) HARD RESET of all global state for a fresh run
|
|
496
|
+
largehm::ClearCollected(); // our helper in largehm::utility.cpp
|
|
497
|
+
|
|
498
|
+
largehm::M = 0;
|
|
499
|
+
largehm::L = 0;
|
|
500
|
+
largehm::mlim = 0;
|
|
501
|
+
largehm::N = 0;
|
|
502
|
+
largehm::theta = 0;
|
|
503
|
+
largehm::E = 0;
|
|
504
|
+
largehm::itmset_exists = false;
|
|
505
|
+
|
|
506
|
+
// containers
|
|
507
|
+
// (item_dic reset is optional and not strictly needed here)
|
|
508
|
+
largehm::DFS.clear();
|
|
509
|
+
largehm::VDFS.clear();
|
|
510
|
+
largehm::Tree.clear();
|
|
511
|
+
largehm::VTree.clear();
|
|
512
|
+
largehm::CTree.clear();
|
|
513
|
+
|
|
514
|
+
largehm::start_time = std::clock();
|
|
515
|
+
|
|
516
|
+
// 3) Handle input (file path or Python list)
|
|
517
|
+
TempFile tmp;
|
|
518
|
+
std::string path;
|
|
519
|
+
|
|
520
|
+
if (py::isinstance<py::str>(data)) {
|
|
521
|
+
path = data.cast<std::string>();
|
|
522
|
+
} else {
|
|
523
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
524
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
525
|
+
path = tmp.path;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
if (verbose) {
|
|
529
|
+
std::cerr << "[LargeHTMiner] path=" << path
|
|
530
|
+
<< " minsup=" << minsup
|
|
531
|
+
<< " preproc(always)=true"
|
|
532
|
+
<< " use_dic=" << use_dic
|
|
533
|
+
<< std::endl;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// 4) Build MDD / load instance.
|
|
537
|
+
// NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
|
|
538
|
+
// so we DO NOT create a root node here.
|
|
539
|
+
if (!largehm::Load_instance(path, minsup)) {
|
|
540
|
+
throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// 5) Run miner (same timing logic as original main)
|
|
544
|
+
if (!largehm::just_build &&
|
|
545
|
+
largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
|
|
546
|
+
{
|
|
547
|
+
largehm::Freq_miner();
|
|
548
|
+
if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
|
|
549
|
+
std::cout << "TIME LIMIT REACHED\n";
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// 6) Return collected patterns + runtime
|
|
554
|
+
py::dict out;
|
|
555
|
+
out["patterns"] = largehm::GetCollected();
|
|
556
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
557
|
+
return out;
|
|
558
|
+
},
|
|
559
|
+
py::arg("data"),
|
|
560
|
+
py::arg("minsup") = 0.01,
|
|
561
|
+
py::arg("time_limit") = 36000,
|
|
562
|
+
py::arg("preproc") = false, // kept for API symmetry
|
|
563
|
+
py::arg("use_dic") = false,
|
|
564
|
+
py::arg("verbose") = false,
|
|
565
|
+
py::arg("out_file") = ""
|
|
566
|
+
);
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
/*#include <pybind11/pybind11.h>
|
|
573
|
+
#include <pybind11/stl.h>
|
|
5
574
|
namespace py = pybind11;
|
|
6
575
|
#include <iostream>
|
|
7
576
|
|
|
@@ -28,6 +597,7 @@ namespace py = pybind11;
|
|
|
28
597
|
#include "largepp/src/load_inst.hpp"
|
|
29
598
|
#include "largepp/src/utility.hpp"
|
|
30
599
|
|
|
600
|
+
|
|
31
601
|
#include "largebm/src/freq_miner.hpp"
|
|
32
602
|
#include "largebm/src/load_inst.hpp"
|
|
33
603
|
#include "largebm/src/utility.hpp"
|
|
@@ -469,7 +1039,7 @@ m.def("HTMiner",
|
|
|
469
1039
|
// ─────────────────────────────────────────────────────────────────────────
|
|
470
1040
|
// LargeBTMiner (MDD-based)
|
|
471
1041
|
// ─────────────────────────────────────────────────────────────────────────
|
|
472
|
-
m.def("LargeBTMiner",
|
|
1042
|
+
/*m.def("LargeBTMiner",
|
|
473
1043
|
[](py::object data,
|
|
474
1044
|
double minsup,
|
|
475
1045
|
unsigned int time_limit,
|
|
@@ -676,4 +1246,4 @@ m.def("LargeHTMiner",
|
|
|
676
1246
|
|
|
677
1247
|
|
|
678
1248
|
|
|
679
|
-
}
|
|
1249
|
+
} */
|
|
Binary file
|
effspm/btminer/src/load_inst.cpp
CHANGED
|
@@ -67,8 +67,8 @@ bool Load_instance(string &items_file, double thresh) {
|
|
|
67
67
|
if (pre_pro) {
|
|
68
68
|
if (!Preprocess(items_file, thresh))
|
|
69
69
|
return false;
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
if (b_disp)
|
|
71
|
+
cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
|
|
72
72
|
|
|
73
73
|
// build empty DFS of size L
|
|
74
74
|
DFS.clear();
|
|
@@ -88,12 +88,13 @@ bool Load_instance(string &items_file, double thresh) {
|
|
|
88
88
|
else
|
|
89
89
|
theta = static_cast<int>(thresh);
|
|
90
90
|
}
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
91
|
+
if (b_disp)
|
|
92
|
+
cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
|
|
93
|
+
if (b_disp)
|
|
94
|
+
cout << "Found " << N * N_mult
|
|
95
|
+
<< " sequence, with max line len " << M
|
|
96
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
97
|
+
//cout << "Total MDD nodes: " << Tree.size() << endl;
|
|
97
98
|
|
|
98
99
|
return true;
|
|
99
100
|
}
|
|
@@ -147,8 +148,8 @@ bool Preprocess(string &inst, double thresh) {
|
|
|
147
148
|
if (freq[i] >= theta)
|
|
148
149
|
item_dic[i] = ++real_L;
|
|
149
150
|
}
|
|
150
|
-
|
|
151
|
-
|
|
151
|
+
if (b_disp)
|
|
152
|
+
cout << "Original number of items: " << L
|
|
152
153
|
<< " Reduced to: " << real_L << endl;
|
|
153
154
|
|
|
154
155
|
L = real_L;
|