effspm 0.3.0__cp312-cp312-macosx_10_9_x86_64.whl → 0.3.1__cp312-cp312-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +550 -2
- effspm/_effspm.cpython-312-darwin.so +0 -0
- effspm/btminer/src/load_inst.cpp +11 -10
- effspm/btminer/src/main.cpp +83 -0
- effspm/htminer/src/build_mdd.cpp +41 -66
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/main.cpp +95 -0
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +132 -173
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +136 -191
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/load_inst.cpp +5 -4
- effspm/largepp/src/main.cpp +108 -0
- effspm/load_inst.cpp +8 -8
- effspm/main.cpp +103 -0
- {effspm-0.3.0.dist-info → effspm-0.3.1.dist-info}/METADATA +1 -1
- effspm-0.3.1.dist-info/RECORD +60 -0
- effspm-0.3.0.dist-info/RECORD +0 -54
- {effspm-0.3.0.dist-info → effspm-0.3.1.dist-info}/WHEEL +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.1.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -2,6 +2,553 @@
|
|
|
2
2
|
|
|
3
3
|
#include <pybind11/pybind11.h>
|
|
4
4
|
#include <pybind11/stl.h>
|
|
5
|
+
|
|
6
|
+
#include <iostream>
|
|
7
|
+
#include <fstream>
|
|
8
|
+
#include <cstdio> // std::remove
|
|
9
|
+
#include <vector>
|
|
10
|
+
#include <string>
|
|
11
|
+
#include <ctime>
|
|
12
|
+
#include <cmath>
|
|
13
|
+
|
|
14
|
+
namespace py = pybind11;
|
|
15
|
+
|
|
16
|
+
// PrefixProjection headers (global namespace)
|
|
17
|
+
#include "freq_miner.hpp"
|
|
18
|
+
#include "load_inst.hpp"
|
|
19
|
+
#include "utility.hpp"
|
|
20
|
+
|
|
21
|
+
// BTMiner (namespaced)
|
|
22
|
+
#include "btminer/src/freq_miner.hpp"
|
|
23
|
+
#include "btminer/src/load_inst.hpp"
|
|
24
|
+
#include "btminer/src/utility.hpp"
|
|
25
|
+
#include "btminer/src/build_mdd.hpp"
|
|
26
|
+
|
|
27
|
+
// HTMiner (namespaced)
|
|
28
|
+
#include "htminer/src/build_mdd.hpp"
|
|
29
|
+
#include "htminer/src/freq_miner.hpp"
|
|
30
|
+
#include "htminer/src/load_inst.hpp"
|
|
31
|
+
#include "htminer/src/utility.hpp"
|
|
32
|
+
|
|
33
|
+
// LargePrefixProjection
|
|
34
|
+
#include "largepp/src/freq_miner.hpp"
|
|
35
|
+
#include "largepp/src/load_inst.hpp"
|
|
36
|
+
#include "largepp/src/utility.hpp"
|
|
37
|
+
|
|
38
|
+
// LargeBTMiner
|
|
39
|
+
#include "largebm/src/freq_miner.hpp"
|
|
40
|
+
#include "largebm/src/load_inst.hpp"
|
|
41
|
+
#include "largebm/src/utility.hpp"
|
|
42
|
+
#include "largebm/src/build_mdd.hpp"
|
|
43
|
+
|
|
44
|
+
// LargeHTMiner
|
|
45
|
+
#include "largehm/src/freq_miner.hpp"
|
|
46
|
+
#include "largehm/src/load_inst.hpp"
|
|
47
|
+
#include "largehm/src/utility.hpp"
|
|
48
|
+
#include "largehm/src/build_mdd.hpp"
|
|
49
|
+
|
|
50
|
+
namespace {
|
|
51
|
+
|
|
52
|
+
// RAII helper for temp file
|
|
53
|
+
struct TempFile {
|
|
54
|
+
std::string path;
|
|
55
|
+
~TempFile() {
|
|
56
|
+
if (!path.empty()) {
|
|
57
|
+
std::remove(path.c_str());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Write Python list[list[int]] to a temp file in professor’s format:
|
|
63
|
+
// one sequence per line, items separated by spaces.
|
|
64
|
+
std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
|
|
65
|
+
char tmp_name[L_tmpnam];
|
|
66
|
+
if (!std::tmpnam(tmp_name)) {
|
|
67
|
+
throw std::runtime_error("Failed to create temporary file name");
|
|
68
|
+
}
|
|
69
|
+
std::string path = std::string(tmp_name) + ".txt";
|
|
70
|
+
|
|
71
|
+
std::ofstream ofs(path);
|
|
72
|
+
if (!ofs) {
|
|
73
|
+
throw std::runtime_error("Failed to open temporary file for writing: " + path);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const auto& seq : seqs) {
|
|
77
|
+
for (size_t i = 0; i < seq.size(); ++i) {
|
|
78
|
+
if (i) ofs << ' ';
|
|
79
|
+
ofs << seq[i];
|
|
80
|
+
}
|
|
81
|
+
ofs << '\n';
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
ofs.close();
|
|
85
|
+
return path;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
} // anonymous namespace
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
92
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
|
|
93
|
+
|
|
94
|
+
// ─────────────────────────────────────────────────────────────
|
|
95
|
+
// PrefixProjection (works directly on Python lists or files)
|
|
96
|
+
// ─────────────────────────────────────────────────────────────
|
|
97
|
+
m.def("PrefixProjection",
|
|
98
|
+
[](py::object data,
|
|
99
|
+
double minsup,
|
|
100
|
+
unsigned int time_limit,
|
|
101
|
+
bool preproc,
|
|
102
|
+
bool use_dic,
|
|
103
|
+
bool verbose,
|
|
104
|
+
const std::string &out_file)
|
|
105
|
+
{
|
|
106
|
+
::time_limit = time_limit;
|
|
107
|
+
::pre_pro = preproc;
|
|
108
|
+
::use_dic = use_dic;
|
|
109
|
+
::use_list = false;
|
|
110
|
+
::b_disp = verbose; // controls prints in original code
|
|
111
|
+
::b_write = !out_file.empty();
|
|
112
|
+
::out_file = out_file;
|
|
113
|
+
|
|
114
|
+
ClearCollected();
|
|
115
|
+
start_time = std::clock();
|
|
116
|
+
|
|
117
|
+
if (py::isinstance<py::str>(data)) {
|
|
118
|
+
std::string path = data.cast<std::string>();
|
|
119
|
+
if (!Load_instance(path, minsup))
|
|
120
|
+
throw std::runtime_error("PrefixProjection: failed to load file: " + path);
|
|
121
|
+
} else {
|
|
122
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
123
|
+
items = std::move(seqs);
|
|
124
|
+
N = items.size();
|
|
125
|
+
|
|
126
|
+
int max_id = 0;
|
|
127
|
+
for (auto &seq : items)
|
|
128
|
+
for (int x : seq)
|
|
129
|
+
max_id = std::max(max_id, std::abs(x));
|
|
130
|
+
L = max_id;
|
|
131
|
+
|
|
132
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
133
|
+
|
|
134
|
+
DFS.clear();
|
|
135
|
+
DFS.reserve(L);
|
|
136
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
137
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
138
|
+
|
|
139
|
+
M = 0;
|
|
140
|
+
E = 0;
|
|
141
|
+
for (auto &seq : items) {
|
|
142
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
143
|
+
E += seq.size();
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
Freq_miner();
|
|
148
|
+
|
|
149
|
+
py::dict out;
|
|
150
|
+
out["patterns"] = GetCollected();
|
|
151
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
152
|
+
return out;
|
|
153
|
+
},
|
|
154
|
+
py::arg("data"),
|
|
155
|
+
py::arg("minsup") = 0.01,
|
|
156
|
+
py::arg("time_limit") = 36000,
|
|
157
|
+
py::arg("preproc") = false,
|
|
158
|
+
py::arg("use_dic") = false,
|
|
159
|
+
py::arg("verbose") = false,
|
|
160
|
+
py::arg("out_file") = ""
|
|
161
|
+
);
|
|
162
|
+
|
|
163
|
+
// ─────────────────────────────────────────────────────────────
|
|
164
|
+
// BTMiner (always uses professor's Load_instance)
|
|
165
|
+
// ─────────────────────────────────────────────────────────────
|
|
166
|
+
m.def("BTMiner",
|
|
167
|
+
[](py::object data,
|
|
168
|
+
double minsup,
|
|
169
|
+
unsigned int time_limit,
|
|
170
|
+
bool preproc,
|
|
171
|
+
bool use_dic,
|
|
172
|
+
bool verbose,
|
|
173
|
+
const std::string &out_file)
|
|
174
|
+
{
|
|
175
|
+
// Configure professor globals
|
|
176
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
177
|
+
btminer::pre_pro = preproc;
|
|
178
|
+
btminer::use_dic = use_dic;
|
|
179
|
+
btminer::b_disp = verbose;
|
|
180
|
+
btminer::b_write = !out_file.empty();
|
|
181
|
+
btminer::out_file = out_file;
|
|
182
|
+
btminer::N_mult = 1;
|
|
183
|
+
btminer::M_mult = 1;
|
|
184
|
+
btminer::just_build = false;
|
|
185
|
+
|
|
186
|
+
btminer::ClearCollected();
|
|
187
|
+
btminer::start_time = std::clock();
|
|
188
|
+
|
|
189
|
+
TempFile tmp;
|
|
190
|
+
std::string path;
|
|
191
|
+
|
|
192
|
+
if (py::isinstance<py::str>(data)) {
|
|
193
|
+
// File path: use directly
|
|
194
|
+
path = data.cast<std::string>();
|
|
195
|
+
} else {
|
|
196
|
+
// Python list → write to a temp file in the same format
|
|
197
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
198
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
199
|
+
path = tmp.path;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if (verbose) {
|
|
203
|
+
std::cerr << "[BTMiner] path=" << path
|
|
204
|
+
<< " minsup=" << minsup
|
|
205
|
+
<< " preproc=" << preproc
|
|
206
|
+
<< " use_dic=" << use_dic
|
|
207
|
+
<< std::endl;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
211
|
+
throw std::runtime_error("BTMiner: failed to load instance from: " + path);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
btminer::Freq_miner();
|
|
215
|
+
|
|
216
|
+
py::dict out;
|
|
217
|
+
out["patterns"] = btminer::GetCollected();
|
|
218
|
+
out["num_patterns"] = btminer::num_patt;
|
|
219
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
220
|
+
out["N"] = btminer::N;
|
|
221
|
+
out["L"] = btminer::L;
|
|
222
|
+
out["theta"] = btminer::theta;
|
|
223
|
+
return out;
|
|
224
|
+
},
|
|
225
|
+
py::arg("data"),
|
|
226
|
+
py::arg("minsup") = 0.01,
|
|
227
|
+
py::arg("time_limit") = 36000,
|
|
228
|
+
py::arg("preproc") = false,
|
|
229
|
+
py::arg("use_dic") = false,
|
|
230
|
+
py::arg("verbose") = false,
|
|
231
|
+
py::arg("out_file") = ""
|
|
232
|
+
);
|
|
233
|
+
|
|
234
|
+
// ─────────────────────────────────────────────────────────────
|
|
235
|
+
// HTMiner (works on files; we use a temp file for in-memory data)
|
|
236
|
+
// ─────────────────────────────────────────────────────────────
|
|
237
|
+
// ─────────────────────────────────────────────────────────────
|
|
238
|
+
// HTMiner (always uses professor's Load_instance; pre_pro forced ON)
|
|
239
|
+
// ─────────────────────────────────────────────────────────────
|
|
240
|
+
m.def("HTMiner",
|
|
241
|
+
[](py::object data,
|
|
242
|
+
double minsup,
|
|
243
|
+
unsigned int time_limit,
|
|
244
|
+
bool /*preproc*/, // Python arg is ignored internally
|
|
245
|
+
bool use_dic,
|
|
246
|
+
bool verbose,
|
|
247
|
+
const std::string &out_file)
|
|
248
|
+
{
|
|
249
|
+
using namespace htminer;
|
|
250
|
+
|
|
251
|
+
// ───────── Global parameter setup ─────────
|
|
252
|
+
htminer::time_limit = time_limit;
|
|
253
|
+
|
|
254
|
+
// IMPORTANT: always run with preprocessing ON,
|
|
255
|
+
// regardless of the Python `preproc` flag.
|
|
256
|
+
htminer::pre_pro = true;
|
|
257
|
+
htminer::use_dic = use_dic;
|
|
258
|
+
htminer::just_build = false;
|
|
259
|
+
htminer::b_disp = verbose;
|
|
260
|
+
htminer::b_write = !out_file.empty();
|
|
261
|
+
htminer::out_file = out_file;
|
|
262
|
+
|
|
263
|
+
// ───────── HARD RESET of HTMiner globals ─────────
|
|
264
|
+
htminer::ClearCollected();
|
|
265
|
+
htminer::Tree.clear();
|
|
266
|
+
htminer::VTree.clear();
|
|
267
|
+
htminer::CTree.clear();
|
|
268
|
+
htminer::DFS.clear();
|
|
269
|
+
htminer::VDFS.clear();
|
|
270
|
+
htminer::item_dic.clear();
|
|
271
|
+
|
|
272
|
+
htminer::M = 0;
|
|
273
|
+
htminer::N = 0;
|
|
274
|
+
htminer::L = 0;
|
|
275
|
+
htminer::E = 0;
|
|
276
|
+
htminer::theta = 0;
|
|
277
|
+
htminer::mlim = 0;
|
|
278
|
+
htminer::itmset_exists = false;
|
|
279
|
+
|
|
280
|
+
// NOTE: do NOT add a root arc here;
|
|
281
|
+
// htminer::Load_instance() already does Tree.emplace_back(0,0,0)
|
|
282
|
+
htminer::start_time = std::clock();
|
|
283
|
+
|
|
284
|
+
// ───────── Handle input (path or in-memory sequences) ─────────
|
|
285
|
+
TempFile tmp;
|
|
286
|
+
std::string path;
|
|
287
|
+
|
|
288
|
+
if (py::isinstance<py::str>(data)) {
|
|
289
|
+
// data is a file path
|
|
290
|
+
path = data.cast<std::string>();
|
|
291
|
+
} else {
|
|
292
|
+
// data is a list[list[int]] → write a temp file in the same text format
|
|
293
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
294
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
295
|
+
path = tmp.path;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (verbose) {
|
|
299
|
+
std::cerr << "[HTMiner] path=" << path
|
|
300
|
+
<< " minsup=" << minsup
|
|
301
|
+
<< " preproc(always)=true"
|
|
302
|
+
<< " use_dic=" << use_dic
|
|
303
|
+
<< std::endl;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// ───────── Build MDD via professor's loader ─────────
|
|
307
|
+
if (!htminer::Load_instance(path, minsup)) {
|
|
308
|
+
throw std::runtime_error("HTMiner: failed to load instance from: " + path);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ───────── Run miner ─────────
|
|
312
|
+
htminer::Freq_miner();
|
|
313
|
+
|
|
314
|
+
// ───────── Return results ─────────
|
|
315
|
+
py::dict out;
|
|
316
|
+
out["patterns"] = htminer::GetCollected();
|
|
317
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
318
|
+
return out;
|
|
319
|
+
},
|
|
320
|
+
py::arg("data"),
|
|
321
|
+
py::arg("minsup") = 0.01,
|
|
322
|
+
py::arg("time_limit") = 36000,
|
|
323
|
+
py::arg("preproc") = false, // kept for API symmetry, but IGNORED
|
|
324
|
+
py::arg("use_dic") = false,
|
|
325
|
+
py::arg("verbose") = false,
|
|
326
|
+
py::arg("out_file") = ""
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
// ─────────────────────────────────────────────────────────────
|
|
330
|
+
// LargePrefixProjection (already has its own Load_py)
|
|
331
|
+
// ─────────────────────────────────────────────────────────────
|
|
332
|
+
m.def("LargePrefixProjection",
|
|
333
|
+
[](py::object data,
|
|
334
|
+
double minsup,
|
|
335
|
+
unsigned int time_limit,
|
|
336
|
+
bool preproc,
|
|
337
|
+
bool use_dic,
|
|
338
|
+
bool verbose,
|
|
339
|
+
const std::string &out_file)
|
|
340
|
+
{
|
|
341
|
+
largepp::time_limit = time_limit;
|
|
342
|
+
largepp::pre_pro = preproc;
|
|
343
|
+
largepp::use_dic = use_dic;
|
|
344
|
+
largepp::use_list = true; // large prefix uses list-based mining
|
|
345
|
+
largepp::b_disp = verbose;
|
|
346
|
+
largepp::b_write = !out_file.empty();
|
|
347
|
+
largepp::out_file = out_file;
|
|
348
|
+
largepp::just_build = false;
|
|
349
|
+
|
|
350
|
+
largepp::ClearCollected();
|
|
351
|
+
largepp::start_time = std::clock();
|
|
352
|
+
|
|
353
|
+
if (py::isinstance<py::str>(data)) {
|
|
354
|
+
std::string fname = data.cast<std::string>();
|
|
355
|
+
largepp::Load_instance(fname, minsup);
|
|
356
|
+
} else {
|
|
357
|
+
largepp::Load_py(data, minsup);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
largepp::Freq_miner();
|
|
361
|
+
|
|
362
|
+
py::dict out;
|
|
363
|
+
out["patterns"] = largepp::GetCollected();
|
|
364
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
365
|
+
return out;
|
|
366
|
+
},
|
|
367
|
+
py::arg("data"),
|
|
368
|
+
py::arg("minsup") = 0.01,
|
|
369
|
+
py::arg("time_limit") = 36000,
|
|
370
|
+
py::arg("preproc") = false,
|
|
371
|
+
py::arg("use_dic") = false,
|
|
372
|
+
py::arg("verbose") = false,
|
|
373
|
+
py::arg("out_file") = ""
|
|
374
|
+
);
|
|
375
|
+
|
|
376
|
+
// ─────────────────────────────────────────────────────────────
|
|
377
|
+
// LargeBTMiner (always uses professor's largebm::Load_instance)
|
|
378
|
+
// ─────────────────────────────────────────────────────────────
|
|
379
|
+
m.def("LargeBTMiner",
|
|
380
|
+
[](py::object data,
|
|
381
|
+
double minsup,
|
|
382
|
+
unsigned int time_limit,
|
|
383
|
+
bool preproc,
|
|
384
|
+
bool use_dic,
|
|
385
|
+
bool verbose,
|
|
386
|
+
const std::string &out_file)
|
|
387
|
+
{
|
|
388
|
+
using namespace largebm;
|
|
389
|
+
|
|
390
|
+
largebm::time_limit = time_limit;
|
|
391
|
+
largebm::pre_pro = preproc;
|
|
392
|
+
largebm::use_dic = use_dic;
|
|
393
|
+
largebm::use_list = false; // MDD-based
|
|
394
|
+
largebm::b_disp = verbose;
|
|
395
|
+
largebm::b_write = !out_file.empty();
|
|
396
|
+
largebm::out_file = out_file;
|
|
397
|
+
largebm::just_build = false;
|
|
398
|
+
|
|
399
|
+
largebm::ClearCollected();
|
|
400
|
+
largebm::items.clear();
|
|
401
|
+
largebm::item_dic.clear();
|
|
402
|
+
largebm::inv_item_dic.clear();
|
|
403
|
+
largebm::Tree.clear();
|
|
404
|
+
largebm::DFS.clear();
|
|
405
|
+
|
|
406
|
+
largebm::start_time = std::clock();
|
|
407
|
+
|
|
408
|
+
TempFile tmp;
|
|
409
|
+
std::string path;
|
|
410
|
+
|
|
411
|
+
if (py::isinstance<py::str>(data)) {
|
|
412
|
+
path = data.cast<std::string>();
|
|
413
|
+
} else {
|
|
414
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
415
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
416
|
+
path = tmp.path;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
if (verbose) {
|
|
420
|
+
std::cerr << "[LargeBTMiner] path=" << path
|
|
421
|
+
<< " minsup=" << minsup
|
|
422
|
+
<< " preproc=" << preproc
|
|
423
|
+
<< " use_dic=" << use_dic
|
|
424
|
+
<< std::endl;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if (!largebm::Load_instance(path, minsup)) {
|
|
428
|
+
throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
largebm::Freq_miner();
|
|
432
|
+
|
|
433
|
+
py::dict out;
|
|
434
|
+
out["patterns"] = largebm::GetCollected();
|
|
435
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
436
|
+
return out;
|
|
437
|
+
},
|
|
438
|
+
py::arg("data"),
|
|
439
|
+
py::arg("minsup") = 0.01,
|
|
440
|
+
py::arg("time_limit") = 36000,
|
|
441
|
+
py::arg("preproc") = false,
|
|
442
|
+
py::arg("use_dic") = false,
|
|
443
|
+
py::arg("verbose") = false,
|
|
444
|
+
py::arg("out_file") = ""
|
|
445
|
+
);
|
|
446
|
+
|
|
447
|
+
// ─────────────────────────────────────────────────────────────
|
|
448
|
+
// LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
|
|
449
|
+
// ─────────────────────────────────────────────────────────────
|
|
450
|
+
// ─────────────────────────────────────────────────────────────
|
|
451
|
+
// LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
|
|
452
|
+
// ─────────────────────────────────────────────────────────────
|
|
453
|
+
m.def("LargeHTMiner",
|
|
454
|
+
[](py::object data,
|
|
455
|
+
double minsup,
|
|
456
|
+
unsigned int time_limit,
|
|
457
|
+
bool /*preproc*/, // kept for API symmetry; ignored
|
|
458
|
+
bool use_dic,
|
|
459
|
+
bool verbose,
|
|
460
|
+
const std::string &out_file)
|
|
461
|
+
{
|
|
462
|
+
using namespace largehm;
|
|
463
|
+
|
|
464
|
+
// 1) Global configuration (mirror professor's style)
|
|
465
|
+
largehm::time_limit = time_limit;
|
|
466
|
+
largehm::pre_pro = true; // always preprocess
|
|
467
|
+
largehm::use_dic = use_dic;
|
|
468
|
+
largehm::just_build = false;
|
|
469
|
+
largehm::b_disp = verbose;
|
|
470
|
+
largehm::b_write = !out_file.empty();
|
|
471
|
+
largehm::out_file = out_file;
|
|
472
|
+
|
|
473
|
+
// 2) HARD RESET of all global state for a fresh run
|
|
474
|
+
largehm::ClearCollected(); // our helper in largehm::utility.cpp
|
|
475
|
+
|
|
476
|
+
largehm::M = 0;
|
|
477
|
+
largehm::L = 0;
|
|
478
|
+
largehm::mlim = 0;
|
|
479
|
+
largehm::N = 0;
|
|
480
|
+
largehm::theta = 0;
|
|
481
|
+
largehm::E = 0;
|
|
482
|
+
largehm::itmset_exists = false;
|
|
483
|
+
|
|
484
|
+
// containers
|
|
485
|
+
// (item_dic reset is optional and not strictly needed here)
|
|
486
|
+
largehm::DFS.clear();
|
|
487
|
+
largehm::VDFS.clear();
|
|
488
|
+
largehm::Tree.clear();
|
|
489
|
+
largehm::VTree.clear();
|
|
490
|
+
largehm::CTree.clear();
|
|
491
|
+
|
|
492
|
+
largehm::start_time = std::clock();
|
|
493
|
+
|
|
494
|
+
// 3) Handle input (file path or Python list)
|
|
495
|
+
TempFile tmp;
|
|
496
|
+
std::string path;
|
|
497
|
+
|
|
498
|
+
if (py::isinstance<py::str>(data)) {
|
|
499
|
+
path = data.cast<std::string>();
|
|
500
|
+
} else {
|
|
501
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
502
|
+
tmp.path = write_temp_seq_file(seqs);
|
|
503
|
+
path = tmp.path;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
if (verbose) {
|
|
507
|
+
std::cerr << "[LargeHTMiner] path=" << path
|
|
508
|
+
<< " minsup=" << minsup
|
|
509
|
+
<< " preproc(always)=true"
|
|
510
|
+
<< " use_dic=" << use_dic
|
|
511
|
+
<< std::endl;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// 4) Build MDD / load instance.
|
|
515
|
+
// NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
|
|
516
|
+
// so we DO NOT create a root node here.
|
|
517
|
+
if (!largehm::Load_instance(path, minsup)) {
|
|
518
|
+
throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// 5) Run miner (same timing logic as original main)
|
|
522
|
+
if (!largehm::just_build &&
|
|
523
|
+
largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
|
|
524
|
+
{
|
|
525
|
+
largehm::Freq_miner();
|
|
526
|
+
if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
|
|
527
|
+
std::cout << "TIME LIMIT REACHED\n";
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// 6) Return collected patterns + runtime
|
|
532
|
+
py::dict out;
|
|
533
|
+
out["patterns"] = largehm::GetCollected();
|
|
534
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
535
|
+
return out;
|
|
536
|
+
},
|
|
537
|
+
py::arg("data"),
|
|
538
|
+
py::arg("minsup") = 0.01,
|
|
539
|
+
py::arg("time_limit") = 36000,
|
|
540
|
+
py::arg("preproc") = false, // kept for API symmetry
|
|
541
|
+
py::arg("use_dic") = false,
|
|
542
|
+
py::arg("verbose") = false,
|
|
543
|
+
py::arg("out_file") = ""
|
|
544
|
+
);
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
/*#include <pybind11/pybind11.h>
|
|
551
|
+
#include <pybind11/stl.h>
|
|
5
552
|
namespace py = pybind11;
|
|
6
553
|
#include <iostream>
|
|
7
554
|
|
|
@@ -28,6 +575,7 @@ namespace py = pybind11;
|
|
|
28
575
|
#include "largepp/src/load_inst.hpp"
|
|
29
576
|
#include "largepp/src/utility.hpp"
|
|
30
577
|
|
|
578
|
+
|
|
31
579
|
#include "largebm/src/freq_miner.hpp"
|
|
32
580
|
#include "largebm/src/load_inst.hpp"
|
|
33
581
|
#include "largebm/src/utility.hpp"
|
|
@@ -469,7 +1017,7 @@ m.def("HTMiner",
|
|
|
469
1017
|
// ─────────────────────────────────────────────────────────────────────────
|
|
470
1018
|
// LargeBTMiner (MDD-based)
|
|
471
1019
|
// ─────────────────────────────────────────────────────────────────────────
|
|
472
|
-
m.def("LargeBTMiner",
|
|
1020
|
+
/*m.def("LargeBTMiner",
|
|
473
1021
|
[](py::object data,
|
|
474
1022
|
double minsup,
|
|
475
1023
|
unsigned int time_limit,
|
|
@@ -676,4 +1224,4 @@ m.def("LargeHTMiner",
|
|
|
676
1224
|
|
|
677
1225
|
|
|
678
1226
|
|
|
679
|
-
}
|
|
1227
|
+
} */
|
|
Binary file
|
effspm/btminer/src/load_inst.cpp
CHANGED
|
@@ -67,8 +67,8 @@ bool Load_instance(string &items_file, double thresh) {
|
|
|
67
67
|
if (pre_pro) {
|
|
68
68
|
if (!Preprocess(items_file, thresh))
|
|
69
69
|
return false;
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
if (b_disp)
|
|
71
|
+
cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
|
|
72
72
|
|
|
73
73
|
// build empty DFS of size L
|
|
74
74
|
DFS.clear();
|
|
@@ -88,12 +88,13 @@ bool Load_instance(string &items_file, double thresh) {
|
|
|
88
88
|
else
|
|
89
89
|
theta = static_cast<int>(thresh);
|
|
90
90
|
}
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
91
|
+
if (b_disp)
|
|
92
|
+
cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
|
|
93
|
+
if (b_disp)
|
|
94
|
+
cout << "Found " << N * N_mult
|
|
95
|
+
<< " sequence, with max line len " << M
|
|
96
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
97
|
+
//cout << "Total MDD nodes: " << Tree.size() << endl;
|
|
97
98
|
|
|
98
99
|
return true;
|
|
99
100
|
}
|
|
@@ -147,8 +148,8 @@ bool Preprocess(string &inst, double thresh) {
|
|
|
147
148
|
if (freq[i] >= theta)
|
|
148
149
|
item_dic[i] = ++real_L;
|
|
149
150
|
}
|
|
150
|
-
|
|
151
|
-
|
|
151
|
+
if (b_disp)
|
|
152
|
+
cout << "Original number of items: " << L
|
|
152
153
|
<< " Reduced to: " << real_L << endl;
|
|
153
154
|
|
|
154
155
|
L = real_L;
|