effspm 0.2.8__cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl → 0.3.0__cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +310 -240
- effspm/_effspm.cpython-311-i386-linux-gnu.so +0 -0
- effspm/btminer/src/build_mdd.cpp +42 -17
- effspm/btminer/src/build_mdd.hpp +13 -19
- effspm/btminer/src/freq_miner.cpp +134 -49
- effspm/btminer/src/freq_miner.hpp +16 -0
- effspm/btminer/src/load_inst.cpp +196 -121
- effspm/btminer/src/load_inst.hpp +22 -4
- effspm/btminer/src/utility.cpp +26 -41
- effspm/btminer/src/utility.hpp +6 -30
- effspm/freq_miner.hpp +2 -1
- effspm/htminer/src/build_mdd.cpp +33 -86
- effspm/largebm/src/build_mdd.cpp +69 -110
- effspm/largebm/src/build_mdd.hpp +22 -37
- effspm/largebm/src/freq_miner.cpp +241 -291
- effspm/largebm/src/freq_miner.hpp +25 -36
- effspm/largebm/src/load_inst.cpp +20 -26
- effspm/largebm/src/load_inst.hpp +24 -34
- effspm/largebm/src/utility.cpp +11 -21
- effspm/largebm/src/utility.hpp +7 -10
- effspm/largehm/src/freq_miner.cpp +62 -78
- effspm/largehm/src/load_inst.cpp +79 -61
- effspm/largepp/src/freq_miner.cpp +184 -156
- effspm/largepp/src/freq_miner.hpp +11 -36
- effspm/largepp/src/load_inst.cpp +27 -8
- effspm/largepp/src/load_inst.hpp +15 -9
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/load_inst.hpp +1 -1
- {effspm-0.2.8.dist-info → effspm-0.3.0.dist-info}/METADATA +1 -1
- effspm-0.3.0.dist-info/RECORD +54 -0
- effspm-0.2.8.dist-info/RECORD +0 -53
- {effspm-0.2.8.dist-info → effspm-0.3.0.dist-info}/WHEEL +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.2.8.dist-info → effspm-0.3.0.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp
CHANGED
|
@@ -111,165 +111,223 @@ PYBIND11_MODULE(_effspm, m) {
|
|
|
111
111
|
py::arg("verbose") = false,
|
|
112
112
|
py::arg("out_file") = ""
|
|
113
113
|
);
|
|
114
|
+
m.def("BTMiner",
|
|
115
|
+
[](py::object data,
|
|
116
|
+
double minsup,
|
|
117
|
+
unsigned int time_limit,
|
|
118
|
+
bool preproc,
|
|
119
|
+
bool use_dic,
|
|
120
|
+
bool verbose,
|
|
121
|
+
const std::string &out_file)
|
|
122
|
+
{
|
|
123
|
+
// We are calling the *professor* BTMiner, now namespaced as btminer::.
|
|
124
|
+
// So we only set the globals the professor code actually has.
|
|
125
|
+
|
|
126
|
+
// 1) configure professor globals
|
|
127
|
+
btminer::time_limit = static_cast<int>(time_limit);
|
|
128
|
+
btminer::pre_pro = preproc;
|
|
129
|
+
btminer::use_dic = use_dic;
|
|
130
|
+
btminer::b_disp = verbose;
|
|
131
|
+
btminer::b_write = !out_file.empty();
|
|
132
|
+
btminer::out_file = out_file;
|
|
133
|
+
btminer::N_mult = 1; // professor uses these too
|
|
134
|
+
btminer::M_mult = 1;
|
|
135
|
+
btminer::just_build = false; // we want full mining
|
|
136
|
+
|
|
137
|
+
btminer::start_time = std::clock();
|
|
138
|
+
|
|
139
|
+
// 2) load data
|
|
140
|
+
//
|
|
141
|
+
// Professor’s code is primarily file-based (Load_instance(const string&, double)).
|
|
142
|
+
// So: if user passes a file path → use the professor loader directly.
|
|
143
|
+
// If user passes a Python list-of-lists → we will build the MDD the same
|
|
144
|
+
// way professor’s loader does, but without changing his logic.
|
|
145
|
+
if (py::isinstance<py::str>(data)) {
|
|
146
|
+
// ----- FILE MODE -----
|
|
147
|
+
std::string path = data.cast<std::string>();
|
|
114
148
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
double minsup,
|
|
121
|
-
unsigned int time_limit,
|
|
122
|
-
bool preproc,
|
|
123
|
-
bool use_dic,
|
|
124
|
-
bool verbose,
|
|
125
|
-
const std::string &out_file)
|
|
126
|
-
{
|
|
127
|
-
btminer::time_limit = time_limit;
|
|
128
|
-
btminer::pre_pro = preproc;
|
|
129
|
-
btminer::use_dic = use_dic;
|
|
130
|
-
btminer::use_list = false;
|
|
131
|
-
btminer::b_disp = verbose;
|
|
132
|
-
btminer::b_write = !out_file.empty();
|
|
133
|
-
btminer::out_file = out_file;
|
|
149
|
+
if (verbose) {
|
|
150
|
+
std::cerr << "[BT][binding] file=" << path
|
|
151
|
+
<< " minsup=" << minsup
|
|
152
|
+
<< " preproc=" << preproc << std::endl;
|
|
153
|
+
}
|
|
134
154
|
|
|
135
|
-
btminer::
|
|
136
|
-
|
|
155
|
+
if (!btminer::Load_instance(path, minsup)) {
|
|
156
|
+
throw std::runtime_error("BTMiner: failed to load file: " + path);
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
// ----- PYTHON LIST MODE -----
|
|
160
|
+
//
|
|
161
|
+
// We mimic professor’s loader:
|
|
162
|
+
// - create root in Tree
|
|
163
|
+
// - compute N, M, L
|
|
164
|
+
// - compute theta from minsup
|
|
165
|
+
// - seed DFS (one Pattern per item, as in Preprocess branch)
|
|
166
|
+
// - call Build_MDD(...) for each sequence
|
|
167
|
+
//
|
|
168
|
+
// This DOES NOT change his mining logic; it just drives it from memory.
|
|
137
169
|
|
|
138
|
-
|
|
139
|
-
std::string path = data.cast<std::string>();
|
|
140
|
-
if (!btminer::Load_instance(path, minsup))
|
|
141
|
-
throw std::runtime_error("Failed to load file: " + path);
|
|
142
|
-
} else {
|
|
143
|
-
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
144
|
-
btminer::items = std::move(seqs);
|
|
145
|
-
btminer::N = btminer::items.size();
|
|
170
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
146
171
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
max_id = std::max(max_id, std::abs(x));
|
|
151
|
-
btminer::L = max_id;
|
|
172
|
+
// clear MDD and globals to a known state
|
|
173
|
+
btminer::Tree.clear();
|
|
174
|
+
btminer::Tree.emplace_back(0, 0, 0); // root (exactly like professor)
|
|
152
175
|
|
|
153
|
-
|
|
176
|
+
// compute basic stats
|
|
177
|
+
int max_id = 0;
|
|
178
|
+
int max_len = 0;
|
|
179
|
+
int seq_count = 0;
|
|
180
|
+
long long entries = 0;
|
|
181
|
+
|
|
182
|
+
for (const auto &s : seqs) {
|
|
183
|
+
if (s.empty()) continue;
|
|
184
|
+
++seq_count;
|
|
185
|
+
max_len = std::max<int>(max_len, static_cast<int>(s.size()));
|
|
186
|
+
for (int x : s) {
|
|
187
|
+
max_id = std::max(max_id, std::abs(x));
|
|
188
|
+
++entries;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
154
191
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
192
|
+
btminer::N = seq_count;
|
|
193
|
+
btminer::M = max_len;
|
|
194
|
+
btminer::L = max_id;
|
|
195
|
+
btminer::E = static_cast<int>(entries);
|
|
196
|
+
|
|
197
|
+
// theta = abs support
|
|
198
|
+
if (minsup < 1.0)
|
|
199
|
+
btminer::theta = static_cast<int>(std::ceil(minsup * btminer::N * btminer::N_mult));
|
|
200
|
+
else
|
|
201
|
+
btminer::theta = static_cast<int>(minsup);
|
|
202
|
+
|
|
203
|
+
// seed DFS exactly like professor does in the preprocessed branch:
|
|
204
|
+
btminer::DFS.clear();
|
|
205
|
+
btminer::DFS.reserve(btminer::L);
|
|
206
|
+
for (int i = 0; i < btminer::L; ++i)
|
|
207
|
+
btminer::DFS.emplace_back(-i - 1);
|
|
208
|
+
|
|
209
|
+
// now build the MDD, sequence by sequence
|
|
210
|
+
for (const auto &s : seqs) {
|
|
211
|
+
if (s.empty()) continue;
|
|
212
|
+
// professor’s Build_MDD takes a vector<int> by non-const ref
|
|
213
|
+
std::vector<int> tmp = s;
|
|
214
|
+
btminer::Build_MDD(tmp);
|
|
215
|
+
}
|
|
159
216
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
217
|
+
if (verbose) {
|
|
218
|
+
std::cerr << "[BT][binding] PY mode: N=" << btminer::N
|
|
219
|
+
<< " L=" << btminer::L
|
|
220
|
+
<< " M=" << btminer::M
|
|
221
|
+
<< " E=" << btminer::E
|
|
222
|
+
<< " theta=" << btminer::theta
|
|
223
|
+
<< " Tree.size()=" << btminer::Tree.size()
|
|
224
|
+
<< std::endl;
|
|
166
225
|
}
|
|
226
|
+
}
|
|
167
227
|
|
|
168
|
-
|
|
228
|
+
// 3) run professor’s miner
|
|
229
|
+
btminer::Freq_miner();
|
|
169
230
|
|
|
231
|
+
// 4) build python result
|
|
232
|
+
// 4) build python result
|
|
170
233
|
py::dict out;
|
|
171
|
-
out["patterns"]
|
|
172
|
-
out["
|
|
234
|
+
out["patterns"] = btminer::GetCollected(); // ← NEW
|
|
235
|
+
out["num_patterns"] = btminer::num_patt;
|
|
236
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
237
|
+
out["N"] = btminer::N;
|
|
238
|
+
out["L"] = btminer::L;
|
|
239
|
+
out["theta"] = btminer::theta;
|
|
173
240
|
return out;
|
|
174
|
-
},
|
|
175
|
-
py::arg("data"),
|
|
176
|
-
py::arg("minsup") = 0.01,
|
|
177
|
-
py::arg("time_limit") = 36000,
|
|
178
|
-
py::arg("preproc") = false,
|
|
179
|
-
py::arg("use_dic") = false,
|
|
180
|
-
py::arg("verbose") = false,
|
|
181
|
-
py::arg("out_file") = ""
|
|
182
|
-
);
|
|
183
241
|
|
|
184
|
-
|
|
242
|
+
},
|
|
243
|
+
py::arg("data"),
|
|
244
|
+
py::arg("minsup") = 0.01,
|
|
245
|
+
py::arg("time_limit") = 36000,
|
|
246
|
+
py::arg("preproc") = false,
|
|
247
|
+
py::arg("use_dic") = false,
|
|
248
|
+
py::arg("verbose") = false,
|
|
249
|
+
py::arg("out_file") = ""
|
|
250
|
+
);
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
|
|
185
255
|
// HTMiner
|
|
186
256
|
// ─────────────────────────────────────────────────────────────
|
|
257
|
+
// HTMiner
|
|
187
258
|
m.def("HTMiner",
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
htminer::start_time = std::clock();
|
|
204
|
-
|
|
205
|
-
// 2) load sequences (either from filename or from Python list)
|
|
206
|
-
if (py::isinstance<py::str>(data)) {
|
|
207
|
-
std::string path = data.cast<std::string>();
|
|
208
|
-
if (!htminer::Load_instance(path, minsup))
|
|
209
|
-
throw std::runtime_error("Failed to load file: " + path);
|
|
210
|
-
} else {
|
|
211
|
-
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
212
|
-
htminer::items = std::move(seqs);
|
|
213
|
-
htminer::N = htminer::items.size();
|
|
259
|
+
[](py::object data,
|
|
260
|
+
double minsup, unsigned int time_limit,
|
|
261
|
+
bool preproc, bool use_dic,
|
|
262
|
+
bool verbose, const std::string &out_file)
|
|
263
|
+
{
|
|
264
|
+
htminer::time_limit = time_limit;
|
|
265
|
+
htminer::pre_pro = preproc;
|
|
266
|
+
htminer::use_dic = use_dic;
|
|
267
|
+
htminer::just_build = false;
|
|
268
|
+
htminer::use_list = false;
|
|
269
|
+
htminer::b_disp = verbose;
|
|
270
|
+
htminer::b_write = !out_file.empty();
|
|
271
|
+
htminer::out_file = out_file;
|
|
272
|
+
htminer::ClearCollected();
|
|
273
|
+
htminer::start_time = std::clock();
|
|
214
274
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
htminer::
|
|
232
|
-
htminer::DFS.reserve(htminer::L);
|
|
233
|
-
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
234
|
-
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
235
|
-
|
|
236
|
-
// initialize VDFS if HTMiner needs it
|
|
237
|
-
htminer::VDFS.clear();
|
|
238
|
-
htminer::VDFS.resize(htminer::L);
|
|
275
|
+
if (py::isinstance<py::str>(data)) {
|
|
276
|
+
std::string path = data.cast<std::string>();
|
|
277
|
+
if (!htminer::Load_instance(path, minsup))
|
|
278
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
279
|
+
} else {
|
|
280
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
281
|
+
htminer::items = std::move(seqs);
|
|
282
|
+
htminer::N = htminer::items.size();
|
|
283
|
+
|
|
284
|
+
int max_id = 0;
|
|
285
|
+
htminer::M = 0;
|
|
286
|
+
htminer::E = 0;
|
|
287
|
+
for (auto &seq : htminer::items) {
|
|
288
|
+
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
289
|
+
for (int x : seq)
|
|
290
|
+
max_id = std::max(max_id, std::abs(x));
|
|
291
|
+
htminer::E += seq.size();
|
|
239
292
|
}
|
|
293
|
+
htminer::L = max_id;
|
|
294
|
+
htminer::theta = (minsup < 1.0)
|
|
295
|
+
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
296
|
+
: static_cast<unsigned long long>(minsup);
|
|
297
|
+
|
|
298
|
+
htminer::DFS.clear();
|
|
299
|
+
htminer::DFS.reserve(htminer::L);
|
|
300
|
+
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
301
|
+
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
302
|
+
|
|
303
|
+
htminer::VDFS.clear();
|
|
304
|
+
htminer::VDFS.resize(htminer::L);
|
|
305
|
+
}
|
|
240
306
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
// for (int x : seq) {
|
|
249
|
-
// std::cout << x << " ";
|
|
250
|
-
// }
|
|
251
|
-
// std::cout << "}\n";
|
|
252
|
-
//}
|
|
253
|
-
std::cout << " total patterns = "
|
|
254
|
-
<< htminer::collectedPatterns.size() << "\n";
|
|
255
|
-
// ─────────────────────────────────────────────────
|
|
256
|
-
|
|
257
|
-
// 4) return patterns + elapsed time
|
|
258
|
-
py::dict out;
|
|
259
|
-
out["patterns"] = htminer::GetCollected();
|
|
260
|
-
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
261
|
-
return out;
|
|
262
|
-
},
|
|
263
|
-
py::arg("data"),
|
|
264
|
-
py::arg("minsup") = 0.01,
|
|
265
|
-
py::arg("time_limit") = 36000,
|
|
266
|
-
py::arg("preproc") = false,
|
|
267
|
-
py::arg("use_dic") = false,
|
|
268
|
-
py::arg("verbose") = false,
|
|
269
|
-
py::arg("out_file") = ""
|
|
270
|
-
);
|
|
307
|
+
htminer::Freq_miner();
|
|
308
|
+
|
|
309
|
+
// 👇 now really respects verbose
|
|
310
|
+
if (verbose) {
|
|
311
|
+
std::cout << " total patterns = "
|
|
312
|
+
<< htminer::collectedPatterns.size() << "\n";
|
|
313
|
+
}
|
|
271
314
|
|
|
272
|
-
|
|
315
|
+
py::dict out;
|
|
316
|
+
out["patterns"] = htminer::GetCollected();
|
|
317
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
318
|
+
return out;
|
|
319
|
+
},
|
|
320
|
+
py::arg("data"),
|
|
321
|
+
py::arg("minsup") = 0.01,
|
|
322
|
+
py::arg("time_limit") = 36000,
|
|
323
|
+
py::arg("preproc") = false,
|
|
324
|
+
py::arg("use_dic") = false,
|
|
325
|
+
py::arg("verbose") = false,
|
|
326
|
+
py::arg("out_file") = ""
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
m.def("LargePrefixProjection",
|
|
273
331
|
[](py::object data,
|
|
274
332
|
double minsup,
|
|
275
333
|
unsigned int time_limit,
|
|
@@ -281,28 +339,30 @@ std::cout << " total patterns = "
|
|
|
281
339
|
largepp::time_limit = time_limit;
|
|
282
340
|
largepp::pre_pro = preproc;
|
|
283
341
|
largepp::use_dic = use_dic;
|
|
284
|
-
largepp::use_list = true;
|
|
342
|
+
largepp::use_list = true;
|
|
285
343
|
largepp::b_disp = verbose;
|
|
286
344
|
largepp::b_write = !out_file.empty();
|
|
287
345
|
largepp::out_file = out_file;
|
|
288
|
-
largepp::just_build = false;
|
|
346
|
+
largepp::just_build = false;
|
|
289
347
|
|
|
290
348
|
largepp::ClearCollected();
|
|
291
349
|
largepp::start_time = std::clock();
|
|
292
|
-
std::string fname = data.cast<std::string>();
|
|
293
|
-
/* 1) load instance (py list or filename) */
|
|
294
|
-
if (py::isinstance<py::str>(data))
|
|
295
|
-
|
|
296
|
-
largepp::Load_instance(fname, minsup);
|
|
297
|
-
else
|
|
298
|
-
largepp::Load_py(data, minsup); // helper you’ll expose
|
|
299
|
-
|
|
300
|
-
std::vector<unsigned long long> dbg;
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
350
|
|
|
351
|
+
// 👇 this was the last noisy one
|
|
352
|
+
if (verbose) {
|
|
353
|
+
std::cerr << " minsup=" << minsup
|
|
354
|
+
<< " preproc=" << preproc
|
|
355
|
+
<< " verbose=" << verbose
|
|
356
|
+
<< " out_file=" << (out_file.empty() ? "(none)" : out_file)
|
|
357
|
+
<< " use_dic=" << use_dic << "\n";
|
|
358
|
+
}
|
|
305
359
|
|
|
360
|
+
if (py::isinstance<py::str>(data)) {
|
|
361
|
+
std::string fname = data.cast<std::string>();
|
|
362
|
+
largepp::Load_instance(fname, minsup);
|
|
363
|
+
} else {
|
|
364
|
+
largepp::Load_py(data, minsup);
|
|
365
|
+
}
|
|
306
366
|
|
|
307
367
|
largepp::Freq_miner();
|
|
308
368
|
|
|
@@ -320,6 +380,8 @@ std::cout << " total patterns = "
|
|
|
320
380
|
py::arg("out_file") = ""
|
|
321
381
|
);
|
|
322
382
|
|
|
383
|
+
|
|
384
|
+
|
|
323
385
|
// ─────────────────────────────────────────────────────────────
|
|
324
386
|
// LargeBTMiner -- Python wrapper for the largebm implementation
|
|
325
387
|
// ─────────────────────────────────────────────────────────────
|
|
@@ -404,100 +466,108 @@ std::cout << " total patterns = "
|
|
|
404
466
|
|
|
405
467
|
|
|
406
468
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
469
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
470
|
+
// LargeBTMiner (MDD-based)
|
|
471
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
472
|
+
m.def("LargeBTMiner",
|
|
473
|
+
[](py::object data,
|
|
474
|
+
double minsup,
|
|
475
|
+
unsigned int time_limit,
|
|
476
|
+
bool preproc,
|
|
477
|
+
bool use_dic,
|
|
478
|
+
bool verbose,
|
|
479
|
+
const std::string &out_file)
|
|
480
|
+
{
|
|
481
|
+
using namespace largebm;
|
|
482
|
+
|
|
483
|
+
// 0) Set global flags and timers
|
|
484
|
+
largebm::time_limit = time_limit;
|
|
485
|
+
largebm::pre_pro = preproc;
|
|
486
|
+
largebm::use_dic = use_dic;
|
|
487
|
+
largebm::use_list = false; // large-mode → always MDD
|
|
488
|
+
largebm::b_disp = verbose;
|
|
489
|
+
largebm::b_write = !out_file.empty();
|
|
490
|
+
largebm::out_file = out_file;
|
|
491
|
+
largebm::just_build = false;
|
|
492
|
+
|
|
493
|
+
// 0.1) Clear any leftover data/state from previous runs
|
|
494
|
+
largebm::items.clear();
|
|
495
|
+
largebm::item_dic.clear();
|
|
496
|
+
largebm::inv_item_dic.clear();
|
|
497
|
+
largebm::Tree.clear();
|
|
498
|
+
largebm::DFS.clear();
|
|
499
|
+
largebm::ClearCollected();
|
|
500
|
+
|
|
501
|
+
// 1) Load sequences (either from filename or from Python list)
|
|
502
|
+
if (py::isinstance<py::str>(data)) {
|
|
503
|
+
// ─────────── FILE-BASED MODE ───────────
|
|
504
|
+
std::string path = data.cast<std::string>();
|
|
505
|
+
if (!largebm::Load_instance(path, minsup))
|
|
506
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
440
507
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
508
|
+
} else {
|
|
509
|
+
// ────────── IN-MEMORY MODE ──────────
|
|
510
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
511
|
+
largebm::items = std::move(seqs);
|
|
512
|
+
largebm::N = largebm::items.size();
|
|
446
513
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
}
|
|
456
|
-
largebm::L = static_cast<unsigned int>(max_id);
|
|
457
|
-
largebm::theta = (minsup < 1.0)
|
|
458
|
-
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
459
|
-
: static_cast<unsigned long long>(minsup);
|
|
460
|
-
|
|
461
|
-
// 1.2) Initialize DFS buffer (size = L)
|
|
462
|
-
largebm::DFS.reserve(largebm::L);
|
|
463
|
-
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
464
|
-
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
465
|
-
|
|
466
|
-
// 1.3) Build the MDD “Tree”
|
|
467
|
-
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
468
|
-
largebm::Tree.emplace_back(0, 0, 0);
|
|
469
|
-
for (auto &seq : largebm::items)
|
|
470
|
-
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
514
|
+
// 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
|
|
515
|
+
int max_id = 0;
|
|
516
|
+
largebm::M = 0;
|
|
517
|
+
largebm::E = 0;
|
|
518
|
+
for (auto &seq : largebm::items) {
|
|
519
|
+
largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
|
|
520
|
+
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
521
|
+
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
471
522
|
}
|
|
523
|
+
largebm::L = static_cast<unsigned int>(max_id);
|
|
524
|
+
largebm::theta = (minsup < 1.0)
|
|
525
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
526
|
+
: static_cast<unsigned long long>(minsup);
|
|
472
527
|
|
|
473
|
-
// 2)
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
528
|
+
// 1.2) Initialize DFS buffer (size = L)
|
|
529
|
+
largebm::DFS.reserve(largebm::L);
|
|
530
|
+
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
531
|
+
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
532
|
+
|
|
533
|
+
// 1.3) Build the MDD “Tree”
|
|
534
|
+
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
535
|
+
largebm::Tree.emplace_back(0, 0, 0);
|
|
536
|
+
for (auto &seq : largebm::items)
|
|
537
|
+
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// 2) Rebuild inverse-dictionary from fresh item_dic
|
|
541
|
+
{
|
|
542
|
+
std::vector<int> inv(largebm::item_dic.size() + 1);
|
|
543
|
+
for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
|
|
544
|
+
int cid = largebm::item_dic[old - 1];
|
|
545
|
+
if (cid > 0) inv[cid] = old;
|
|
481
546
|
}
|
|
547
|
+
largebm::inv_item_dic = std::move(inv);
|
|
548
|
+
}
|
|
482
549
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
550
|
+
// 3) Start timing and run the miner
|
|
551
|
+
largebm::start_time = std::clock();
|
|
552
|
+
largebm::Freq_miner();
|
|
553
|
+
|
|
554
|
+
// 4) Collect results and elapsed time
|
|
555
|
+
const auto& pats = largebm::GetCollected();
|
|
556
|
+
|
|
557
|
+
py::dict out;
|
|
558
|
+
out["patterns"] = pats;
|
|
559
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
560
|
+
return out;
|
|
561
|
+
},
|
|
562
|
+
py::arg("data"),
|
|
563
|
+
py::arg("minsup") = 0.01,
|
|
564
|
+
py::arg("time_limit") = 36000,
|
|
565
|
+
py::arg("preproc") = false,
|
|
566
|
+
py::arg("use_dic") = false,
|
|
567
|
+
py::arg("verbose") = false,
|
|
568
|
+
py::arg("out_file") = ""
|
|
569
|
+
);
|
|
486
570
|
|
|
487
|
-
// 4) Collect results and elapsed time
|
|
488
|
-
py::dict out;
|
|
489
|
-
out["patterns"] = largebm::GetCollected();
|
|
490
|
-
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
491
|
-
return out;
|
|
492
|
-
},
|
|
493
|
-
py::arg("data"),
|
|
494
|
-
py::arg("minsup") = 0.01,
|
|
495
|
-
py::arg("time_limit") = 36000,
|
|
496
|
-
py::arg("preproc") = false,
|
|
497
|
-
py::arg("use_dic") = false,
|
|
498
|
-
py::arg("verbose") = false,
|
|
499
|
-
py::arg("out_file") = ""
|
|
500
|
-
);
|
|
501
571
|
|
|
502
572
|
|
|
503
573
|
m.def("LargeHTMiner",
|
|
Binary file
|