effspm 0.1.5__cp310-cp310-win_amd64.whl → 0.3.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. effspm/__init__.py +9 -2
  2. effspm/_core.cpp +91 -13
  3. effspm/_effspm.cp310-win_amd64.pyd +0 -0
  4. effspm/_effspm.cpp +679 -0
  5. effspm/btminer/src/build_mdd.cpp +88 -0
  6. effspm/btminer/src/build_mdd.hpp +34 -0
  7. effspm/btminer/src/freq_miner.cpp +264 -0
  8. effspm/btminer/src/freq_miner.hpp +55 -0
  9. effspm/btminer/src/load_inst.cpp +275 -0
  10. effspm/btminer/src/load_inst.hpp +43 -0
  11. effspm/btminer/src/utility.cpp +50 -0
  12. effspm/btminer/src/utility.hpp +16 -0
  13. effspm/freq_miner.hpp +7 -1
  14. effspm/htminer/src/build_mdd.cpp +139 -0
  15. effspm/htminer/src/build_mdd.hpp +64 -0
  16. effspm/htminer/src/freq_miner.cpp +350 -0
  17. effspm/htminer/src/freq_miner.hpp +60 -0
  18. effspm/htminer/src/load_inst.cpp +394 -0
  19. effspm/htminer/src/load_inst.hpp +23 -0
  20. effspm/htminer/src/utility.cpp +72 -0
  21. effspm/htminer/src/utility.hpp +77 -0
  22. effspm/largebm/src/build_mdd.cpp +96 -0
  23. effspm/largebm/src/build_mdd.hpp +32 -0
  24. effspm/largebm/src/freq_miner.cpp +299 -0
  25. effspm/largebm/src/freq_miner.hpp +37 -0
  26. effspm/largebm/src/load_inst.cpp +224 -0
  27. effspm/largebm/src/load_inst.hpp +35 -0
  28. effspm/largebm/src/utility.cpp +35 -0
  29. effspm/largebm/src/utility.hpp +15 -0
  30. effspm/largehm/src/build_mdd.cpp +174 -0
  31. effspm/largehm/src/build_mdd.hpp +93 -0
  32. effspm/largehm/src/freq_miner.cpp +429 -0
  33. effspm/largehm/src/freq_miner.hpp +77 -0
  34. effspm/largehm/src/load_inst.cpp +375 -0
  35. effspm/largehm/src/load_inst.hpp +64 -0
  36. effspm/largehm/src/utility.cpp +38 -0
  37. effspm/largehm/src/utility.hpp +29 -0
  38. effspm/largepp/src/freq_miner.cpp +198 -0
  39. effspm/largepp/src/freq_miner.hpp +18 -0
  40. effspm/largepp/src/load_inst.cpp +238 -0
  41. effspm/largepp/src/load_inst.hpp +34 -0
  42. effspm/largepp/src/pattern.hpp +31 -0
  43. effspm/largepp/src/utility.cpp +34 -0
  44. effspm/largepp/src/utility.hpp +21 -0
  45. effspm/load_inst.hpp +18 -12
  46. effspm-0.3.0.dist-info/METADATA +237 -0
  47. effspm-0.3.0.dist-info/RECORD +54 -0
  48. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/WHEEL +1 -1
  49. effspm/_core.cp310-win_amd64.pyd +0 -0
  50. effspm-0.1.5.dist-info/METADATA +0 -38
  51. effspm-0.1.5.dist-info/RECORD +0 -14
  52. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/licenses/LICENSE +0 -0
  53. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/top_level.txt +0 -0
effspm/_effspm.cpp ADDED
@@ -0,0 +1,679 @@
1
+ // _effspm.cpp
2
+
3
+ #include <pybind11/pybind11.h>
4
+ #include <pybind11/stl.h>
5
+ namespace py = pybind11;
6
+ #include <iostream>
7
+
8
+
9
+ // PrefixProjection headers
10
+ #include "freq_miner.hpp"
11
+ #include "load_inst.hpp"
12
+ #include "utility.hpp"
13
+
14
+ // BTMiner (wrapped in its own namespace in source files)
15
+ #include "btminer/src/freq_miner.hpp"
16
+ #include "btminer/src/load_inst.hpp"
17
+ #include "btminer/src/utility.hpp"
18
+ #include "btminer/src/build_mdd.hpp"
19
+
20
+ // HTMiner (wrapped in its own namespace in source files)
21
+ #include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
22
+ #include "htminer/src/freq_miner.hpp"
23
+ #include "htminer/src/load_inst.hpp"
24
+ #include "htminer/src/utility.hpp"
25
+
26
+
27
+ #include "largepp/src/freq_miner.hpp"
28
+ #include "largepp/src/load_inst.hpp"
29
+ #include "largepp/src/utility.hpp"
30
+
31
+ #include "largebm/src/freq_miner.hpp"
32
+ #include "largebm/src/load_inst.hpp"
33
+ #include "largebm/src/utility.hpp"
34
+ #include "largebm/src/build_mdd.hpp"
35
+
36
+ #include "largehm/src/freq_miner.hpp"
37
+ #include "largehm/src/load_inst.hpp"
38
+ #include "largehm/src/utility.hpp"
39
+ #include "largehm/src/build_mdd.hpp"
40
+
41
+
42
+
43
+ PYBIND11_MODULE(_effspm, m) {
44
+ m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
45
+
46
+ // ─────────────────────────────────────────────────────────────
47
+ // PrefixProjection
48
+ // ─────────────────────────────────────────────────────────────
49
+ m.def("PrefixProjection",
50
+ [](py::object data,
51
+ double minsup,
52
+ unsigned int time_limit,
53
+ bool preproc,
54
+ bool use_dic,
55
+ bool verbose,
56
+ const std::string &out_file)
57
+ {
58
+ ::time_limit = time_limit;
59
+ ::pre_pro = preproc;
60
+ ::use_dic = use_dic;
61
+ ::use_list = false;
62
+ ::b_disp = verbose;
63
+ ::b_write = !out_file.empty();
64
+ ::out_file = out_file;
65
+
66
+ ClearCollected();
67
+ start_time = std::clock();
68
+
69
+ if (py::isinstance<py::str>(data)) {
70
+ std::string path = data.cast<std::string>();
71
+ if (!Load_instance(path, minsup))
72
+ throw std::runtime_error("Failed to load file: " + path);
73
+ } else {
74
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
75
+ items = std::move(seqs);
76
+ N = items.size();
77
+
78
+ int max_id = 0;
79
+ for (auto &seq : items)
80
+ for (int x : seq)
81
+ max_id = std::max(max_id, std::abs(x));
82
+ L = max_id;
83
+
84
+ theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
85
+
86
+ DFS.clear();
87
+ DFS.reserve(L);
88
+ for (unsigned int i = 0; i < L; ++i)
89
+ DFS.emplace_back(-static_cast<int>(i) - 1);
90
+
91
+ M = 0;
92
+ E = 0;
93
+ for (auto &seq : items) {
94
+ M = std::max<unsigned int>(M, seq.size());
95
+ E += seq.size();
96
+ }
97
+ }
98
+
99
+ Freq_miner();
100
+
101
+ py::dict out;
102
+ out["patterns"] = GetCollected();
103
+ out["time"] = give_time(std::clock() - start_time);
104
+ return out;
105
+ },
106
+ py::arg("data"),
107
+ py::arg("minsup") = 0.01,
108
+ py::arg("time_limit") = 36000,
109
+ py::arg("preproc") = false,
110
+ py::arg("use_dic") = false,
111
+ py::arg("verbose") = false,
112
+ py::arg("out_file") = ""
113
+ );
114
+ m.def("BTMiner",
115
+ [](py::object data,
116
+ double minsup,
117
+ unsigned int time_limit,
118
+ bool preproc,
119
+ bool use_dic,
120
+ bool verbose,
121
+ const std::string &out_file)
122
+ {
123
+ // We are calling the *professor* BTMiner, now namespaced as btminer::.
124
+ // So we only set the globals the professor code actually has.
125
+
126
+ // 1) configure professor globals
127
+ btminer::time_limit = static_cast<int>(time_limit);
128
+ btminer::pre_pro = preproc;
129
+ btminer::use_dic = use_dic;
130
+ btminer::b_disp = verbose;
131
+ btminer::b_write = !out_file.empty();
132
+ btminer::out_file = out_file;
133
+ btminer::N_mult = 1; // professor uses these too
134
+ btminer::M_mult = 1;
135
+ btminer::just_build = false; // we want full mining
136
+
137
+ btminer::start_time = std::clock();
138
+
139
+ // 2) load data
140
+ //
141
+ // Professor’s code is primarily file-based (Load_instance(const string&, double)).
142
+ // So: if user passes a file path → use the professor loader directly.
143
+ // If user passes a Python list-of-lists → we will build the MDD the same
144
+ // way professor’s loader does, but without changing his logic.
145
+ if (py::isinstance<py::str>(data)) {
146
+ // ----- FILE MODE -----
147
+ std::string path = data.cast<std::string>();
148
+
149
+ if (verbose) {
150
+ std::cerr << "[BT][binding] file=" << path
151
+ << " minsup=" << minsup
152
+ << " preproc=" << preproc << std::endl;
153
+ }
154
+
155
+ if (!btminer::Load_instance(path, minsup)) {
156
+ throw std::runtime_error("BTMiner: failed to load file: " + path);
157
+ }
158
+ } else {
159
+ // ----- PYTHON LIST MODE -----
160
+ //
161
+ // We mimic professor’s loader:
162
+ // - create root in Tree
163
+ // - compute N, M, L
164
+ // - compute theta from minsup
165
+ // - seed DFS (one Pattern per item, as in Preprocess branch)
166
+ // - call Build_MDD(...) for each sequence
167
+ //
168
+ // This DOES NOT change his mining logic; it just drives it from memory.
169
+
170
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
171
+
172
+ // clear MDD and globals to a known state
173
+ btminer::Tree.clear();
174
+ btminer::Tree.emplace_back(0, 0, 0); // root (exactly like professor)
175
+
176
+ // compute basic stats
177
+ int max_id = 0;
178
+ int max_len = 0;
179
+ int seq_count = 0;
180
+ long long entries = 0;
181
+
182
+ for (const auto &s : seqs) {
183
+ if (s.empty()) continue;
184
+ ++seq_count;
185
+ max_len = std::max<int>(max_len, static_cast<int>(s.size()));
186
+ for (int x : s) {
187
+ max_id = std::max(max_id, std::abs(x));
188
+ ++entries;
189
+ }
190
+ }
191
+
192
+ btminer::N = seq_count;
193
+ btminer::M = max_len;
194
+ btminer::L = max_id;
195
+ btminer::E = static_cast<int>(entries);
196
+
197
+ // theta = abs support
198
+ if (minsup < 1.0)
199
+ btminer::theta = static_cast<int>(std::ceil(minsup * btminer::N * btminer::N_mult));
200
+ else
201
+ btminer::theta = static_cast<int>(minsup);
202
+
203
+ // seed DFS exactly like professor does in the preprocessed branch:
204
+ btminer::DFS.clear();
205
+ btminer::DFS.reserve(btminer::L);
206
+ for (int i = 0; i < btminer::L; ++i)
207
+ btminer::DFS.emplace_back(-i - 1);
208
+
209
+ // now build the MDD, sequence by sequence
210
+ for (const auto &s : seqs) {
211
+ if (s.empty()) continue;
212
+ // professor’s Build_MDD takes a vector<int> by non-const ref
213
+ std::vector<int> tmp = s;
214
+ btminer::Build_MDD(tmp);
215
+ }
216
+
217
+ if (verbose) {
218
+ std::cerr << "[BT][binding] PY mode: N=" << btminer::N
219
+ << " L=" << btminer::L
220
+ << " M=" << btminer::M
221
+ << " E=" << btminer::E
222
+ << " theta=" << btminer::theta
223
+ << " Tree.size()=" << btminer::Tree.size()
224
+ << std::endl;
225
+ }
226
+ }
227
+
228
+ // 3) run professor’s miner
229
+ btminer::Freq_miner();
230
+
231
+ // 4) build python result
232
+ // 4) build python result
233
+ py::dict out;
234
+ out["patterns"] = btminer::GetCollected(); // ← NEW
235
+ out["num_patterns"] = btminer::num_patt;
236
+ out["time"] = btminer::give_time(std::clock() - btminer::start_time);
237
+ out["N"] = btminer::N;
238
+ out["L"] = btminer::L;
239
+ out["theta"] = btminer::theta;
240
+ return out;
241
+
242
+ },
243
+ py::arg("data"),
244
+ py::arg("minsup") = 0.01,
245
+ py::arg("time_limit") = 36000,
246
+ py::arg("preproc") = false,
247
+ py::arg("use_dic") = false,
248
+ py::arg("verbose") = false,
249
+ py::arg("out_file") = ""
250
+ );
251
+
252
+
253
+
254
+
255
+ // HTMiner
256
+ // ─────────────────────────────────────────────────────────────
257
+ // HTMiner
258
+ m.def("HTMiner",
259
+ [](py::object data,
260
+ double minsup, unsigned int time_limit,
261
+ bool preproc, bool use_dic,
262
+ bool verbose, const std::string &out_file)
263
+ {
264
+ htminer::time_limit = time_limit;
265
+ htminer::pre_pro = preproc;
266
+ htminer::use_dic = use_dic;
267
+ htminer::just_build = false;
268
+ htminer::use_list = false;
269
+ htminer::b_disp = verbose;
270
+ htminer::b_write = !out_file.empty();
271
+ htminer::out_file = out_file;
272
+ htminer::ClearCollected();
273
+ htminer::start_time = std::clock();
274
+
275
+ if (py::isinstance<py::str>(data)) {
276
+ std::string path = data.cast<std::string>();
277
+ if (!htminer::Load_instance(path, minsup))
278
+ throw std::runtime_error("Failed to load file: " + path);
279
+ } else {
280
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
281
+ htminer::items = std::move(seqs);
282
+ htminer::N = htminer::items.size();
283
+
284
+ int max_id = 0;
285
+ htminer::M = 0;
286
+ htminer::E = 0;
287
+ for (auto &seq : htminer::items) {
288
+ htminer::M = std::max<unsigned int>(htminer::M, seq.size());
289
+ for (int x : seq)
290
+ max_id = std::max(max_id, std::abs(x));
291
+ htminer::E += seq.size();
292
+ }
293
+ htminer::L = max_id;
294
+ htminer::theta = (minsup < 1.0)
295
+ ? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
296
+ : static_cast<unsigned long long>(minsup);
297
+
298
+ htminer::DFS.clear();
299
+ htminer::DFS.reserve(htminer::L);
300
+ for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
301
+ htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
302
+
303
+ htminer::VDFS.clear();
304
+ htminer::VDFS.resize(htminer::L);
305
+ }
306
+
307
+ htminer::Freq_miner();
308
+
309
+ // 👇 now really respects verbose
310
+ if (verbose) {
311
+ std::cout << " total patterns = "
312
+ << htminer::collectedPatterns.size() << "\n";
313
+ }
314
+
315
+ py::dict out;
316
+ out["patterns"] = htminer::GetCollected();
317
+ out["time"] = htminer::give_time(std::clock() - htminer::start_time);
318
+ return out;
319
+ },
320
+ py::arg("data"),
321
+ py::arg("minsup") = 0.01,
322
+ py::arg("time_limit") = 36000,
323
+ py::arg("preproc") = false,
324
+ py::arg("use_dic") = false,
325
+ py::arg("verbose") = false,
326
+ py::arg("out_file") = ""
327
+ );
328
+
329
+
330
+ m.def("LargePrefixProjection",
331
+ [](py::object data,
332
+ double minsup,
333
+ unsigned int time_limit,
334
+ bool preproc,
335
+ bool use_dic,
336
+ bool verbose,
337
+ const std::string &out_file)
338
+ {
339
+ largepp::time_limit = time_limit;
340
+ largepp::pre_pro = preproc;
341
+ largepp::use_dic = use_dic;
342
+ largepp::use_list = true;
343
+ largepp::b_disp = verbose;
344
+ largepp::b_write = !out_file.empty();
345
+ largepp::out_file = out_file;
346
+ largepp::just_build = false;
347
+
348
+ largepp::ClearCollected();
349
+ largepp::start_time = std::clock();
350
+
351
+ // 👇 this was the last noisy one
352
+ if (verbose) {
353
+ std::cerr << " minsup=" << minsup
354
+ << " preproc=" << preproc
355
+ << " verbose=" << verbose
356
+ << " out_file=" << (out_file.empty() ? "(none)" : out_file)
357
+ << " use_dic=" << use_dic << "\n";
358
+ }
359
+
360
+ if (py::isinstance<py::str>(data)) {
361
+ std::string fname = data.cast<std::string>();
362
+ largepp::Load_instance(fname, minsup);
363
+ } else {
364
+ largepp::Load_py(data, minsup);
365
+ }
366
+
367
+ largepp::Freq_miner();
368
+
369
+ py::dict out;
370
+ out["patterns"] = largepp::GetCollected();
371
+ out["time"] = largepp::give_time(std::clock() - largepp::start_time);
372
+ return out;
373
+ },
374
+ py::arg("data"),
375
+ py::arg("minsup") = 0.01,
376
+ py::arg("time_limit") = 36000,
377
+ py::arg("preproc") = false,
378
+ py::arg("use_dic") = false,
379
+ py::arg("verbose") = false,
380
+ py::arg("out_file") = ""
381
+ );
382
+
383
+
384
+
385
+ // ─────────────────────────────────────────────────────────────
386
+ // LargeBTMiner -- Python wrapper for the largebm implementation
387
+ // ─────────────────────────────────────────────────────────────
388
+ // m.def(
389
+ // "LargeBTMiner",
390
+ // [](py::object data,
391
+ // double minsup ,
392
+ // unsigned int time_limit,
393
+ // bool preproc ,
394
+ // bool use_dic,
395
+ // bool verbose,
396
+ // const std::string &out_file )
397
+ // {
398
+ // /* 1) Global flags */
399
+ // largebm::time_limit = time_limit;
400
+ // largebm::pre_pro = preproc;
401
+ // largebm::use_dic = use_dic;
402
+ // largebm::use_list = false; // large-mode → always MDD
403
+ // largebm::just_build = false;
404
+ // largebm::b_disp = verbose;
405
+ // largebm::b_write = !out_file.empty();
406
+ // largebm::out_file = out_file;
407
+
408
+ // /* 2) Reset per-run state */
409
+ // largebm::ClearCollected();
410
+ // largebm::start_time = std::clock();
411
+
412
+ // /* 3) Load the DB (file path or in-memory list<list<int>>) */
413
+ // if (py::isinstance<py::str>(data)) {
414
+ // std::string path = data.cast<std::string>();
415
+ // if (!largebm::Load_instance(path, minsup))
416
+ // throw std::runtime_error("Failed to load file: " + path);
417
+ // } else {
418
+ // // In-memory sequences
419
+ // largebm::items = std::move(data.cast<std::vector<std::vector<int>>>());
420
+ // largebm::N = static_cast<unsigned int>(largebm::items.size());
421
+
422
+ // /* -- basic stats -- */
423
+ // int max_id = 0;
424
+ // largebm::M = 0;
425
+ // largebm::E = 0;
426
+ // for ( auto &seq : largebm::items) {
427
+ // largebm::M = std::max<unsigned int>(largebm::M,
428
+ // static_cast<unsigned int>(seq.size()));
429
+ // largebm::E += static_cast<unsigned long long>(seq.size());
430
+ // for (int x : seq) max_id = std::max(max_id, std::abs(x));
431
+ // }
432
+ // largebm::L = static_cast<unsigned int>(max_id);
433
+ // largebm::theta = (minsup < 1.0)
434
+ // ? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
435
+ // : static_cast<unsigned long long>(minsup);
436
+
437
+ // /* -- DFS buffer (size = L) -- */
438
+ // largebm::DFS.clear();
439
+ // largebm::DFS.reserve(largebm::L);
440
+ // for (unsigned int i = 0; i < largebm::L; ++i)
441
+ // largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
442
+
443
+ // /* -- Build the MDD -- */
444
+ // largebm::Tree.clear();
445
+ // largebm::Tree.emplace_back(0, 0, 0); // dummy root
446
+ // for ( auto &seq : largebm::items)
447
+ // largebm::Build_MDD(seq);
448
+ // }
449
+
450
+ // /* 4) Mine and return results */
451
+ // largebm::Freq_miner();
452
+
453
+ // py::dict out;
454
+ // out["patterns"] = largebm::GetCollected();
455
+ // out["time"] = largebm::give_time(std::clock() - largebm::start_time);
456
+ // return out;
457
+ // },
458
+ // py::arg("data"),
459
+ // py::arg("minsup") = 0.01,
460
+ // py::arg("time_limit") = 36000,
461
+ // py::arg("preproc") = false,
462
+ // py::arg("use_dic") = false,
463
+ // py::arg("verbose") = false,
464
+ // py::arg("out_file") = ""
465
+ // );
466
+
467
+
468
+
469
+ // ─────────────────────────────────────────────────────────────────────────
470
+ // LargeBTMiner (MDD-based)
471
+ // ─────────────────────────────────────────────────────────────────────────
472
+ m.def("LargeBTMiner",
473
+ [](py::object data,
474
+ double minsup,
475
+ unsigned int time_limit,
476
+ bool preproc,
477
+ bool use_dic,
478
+ bool verbose,
479
+ const std::string &out_file)
480
+ {
481
+ using namespace largebm;
482
+
483
+ // 0) Set global flags and timers
484
+ largebm::time_limit = time_limit;
485
+ largebm::pre_pro = preproc;
486
+ largebm::use_dic = use_dic;
487
+ largebm::use_list = false; // large-mode → always MDD
488
+ largebm::b_disp = verbose;
489
+ largebm::b_write = !out_file.empty();
490
+ largebm::out_file = out_file;
491
+ largebm::just_build = false;
492
+
493
+ // 0.1) Clear any leftover data/state from previous runs
494
+ largebm::items.clear();
495
+ largebm::item_dic.clear();
496
+ largebm::inv_item_dic.clear();
497
+ largebm::Tree.clear();
498
+ largebm::DFS.clear();
499
+ largebm::ClearCollected();
500
+
501
+ // 1) Load sequences (either from filename or from Python list)
502
+ if (py::isinstance<py::str>(data)) {
503
+ // ─────────── FILE-BASED MODE ───────────
504
+ std::string path = data.cast<std::string>();
505
+ if (!largebm::Load_instance(path, minsup))
506
+ throw std::runtime_error("Failed to load file: " + path);
507
+
508
+ } else {
509
+ // ────────── IN-MEMORY MODE ──────────
510
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
511
+ largebm::items = std::move(seqs);
512
+ largebm::N = largebm::items.size();
513
+
514
+ // 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
515
+ int max_id = 0;
516
+ largebm::M = 0;
517
+ largebm::E = 0;
518
+ for (auto &seq : largebm::items) {
519
+ largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
520
+ largebm::E += static_cast<unsigned long long>(seq.size());
521
+ for (int x : seq) max_id = std::max(max_id, std::abs(x));
522
+ }
523
+ largebm::L = static_cast<unsigned int>(max_id);
524
+ largebm::theta = (minsup < 1.0)
525
+ ? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
526
+ : static_cast<unsigned long long>(minsup);
527
+
528
+ // 1.2) Initialize DFS buffer (size = L)
529
+ largebm::DFS.reserve(largebm::L);
530
+ for (unsigned int i = 0; i < largebm::L; ++i)
531
+ largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
532
+
533
+ // 1.3) Build the MDD “Tree”
534
+ // Insert one dummy root node (item=0, freq=0, anct=0)
535
+ largebm::Tree.emplace_back(0, 0, 0);
536
+ for (auto &seq : largebm::items)
537
+ largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
538
+ }
539
+
540
+ // 2) Rebuild inverse-dictionary from fresh item_dic
541
+ {
542
+ std::vector<int> inv(largebm::item_dic.size() + 1);
543
+ for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
544
+ int cid = largebm::item_dic[old - 1];
545
+ if (cid > 0) inv[cid] = old;
546
+ }
547
+ largebm::inv_item_dic = std::move(inv);
548
+ }
549
+
550
+ // 3) Start timing and run the miner
551
+ largebm::start_time = std::clock();
552
+ largebm::Freq_miner();
553
+
554
+ // 4) Collect results and elapsed time
555
+ const auto& pats = largebm::GetCollected();
556
+
557
+ py::dict out;
558
+ out["patterns"] = pats;
559
+ out["time"] = largebm::give_time(std::clock() - largebm::start_time);
560
+ return out;
561
+ },
562
+ py::arg("data"),
563
+ py::arg("minsup") = 0.01,
564
+ py::arg("time_limit") = 36000,
565
+ py::arg("preproc") = false,
566
+ py::arg("use_dic") = false,
567
+ py::arg("verbose") = false,
568
+ py::arg("out_file") = ""
569
+ );
570
+
571
+
572
+
573
+ m.def("LargeHTMiner",
574
+ [](py::object data,
575
+ double minsup,
576
+ unsigned int time_limit,
577
+ bool preproc,
578
+ bool use_dic,
579
+ bool verbose,
580
+ const std::string &out_file)
581
+ {
582
+ // 0) Set global flags and timers:
583
+ largehm::time_limit = time_limit;
584
+ largehm::pre_pro = preproc;
585
+ largehm::use_dic = use_dic;
586
+ largehm::use_list = true; // force in‐memory mode
587
+ largehm::b_disp = verbose;
588
+ largehm::b_write = !out_file.empty();
589
+ largehm::out_file = out_file;
590
+ largehm::just_build = false;
591
+
592
+ largehm::ClearCollected();
593
+ largehm::start_time = std::clock();
594
+
595
+ if (py::isinstance<py::str>(data)) {
596
+ // ───────────── FILE‐BASED MODE ─────────────
597
+ // Force mlim so that every item lands in temp_vec (never temp_lim):
598
+ largehm::mlim = UINT_MAX;
599
+
600
+ std::string path = data.cast<std::string>();
601
+ if (! largehm::Load_instance(path, minsup))
602
+ throw std::runtime_error("Failed to load file: " + path);
603
+ }
604
+ else {
605
+ // ───────────── IN‐MEMORY MODE ─────────────
606
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
607
+ largehm::items = std::move(seqs);
608
+ largehm::N = largehm::items.size();
609
+
610
+ // 1) Compute L = maximum absolute item ID
611
+ int max_id = 0;
612
+ for (auto &seq : largehm::items)
613
+ for (int x : seq)
614
+ max_id = std::max(max_id, std::abs(x));
615
+ largehm::L = static_cast<unsigned int>(max_id);
616
+
617
+ // 2) Compute theta as absolute support threshold
618
+ largehm::theta = (minsup < 1.0)
619
+ ? static_cast<unsigned long long>(std::ceil(minsup * largehm::N))
620
+ : static_cast<unsigned long long>(minsup);
621
+
622
+ // 3) Initialize DFS (size = L)
623
+ largehm::DFS.clear();
624
+ largehm::DFS.reserve(largehm::L);
625
+ for (unsigned int i = 0; i < largehm::L; ++i)
626
+ largehm::DFS.emplace_back(-static_cast<int>(i) - 1);
627
+
628
+ // 4) Compute M (max sequence length) and E (total entries)
629
+ largehm::M = 0;
630
+ largehm::E = 0;
631
+ for (auto &seq : largehm::items) {
632
+ largehm::M = std::max<unsigned int>(
633
+ largehm::M, static_cast<unsigned int>(seq.size()));
634
+ largehm::E += seq.size();
635
+ }
636
+
637
+ // 5) ─── Build the MDD “manually” ───
638
+ largehm::Tree.clear();
639
+ largehm::VTree.clear();
640
+ largehm::CTree.clear();
641
+
642
+ // Insert exactly one dummy root node (chld=0, sibl=0, freq=0):
643
+ largehm::Tree.emplace_back(0,0,0);
644
+
645
+ // For each sequence “seq”, insert into MDD by placing a single −1 sentinel:
646
+ for (auto &seq : largehm::items) {
647
+ // Copy the item IDs:
648
+ std::vector<int> temp_vec = seq;
649
+ // Only a single “−1” is needed to force the suffix insertion:
650
+ std::vector<int> temp_lim(1, -1);
651
+
652
+ largehm::Build_MDD(temp_vec, temp_lim);
653
+ }
654
+
655
+
656
+ }
657
+
658
+ // 6) Run the frequency miner (Tree is now properly built):
659
+ largehm::Freq_miner();
660
+
661
+ // 7) Return results to Python:
662
+ py::dict out;
663
+ out["patterns"] = largehm::GetCollected();
664
+ out["time"] = largehm::give_time(std::clock() - largehm::start_time);
665
+ return out;
666
+ },
667
+ py::arg("data"),
668
+ py::arg("minsup") = 0.01,
669
+ py::arg("time_limit") = 36000,
670
+ py::arg("preproc") = false,
671
+ py::arg("use_dic") = false,
672
+ py::arg("verbose") = false,
673
+ py::arg("out_file") = ""
674
+ );
675
+
676
+
677
+
678
+
679
+ }