effspm 0.1.7__cp310-cp310-macosx_11_0_arm64.whl → 0.2.6__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. effspm/__init__.py +10 -2
  2. effspm/_effspm.cpp +609 -0
  3. effspm/_effspm.cpython-310-darwin.so +0 -0
  4. effspm/btminer/src/build_mdd.cpp +63 -0
  5. effspm/btminer/src/build_mdd.hpp +40 -0
  6. effspm/btminer/src/freq_miner.cpp +179 -0
  7. effspm/btminer/src/freq_miner.hpp +39 -0
  8. effspm/btminer/src/load_inst.cpp +200 -0
  9. effspm/btminer/src/load_inst.hpp +25 -0
  10. effspm/btminer/src/utility.cpp +65 -0
  11. effspm/btminer/src/utility.hpp +40 -0
  12. effspm/freq_miner.hpp +4 -1
  13. effspm/htminer/src/build_mdd.cpp +192 -0
  14. effspm/htminer/src/build_mdd.hpp +64 -0
  15. effspm/htminer/src/freq_miner.cpp +350 -0
  16. effspm/htminer/src/freq_miner.hpp +60 -0
  17. effspm/htminer/src/load_inst.cpp +394 -0
  18. effspm/htminer/src/load_inst.hpp +23 -0
  19. effspm/htminer/src/utility.cpp +72 -0
  20. effspm/htminer/src/utility.hpp +77 -0
  21. effspm/largebm/src/build_mdd.cpp +137 -0
  22. effspm/largebm/src/build_mdd.hpp +47 -0
  23. effspm/largebm/src/freq_miner.cpp +349 -0
  24. effspm/largebm/src/freq_miner.hpp +48 -0
  25. effspm/largebm/src/load_inst.cpp +230 -0
  26. effspm/largebm/src/load_inst.hpp +45 -0
  27. effspm/largebm/src/utility.cpp +45 -0
  28. effspm/largebm/src/utility.hpp +18 -0
  29. effspm/largehm/src/build_mdd.cpp +174 -0
  30. effspm/largehm/src/build_mdd.hpp +93 -0
  31. effspm/largehm/src/freq_miner.cpp +445 -0
  32. effspm/largehm/src/freq_miner.hpp +77 -0
  33. effspm/largehm/src/load_inst.cpp +357 -0
  34. effspm/largehm/src/load_inst.hpp +64 -0
  35. effspm/largehm/src/utility.cpp +38 -0
  36. effspm/largehm/src/utility.hpp +29 -0
  37. effspm/largepp/src/freq_miner.cpp +170 -0
  38. effspm/largepp/src/freq_miner.hpp +43 -0
  39. effspm/largepp/src/load_inst.cpp +219 -0
  40. effspm/largepp/src/load_inst.hpp +28 -0
  41. effspm/largepp/src/utility.cpp +34 -0
  42. effspm/largepp/src/utility.hpp +21 -0
  43. effspm/load_inst.hpp +2 -1
  44. effspm-0.2.6.dist-info/METADATA +237 -0
  45. effspm-0.2.6.dist-info/RECORD +53 -0
  46. {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/WHEEL +1 -2
  47. effspm/_core.cpython-310-darwin.so +0 -0
  48. effspm-0.1.7.dist-info/METADATA +0 -38
  49. effspm-0.1.7.dist-info/RECORD +0 -14
  50. {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/licenses/LICENSE +0 -0
  51. {effspm-0.1.7.dist-info → effspm-0.2.6.dist-info}/top_level.txt +0 -0
effspm/__init__.py CHANGED
@@ -1,3 +1,11 @@
1
- from ._core import PrefixProjection
2
1
 
3
- __all__ = ['PrefixProjection']
2
+ from ._effspm import PrefixProjection, HTMiner, LargeHTMiner, BTMiner, LargeBTMiner, LargePrefixProjection
3
+
4
+ __all__ = [
5
+ "PrefixProjection",
6
+ "HTMiner",
7
+ "LargeHTMiner",
8
+ "BTMiner",
9
+ "LargeBTMiner",
10
+ "LargePrefixProjection",
11
+ ]
effspm/_effspm.cpp ADDED
@@ -0,0 +1,609 @@
1
+ // _effspm.cpp
2
+
3
+ #include <pybind11/pybind11.h>
4
+ #include <pybind11/stl.h>
5
+ namespace py = pybind11;
6
+ #include <iostream>
7
+
8
+
9
+ // PrefixProjection headers
10
+ #include "freq_miner.hpp"
11
+ #include "load_inst.hpp"
12
+ #include "utility.hpp"
13
+
14
+ // BTMiner (wrapped in its own namespace in source files)
15
+ #include "btminer/src/freq_miner.hpp"
16
+ #include "btminer/src/load_inst.hpp"
17
+ #include "btminer/src/utility.hpp"
18
+ #include "btminer/src/build_mdd.hpp"
19
+
20
+ // HTMiner (wrapped in its own namespace in source files)
21
+ #include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
22
+ #include "htminer/src/freq_miner.hpp"
23
+ #include "htminer/src/load_inst.hpp"
24
+ #include "htminer/src/utility.hpp"
25
+
26
+
27
+ #include "largepp/src/freq_miner.hpp"
28
+ #include "largepp/src/load_inst.hpp"
29
+ #include "largepp/src/utility.hpp"
30
+
31
+ #include "largebm/src/freq_miner.hpp"
32
+ #include "largebm/src/load_inst.hpp"
33
+ #include "largebm/src/utility.hpp"
34
+ #include "largebm/src/build_mdd.hpp"
35
+
36
+ #include "largehm/src/freq_miner.hpp"
37
+ #include "largehm/src/load_inst.hpp"
38
+ #include "largehm/src/utility.hpp"
39
+ #include "largehm/src/build_mdd.hpp"
40
+
41
+
42
+
43
+ PYBIND11_MODULE(_effspm, m) {
44
+ m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
45
+
46
+ // ─────────────────────────────────────────────────────────────
47
+ // PrefixProjection
48
+ // ─────────────────────────────────────────────────────────────
49
+ m.def("PrefixProjection",
50
+ [](py::object data,
51
+ double minsup,
52
+ unsigned int time_limit,
53
+ bool preproc,
54
+ bool use_dic,
55
+ bool verbose,
56
+ const std::string &out_file)
57
+ {
58
+ ::time_limit = time_limit;
59
+ ::pre_pro = preproc;
60
+ ::use_dic = use_dic;
61
+ ::use_list = false;
62
+ ::b_disp = verbose;
63
+ ::b_write = !out_file.empty();
64
+ ::out_file = out_file;
65
+
66
+ ClearCollected();
67
+ start_time = std::clock();
68
+
69
+ if (py::isinstance<py::str>(data)) {
70
+ std::string path = data.cast<std::string>();
71
+ if (!Load_instance(path, minsup))
72
+ throw std::runtime_error("Failed to load file: " + path);
73
+ } else {
74
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
75
+ items = std::move(seqs);
76
+ N = items.size();
77
+
78
+ int max_id = 0;
79
+ for (auto &seq : items)
80
+ for (int x : seq)
81
+ max_id = std::max(max_id, std::abs(x));
82
+ L = max_id;
83
+
84
+ theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
85
+
86
+ DFS.clear();
87
+ DFS.reserve(L);
88
+ for (unsigned int i = 0; i < L; ++i)
89
+ DFS.emplace_back(-static_cast<int>(i) - 1);
90
+
91
+ M = 0;
92
+ E = 0;
93
+ for (auto &seq : items) {
94
+ M = std::max<unsigned int>(M, seq.size());
95
+ E += seq.size();
96
+ }
97
+ }
98
+
99
+ Freq_miner();
100
+
101
+ py::dict out;
102
+ out["patterns"] = GetCollected();
103
+ out["time"] = give_time(std::clock() - start_time);
104
+ return out;
105
+ },
106
+ py::arg("data"),
107
+ py::arg("minsup") = 0.01,
108
+ py::arg("time_limit") = 36000,
109
+ py::arg("preproc") = false,
110
+ py::arg("use_dic") = false,
111
+ py::arg("verbose") = false,
112
+ py::arg("out_file") = ""
113
+ );
114
+
115
+ // ─────────────────────────────────────────────────────────────
116
+ // BTMiner
117
+ // ─────────────────────────────────────────────────────────────
118
+ m.def("BTMiner",
119
+ [](py::object data,
120
+ double minsup,
121
+ unsigned int time_limit,
122
+ bool preproc,
123
+ bool use_dic,
124
+ bool verbose,
125
+ const std::string &out_file)
126
+ {
127
+ btminer::time_limit = time_limit;
128
+ btminer::pre_pro = preproc;
129
+ btminer::use_dic = use_dic;
130
+ btminer::use_list = false;
131
+ btminer::b_disp = verbose;
132
+ btminer::b_write = !out_file.empty();
133
+ btminer::out_file = out_file;
134
+
135
+ btminer::ClearCollected();
136
+ btminer::start_time = std::clock();
137
+
138
+ if (py::isinstance<py::str>(data)) {
139
+ std::string path = data.cast<std::string>();
140
+ if (!btminer::Load_instance(path, minsup))
141
+ throw std::runtime_error("Failed to load file: " + path);
142
+ } else {
143
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
144
+ btminer::items = std::move(seqs);
145
+ btminer::N = btminer::items.size();
146
+
147
+ int max_id = 0;
148
+ for (auto &seq : btminer::items)
149
+ for (int x : seq)
150
+ max_id = std::max(max_id, std::abs(x));
151
+ btminer::L = max_id;
152
+
153
+ btminer::theta = (minsup < 1.0) ? std::ceil(minsup * btminer::N) : minsup;
154
+
155
+ btminer::DFS.clear();
156
+ btminer::DFS.reserve(btminer::L);
157
+ for (unsigned int i = 0; i < btminer::L; ++i)
158
+ btminer::DFS.emplace_back(-static_cast<int>(i) - 1);
159
+
160
+ btminer::M = 0;
161
+ btminer::E = 0;
162
+ for (auto &seq : btminer::items) {
163
+ btminer::M = std::max<unsigned int>(btminer::M, seq.size());
164
+ btminer::E += seq.size();
165
+ }
166
+ }
167
+
168
+ btminer::Freq_miner();
169
+
170
+ py::dict out;
171
+ out["patterns"] = btminer::GetCollected();
172
+ out["time"] = btminer::give_time(std::clock() - btminer::start_time);
173
+ return out;
174
+ },
175
+ py::arg("data"),
176
+ py::arg("minsup") = 0.01,
177
+ py::arg("time_limit") = 36000,
178
+ py::arg("preproc") = false,
179
+ py::arg("use_dic") = false,
180
+ py::arg("verbose") = false,
181
+ py::arg("out_file") = ""
182
+ );
183
+
184
+ // ─────────────────────────────────────────────────────────────
185
+ // HTMiner
186
+ // ─────────────────────────────────────────────────────────────
187
+ m.def("HTMiner",
188
+ [](py::object data,
189
+ double minsup, unsigned int time_limit,
190
+ bool preproc, bool use_dic,
191
+ bool verbose, const std::string &out_file)
192
+ {
193
+ // 1) set HTMiner globals (declared in htminer/src/utility.hpp)
194
+ htminer::time_limit = time_limit;
195
+ htminer::pre_pro = preproc;
196
+ htminer::use_dic = use_dic;
197
+ htminer::just_build = false; // or true if you want “build only”
198
+ htminer::use_list = false; // HTMiner always uses MDD‐based mode
199
+ htminer::b_disp = verbose;
200
+ htminer::b_write = !out_file.empty();
201
+ htminer::out_file = out_file;
202
+ htminer::ClearCollected(); // clear any leftover patterns
203
+ htminer::start_time = std::clock();
204
+
205
+ // 2) load sequences (either from filename or from Python list)
206
+ if (py::isinstance<py::str>(data)) {
207
+ std::string path = data.cast<std::string>();
208
+ if (!htminer::Load_instance(path, minsup))
209
+ throw std::runtime_error("Failed to load file: " + path);
210
+ } else {
211
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
212
+ htminer::items = std::move(seqs);
213
+ htminer::N = htminer::items.size();
214
+
215
+ // compute L (max item ID), M (max sequence length), E (total entries)
216
+ int max_id = 0;
217
+ htminer::M = 0;
218
+ htminer::E = 0;
219
+ for (auto &seq : htminer::items) {
220
+ htminer::M = std::max<unsigned int>(htminer::M, seq.size());
221
+ for (int x : seq)
222
+ max_id = std::max(max_id, std::abs(x));
223
+ htminer::E += seq.size();
224
+ }
225
+ htminer::L = max_id;
226
+ htminer::theta = (minsup < 1.0)
227
+ ? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
228
+ : static_cast<unsigned long long>(minsup);
229
+
230
+ // build empty DFS stack (size L) as HTMiner expects
231
+ htminer::DFS.clear();
232
+ htminer::DFS.reserve(htminer::L);
233
+ for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
234
+ htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
235
+
236
+ // initialize VDFS if HTMiner needs it
237
+ htminer::VDFS.clear();
238
+ htminer::VDFS.resize(htminer::L);
239
+ }
240
+
241
+ // 3) run the mining algorithm
242
+ htminer::Freq_miner();
243
+
244
+ // std::cout << "[HTMiner] dumping all collected patterns:\n";
245
+ // for (size_t i = 0; i < htminer::collectedPatterns.size(); ++i) {
246
+ // const auto &seq = htminer::collectedPatterns[i];
247
+ // std::cout << "Pattern " << i << ": { ";
248
+ // for (int x : seq) {
249
+ // std::cout << x << " ";
250
+ // }
251
+ // std::cout << "}\n";
252
+ //}
253
+ std::cout << " total patterns = "
254
+ << htminer::collectedPatterns.size() << "\n";
255
+ // ─────────────────────────────────────────────────
256
+
257
+ // 4) return patterns + elapsed time
258
+ py::dict out;
259
+ out["patterns"] = htminer::GetCollected();
260
+ out["time"] = htminer::give_time(std::clock() - htminer::start_time);
261
+ return out;
262
+ },
263
+ py::arg("data"),
264
+ py::arg("minsup") = 0.01,
265
+ py::arg("time_limit") = 36000,
266
+ py::arg("preproc") = false,
267
+ py::arg("use_dic") = false,
268
+ py::arg("verbose") = false,
269
+ py::arg("out_file") = ""
270
+ );
271
+
272
+ m.def("LargePrefixProjection",
273
+ [](py::object data,
274
+ double minsup,
275
+ unsigned int time_limit,
276
+ bool preproc,
277
+ bool use_dic,
278
+ bool verbose,
279
+ const std::string &out_file)
280
+ {
281
+ largepp::time_limit = time_limit;
282
+ largepp::pre_pro = preproc;
283
+ largepp::use_dic = use_dic;
284
+ largepp::use_list = true; // ← key difference
285
+ largepp::b_disp = verbose;
286
+ largepp::b_write = !out_file.empty();
287
+ largepp::out_file = out_file;
288
+ largepp::just_build = false;
289
+
290
+ largepp::ClearCollected();
291
+ largepp::start_time = std::clock();
292
+ std::string fname = data.cast<std::string>();
293
+ /* 1) load instance (py list or filename) */
294
+ if (py::isinstance<py::str>(data))
295
+
296
+ largepp::Load_instance(fname, minsup);
297
+ else
298
+ largepp::Load_py(data, minsup); // helper you’ll expose
299
+
300
+ std::vector<unsigned long long> dbg;
301
+
302
+
303
+
304
+
305
+
306
+
307
+ largepp::Freq_miner();
308
+
309
+ py::dict out;
310
+ out["patterns"] = largepp::GetCollected();
311
+ out["time"] = largepp::give_time(std::clock() - largepp::start_time);
312
+ return out;
313
+ },
314
+ py::arg("data"),
315
+ py::arg("minsup") = 0.01,
316
+ py::arg("time_limit") = 36000,
317
+ py::arg("preproc") = false,
318
+ py::arg("use_dic") = false,
319
+ py::arg("verbose") = false,
320
+ py::arg("out_file") = ""
321
+ );
322
+
323
+ // ─────────────────────────────────────────────────────────────
324
+ // LargeBTMiner -- Python wrapper for the largebm implementation
325
+ // ─────────────────────────────────────────────────────────────
326
+ // m.def(
327
+ // "LargeBTMiner",
328
+ // [](py::object data,
329
+ // double minsup ,
330
+ // unsigned int time_limit,
331
+ // bool preproc ,
332
+ // bool use_dic,
333
+ // bool verbose,
334
+ // const std::string &out_file )
335
+ // {
336
+ // /* 1) Global flags */
337
+ // largebm::time_limit = time_limit;
338
+ // largebm::pre_pro = preproc;
339
+ // largebm::use_dic = use_dic;
340
+ // largebm::use_list = false; // large-mode → always MDD
341
+ // largebm::just_build = false;
342
+ // largebm::b_disp = verbose;
343
+ // largebm::b_write = !out_file.empty();
344
+ // largebm::out_file = out_file;
345
+
346
+ // /* 2) Reset per-run state */
347
+ // largebm::ClearCollected();
348
+ // largebm::start_time = std::clock();
349
+
350
+ // /* 3) Load the DB (file path or in-memory list<list<int>>) */
351
+ // if (py::isinstance<py::str>(data)) {
352
+ // std::string path = data.cast<std::string>();
353
+ // if (!largebm::Load_instance(path, minsup))
354
+ // throw std::runtime_error("Failed to load file: " + path);
355
+ // } else {
356
+ // // In-memory sequences
357
+ // largebm::items = std::move(data.cast<std::vector<std::vector<int>>>());
358
+ // largebm::N = static_cast<unsigned int>(largebm::items.size());
359
+
360
+ // /* -- basic stats -- */
361
+ // int max_id = 0;
362
+ // largebm::M = 0;
363
+ // largebm::E = 0;
364
+ // for ( auto &seq : largebm::items) {
365
+ // largebm::M = std::max<unsigned int>(largebm::M,
366
+ // static_cast<unsigned int>(seq.size()));
367
+ // largebm::E += static_cast<unsigned long long>(seq.size());
368
+ // for (int x : seq) max_id = std::max(max_id, std::abs(x));
369
+ // }
370
+ // largebm::L = static_cast<unsigned int>(max_id);
371
+ // largebm::theta = (minsup < 1.0)
372
+ // ? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
373
+ // : static_cast<unsigned long long>(minsup);
374
+
375
+ // /* -- DFS buffer (size = L) -- */
376
+ // largebm::DFS.clear();
377
+ // largebm::DFS.reserve(largebm::L);
378
+ // for (unsigned int i = 0; i < largebm::L; ++i)
379
+ // largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
380
+
381
+ // /* -- Build the MDD -- */
382
+ // largebm::Tree.clear();
383
+ // largebm::Tree.emplace_back(0, 0, 0); // dummy root
384
+ // for ( auto &seq : largebm::items)
385
+ // largebm::Build_MDD(seq);
386
+ // }
387
+
388
+ // /* 4) Mine and return results */
389
+ // largebm::Freq_miner();
390
+
391
+ // py::dict out;
392
+ // out["patterns"] = largebm::GetCollected();
393
+ // out["time"] = largebm::give_time(std::clock() - largebm::start_time);
394
+ // return out;
395
+ // },
396
+ // py::arg("data"),
397
+ // py::arg("minsup") = 0.01,
398
+ // py::arg("time_limit") = 36000,
399
+ // py::arg("preproc") = false,
400
+ // py::arg("use_dic") = false,
401
+ // py::arg("verbose") = false,
402
+ // py::arg("out_file") = ""
403
+ // );
404
+
405
+
406
+
407
+ m.def("LargeBTMiner",
408
+ [](py::object data,
409
+ double minsup,
410
+ unsigned int time_limit,
411
+ bool preproc,
412
+ bool use_dic,
413
+ bool verbose,
414
+ const std::string &out_file)
415
+ {
416
+ // 0) Set global flags and timers
417
+ largebm::time_limit = time_limit;
418
+ largebm::pre_pro = preproc;
419
+ largebm::use_dic = use_dic;
420
+ largebm::use_list = false; // large‑mode → always MDD
421
+ largebm::b_disp = verbose;
422
+ largebm::b_write = !out_file.empty();
423
+ largebm::out_file = out_file;
424
+ largebm::just_build = false;
425
+
426
+ // 0.1) Clear any leftover data/state from previous runs
427
+ largebm::items.clear();
428
+ largebm::item_dic.clear();
429
+ largebm::inv_item_dic.clear();
430
+ largebm::Tree.clear();
431
+ largebm::DFS.clear();
432
+ largebm::ClearCollected();
433
+
434
+ // 1) Load sequences (either from filename or from Python list)
435
+ if (py::isinstance<py::str>(data)) {
436
+ // ─────────── FILE‑BASED MODE ───────────
437
+ std::string path = data.cast<std::string>();
438
+ if (!largebm::Load_instance(path, minsup))
439
+ throw std::runtime_error("Failed to load file: " + path);
440
+
441
+ } else {
442
+ // ────────── IN‑MEMORY MODE ──────────
443
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
444
+ largebm::items = std::move(seqs);
445
+ largebm::N = largebm::items.size();
446
+
447
+ // 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
448
+ int max_id = 0;
449
+ largebm::M = 0;
450
+ largebm::E = 0;
451
+ for (auto &seq : largebm::items) {
452
+ largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
453
+ largebm::E += static_cast<unsigned long long>(seq.size());
454
+ for (int x : seq) max_id = std::max(max_id, std::abs(x));
455
+ }
456
+ largebm::L = static_cast<unsigned int>(max_id);
457
+ largebm::theta = (minsup < 1.0)
458
+ ? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
459
+ : static_cast<unsigned long long>(minsup);
460
+
461
+ // 1.2) Initialize DFS buffer (size = L)
462
+ largebm::DFS.reserve(largebm::L);
463
+ for (unsigned int i = 0; i < largebm::L; ++i)
464
+ largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
465
+
466
+ // 1.3) Build the MDD “Tree”
467
+ // Insert one dummy root node (item=0, freq=0, anct=0)
468
+ largebm::Tree.emplace_back(0, 0, 0);
469
+ for (auto &seq : largebm::items)
470
+ largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
471
+ }
472
+
473
+ // 2) Rebuild inverse‑dictionary from fresh item_dic
474
+ {
475
+ std::vector<int> inv(largebm::item_dic.size() + 1);
476
+ for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
477
+ int cid = largebm::item_dic[old - 1];
478
+ if (cid > 0) inv[cid] = old;
479
+ }
480
+ largebm::inv_item_dic = std::move(inv);
481
+ }
482
+
483
+ // 3) Start timing and run the miner
484
+ largebm::start_time = std::clock();
485
+ largebm::Freq_miner();
486
+
487
+ // 4) Collect results and elapsed time
488
+ py::dict out;
489
+ out["patterns"] = largebm::GetCollected();
490
+ out["time"] = largebm::give_time(std::clock() - largebm::start_time);
491
+ return out;
492
+ },
493
+ py::arg("data"),
494
+ py::arg("minsup") = 0.01,
495
+ py::arg("time_limit") = 36000,
496
+ py::arg("preproc") = false,
497
+ py::arg("use_dic") = false,
498
+ py::arg("verbose") = false,
499
+ py::arg("out_file") = ""
500
+ );
501
+
502
+
503
+ m.def("LargeHTMiner",
504
+ [](py::object data,
505
+ double minsup,
506
+ unsigned int time_limit,
507
+ bool preproc,
508
+ bool use_dic,
509
+ bool verbose,
510
+ const std::string &out_file)
511
+ {
512
+ // 0) Set global flags and timers:
513
+ largehm::time_limit = time_limit;
514
+ largehm::pre_pro = preproc;
515
+ largehm::use_dic = use_dic;
516
+ largehm::use_list = true; // force in‐memory mode
517
+ largehm::b_disp = verbose;
518
+ largehm::b_write = !out_file.empty();
519
+ largehm::out_file = out_file;
520
+ largehm::just_build = false;
521
+
522
+ largehm::ClearCollected();
523
+ largehm::start_time = std::clock();
524
+
525
+ if (py::isinstance<py::str>(data)) {
526
+ // ───────────── FILE‐BASED MODE ─────────────
527
+ // Force mlim so that every item lands in temp_vec (never temp_lim):
528
+ largehm::mlim = UINT_MAX;
529
+
530
+ std::string path = data.cast<std::string>();
531
+ if (! largehm::Load_instance(path, minsup))
532
+ throw std::runtime_error("Failed to load file: " + path);
533
+ }
534
+ else {
535
+ // ───────────── IN‐MEMORY MODE ─────────────
536
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
537
+ largehm::items = std::move(seqs);
538
+ largehm::N = largehm::items.size();
539
+
540
+ // 1) Compute L = maximum absolute item ID
541
+ int max_id = 0;
542
+ for (auto &seq : largehm::items)
543
+ for (int x : seq)
544
+ max_id = std::max(max_id, std::abs(x));
545
+ largehm::L = static_cast<unsigned int>(max_id);
546
+
547
+ // 2) Compute theta as absolute support threshold
548
+ largehm::theta = (minsup < 1.0)
549
+ ? static_cast<unsigned long long>(std::ceil(minsup * largehm::N))
550
+ : static_cast<unsigned long long>(minsup);
551
+
552
+ // 3) Initialize DFS (size = L)
553
+ largehm::DFS.clear();
554
+ largehm::DFS.reserve(largehm::L);
555
+ for (unsigned int i = 0; i < largehm::L; ++i)
556
+ largehm::DFS.emplace_back(-static_cast<int>(i) - 1);
557
+
558
+ // 4) Compute M (max sequence length) and E (total entries)
559
+ largehm::M = 0;
560
+ largehm::E = 0;
561
+ for (auto &seq : largehm::items) {
562
+ largehm::M = std::max<unsigned int>(
563
+ largehm::M, static_cast<unsigned int>(seq.size()));
564
+ largehm::E += seq.size();
565
+ }
566
+
567
+ // 5) ─── Build the MDD “manually” ───
568
+ largehm::Tree.clear();
569
+ largehm::VTree.clear();
570
+ largehm::CTree.clear();
571
+
572
+ // Insert exactly one dummy root node (chld=0, sibl=0, freq=0):
573
+ largehm::Tree.emplace_back(0,0,0);
574
+
575
+ // For each sequence “seq”, insert into MDD by placing a single −1 sentinel:
576
+ for (auto &seq : largehm::items) {
577
+ // Copy the item IDs:
578
+ std::vector<int> temp_vec = seq;
579
+ // Only a single “−1” is needed to force the suffix insertion:
580
+ std::vector<int> temp_lim(1, -1);
581
+
582
+ largehm::Build_MDD(temp_vec, temp_lim);
583
+ }
584
+
585
+
586
+ }
587
+
588
+ // 6) Run the frequency miner (Tree is now properly built):
589
+ largehm::Freq_miner();
590
+
591
+ // 7) Return results to Python:
592
+ py::dict out;
593
+ out["patterns"] = largehm::GetCollected();
594
+ out["time"] = largehm::give_time(std::clock() - largehm::start_time);
595
+ return out;
596
+ },
597
+ py::arg("data"),
598
+ py::arg("minsup") = 0.01,
599
+ py::arg("time_limit") = 36000,
600
+ py::arg("preproc") = false,
601
+ py::arg("use_dic") = false,
602
+ py::arg("verbose") = false,
603
+ py::arg("out_file") = ""
604
+ );
605
+
606
+
607
+
608
+
609
+ }
Binary file
@@ -0,0 +1,63 @@
1
+ #include <vector>
2
+ #include <iostream>
3
+ #include <unordered_map>
4
+ #include "load_inst.hpp"
5
+ #include "build_mdd.hpp"
6
+ #include "freq_miner.hpp"
7
+ #include "utility.hpp"
8
+
9
+ namespace btminer {
10
+
11
+ int Add_arc(int item, int last_arc, int& itmset, std::unordered_map<int, int>& ancest_map);
12
+ std::vector<Arc> Tree;
13
+
14
+ void Build_MDD(std::vector<int>& items) {
15
+ std::unordered_map<int, int> ancest_map;
16
+ int last_arc = 0, itmset = 0;
17
+ for (auto it = items.begin(); it != items.end(); ++it)
18
+ last_arc = Add_arc(*it, last_arc, itmset, ancest_map);
19
+ }
20
+
21
+ int Add_arc(int item, int last_arc, int& itmset, std::unordered_map<int, int>& ancest_map) {
22
+ int anct;
23
+ auto p = ancest_map.find(abs(item));
24
+ if (p == ancest_map.end())
25
+ anct = 0;
26
+ else
27
+ anct = p->second;
28
+
29
+ if (item < 0)
30
+ ++itmset;
31
+
32
+ int last_sibl = Tree[last_arc].chld;
33
+
34
+ if (last_sibl == -1) {
35
+ Tree.emplace_back(item, itmset, anct);
36
+ last_sibl = Tree.size() - 1;
37
+ Tree[last_arc].chld = last_sibl;
38
+ if (anct == 0)
39
+ DFS[abs(item) - 1].str_pnt.push_back(last_sibl);
40
+ } else {
41
+ while (Tree[last_sibl].item != item) {
42
+ if (Tree[last_sibl].sibl == -1) {
43
+ Tree.emplace_back(item, itmset, anct);
44
+ Tree[last_sibl].sibl = Tree.size() - 1;
45
+ last_sibl = Tree.size() - 1;
46
+ if (anct == 0)
47
+ DFS[abs(item) - 1].str_pnt.push_back(last_sibl);
48
+ break;
49
+ }
50
+ last_sibl = Tree[last_sibl].sibl;
51
+ }
52
+ }
53
+
54
+ if (anct == 0)
55
+ ++DFS[abs(item) - 1].freq;
56
+
57
+ ++Tree[last_sibl].freq;
58
+ ancest_map[abs(item)] = last_sibl;
59
+
60
+ return last_sibl;
61
+ }
62
+
63
+ } // namespace btminer