effspm 0.2.2__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm-0.2.6/MANIFEST.in +5 -0
- {effspm-0.2.2/effspm.egg-info → effspm-0.2.6}/PKG-INFO +1 -1
- effspm-0.2.6/effspm/__init__.py +11 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/_effspm.cpp +90 -58
- {effspm-0.2.2 → effspm-0.2.6}/effspm/freq_miner.hpp +4 -1
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/load_inst.cpp +2 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/freq_miner.cpp +13 -6
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/load_inst.cpp +72 -77
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/build_mdd.cpp +10 -9
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/freq_miner.cpp +15 -11
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/freq_miner.hpp +7 -7
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/utility.cpp +1 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/utility.hpp +1 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/load_inst.hpp +2 -1
- {effspm-0.2.2 → effspm-0.2.6/effspm.egg-info}/PKG-INFO +1 -1
- {effspm-0.2.2 → effspm-0.2.6}/effspm.egg-info/SOURCES.txt +0 -6
- {effspm-0.2.2 → effspm-0.2.6}/pyproject.toml +1 -2
- {effspm-0.2.2 → effspm-0.2.6}/setup.py +1 -1
- effspm-0.2.6/tests/test.py +22 -0
- effspm-0.2.6/tests/test_basic.py +19 -0
- effspm-0.2.2/MANIFEST.in +0 -12
- effspm-0.2.2/effspm/__init__.py +0 -3
- effspm-0.2.2/effspm/btminer/src/main.cpp +0 -92
- effspm-0.2.2/effspm/htminer/src/main.cpp +0 -96
- effspm-0.2.2/effspm/largebm/src/main.cpp +0 -95
- effspm-0.2.2/effspm/largehm/src/main.cpp +0 -95
- effspm-0.2.2/effspm/largepp/src/main.cpp +0 -108
- effspm-0.2.2/effspm/main.cpp +0 -103
- effspm-0.2.2/tests/test.py +0 -31
- effspm-0.2.2/tests/test_basic.py +0 -37
- {effspm-0.2.2 → effspm-0.2.6}/LICENSE +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/README.md +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/_core.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/build_mdd.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/build_mdd.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/freq_miner.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/freq_miner.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/load_inst.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/load_inst.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/utility.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/btminer/src/utility.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/freq_miner.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/build_mdd.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/build_mdd.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/freq_miner.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/freq_miner.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/load_inst.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/utility.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/htminer/src/utility.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/build_mdd.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/build_mdd.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/freq_miner.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/load_inst.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/utility.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largebm/src/utility.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/build_mdd.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/load_inst.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/load_inst.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/utility.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largehm/src/utility.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/freq_miner.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/freq_miner.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/load_inst.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/largepp/src/load_inst.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/load_inst.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/utility.cpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm/utility.hpp +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm.egg-info/dependency_links.txt +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm.egg-info/not-zip-safe +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm.egg-info/requires.txt +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/effspm.egg-info/top_level.txt +0 -0
- {effspm-0.2.2 → effspm-0.2.6}/setup.cfg +0 -0
effspm-0.2.6/MANIFEST.in
ADDED
|
@@ -403,71 +403,103 @@ std::cout << " total patterns = "
|
|
|
403
403
|
// );
|
|
404
404
|
|
|
405
405
|
|
|
406
|
+
|
|
406
407
|
m.def("LargeBTMiner",
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
{
|
|
415
|
-
largebm::time_limit = time_limit;
|
|
416
|
-
largebm::pre_pro = preproc;
|
|
417
|
-
largebm::use_dic = use_dic;
|
|
418
|
-
largebm::use_list = false; // <-- switch into “large” mode
|
|
419
|
-
largebm::b_disp = verbose;
|
|
420
|
-
largebm::b_write = !out_file.empty();
|
|
421
|
-
largebm::out_file = out_file;
|
|
422
|
-
largebm::just_build = false;
|
|
423
|
-
|
|
424
|
-
// ── Build the inverse‐dictionary here ────────────────────────────
|
|
408
|
+
[](py::object data,
|
|
409
|
+
double minsup,
|
|
410
|
+
unsigned int time_limit,
|
|
411
|
+
bool preproc,
|
|
412
|
+
bool use_dic,
|
|
413
|
+
bool verbose,
|
|
414
|
+
const std::string &out_file)
|
|
425
415
|
{
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
largebm::
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
416
|
+
// 0) Set global flags and timers
|
|
417
|
+
largebm::time_limit = time_limit;
|
|
418
|
+
largebm::pre_pro = preproc;
|
|
419
|
+
largebm::use_dic = use_dic;
|
|
420
|
+
largebm::use_list = false; // large‑mode → always MDD
|
|
421
|
+
largebm::b_disp = verbose;
|
|
422
|
+
largebm::b_write = !out_file.empty();
|
|
423
|
+
largebm::out_file = out_file;
|
|
424
|
+
largebm::just_build = false;
|
|
425
|
+
|
|
426
|
+
// 0.1) Clear any leftover data/state from previous runs
|
|
427
|
+
largebm::items.clear();
|
|
428
|
+
largebm::item_dic.clear();
|
|
429
|
+
largebm::inv_item_dic.clear();
|
|
430
|
+
largebm::Tree.clear();
|
|
431
|
+
largebm::DFS.clear();
|
|
432
|
+
largebm::ClearCollected();
|
|
433
|
+
|
|
434
|
+
// 1) Load sequences (either from filename or from Python list)
|
|
435
|
+
if (py::isinstance<py::str>(data)) {
|
|
436
|
+
// ─────────── FILE‑BASED MODE ───────────
|
|
437
|
+
std::string path = data.cast<std::string>();
|
|
438
|
+
if (!largebm::Load_instance(path, minsup))
|
|
439
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
439
440
|
|
|
440
|
-
|
|
441
|
-
|
|
441
|
+
} else {
|
|
442
|
+
// ────────── IN‑MEMORY MODE ──────────
|
|
443
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
444
|
+
largebm::items = std::move(seqs);
|
|
445
|
+
largebm::N = largebm::items.size();
|
|
442
446
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
447
|
+
// 1.1) Compute basic DB statistics (M, E, L) and absolute support θ
|
|
448
|
+
int max_id = 0;
|
|
449
|
+
largebm::M = 0;
|
|
450
|
+
largebm::E = 0;
|
|
451
|
+
for (auto &seq : largebm::items) {
|
|
452
|
+
largebm::M = std::max<unsigned int>(largebm::M, static_cast<unsigned int>(seq.size()));
|
|
453
|
+
largebm::E += static_cast<unsigned long long>(seq.size());
|
|
454
|
+
for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
455
|
+
}
|
|
456
|
+
largebm::L = static_cast<unsigned int>(max_id);
|
|
457
|
+
largebm::theta = (minsup < 1.0)
|
|
458
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
459
|
+
: static_cast<unsigned long long>(minsup);
|
|
460
|
+
|
|
461
|
+
// 1.2) Initialize DFS buffer (size = L)
|
|
462
|
+
largebm::DFS.reserve(largebm::L);
|
|
463
|
+
for (unsigned int i = 0; i < largebm::L; ++i)
|
|
464
|
+
largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
465
|
+
|
|
466
|
+
// 1.3) Build the MDD “Tree”
|
|
467
|
+
// Insert one dummy root node (item=0, freq=0, anct=0)
|
|
468
|
+
largebm::Tree.emplace_back(0, 0, 0);
|
|
469
|
+
for (auto &seq : largebm::items)
|
|
470
|
+
largebm::Build_MDD(const_cast<std::vector<int>&>(seq));
|
|
471
|
+
}
|
|
454
472
|
|
|
455
|
-
|
|
473
|
+
// 2) Rebuild inverse‑dictionary from fresh item_dic
|
|
474
|
+
{
|
|
475
|
+
std::vector<int> inv(largebm::item_dic.size() + 1);
|
|
476
|
+
for (int old = 1; old <= static_cast<int>(largebm::item_dic.size()); ++old) {
|
|
477
|
+
int cid = largebm::item_dic[old - 1];
|
|
478
|
+
if (cid > 0) inv[cid] = old;
|
|
479
|
+
}
|
|
480
|
+
largebm::inv_item_dic = std::move(inv);
|
|
481
|
+
}
|
|
456
482
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
)
|
|
483
|
+
// 3) Start timing and run the miner
|
|
484
|
+
largebm::start_time = std::clock();
|
|
485
|
+
largebm::Freq_miner();
|
|
486
|
+
|
|
487
|
+
// 4) Collect results and elapsed time
|
|
488
|
+
py::dict out;
|
|
489
|
+
out["patterns"] = largebm::GetCollected();
|
|
490
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
491
|
+
return out;
|
|
492
|
+
},
|
|
493
|
+
py::arg("data"),
|
|
494
|
+
py::arg("minsup") = 0.01,
|
|
495
|
+
py::arg("time_limit") = 36000,
|
|
496
|
+
py::arg("preproc") = false,
|
|
497
|
+
py::arg("use_dic") = false,
|
|
498
|
+
py::arg("verbose") = false,
|
|
499
|
+
py::arg("out_file") = ""
|
|
500
|
+
);
|
|
470
501
|
|
|
502
|
+
|
|
471
503
|
m.def("LargeHTMiner",
|
|
472
504
|
[](py::object data,
|
|
473
505
|
double minsup,
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
#include "load_inst.hpp"
|
|
4
4
|
#include <cstdlib>
|
|
5
5
|
#include <cmath>
|
|
6
|
+
#include <cstddef> // for std::size_t
|
|
7
|
+
|
|
6
8
|
using namespace std;
|
|
7
9
|
void Freq_miner();
|
|
8
10
|
void Out_patt(std::vector<int>& seq, unsigned int freq);
|
|
@@ -22,7 +24,8 @@ public:
|
|
|
22
24
|
|
|
23
25
|
Pattern(vector<int>& _seq, int item) {
|
|
24
26
|
seq.reserve(_seq.size());
|
|
25
|
-
for (
|
|
27
|
+
for (std::size_t i = 0; i < _seq.size(); ++i)
|
|
28
|
+
|
|
26
29
|
seq.push_back(_seq[i]);
|
|
27
30
|
seq.push_back(item);
|
|
28
31
|
freq = 0;
|
|
@@ -106,6 +106,8 @@ bool Load_instance(std::string& items_file, double thresh) {
|
|
|
106
106
|
// << " M=" << M << " L=" << L << " E=" << E << std::endl;
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
+
|
|
110
|
+
|
|
109
111
|
std::cout << "\nMDD Database built in " << give_time(std::clock() - kk) << " seconds\n\n";
|
|
110
112
|
std::cout << "Found " << N << " sequence, with max line len " << M
|
|
111
113
|
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
// File: effspm/largebm/src/freq_miner.cpp
|
|
2
|
-
|
|
3
1
|
#include <vector>
|
|
4
2
|
#include <algorithm>
|
|
5
3
|
#include <iostream>
|
|
@@ -27,16 +25,25 @@ namespace largebm {
|
|
|
27
25
|
Pattern _patt;
|
|
28
26
|
|
|
29
27
|
void Freq_miner() {
|
|
28
|
+
// ─── RESET per‐run state ──────────────────────────────────────
|
|
29
|
+
collected.clear();
|
|
30
|
+
num_patt = 0;
|
|
31
|
+
// Ensure DFS has at least L entries (so DFS[i] is valid for 0..L-1)
|
|
32
|
+
if (static_cast<int>(DFS.size()) < static_cast<int>(L)) {
|
|
33
|
+
DFS.resize(L);
|
|
34
|
+
}
|
|
35
|
+
// ─────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
30
37
|
std::vector<int> list;
|
|
31
38
|
|
|
32
39
|
if (use_list) {
|
|
33
|
-
// List
|
|
40
|
+
// List‐based routine
|
|
34
41
|
std::vector<int> empty_pref;
|
|
35
42
|
Freq_miner_list(items, empty_pref, theta, collected);
|
|
36
43
|
return;
|
|
37
44
|
}
|
|
38
45
|
|
|
39
|
-
// MDD
|
|
46
|
+
// MDD‐based initialization
|
|
40
47
|
for (int i = 0; i < static_cast<int>(L); ++i) {
|
|
41
48
|
if (DFS[i].freq >= theta) {
|
|
42
49
|
list.push_back(-i - 1);
|
|
@@ -301,13 +308,13 @@ namespace largebm {
|
|
|
301
308
|
return std::abs(a.first) < std::abs(b.first);
|
|
302
309
|
});
|
|
303
310
|
|
|
304
|
-
// 4) depth
|
|
311
|
+
// 4) depth‐first enumerate them
|
|
305
312
|
for (auto const& pr : cand) {
|
|
306
313
|
int item = pr.first;
|
|
307
314
|
prefix.push_back(item);
|
|
308
315
|
|
|
309
316
|
if (use_dic) {
|
|
310
|
-
// “un
|
|
317
|
+
// “un‐compress” each pattern back to original IDs
|
|
311
318
|
std::vector<int> unmapped;
|
|
312
319
|
unmapped.reserve(prefix.size());
|
|
313
320
|
for (int cid : prefix) {
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
|
|
1
|
+
|
|
2
2
|
#include <sstream>
|
|
3
3
|
#include <algorithm>
|
|
4
4
|
#include <cmath>
|
|
5
5
|
#include <ctime>
|
|
6
|
+
#include <iostream> // for std::cout, std::endl
|
|
6
7
|
#include <fstream>
|
|
7
8
|
#include <vector>
|
|
8
9
|
#include <string>
|
|
9
10
|
|
|
10
|
-
#include "load_inst.hpp"
|
|
11
|
-
#include "build_mdd.hpp"
|
|
12
|
-
#include "utility.hpp"
|
|
13
|
-
#include "freq_miner.hpp"
|
|
11
|
+
#include "load_inst.hpp"
|
|
12
|
+
#include "build_mdd.hpp"
|
|
13
|
+
#include "utility.hpp"
|
|
14
|
+
#include "freq_miner.hpp"
|
|
14
15
|
|
|
15
16
|
namespace largebm {
|
|
16
17
|
|
|
@@ -29,20 +30,16 @@ std::clock_t start_time = 0;
|
|
|
29
30
|
|
|
30
31
|
std::vector<int> item_dic;
|
|
31
32
|
std::vector<Pattern> DFS;
|
|
32
|
-
std::vector<std::vector<int>> items;
|
|
33
|
-
std::vector<std::vector<int>> collected;
|
|
33
|
+
std::vector<std::vector<int>> items;
|
|
34
|
+
std::vector<std::vector<int>> collected;
|
|
34
35
|
std::vector<int> inv_item_dic;
|
|
36
|
+
|
|
35
37
|
std::string out_file, folder;
|
|
36
|
-
void ClearCollected() { collected.clear(); }
|
|
37
|
-
const std::vector<std::vector<int>>& GetCollected() { return collected; }
|
|
38
38
|
|
|
39
39
|
// ───────────── helper for list‐mode DB build ─────────────────────
|
|
40
|
-
static void Load_items_list(const std::string& fname)
|
|
41
|
-
{
|
|
40
|
+
static void Load_items_list(const std::string& fname) {
|
|
42
41
|
std::ifstream in(fname);
|
|
43
|
-
if (!in.good())
|
|
44
|
-
return;
|
|
45
|
-
}
|
|
42
|
+
if (!in.good()) return;
|
|
46
43
|
std::string line;
|
|
47
44
|
while (std::getline(in, line)) {
|
|
48
45
|
std::istringstream iss(line);
|
|
@@ -50,39 +47,39 @@ static void Load_items_list(const std::string& fname)
|
|
|
50
47
|
int x;
|
|
51
48
|
while (iss >> x) {
|
|
52
49
|
int a = std::abs(x);
|
|
53
|
-
if (a < 1 || a > static_cast<int>(item_dic.size()))
|
|
54
|
-
|
|
55
|
-
}
|
|
56
|
-
if (item_dic[a - 1] == -1) {
|
|
57
|
-
continue;
|
|
58
|
-
}
|
|
50
|
+
if (a < 1 || a > static_cast<int>(item_dic.size())) continue;
|
|
51
|
+
if (item_dic[a - 1] == -1) continue;
|
|
59
52
|
seq.push_back(x);
|
|
60
53
|
}
|
|
61
|
-
if (!seq.empty())
|
|
62
|
-
items.push_back(std::move(seq));
|
|
63
|
-
}
|
|
54
|
+
if (!seq.empty()) items.push_back(std::move(seq));
|
|
64
55
|
}
|
|
65
56
|
}
|
|
66
57
|
|
|
67
58
|
// ─────────────── main loader ─────────────────────────────────────
|
|
68
|
-
bool Load_instance(const std::string& items_file, double minsup)
|
|
69
|
-
|
|
59
|
+
bool Load_instance(const std::string& items_file, double minsup) {
|
|
60
|
+
// reset state
|
|
61
|
+
N = L = num_nodes = theta = M = E = 0;
|
|
70
62
|
start_time = std::clock();
|
|
71
63
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
64
|
+
DFS.clear();
|
|
65
|
+
Tree.clear();
|
|
66
|
+
items.clear();
|
|
67
|
+
collected.clear();
|
|
68
|
+
item_dic.clear();
|
|
69
|
+
inv_item_dic.clear();
|
|
70
|
+
itmset_exists = false;
|
|
71
|
+
|
|
72
|
+
std::clock_t kk = start_time;
|
|
73
|
+
Tree.emplace_back(0, 0, 0); // root
|
|
77
74
|
|
|
75
|
+
if (use_list) {
|
|
76
|
+
if (!Preprocess(items_file, minsup)) return false;
|
|
78
77
|
inv_item_dic.assign(L + 1, 0);
|
|
79
78
|
for (int old = 1; old <= static_cast<int>(item_dic.size()); ++old) {
|
|
80
79
|
int cid = item_dic[old - 1];
|
|
81
80
|
if (cid > 0) inv_item_dic[cid] = old;
|
|
82
81
|
}
|
|
83
|
-
|
|
84
82
|
Load_items_list(items_file);
|
|
85
|
-
|
|
86
83
|
N = items.size();
|
|
87
84
|
theta = (minsup < 1.0)
|
|
88
85
|
? static_cast<unsigned long long>(std::ceil(minsup * N))
|
|
@@ -90,29 +87,39 @@ bool Load_instance(const std::string& items_file, double minsup)
|
|
|
90
87
|
return true;
|
|
91
88
|
}
|
|
92
89
|
|
|
93
|
-
|
|
94
|
-
Tree.clear();
|
|
95
|
-
Tree.emplace_back(0, 0, 0); // root node
|
|
96
|
-
|
|
90
|
+
// MDD build mode
|
|
97
91
|
if (pre_pro) {
|
|
98
|
-
if (!Preprocess(items_file, minsup))
|
|
99
|
-
|
|
100
|
-
|
|
92
|
+
if (!Preprocess(items_file, minsup)) return false;
|
|
93
|
+
std::cout << "\nPreprocess done in "
|
|
94
|
+
<< give_time(std::clock() - kk)
|
|
95
|
+
<< " seconds\n\n";
|
|
96
|
+
DFS.clear();
|
|
97
|
+
DFS.reserve(L);
|
|
98
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
99
|
+
DFS.emplace_back(-int(i) - 1);
|
|
100
|
+
kk = std::clock();
|
|
101
101
|
Load_items_pre(items_file);
|
|
102
102
|
} else {
|
|
103
|
+
if (!Preprocess(items_file, 0.0)) return false;
|
|
104
|
+
kk = std::clock();
|
|
103
105
|
Load_items(items_file);
|
|
104
106
|
}
|
|
105
107
|
|
|
108
|
+
std::cout << "\nMDD Database built in "
|
|
109
|
+
<< give_time(std::clock() - kk)
|
|
110
|
+
<< " seconds\n\n";
|
|
111
|
+
std::cout << "Found " << N
|
|
112
|
+
<< " sequences, with max line len " << M
|
|
113
|
+
<< ", and " << L << " items, and " << E << " entries\n";
|
|
114
|
+
std::cout << "Total MDD nodes: " << Tree.size() << std::endl;
|
|
115
|
+
|
|
106
116
|
return true;
|
|
107
117
|
}
|
|
108
118
|
|
|
109
119
|
// ────────────── Preprocess (list mode) ───────────────────────────
|
|
110
|
-
bool Preprocess(const std::string& inst, double thresh)
|
|
111
|
-
{
|
|
120
|
+
bool Preprocess(const std::string& inst, double thresh) {
|
|
112
121
|
std::ifstream file(inst);
|
|
113
|
-
if (!file.good())
|
|
114
|
-
return false;
|
|
115
|
-
}
|
|
122
|
+
if (!file.good()) return false;
|
|
116
123
|
|
|
117
124
|
std::vector<unsigned long long> freq(1000000);
|
|
118
125
|
std::vector<unsigned long long> counted(1000000, 0);
|
|
@@ -154,9 +161,7 @@ bool Preprocess(const std::string& inst, double thresh)
|
|
|
154
161
|
// Load_items_pre: MDD insert from file
|
|
155
162
|
void Load_items_pre(const std::string& inst_name) {
|
|
156
163
|
std::ifstream file(inst_name);
|
|
157
|
-
if (!file.good())
|
|
158
|
-
return;
|
|
159
|
-
}
|
|
164
|
+
if (!file.good()) return;
|
|
160
165
|
|
|
161
166
|
std::string line;
|
|
162
167
|
while (std::getline(file, line)) {
|
|
@@ -166,11 +171,7 @@ void Load_items_pre(const std::string& inst_name) {
|
|
|
166
171
|
bool sgn = false;
|
|
167
172
|
while (word >> itm) {
|
|
168
173
|
int ditem;
|
|
169
|
-
try {
|
|
170
|
-
ditem = std::stoi(itm);
|
|
171
|
-
} catch (...) {
|
|
172
|
-
continue;
|
|
173
|
-
}
|
|
174
|
+
try { ditem = std::stoi(itm); } catch (...) { continue; }
|
|
174
175
|
int absidx = std::abs(ditem) - 1;
|
|
175
176
|
if (absidx < 0 || absidx >= static_cast<int>(item_dic.size())) {
|
|
176
177
|
if (!sgn && ditem < 0) sgn = true;
|
|
@@ -180,23 +181,13 @@ void Load_items_pre(const std::string& inst_name) {
|
|
|
180
181
|
if (!sgn && ditem < 0) sgn = true;
|
|
181
182
|
continue;
|
|
182
183
|
}
|
|
183
|
-
if (ditem > 0) {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
} else {
|
|
187
|
-
ditem = -item_dic[-ditem - 1];
|
|
188
|
-
}
|
|
189
|
-
if (sgn) {
|
|
190
|
-
if (ditem > 0) ditem = -ditem;
|
|
191
|
-
sgn = false;
|
|
192
|
-
}
|
|
184
|
+
if (ditem > 0) { ditem = item_dic[ditem - 1]; itmset_exists = true; }
|
|
185
|
+
else { ditem = -item_dic[-ditem - 1]; }
|
|
186
|
+
if (sgn) { if (ditem > 0) ditem = -ditem; sgn = false; }
|
|
193
187
|
temp_vec.push_back(ditem);
|
|
194
188
|
}
|
|
195
|
-
if (temp_vec.empty())
|
|
196
|
-
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
N++;
|
|
189
|
+
if (temp_vec.empty()) continue;
|
|
190
|
+
++N;
|
|
200
191
|
M = std::max<unsigned>(M, temp_vec.size());
|
|
201
192
|
Build_MDD(temp_vec);
|
|
202
193
|
}
|
|
@@ -205,9 +196,7 @@ void Load_items_pre(const std::string& inst_name) {
|
|
|
205
196
|
// Load_items: full MDD build
|
|
206
197
|
bool Load_items(const std::string& inst_name) {
|
|
207
198
|
std::ifstream file(inst_name);
|
|
208
|
-
if (!file.good())
|
|
209
|
-
return false;
|
|
210
|
-
}
|
|
199
|
+
if (!file.good()) return false;
|
|
211
200
|
|
|
212
201
|
std::string line;
|
|
213
202
|
while (std::getline(file, line)) {
|
|
@@ -217,19 +206,25 @@ bool Load_items(const std::string& inst_name) {
|
|
|
217
206
|
std::vector<int> temp_vec;
|
|
218
207
|
while (word >> itm) {
|
|
219
208
|
int ditem;
|
|
220
|
-
try {
|
|
221
|
-
ditem = std::stoi(itm);
|
|
222
|
-
} catch (...) {
|
|
223
|
-
continue;
|
|
224
|
-
}
|
|
209
|
+
try { ditem = std::stoi(itm); } catch (...) { continue; }
|
|
225
210
|
if (ditem > 0) itmset_exists = true;
|
|
226
|
-
|
|
211
|
+
unsigned int ad = static_cast<unsigned int>(std::abs(ditem));
|
|
212
|
+
if (L < ad) {
|
|
213
|
+
L = ad;
|
|
214
|
+
DFS.reserve(L);
|
|
215
|
+
while (DFS.size() < L)
|
|
216
|
+
DFS.emplace_back(-int(DFS.size()) - 1);
|
|
217
|
+
}
|
|
227
218
|
temp_vec.push_back(ditem);
|
|
228
219
|
}
|
|
229
|
-
|
|
220
|
+
if (temp_vec.size() > M) M = temp_vec.size();
|
|
230
221
|
Build_MDD(temp_vec);
|
|
231
222
|
}
|
|
232
223
|
return true;
|
|
233
224
|
}
|
|
234
225
|
|
|
226
|
+
void ClearCollected() { collected.clear(); }
|
|
227
|
+
const std::vector<std::vector<int>>& GetCollected() { return collected; }
|
|
228
|
+
|
|
235
229
|
} // namespace largebm
|
|
230
|
+
|
|
@@ -11,6 +11,7 @@ std::vector<largehm::CArc> largehm::CTree;
|
|
|
11
11
|
#include <iostream>
|
|
12
12
|
#include <cmath> // for std::abs
|
|
13
13
|
#include <unordered_map>
|
|
14
|
+
#include <cstdint> // for std::uint64_t
|
|
14
15
|
#include "load_inst.hpp"
|
|
15
16
|
#include "freq_miner.hpp"
|
|
16
17
|
#include "utility.hpp"
|
|
@@ -23,8 +24,8 @@ namespace largehm {
|
|
|
23
24
|
void Build_MDD(std::vector<int>& items, std::vector<int>& items_lim) {
|
|
24
25
|
// SANITY CHECK: show sizes before building
|
|
25
26
|
|
|
26
|
-
std::unordered_map<int,
|
|
27
|
-
|
|
27
|
+
std::unordered_map<int, std::uint64_t> ancest_map;
|
|
28
|
+
std::uint64_t last_arc = 0;
|
|
28
29
|
int itmset = 0;
|
|
29
30
|
|
|
30
31
|
// Insert each prefix item as an arc
|
|
@@ -43,9 +44,9 @@ void Build_MDD(std::vector<int>& items, std::vector<int>& items_lim) {
|
|
|
43
44
|
// ─── Add_arc: insert a single “item” into the MDD under parent last_arc. ──────
|
|
44
45
|
//
|
|
45
46
|
int Add_arc(int item,
|
|
46
|
-
|
|
47
|
+
std::uint64_t last_arc,
|
|
47
48
|
int& itmset,
|
|
48
|
-
std::unordered_map<int,
|
|
49
|
+
std::unordered_map<int, std::uint64_t>& ancest_map)
|
|
49
50
|
{
|
|
50
51
|
// Ensure DFS is at least size |item|
|
|
51
52
|
size_t needed = static_cast<size_t>(std::abs(item));
|
|
@@ -67,7 +68,7 @@ int Add_arc(int item,
|
|
|
67
68
|
++itmset;
|
|
68
69
|
}
|
|
69
70
|
|
|
70
|
-
|
|
71
|
+
std::uint64_t last_sibl = Tree[last_arc].chld;
|
|
71
72
|
if (last_sibl == 0) {
|
|
72
73
|
// No child yet: create a new Arc
|
|
73
74
|
Tree.emplace_back(item, itmset, anct);
|
|
@@ -98,7 +99,7 @@ int Add_arc(int item,
|
|
|
98
99
|
}
|
|
99
100
|
++Tree[last_sibl].freq;
|
|
100
101
|
ancest_map[std::abs(item)] = last_sibl;
|
|
101
|
-
return last_sibl;
|
|
102
|
+
return static_cast<int>(last_sibl);
|
|
102
103
|
}
|
|
103
104
|
|
|
104
105
|
|
|
@@ -106,8 +107,8 @@ int Add_arc(int item,
|
|
|
106
107
|
// ─── Add_vec: attach the “items_lim” vector as children/vertical arcs ─────────
|
|
107
108
|
//
|
|
108
109
|
void Add_vec(std::vector<int>& items_lim,
|
|
109
|
-
std::unordered_map<int,
|
|
110
|
-
|
|
110
|
+
std::unordered_map<int, std::uint64_t>& ancest_map,
|
|
111
|
+
std::uint64_t last_arc,
|
|
111
112
|
int itmset)
|
|
112
113
|
{
|
|
113
114
|
// Ensure VDFS and DFS are at least size L
|
|
@@ -131,7 +132,7 @@ void Add_vec(std::vector<int>& items_lim,
|
|
|
131
132
|
|
|
132
133
|
// If this node has positive itmset (>0) or no CTree child yet, create first child entry
|
|
133
134
|
if (Tree[last_arc].itmset > 0 || Tree[last_arc].chld == 0) {
|
|
134
|
-
std::vector<
|
|
135
|
+
std::vector<std::uint64_t> ancest(L + 1, 0ULL);
|
|
135
136
|
for (auto& kv : ancest_map) {
|
|
136
137
|
ancest[kv.first - 1] = kv.second;
|
|
137
138
|
counted[kv.first - 1] = true;
|
|
@@ -1,12 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
#include <cstdint>
|
|
3
|
+
#include <vector>
|
|
4
|
+
|
|
1
5
|
#include <iostream>
|
|
2
6
|
#include <time.h>
|
|
3
|
-
|
|
7
|
+
// for std::vector
|
|
4
8
|
#include <cmath> // for std::ceil
|
|
5
9
|
|
|
6
10
|
#include "freq_miner.hpp"
|
|
7
11
|
#include "build_mdd.hpp"
|
|
8
12
|
#include "utility.hpp"
|
|
9
|
-
|
|
13
|
+
std::vector<std::uint64_t> ancest_base;
|
|
10
14
|
namespace largehm {
|
|
11
15
|
|
|
12
16
|
void Out_patt(std::vector<int>& seq, unsigned int freq);
|
|
@@ -14,7 +18,7 @@ void Extend_patt(Pattern& _patt);
|
|
|
14
18
|
void Mine_vec(unsigned long long int seq_ID,
|
|
15
19
|
int pos,
|
|
16
20
|
int num_found,
|
|
17
|
-
|
|
21
|
+
|
|
18
22
|
std::vector<int>& items,
|
|
19
23
|
unsigned long long int inod,
|
|
20
24
|
int sgn);
|
|
@@ -27,7 +31,7 @@ std::vector<bool> slist;
|
|
|
27
31
|
std::vector<Pattern> pot_patt;
|
|
28
32
|
std::vector<VPattern> pot_vpatt;
|
|
29
33
|
std::vector<unsigned long long int> last_strpnt;
|
|
30
|
-
|
|
34
|
+
|
|
31
35
|
std::vector<int> DFS_numfound;
|
|
32
36
|
|
|
33
37
|
Pattern _patt;
|
|
@@ -354,13 +358,13 @@ void Extend_patt(Pattern& _pattern) {
|
|
|
354
358
|
}
|
|
355
359
|
|
|
356
360
|
|
|
357
|
-
void Mine_vec(
|
|
358
|
-
int pos,
|
|
359
|
-
int num_found,
|
|
360
|
-
std::vector<
|
|
361
|
-
std::vector<int>& items,
|
|
362
|
-
|
|
363
|
-
int sgn)
|
|
361
|
+
void Mine_vec(std::uint64_t seq_ID,
|
|
362
|
+
int pos,
|
|
363
|
+
int num_found,
|
|
364
|
+
std::vector<std::uint64_t>& ancest,
|
|
365
|
+
std::vector<int>& items,
|
|
366
|
+
std::uint64_t pnt,
|
|
367
|
+
int sgn)
|
|
364
368
|
{
|
|
365
369
|
std::vector<bool> found(L + L * (ilist_nempty ? 1 : 0), false);
|
|
366
370
|
|