effspm 0.3.0__cp312-cp312-macosx_11_0_arm64.whl → 0.3.3__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/_effspm.cpp +683 -2
- effspm/_effspm.cpython-312-darwin.so +0 -0
- effspm/btminer/src/load_inst.cpp +21 -11
- effspm/btminer/src/main.cpp +83 -0
- effspm/htminer/src/build_mdd.cpp +41 -66
- effspm/htminer/src/build_mdd.hpp +56 -49
- effspm/htminer/src/freq_miner.cpp +341 -307
- effspm/htminer/src/freq_miner.hpp +39 -40
- effspm/htminer/src/load_inst.cpp +287 -336
- effspm/htminer/src/load_inst.hpp +23 -6
- effspm/htminer/src/main.cpp +97 -0
- effspm/htminer/src/utility.cpp +38 -57
- effspm/htminer/src/utility.hpp +9 -64
- effspm/largebm/src/main.cpp +95 -0
- effspm/largehm/src/build_mdd.cpp +75 -110
- effspm/largehm/src/build_mdd.hpp +53 -73
- effspm/largehm/src/freq_miner.cpp +132 -173
- effspm/largehm/src/freq_miner.hpp +37 -60
- effspm/largehm/src/load_inst.cpp +136 -191
- effspm/largehm/src/load_inst.hpp +13 -50
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +46 -28
- effspm/largehm/src/utility.hpp +18 -16
- effspm/largepp/src/load_inst.cpp +5 -4
- effspm/largepp/src/main.cpp +108 -0
- effspm/load_inst.cpp +8 -8
- effspm/main.cpp +103 -0
- {effspm-0.3.0.dist-info → effspm-0.3.3.dist-info}/METADATA +1 -1
- effspm-0.3.3.dist-info/RECORD +60 -0
- effspm-0.3.0.dist-info/RECORD +0 -54
- {effspm-0.3.0.dist-info → effspm-0.3.3.dist-info}/WHEEL +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.3.0.dist-info → effspm-0.3.3.dist-info}/top_level.txt +0 -0
effspm/htminer/src/load_inst.hpp
CHANGED
|
@@ -6,18 +6,35 @@
|
|
|
6
6
|
#include <map>
|
|
7
7
|
#include <unordered_set>
|
|
8
8
|
#include <unordered_map>
|
|
9
|
+
#include <time.h>
|
|
10
|
+
|
|
9
11
|
namespace htminer {
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
using std::string;
|
|
14
|
+
using std::vector;
|
|
11
15
|
|
|
12
16
|
bool Load_instance(string& items_file, double thresh);
|
|
13
17
|
|
|
14
18
|
extern string out_file, folder;
|
|
15
19
|
|
|
16
|
-
extern bool b_disp
|
|
20
|
+
extern bool b_disp;
|
|
21
|
+
extern bool b_write;
|
|
22
|
+
extern bool use_dic;
|
|
23
|
+
extern bool just_build;
|
|
24
|
+
extern bool pre_pro;
|
|
25
|
+
extern bool itmset_exists;
|
|
26
|
+
|
|
27
|
+
extern unsigned int M;
|
|
28
|
+
extern unsigned int mlim;
|
|
29
|
+
extern unsigned int time_limit;
|
|
30
|
+
extern unsigned long long N;
|
|
31
|
+
extern unsigned long long L;
|
|
32
|
+
extern unsigned long long theta;
|
|
33
|
+
extern unsigned long long E;
|
|
17
34
|
|
|
18
|
-
|
|
35
|
+
// 🔥 This is the missing declaration that fixes the error:
|
|
36
|
+
extern vector<int> item_dic;
|
|
19
37
|
|
|
20
|
-
extern
|
|
38
|
+
extern clock_t start_time;
|
|
21
39
|
|
|
22
|
-
|
|
23
|
-
}
|
|
40
|
+
} // namespace htminer
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <time.h>
|
|
3
|
+
#include <string.h>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include "load_inst.hpp"
|
|
6
|
+
#include "build_mdd.hpp"
|
|
7
|
+
#include "utility.hpp"
|
|
8
|
+
#include "freq_miner.hpp"
|
|
9
|
+
namespace htminer {
|
|
10
|
+
using namespace std;
|
|
11
|
+
|
|
12
|
+
string out_file;
|
|
13
|
+
|
|
14
|
+
bool b_disp = 0, b_write = 0, use_dic = 0, just_build = 0, pre_pro = 1;
|
|
15
|
+
|
|
16
|
+
unsigned int time_limit = 10 * 3600;
|
|
17
|
+
|
|
18
|
+
clock_t start_time;
|
|
19
|
+
|
|
20
|
+
string folder;
|
|
21
|
+
|
|
22
|
+
int main(int argc, char* argv[]) {
|
|
23
|
+
|
|
24
|
+
string VV, attr;
|
|
25
|
+
|
|
26
|
+
double thresh = 0;
|
|
27
|
+
for (int i = 1; i<argc; i++) {
|
|
28
|
+
if (argv[i][0] != '-' || isdigit(argv[i][1]))
|
|
29
|
+
continue;
|
|
30
|
+
else if (strcmp(argv[i], "-thr") == 0)
|
|
31
|
+
thresh = stod(argv[i + 1]);
|
|
32
|
+
else if (strcmp(argv[i], "-file") == 0)
|
|
33
|
+
VV = argv[i + 1];
|
|
34
|
+
else if (strcmp(argv[i], "-time") == 0)
|
|
35
|
+
time_limit = stoi(argv[i + 1]);
|
|
36
|
+
else if (strcmp(argv[i], "-jbuild") == 0)
|
|
37
|
+
just_build = 1;
|
|
38
|
+
else if (strcmp(argv[i], "-folder") == 0)
|
|
39
|
+
folder = argv[i + 1];
|
|
40
|
+
else if (strcmp(argv[i], "-npre") == 0)
|
|
41
|
+
pre_pro = 0;
|
|
42
|
+
else if (strcmp(argv[i], "-dic") == 0)
|
|
43
|
+
use_dic = 1;
|
|
44
|
+
else if (strcmp(argv[i], "-out") == 0) {
|
|
45
|
+
if (i + 1 == argc || argv[i + 1][0] == '-')
|
|
46
|
+
b_disp = 1;
|
|
47
|
+
else if (argv[i + 1][0] == '+') {
|
|
48
|
+
b_disp = 1;
|
|
49
|
+
b_write = 1;
|
|
50
|
+
if (strlen(argv[i + 1]) > 1) {
|
|
51
|
+
out_file = argv[i + 1];
|
|
52
|
+
out_file = out_file.substr(1, out_file.size() - 1);
|
|
53
|
+
}
|
|
54
|
+
else
|
|
55
|
+
out_file = VV;
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
b_write = 1;
|
|
59
|
+
out_file = argv[i + 1];
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
else
|
|
64
|
+
cout << "Command " << argv[i] << " not recognized and skipped.\n";
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
cout << "\n********************** " << VV << "**********************\n";
|
|
70
|
+
|
|
71
|
+
string item_file = folder + VV + ".txt";
|
|
72
|
+
|
|
73
|
+
cout << "loading instances...\n";
|
|
74
|
+
|
|
75
|
+
clock_t start_time_all = clock();
|
|
76
|
+
start_time = clock();
|
|
77
|
+
|
|
78
|
+
if (!Load_instance(item_file, thresh)) {
|
|
79
|
+
cout << "Files invalid, exiting.\n";
|
|
80
|
+
cin.get();
|
|
81
|
+
return 0;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
start_time = clock();
|
|
85
|
+
|
|
86
|
+
if (!just_build && give_time(clock() - start_time) < time_limit) {
|
|
87
|
+
Freq_miner();
|
|
88
|
+
if (give_time(clock() - start_time) >= time_limit)
|
|
89
|
+
cout << "TIME LIMIT REACHED\n";
|
|
90
|
+
cout << "Mining Complete\n\nFound a total of " << num_patt << " patterns\n";
|
|
91
|
+
cout << "\nTotal CPU time " << give_time(clock() - start_time_all) << " seconds\n\n";
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
return 0;
|
|
96
|
+
}
|
|
97
|
+
}
|
effspm/htminer/src/utility.cpp
CHANGED
|
@@ -1,72 +1,53 @@
|
|
|
1
1
|
#include "utility.hpp"
|
|
2
|
+
#include "build_mdd.hpp"
|
|
2
3
|
#include "load_inst.hpp"
|
|
3
|
-
#include
|
|
4
|
-
#include <vector>
|
|
5
|
-
namespace htminer {
|
|
6
|
-
|
|
7
|
-
// ─── Flag‐like globals ──────────────────────────────────────────────────────
|
|
8
|
-
bool use_list = false;
|
|
9
|
-
bool just_build = false;
|
|
10
|
-
bool b_disp = false;
|
|
11
|
-
bool b_write = false;
|
|
12
|
-
bool use_dic = false;
|
|
13
|
-
bool pre_pro = false;
|
|
4
|
+
#include <iostream>
|
|
14
5
|
|
|
15
|
-
|
|
16
|
-
std::string out_file = "";
|
|
17
|
-
std::clock_t start_time = 0;
|
|
6
|
+
namespace htminer {
|
|
18
7
|
|
|
19
|
-
|
|
20
|
-
std::vector<std::vector<int>> items;
|
|
21
|
-
unsigned long long N = 0;
|
|
22
|
-
unsigned long long L = 0;
|
|
23
|
-
unsigned long long theta = 0;
|
|
24
|
-
unsigned int M = 0;
|
|
25
|
-
unsigned long long E = 0;
|
|
26
|
-
unsigned int mlim = 0;
|
|
27
|
-
// ─── DFS stacks ─────────────────────────────────────────────────────────────
|
|
28
|
-
std::vector<Pattern> DFS;
|
|
29
|
-
std::vector<VPattern> VDFS;
|
|
8
|
+
using std::vector;
|
|
30
9
|
|
|
31
|
-
|
|
32
|
-
std::vector<std::vector<int>> collectedPatterns;
|
|
33
|
-
const std::vector<std::vector<int>>& GetCollected() {
|
|
34
|
-
return collectedPatterns;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
// ─── give_time and check_parent get their definitions here (as provided) ───
|
|
38
|
-
float give_time(std::clock_t kk) {
|
|
39
|
-
return static_cast<float>(kk) / static_cast<float>(CLOCKS_PER_SEC);
|
|
40
|
-
}
|
|
41
|
-
bool check_parent(unsigned int cur_anct, unsigned int str_pnt, unsigned int start, vector<unsigned int>& strpnt_vec) {
|
|
10
|
+
vector<vector<int>> collectedPatterns;
|
|
42
11
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
while (abs(Tree[cur_anct].itmset) > abs(Tree[str_pnt].itmset)) {
|
|
46
|
-
if (Tree[cur_anct].item > 0)
|
|
47
|
-
ancestors.push_back(cur_anct);
|
|
48
|
-
cur_anct = Tree[cur_anct].anct;
|
|
49
|
-
}
|
|
12
|
+
bool check_parent(unsigned int cur_anct, unsigned int str_pnt,
|
|
13
|
+
unsigned int start, vector<unsigned int>& strpnt_vec) {
|
|
50
14
|
|
|
51
|
-
|
|
52
|
-
return 1;
|
|
53
|
-
else {
|
|
54
|
-
for (vector<unsigned int>::reverse_iterator it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
|
|
55
|
-
for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
|
|
56
|
-
if (strpnt_vec[i] == *it)
|
|
57
|
-
return 1;
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
}
|
|
15
|
+
vector<unsigned int> ancestors;
|
|
61
16
|
|
|
17
|
+
while (std::abs(Tree[cur_anct].itmset) >
|
|
18
|
+
std::abs(Tree[str_pnt].itmset)) {
|
|
19
|
+
if (Tree[cur_anct].item > 0)
|
|
20
|
+
ancestors.push_back(cur_anct);
|
|
21
|
+
cur_anct = Tree[cur_anct].anct;
|
|
22
|
+
}
|
|
62
23
|
|
|
63
|
-
|
|
24
|
+
if (std::abs(Tree[cur_anct].itmset) ==
|
|
25
|
+
std::abs(Tree[str_pnt].itmset))
|
|
26
|
+
return true;
|
|
27
|
+
else {
|
|
28
|
+
for (vector<unsigned int>::reverse_iterator it = ancestors.rbegin();
|
|
29
|
+
it != ancestors.rend(); ++it) {
|
|
30
|
+
for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
|
|
31
|
+
if (strpnt_vec[i] == *it)
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
64
36
|
|
|
37
|
+
return false;
|
|
65
38
|
}
|
|
66
39
|
|
|
40
|
+
float give_time(clock_t kk) {
|
|
41
|
+
float ll = ((float)kk) / CLOCKS_PER_SEC;
|
|
42
|
+
return ll;
|
|
43
|
+
}
|
|
67
44
|
|
|
45
|
+
void ClearCollected() {
|
|
46
|
+
collectedPatterns.clear();
|
|
47
|
+
}
|
|
68
48
|
|
|
49
|
+
const vector<vector<int>>& GetCollected() {
|
|
50
|
+
return collectedPatterns;
|
|
51
|
+
}
|
|
69
52
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
}
|
|
53
|
+
} // namespace htminer
|
effspm/htminer/src/utility.hpp
CHANGED
|
@@ -1,77 +1,22 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
3
|
#include <vector>
|
|
4
|
-
#include <
|
|
4
|
+
#include <time.h>
|
|
5
5
|
#include <string>
|
|
6
6
|
#include "build_mdd.hpp"
|
|
7
|
-
#include "freq_miner.hpp"
|
|
8
|
-
#include "load_inst.hpp"
|
|
9
7
|
|
|
10
8
|
namespace htminer {
|
|
11
9
|
|
|
12
|
-
|
|
13
|
-
/// Controls whether to mine in “list” mode (unused for HTMiner, but declared)
|
|
14
|
-
extern bool use_list;
|
|
15
|
-
/// If true, only build MDD and exit (don’t actually mine)
|
|
16
|
-
extern bool just_build;
|
|
17
|
-
/// If true, print each pattern to stdout as it’s found
|
|
18
|
-
extern bool b_disp;
|
|
19
|
-
/// If true, write each pattern to file (see out_file)
|
|
20
|
-
extern bool b_write;
|
|
21
|
-
/// If true, use a dictionary‐file mapping items → new IDs
|
|
22
|
-
extern bool use_dic;
|
|
23
|
-
/// If true, preprocess input (create dictionary) instead of mining
|
|
24
|
-
extern bool pre_pro;
|
|
10
|
+
using std::vector;
|
|
25
11
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
extern std::string out_file;
|
|
30
|
-
/// Clock tick when mining started
|
|
31
|
-
extern std::clock_t start_time;
|
|
12
|
+
float give_time(clock_t kk);
|
|
13
|
+
bool check_parent(unsigned int cur_arc, unsigned int str_pnt,
|
|
14
|
+
unsigned int start, vector<unsigned int>& strpnt_vec);
|
|
32
15
|
|
|
33
|
-
//
|
|
34
|
-
|
|
35
|
-
extern std::vector<std::vector<int>> items;
|
|
36
|
-
/// Number of sequences (items.size())
|
|
37
|
-
extern unsigned long long N;
|
|
38
|
-
/// Number of distinct items (max absolute item ID)
|
|
39
|
-
extern unsigned long long L;
|
|
40
|
-
/// Minimum support threshold (absolute count, not fraction)
|
|
41
|
-
extern unsigned long long theta;
|
|
42
|
-
/// Maximum sequence length across all items
|
|
43
|
-
extern unsigned int M;
|
|
44
|
-
/// Total number of “entries” (sum of all sequence lengths)
|
|
45
|
-
extern unsigned long long E;
|
|
16
|
+
// pattern collection for Python
|
|
17
|
+
extern vector<vector<int>> collectedPatterns;
|
|
46
18
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
extern std::vector<Pattern> DFS;
|
|
50
|
-
|
|
51
|
-
extern std::vector<std::vector<int>> collectedPatterns;
|
|
52
|
-
// ─── Collected output ───────────────────────────────────────────────────────
|
|
53
|
-
/// Clears any patterns left in DFS (called at the start of each run)
|
|
54
|
-
inline void ClearCollected() {
|
|
55
|
-
DFS.clear();
|
|
56
|
-
collectedPatterns.clear();
|
|
57
|
-
}
|
|
58
|
-
/// Returns a reference to the entire “collected patterns” vector
|
|
59
|
-
/// (each Pattern knows how to output itself as a vector<int>)
|
|
60
|
-
const std::vector<std::vector<int>>& GetCollected();
|
|
61
|
-
|
|
62
|
-
// ─── Helper functions ───────────────────────────────────────────────────────
|
|
63
|
-
/// Given a clock‐tick difference, return elapsed seconds as a float
|
|
64
|
-
float give_time(std::clock_t kk);
|
|
65
|
-
|
|
66
|
-
/// Check whether a candidate can extend its parent pattern:
|
|
67
|
-
/// cur_arc = current Arc node ID in MDD
|
|
68
|
-
/// str_pnt = string‐pointer of the existing pattern
|
|
69
|
-
/// start = starting index within the MDD arc‐list
|
|
70
|
-
/// strpnt_vec = parent’s “string pointers” vector
|
|
71
|
-
/// Returns true if `cur_arc` is a valid child of `str_pnt` from position `start`.
|
|
72
|
-
bool check_parent(unsigned int cur_arc,
|
|
73
|
-
unsigned int str_pnt,
|
|
74
|
-
unsigned int start,
|
|
75
|
-
std::vector<unsigned int>& strpnt_vec);
|
|
19
|
+
void ClearCollected();
|
|
20
|
+
const vector<vector<int>>& GetCollected();
|
|
76
21
|
|
|
77
22
|
} // namespace htminer
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <time.h>
|
|
3
|
+
#include <string.h>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include "load_inst.hpp"
|
|
6
|
+
#include "build_mdd.hpp"
|
|
7
|
+
#include "utility.hpp"
|
|
8
|
+
#include "freq_miner.hpp"
|
|
9
|
+
|
|
10
|
+
namespace largebm{
|
|
11
|
+
using namespace std;
|
|
12
|
+
|
|
13
|
+
string out_file;
|
|
14
|
+
|
|
15
|
+
bool b_disp = 0, b_write = 0, just_build = 0, pre_pro = 1;
|
|
16
|
+
|
|
17
|
+
int time_limit = 30 * 3600;
|
|
18
|
+
|
|
19
|
+
clock_t start_time;
|
|
20
|
+
|
|
21
|
+
string folder;
|
|
22
|
+
|
|
23
|
+
int main(int argc, char* argv[]) {
|
|
24
|
+
|
|
25
|
+
string VV, attr;
|
|
26
|
+
|
|
27
|
+
double thresh = 0;
|
|
28
|
+
for (int i = 1; i<argc; i++) {
|
|
29
|
+
if (argv[i][0] != '-' || isdigit(argv[i][1]))
|
|
30
|
+
continue;
|
|
31
|
+
else if (strcmp(argv[i], "-thr") == 0)
|
|
32
|
+
thresh = stod(argv[i + 1]);
|
|
33
|
+
else if (strcmp(argv[i], "-file") == 0)
|
|
34
|
+
VV = argv[i + 1];
|
|
35
|
+
else if (strcmp(argv[i], "-time") == 0)
|
|
36
|
+
time_limit = stoi(argv[i + 1]);
|
|
37
|
+
else if (strcmp(argv[i], "-jbuild") == 0)
|
|
38
|
+
just_build = 1;
|
|
39
|
+
else if (strcmp(argv[i], "-folder") == 0)
|
|
40
|
+
folder = argv[i + 1];
|
|
41
|
+
else if (strcmp(argv[i], "-npre") == 0)
|
|
42
|
+
pre_pro = 0;
|
|
43
|
+
else if (strcmp(argv[i], "-out") == 0) {
|
|
44
|
+
if (i + 1 == argc || argv[i + 1][0] == '-')
|
|
45
|
+
b_disp = 1;
|
|
46
|
+
else if (argv[i + 1][0] == '+') {
|
|
47
|
+
b_disp = 1;
|
|
48
|
+
b_write = 1;
|
|
49
|
+
if (strlen(argv[i + 1]) > 1) {
|
|
50
|
+
out_file = argv[i + 1];
|
|
51
|
+
out_file = out_file.substr(1, out_file.size() - 1);
|
|
52
|
+
}
|
|
53
|
+
else
|
|
54
|
+
out_file = VV;
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
b_write = 1;
|
|
58
|
+
out_file = argv[i + 1];
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
else
|
|
63
|
+
cout << "Command " << argv[i] << " not recognized and skipped.\n";
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
cout << "\n********************** " << VV << "**********************\n";
|
|
69
|
+
|
|
70
|
+
string item_file = folder + VV + ".txt";
|
|
71
|
+
|
|
72
|
+
cout << "loading instances...\n";
|
|
73
|
+
|
|
74
|
+
start_time = clock();
|
|
75
|
+
|
|
76
|
+
if (!Load_instance(item_file, thresh)) {
|
|
77
|
+
cout << "Files invalid, exiting.\n";
|
|
78
|
+
cin.get();
|
|
79
|
+
return 0;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
//kk = clock();
|
|
83
|
+
|
|
84
|
+
if (!just_build && give_time(clock() - start_time) < time_limit) {
|
|
85
|
+
Freq_miner();
|
|
86
|
+
if (give_time(clock() - start_time) >= time_limit)
|
|
87
|
+
cout << "TIME LIMIT REACHED\n";
|
|
88
|
+
cout << "Mining Complete\n\nFound a total of " << num_patt << " patterns\n";
|
|
89
|
+
cout << "\nTotal CPU time " << give_time(clock() - start_time) << " seconds\n\n";
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
return 0;
|
|
94
|
+
}
|
|
95
|
+
}
|
effspm/largehm/src/build_mdd.cpp
CHANGED
|
@@ -1,173 +1,138 @@
|
|
|
1
|
-
// ─── effspm/largehm/src/build_mdd.cpp ─────────────────────────────────────────
|
|
2
|
-
|
|
3
|
-
#include "build_mdd.hpp"
|
|
4
|
-
|
|
5
|
-
// ─── Definitions of the extern globals declared in build_mdd.hpp ─────────────
|
|
6
|
-
std::vector<largehm::Arc> largehm::Tree;
|
|
7
|
-
std::vector<largehm::VArc> largehm::VTree;
|
|
8
|
-
std::vector<largehm::CArc> largehm::CTree;
|
|
9
|
-
|
|
10
1
|
#include <vector>
|
|
11
2
|
#include <iostream>
|
|
12
|
-
#include <cmath> // for std::abs
|
|
13
3
|
#include <unordered_map>
|
|
14
|
-
#include <cstdint> // for std::uint64_t
|
|
15
4
|
#include "load_inst.hpp"
|
|
5
|
+
#include "build_mdd.hpp"
|
|
16
6
|
#include "freq_miner.hpp"
|
|
17
7
|
#include "utility.hpp"
|
|
18
8
|
|
|
19
9
|
namespace largehm {
|
|
20
10
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
std::unordered_map<int, std::uint64_t> ancest_map;
|
|
28
|
-
std::uint64_t last_arc = 0;
|
|
29
|
-
int itmset = 0;
|
|
11
|
+
using namespace std;
|
|
12
|
+
|
|
13
|
+
int Add_arc(int item,
|
|
14
|
+
unsigned long long int last_arc,
|
|
15
|
+
int& itmset,
|
|
16
|
+
unordered_map<int, unsigned long long int>& ancest_map);
|
|
30
17
|
|
|
31
|
-
|
|
32
|
-
|
|
18
|
+
void Add_vec(vector<int>& items_lim,
|
|
19
|
+
unordered_map<int, unsigned long long int>& ancest_map,
|
|
20
|
+
unsigned long long int last_arc,
|
|
21
|
+
int itmset);
|
|
22
|
+
|
|
23
|
+
vector<Arc> Tree;
|
|
24
|
+
vector<VArc> VTree;
|
|
25
|
+
vector<CArc> CTree;
|
|
26
|
+
|
|
27
|
+
void Build_MDD(vector<int>& items, vector<int>& items_lim) {
|
|
28
|
+
|
|
29
|
+
unordered_map<int, unsigned long long int> ancest_map;
|
|
30
|
+
|
|
31
|
+
unsigned long long int last_arc = 0;
|
|
32
|
+
int itmset = 0;
|
|
33
|
+
for (vector<int>::iterator it = items.begin(); it != items.end(); ++it)
|
|
33
34
|
last_arc = Add_arc(*it, last_arc, itmset, ancest_map);
|
|
34
|
-
}
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
if (!items_lim.empty()) {
|
|
36
|
+
if (!items_lim.empty())
|
|
38
37
|
Add_vec(items_lim, ancest_map, last_arc, itmset);
|
|
39
|
-
}
|
|
40
38
|
}
|
|
41
39
|
|
|
42
|
-
|
|
43
|
-
//
|
|
44
|
-
// ─── Add_arc: insert a single “item” into the MDD under parent last_arc. ──────
|
|
45
|
-
//
|
|
46
40
|
int Add_arc(int item,
|
|
47
|
-
|
|
41
|
+
unsigned long long int last_arc,
|
|
48
42
|
int& itmset,
|
|
49
|
-
|
|
50
|
-
{
|
|
51
|
-
// Ensure DFS is at least size |item|
|
|
52
|
-
size_t needed = static_cast<size_t>(std::abs(item));
|
|
53
|
-
if (DFS.size() < needed) {
|
|
54
|
-
size_t old = DFS.size();
|
|
55
|
-
DFS.resize(needed);
|
|
56
|
-
for (size_t i = old; i < needed; ++i) {
|
|
57
|
-
DFS[i] = Pattern(-static_cast<int>(i) - 1);
|
|
58
|
-
}
|
|
59
|
-
}
|
|
43
|
+
unordered_map<int, unsigned long long int>& ancest_map) {
|
|
60
44
|
|
|
61
|
-
unsigned int anct
|
|
62
|
-
|
|
63
|
-
if (p
|
|
45
|
+
unsigned int anct;
|
|
46
|
+
unordered_map<int, unsigned long long int>::iterator p = ancest_map.find(abs(item));
|
|
47
|
+
if (p == ancest_map.end())
|
|
48
|
+
anct = 0;
|
|
49
|
+
else
|
|
64
50
|
anct = p->second;
|
|
65
|
-
}
|
|
66
51
|
|
|
67
|
-
if (item < 0)
|
|
52
|
+
if (item < 0)
|
|
68
53
|
++itmset;
|
|
69
|
-
}
|
|
70
54
|
|
|
71
|
-
|
|
55
|
+
unsigned long long int last_sibl = Tree[last_arc].chld;
|
|
56
|
+
|
|
72
57
|
if (last_sibl == 0) {
|
|
73
|
-
// No child yet: create a new Arc
|
|
74
58
|
Tree.emplace_back(item, itmset, anct);
|
|
75
59
|
last_sibl = Tree.size() - 1;
|
|
76
60
|
Tree[last_arc].chld = last_sibl;
|
|
77
|
-
if (anct == 0)
|
|
78
|
-
DFS[
|
|
79
|
-
|
|
61
|
+
if (anct == 0)
|
|
62
|
+
DFS[abs(item) - 1].str_pnt.push_back(last_sibl);
|
|
63
|
+
|
|
80
64
|
}
|
|
81
65
|
else {
|
|
82
|
-
// Traverse siblings until we find a match or append
|
|
83
66
|
while (Tree[last_sibl].item != item) {
|
|
84
67
|
if (Tree[last_sibl].sibl == 0) {
|
|
85
68
|
Tree.emplace_back(item, itmset, anct);
|
|
86
69
|
Tree[last_sibl].sibl = Tree.size() - 1;
|
|
87
70
|
last_sibl = Tree.size() - 1;
|
|
88
|
-
if (anct == 0)
|
|
89
|
-
DFS[
|
|
90
|
-
}
|
|
71
|
+
if (anct == 0)
|
|
72
|
+
DFS[abs(item) - 1].str_pnt.push_back(last_sibl);
|
|
91
73
|
break;
|
|
92
74
|
}
|
|
93
75
|
last_sibl = Tree[last_sibl].sibl;
|
|
94
76
|
}
|
|
95
77
|
}
|
|
96
78
|
|
|
97
|
-
if (anct == 0)
|
|
98
|
-
++DFS[
|
|
99
|
-
|
|
79
|
+
if (anct == 0)
|
|
80
|
+
++DFS[abs(item) - 1].freq;
|
|
81
|
+
|
|
100
82
|
++Tree[last_sibl].freq;
|
|
101
|
-
ancest_map[std::abs(item)] = last_sibl;
|
|
102
|
-
return static_cast<int>(last_sibl);
|
|
103
|
-
}
|
|
104
83
|
|
|
84
|
+
ancest_map[abs(item)] = last_sibl;
|
|
105
85
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
void Add_vec(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
int itmset)
|
|
113
|
-
{
|
|
114
|
-
// Ensure VDFS and DFS are at least size L
|
|
115
|
-
if (VDFS.size() < static_cast<size_t>(L)) {
|
|
116
|
-
size_t old = VDFS.size();
|
|
117
|
-
VDFS.resize(static_cast<size_t>(L));
|
|
118
|
-
for (size_t i = old; i < VDFS.size(); ++i) {
|
|
119
|
-
VDFS[i] = VPattern(static_cast<int>(i));
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
if (DFS.size() < static_cast<size_t>(L)) {
|
|
123
|
-
size_t old = DFS.size();
|
|
124
|
-
DFS.resize(static_cast<size_t>(L));
|
|
125
|
-
for (size_t i = old; i < DFS.size(); ++i) {
|
|
126
|
-
DFS[i] = Pattern(-static_cast<int>(i) - 1);
|
|
127
|
-
}
|
|
128
|
-
}
|
|
86
|
+
return last_sibl;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
void Add_vec(vector<int>& items_lim,
|
|
90
|
+
unordered_map<int, unsigned long long int>& ancest_map,
|
|
91
|
+
unsigned long long int last_arc,
|
|
92
|
+
int itmset) {
|
|
129
93
|
|
|
130
94
|
items_lim.shrink_to_fit();
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
ancest[
|
|
138
|
-
counted[
|
|
95
|
+
vector<bool> counted(L, 0);
|
|
96
|
+
|
|
97
|
+
if (Tree[last_arc].itmset > 0) {
|
|
98
|
+
vector<unsigned long long int> ancest(L + 1, 0); // last element is CArc child
|
|
99
|
+
for (unordered_map<int, unsigned long long int>::iterator p = ancest_map.begin();
|
|
100
|
+
p != ancest_map.end(); ++p) {
|
|
101
|
+
ancest[p->first - 1] = p->second;
|
|
102
|
+
counted[p->first - 1] = 1;
|
|
139
103
|
}
|
|
140
|
-
for (int i = 0; i <
|
|
141
|
-
int cur_itm =
|
|
104
|
+
for (int i = 0; i < (int)items_lim.size(); ++i) {
|
|
105
|
+
int cur_itm = abs(items_lim[i]);
|
|
142
106
|
if (!counted[cur_itm - 1]) {
|
|
143
|
-
if (i + 1 <
|
|
144
|
-
VDFS[cur_itm - 1].str_pnt.push_back(-i - 1);
|
|
107
|
+
if (i + 1 < (int)items_lim.size()) {
|
|
108
|
+
VDFS[cur_itm - 1].str_pnt.push_back(-i - 1); // negative = CTree
|
|
145
109
|
VDFS[cur_itm - 1].seq_ID.push_back(CTree.size());
|
|
146
110
|
}
|
|
147
111
|
++DFS[cur_itm - 1].freq;
|
|
148
|
-
counted[cur_itm - 1] =
|
|
112
|
+
counted[cur_itm - 1] = 1;
|
|
149
113
|
}
|
|
150
114
|
}
|
|
151
115
|
CTree.emplace_back(ancest, items_lim);
|
|
152
|
-
Tree[last_arc].chld
|
|
153
|
-
Tree[last_arc].itmset = -itmset;
|
|
116
|
+
Tree[last_arc].chld = CTree.size() - 1;
|
|
117
|
+
Tree[last_arc].itmset = -itmset; // negative itmset = connection to CTree
|
|
154
118
|
}
|
|
155
119
|
else {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
if (i + 1 < static_cast<int>(items_lim.size())) {
|
|
120
|
+
vector<unsigned long long int>& ancest = CTree[Tree[last_arc].chld].ancest;
|
|
121
|
+
for (int i = 0; i < (int)items_lim.size(); ++i) {
|
|
122
|
+
int cur_itm = abs(items_lim[i]);
|
|
123
|
+
if (!counted[cur_itm - 1] && ancest[cur_itm - 1] == 0) {
|
|
124
|
+
if (i + 1 < (int)items_lim.size()) {
|
|
162
125
|
VDFS[cur_itm - 1].str_pnt.push_back(i + 1);
|
|
163
126
|
VDFS[cur_itm - 1].seq_ID.push_back(VTree.size());
|
|
164
127
|
}
|
|
165
128
|
++DFS[cur_itm - 1].freq;
|
|
166
|
-
counted[cur_itm - 1] =
|
|
129
|
+
counted[cur_itm - 1] = 1;
|
|
167
130
|
}
|
|
168
131
|
}
|
|
169
|
-
VTree.emplace_back(items_lim,
|
|
170
|
-
|
|
132
|
+
VTree.emplace_back(items_lim,
|
|
133
|
+
CTree[Tree[last_arc].chld].ancest.back());
|
|
134
|
+
// VTree siblings and CTree children are +1 of their actual position
|
|
135
|
+
CTree[Tree[last_arc].chld].ancest.back() = VTree.size();
|
|
171
136
|
}
|
|
172
137
|
}
|
|
173
138
|
|