effspm 0.1.11__cp313-cp313-macosx_10_13_universal2.whl → 0.2.1__cp313-cp313-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of effspm might be problematic. Click here for more details.
- effspm/__init__.py +3 -3
- effspm/_effspm.cpp +437 -13
- effspm/_effspm.cpython-313-darwin.so +0 -0
- effspm/btminer/src/freq_miner.cpp +3 -0
- effspm/btminer/src/load_inst.cpp +10 -4
- effspm/btminer/src/load_inst.hpp +2 -0
- effspm/btminer/src/utility.cpp +31 -33
- effspm/btminer/src/utility.hpp +9 -13
- effspm/htminer/src/build_mdd.cpp +192 -0
- effspm/htminer/src/build_mdd.hpp +64 -0
- effspm/htminer/src/freq_miner.cpp +350 -0
- effspm/htminer/src/freq_miner.hpp +60 -0
- effspm/htminer/src/load_inst.cpp +381 -0
- effspm/htminer/src/load_inst.hpp +23 -0
- effspm/htminer/src/main.cpp +96 -0
- effspm/htminer/src/utility.cpp +72 -0
- effspm/htminer/src/utility.hpp +77 -0
- effspm/largebm/src/build_mdd.cpp +137 -0
- effspm/largebm/src/build_mdd.hpp +47 -0
- effspm/largebm/src/freq_miner.cpp +342 -0
- effspm/largebm/src/freq_miner.hpp +48 -0
- effspm/largebm/src/load_inst.cpp +235 -0
- effspm/largebm/src/load_inst.hpp +45 -0
- effspm/largebm/src/main.cpp +95 -0
- effspm/largebm/src/utility.cpp +45 -0
- effspm/largebm/src/utility.hpp +18 -0
- effspm/largehm/src/build_mdd.cpp +173 -0
- effspm/largehm/src/build_mdd.hpp +93 -0
- effspm/largehm/src/freq_miner.cpp +441 -0
- effspm/largehm/src/freq_miner.hpp +77 -0
- effspm/largehm/src/load_inst.cpp +357 -0
- effspm/largehm/src/load_inst.hpp +64 -0
- effspm/largehm/src/main.cpp +95 -0
- effspm/largehm/src/utility.cpp +38 -0
- effspm/largehm/src/utility.hpp +29 -0
- effspm/largepp/src/freq_miner.cpp +170 -0
- effspm/largepp/src/freq_miner.hpp +43 -0
- effspm/largepp/src/load_inst.cpp +219 -0
- effspm/largepp/src/load_inst.hpp +28 -0
- effspm/largepp/src/main.cpp +108 -0
- effspm/largepp/src/utility.cpp +33 -0
- effspm/largepp/src/utility.hpp +20 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/METADATA +1 -1
- effspm-0.2.1.dist-info/RECORD +59 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/WHEEL +1 -1
- effspm-0.1.11.dist-info/RECORD +0 -25
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.1.11.dist-info → effspm-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <ctime>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include "build_mdd.hpp"
|
|
7
|
+
#include "freq_miner.hpp"
|
|
8
|
+
#include "load_inst.hpp"
|
|
9
|
+
|
|
10
|
+
namespace htminer {
|
|
11
|
+
|
|
12
|
+
// ─── Global flags and counters ─────────────────────────────────────────────
|
|
13
|
+
/// Controls whether to mine in “list” mode (unused for HTMiner, but declared)
|
|
14
|
+
extern bool use_list;
|
|
15
|
+
/// If true, only build MDD and exit (don’t actually mine)
|
|
16
|
+
extern bool just_build;
|
|
17
|
+
/// If true, print each pattern to stdout as it’s found
|
|
18
|
+
extern bool b_disp;
|
|
19
|
+
/// If true, write each pattern to file (see out_file)
|
|
20
|
+
extern bool b_write;
|
|
21
|
+
/// If true, use a dictionary‐file mapping items → new IDs
|
|
22
|
+
extern bool use_dic;
|
|
23
|
+
/// If true, preprocess input (create dictionary) instead of mining
|
|
24
|
+
extern bool pre_pro;
|
|
25
|
+
|
|
26
|
+
/// Time limit (in seconds) for mining before forced exit
|
|
27
|
+
extern unsigned int time_limit;
|
|
28
|
+
/// Output filename (if b_write is true)
|
|
29
|
+
extern std::string out_file;
|
|
30
|
+
/// Clock tick when mining started
|
|
31
|
+
extern std::clock_t start_time;
|
|
32
|
+
|
|
33
|
+
// ─── Data‐set‐level globals ─────────────────────────────────────────────────
|
|
34
|
+
/// The input sequences (each sequence is a vector of integers)
|
|
35
|
+
extern std::vector<std::vector<int>> items;
|
|
36
|
+
/// Number of sequences (items.size())
|
|
37
|
+
extern unsigned long long N;
|
|
38
|
+
/// Number of distinct items (max absolute item ID)
|
|
39
|
+
extern unsigned long long L;
|
|
40
|
+
/// Minimum support threshold (absolute count, not fraction)
|
|
41
|
+
extern unsigned long long theta;
|
|
42
|
+
/// Maximum sequence length across all items
|
|
43
|
+
extern unsigned int M;
|
|
44
|
+
/// Total number of “entries” (sum of all sequence lengths)
|
|
45
|
+
extern unsigned long long E;
|
|
46
|
+
|
|
47
|
+
// ─── Per‐pattern DFS stacks ─────────────────────────────────────────────────
|
|
48
|
+
/// DFS stack of “in‐memory” patterns (each Pattern holds its own ilist/slist, freq, str_pnt, etc.)
|
|
49
|
+
extern std::vector<Pattern> DFS;
|
|
50
|
+
|
|
51
|
+
extern std::vector<std::vector<int>> collectedPatterns;
|
|
52
|
+
// ─── Collected output ───────────────────────────────────────────────────────
|
|
53
|
+
/// Clears any patterns left in DFS (called at the start of each run)
|
|
54
|
+
inline void ClearCollected() {
|
|
55
|
+
DFS.clear();
|
|
56
|
+
collectedPatterns.clear();
|
|
57
|
+
}
|
|
58
|
+
/// Returns a reference to the entire “collected patterns” vector
|
|
59
|
+
/// (each Pattern knows how to output itself as a vector<int>)
|
|
60
|
+
const std::vector<std::vector<int>>& GetCollected();
|
|
61
|
+
|
|
62
|
+
// ─── Helper functions ───────────────────────────────────────────────────────
|
|
63
|
+
/// Given a clock‐tick difference, return elapsed seconds as a float
|
|
64
|
+
float give_time(std::clock_t kk);
|
|
65
|
+
|
|
66
|
+
/// Check whether a candidate can extend its parent pattern:
|
|
67
|
+
/// cur_arc = current Arc node ID in MDD
|
|
68
|
+
/// str_pnt = string‐pointer of the existing pattern
|
|
69
|
+
/// start = starting index within the MDD arc‐list
|
|
70
|
+
/// strpnt_vec = parent’s “string pointers” vector
|
|
71
|
+
/// Returns true if `cur_arc` is a valid child of `str_pnt` from position `start`.
|
|
72
|
+
bool check_parent(unsigned int cur_arc,
|
|
73
|
+
unsigned int str_pnt,
|
|
74
|
+
unsigned int start,
|
|
75
|
+
std::vector<unsigned int>& strpnt_vec);
|
|
76
|
+
|
|
77
|
+
} // namespace htminer
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
// File: effspm/largebm/src/load_inst.cpp
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <iostream>
|
|
5
|
+
#include <unordered_map>
|
|
6
|
+
#include "load_inst.hpp"
|
|
7
|
+
#include "build_mdd.hpp"
|
|
8
|
+
#include "freq_miner.hpp"
|
|
9
|
+
#include "utility.hpp"
|
|
10
|
+
|
|
11
|
+
namespace largebm {
|
|
12
|
+
|
|
13
|
+
// Forward declaration for Add_arc
|
|
14
|
+
int Add_arc(int item, unsigned long long int last_arc, int& itmset,
|
|
15
|
+
std::unordered_map<int, unsigned long long int>& ancest_map);
|
|
16
|
+
|
|
17
|
+
// Global MDD tree and other globals (declared in headers)
|
|
18
|
+
std::vector<Arc> Tree;
|
|
19
|
+
|
|
20
|
+
void Build_MDD(std::vector<int>& items) {
|
|
21
|
+
std::unordered_map<int, unsigned long long int> ancest_map;
|
|
22
|
+
unsigned long long int last_arc = 0;
|
|
23
|
+
int itmset = 0;
|
|
24
|
+
|
|
25
|
+
for (auto it = items.begin(); it != items.end(); ++it) {
|
|
26
|
+
last_arc = Add_arc(*it, last_arc, itmset, ancest_map);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
int Add_arc(int item, unsigned long long int last_arc, int& itmset,
|
|
32
|
+
std::unordered_map<int, unsigned long long int>& ancest_map) {
|
|
33
|
+
|
|
34
|
+
unsigned idx = std::abs(item) - 1;
|
|
35
|
+
|
|
36
|
+
// ─── DEBUG ────────────────────────────────────────────────
|
|
37
|
+
// std::cout << "[Add_arc] item=" << item
|
|
38
|
+
// << " idx=" << idx
|
|
39
|
+
// << " last_arc=" << last_arc
|
|
40
|
+
// << " Tree.size=" << Tree.size()
|
|
41
|
+
// << " DFS.size=" << DFS.size()
|
|
42
|
+
// << std::endl;
|
|
43
|
+
|
|
44
|
+
// Ensure DFS can hold this index
|
|
45
|
+
if (idx >= DFS.size()) {
|
|
46
|
+
// std::cout << "[Add_arc] • resizing DFS to " << (idx + 1) << std::endl;
|
|
47
|
+
DFS.reserve(idx + 1);
|
|
48
|
+
while (DFS.size() <= idx) {
|
|
49
|
+
DFS.emplace_back(-static_cast<int>(DFS.size()) - 1); // Pattern(-id)
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
unsigned long long int anct;
|
|
54
|
+
auto p = ancest_map.find(std::abs(item));
|
|
55
|
+
if (p == ancest_map.end()) {
|
|
56
|
+
anct = 0;
|
|
57
|
+
} else {
|
|
58
|
+
anct = p->second;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (item < 0) {
|
|
62
|
+
++itmset;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Before accessing Tree[last_arc].chld, check bounds
|
|
66
|
+
if (last_arc >= Tree.size()) {
|
|
67
|
+
// std::cout << "[Add_arc] !!! last_arc OOB last_arc="
|
|
68
|
+
// << last_arc << " Tree.size=" << Tree.size()
|
|
69
|
+
// << std::endl;
|
|
70
|
+
// We still proceed so we can see crash context:
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
unsigned long long int last_sibl = 0;
|
|
74
|
+
if (last_arc < Tree.size()) {
|
|
75
|
+
last_sibl = Tree[last_arc].chld;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (last_sibl == 0) {
|
|
79
|
+
// Insert new node as first child
|
|
80
|
+
Tree.emplace_back(item, itmset, anct);
|
|
81
|
+
last_sibl = Tree.size() - 1;
|
|
82
|
+
|
|
83
|
+
if (last_arc < Tree.size()) {
|
|
84
|
+
Tree[last_arc].chld = last_sibl;
|
|
85
|
+
}
|
|
86
|
+
if (anct == 0) {
|
|
87
|
+
// Debug before DFS access
|
|
88
|
+
// std::cout << "[Add_arc] • DFS access at index=" << (std::abs(item) - 1)
|
|
89
|
+
// << " DFS.size=" << DFS.size() << std::endl;
|
|
90
|
+
DFS[std::abs(item) - 1].str_pnt.push_back(last_sibl);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
} else {
|
|
94
|
+
|
|
95
|
+
// Walk siblings until find matching item or end
|
|
96
|
+
while (true) {
|
|
97
|
+
if (last_sibl >= Tree.size()) {
|
|
98
|
+
// std::cout << "[Add_arc] !!! last_sibl OOB last_sibl="
|
|
99
|
+
// << last_sibl << " Tree.size=" << Tree.size()
|
|
100
|
+
// << std::endl;
|
|
101
|
+
break;
|
|
102
|
+
}
|
|
103
|
+
if (Tree[last_sibl].item == item) {
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
if (Tree[last_sibl].sibl == 0) {
|
|
107
|
+
Tree.emplace_back(item, itmset, anct);
|
|
108
|
+
Tree[last_sibl].sibl = Tree.size() - 1;
|
|
109
|
+
last_sibl = Tree.size() - 1;
|
|
110
|
+
if (anct == 0) {
|
|
111
|
+
// std::cout << "[Add_arc] • DFS access at index=" << (std::abs(item) - 1)
|
|
112
|
+
// << " DFS.size=" << DFS.size() << std::endl;
|
|
113
|
+
DFS[std::abs(item) - 1].str_pnt.push_back(last_sibl);
|
|
114
|
+
}
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
last_sibl = Tree[last_sibl].sibl;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (anct == 0) {
|
|
122
|
+
// std::cout << "[Add_arc] • increment DFS.freq at index=" << (std::abs(item) - 1)
|
|
123
|
+
// << " DFS.size=" << DFS.size() << std::endl;
|
|
124
|
+
DFS[std::abs(item) - 1].freq++;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (last_sibl < Tree.size()) {
|
|
128
|
+
// std::cout << "[Add_arc] • increment Tree.freq at node=" << last_sibl
|
|
129
|
+
// << " Tree.size=" << Tree.size() << std::endl;
|
|
130
|
+
Tree[last_sibl].freq++;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
ancest_map[std::abs(item)] = last_sibl;
|
|
134
|
+
return last_sibl;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
} // namespace largebm
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include<vector>
|
|
4
|
+
#include <cmath>
|
|
5
|
+
#include "load_inst.hpp"
|
|
6
|
+
|
|
7
|
+
namespace largebm {
|
|
8
|
+
void Build_MDD(std::vector<int>& items);
|
|
9
|
+
|
|
10
|
+
class Arc {
|
|
11
|
+
public:
|
|
12
|
+
|
|
13
|
+
unsigned long long int chld;
|
|
14
|
+
unsigned long long int sibl;
|
|
15
|
+
unsigned long long int freq;
|
|
16
|
+
unsigned long long int anct;
|
|
17
|
+
int itmset;
|
|
18
|
+
int item;
|
|
19
|
+
|
|
20
|
+
Arc(int _itm, int _itmset, unsigned long long int _anc) {
|
|
21
|
+
itmset = _itmset;
|
|
22
|
+
anct = _anc;
|
|
23
|
+
item = _itm;
|
|
24
|
+
freq = 0;
|
|
25
|
+
chld = 0;
|
|
26
|
+
sibl = 0;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Arc(int _itm, int _anc) {
|
|
30
|
+
item = _itm;
|
|
31
|
+
anct = _anc;
|
|
32
|
+
freq = 0;
|
|
33
|
+
chld = 0;
|
|
34
|
+
sibl = 0;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
Arc() {
|
|
38
|
+
freq = 0;
|
|
39
|
+
chld = 0;
|
|
40
|
+
sibl = 0;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
extern std::vector<Arc> Tree;
|
|
47
|
+
}
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
// File: effspm/largebm/src/freq_miner.cpp
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <iostream>
|
|
6
|
+
#include <fstream>
|
|
7
|
+
#include <ctime>
|
|
8
|
+
#include <unordered_map>
|
|
9
|
+
#include <unordered_set>
|
|
10
|
+
|
|
11
|
+
#include "freq_miner.hpp" // must come before load_inst.hpp
|
|
12
|
+
#include "load_inst.hpp"
|
|
13
|
+
#include "utility.hpp"
|
|
14
|
+
#include "build_mdd.hpp"
|
|
15
|
+
|
|
16
|
+
namespace largebm {
|
|
17
|
+
|
|
18
|
+
// Helper declarations (must match headers exactly)
|
|
19
|
+
static void Out_patt(const std::vector<int>& seq, unsigned long long freq);
|
|
20
|
+
static void Extend_patt(Pattern& patt);
|
|
21
|
+
|
|
22
|
+
// Globals (declared once; types must match freq_miner.hpp)
|
|
23
|
+
unsigned long long int num_patt = 0;
|
|
24
|
+
std::vector<bool> ilist;
|
|
25
|
+
std::vector<bool> slist;
|
|
26
|
+
std::vector<int> DFS_numfound;
|
|
27
|
+
Pattern _patt;
|
|
28
|
+
|
|
29
|
+
void Freq_miner() {
|
|
30
|
+
std::vector<int> list;
|
|
31
|
+
|
|
32
|
+
if (use_list) {
|
|
33
|
+
// List-based routine
|
|
34
|
+
std::vector<int> empty_pref;
|
|
35
|
+
Freq_miner_list(items, empty_pref, theta, collected);
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// MDD-based initialization
|
|
40
|
+
for (int i = 0; i < static_cast<int>(L); ++i) {
|
|
41
|
+
if (DFS[i].freq >= theta) {
|
|
42
|
+
list.push_back(-i - 1);
|
|
43
|
+
if (itmset_exists) {
|
|
44
|
+
list.push_back(i + 1);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
for (size_t i = 0; i < DFS.size(); ++i) {
|
|
49
|
+
DFS[i].list = list;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
while (!DFS.empty() && give_time(clock() - start_time) < time_limit) {
|
|
53
|
+
if (DFS.back().freq >= theta) {
|
|
54
|
+
Extend_patt(DFS.back());
|
|
55
|
+
} else {
|
|
56
|
+
DFS.pop_back();
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
void Extend_patt(Pattern& _pattern) {
|
|
62
|
+
swap(_patt, _pattern);
|
|
63
|
+
DFS.pop_back();
|
|
64
|
+
|
|
65
|
+
slist = std::vector<bool>(L, false);
|
|
66
|
+
bool ilist_nempty = false;
|
|
67
|
+
|
|
68
|
+
if (itmset_exists) {
|
|
69
|
+
ilist = std::vector<bool>(L, false);
|
|
70
|
+
for (auto it = _patt.list.begin(); it != _patt.list.end(); ++it) {
|
|
71
|
+
if (*it < 0) {
|
|
72
|
+
slist[-(*it) - 1] = true;
|
|
73
|
+
} else {
|
|
74
|
+
ilist[(*it) - 1] = true;
|
|
75
|
+
ilist_nempty = true;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
} else {
|
|
79
|
+
for (auto it = _patt.list.begin(); it != _patt.list.end(); ++it) {
|
|
80
|
+
slist[-(*it) - 1] = true;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
int itmset_size = 1;
|
|
85
|
+
int last_neg = static_cast<int>(_patt.seq.size()) - 1;
|
|
86
|
+
while (_patt.seq[last_neg] > 0) {
|
|
87
|
+
--last_neg;
|
|
88
|
+
++itmset_size;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
std::vector<Pattern> pot_patt(L + (ilist_nempty ? L : 0));
|
|
92
|
+
std::vector<unsigned long long int> DFS_patt_init;
|
|
93
|
+
std::vector<unsigned long long int> DFS_patt;
|
|
94
|
+
if (ilist_nempty) {
|
|
95
|
+
DFS_numfound.clear();
|
|
96
|
+
}
|
|
97
|
+
std::vector<unsigned long long int> last_strpnt(L, 0);
|
|
98
|
+
|
|
99
|
+
for (unsigned long long int pnt = 0; pnt < _patt.str_pnt.size(); ++pnt) {
|
|
100
|
+
DFS_patt_init.push_back(_patt.str_pnt[pnt]);
|
|
101
|
+
while (!DFS_patt_init.empty()) {
|
|
102
|
+
unsigned long long int cur_sibl = Tree[DFS_patt_init.back()].chld;
|
|
103
|
+
DFS_patt_init.pop_back();
|
|
104
|
+
while (cur_sibl != 0) {
|
|
105
|
+
int cur_itm = Tree[cur_sibl].item;
|
|
106
|
+
if (cur_itm < 0) {
|
|
107
|
+
cur_itm = -cur_itm;
|
|
108
|
+
if (slist[cur_itm - 1]) {
|
|
109
|
+
pot_patt[cur_itm - 1].freq += Tree[cur_sibl].freq;
|
|
110
|
+
if (Tree[cur_sibl].chld != 0) {
|
|
111
|
+
pot_patt[cur_itm - 1].str_pnt.push_back(cur_sibl);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (Tree[cur_sibl].chld != static_cast<unsigned long long>(-1)) {
|
|
115
|
+
DFS_patt.push_back(cur_sibl);
|
|
116
|
+
if (ilist_nempty) {
|
|
117
|
+
if (cur_itm == -_patt.seq[last_neg]) {
|
|
118
|
+
DFS_numfound.push_back(1);
|
|
119
|
+
} else {
|
|
120
|
+
DFS_numfound.push_back(0);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
} else {
|
|
125
|
+
if (ilist[cur_itm - 1]) {
|
|
126
|
+
pot_patt[cur_itm + L - 1].freq += Tree[cur_sibl].freq;
|
|
127
|
+
if (Tree[cur_sibl].chld != 0) {
|
|
128
|
+
pot_patt[cur_itm + L - 1].str_pnt.push_back(cur_sibl);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
if (Tree[cur_sibl].chld != static_cast<unsigned long long>(-1)) {
|
|
132
|
+
DFS_patt_init.push_back(cur_sibl);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
cur_sibl = Tree[cur_sibl].sibl;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
if (ilist_nempty) {
|
|
139
|
+
for (int i = 0; i < static_cast<int>(L); ++i) {
|
|
140
|
+
if (ilist[i]) {
|
|
141
|
+
last_strpnt[i] = pot_patt[i + L].str_pnt.size();
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
while (!DFS_patt.empty()) {
|
|
146
|
+
unsigned long long int cur_sibl = Tree[DFS_patt.back()].chld;
|
|
147
|
+
DFS_patt.pop_back();
|
|
148
|
+
int num_found = 0;
|
|
149
|
+
if (ilist_nempty) {
|
|
150
|
+
num_found = DFS_numfound.back();
|
|
151
|
+
DFS_numfound.pop_back();
|
|
152
|
+
}
|
|
153
|
+
while (cur_sibl != 0) {
|
|
154
|
+
int cur_itm = Tree[cur_sibl].item;
|
|
155
|
+
if (cur_itm > 0) {
|
|
156
|
+
if (num_found == itmset_size &&
|
|
157
|
+
ilist[cur_itm - 1] &&
|
|
158
|
+
(Tree[Tree[cur_sibl].anct].itmset < Tree[_patt.str_pnt[pnt]].itmset ||
|
|
159
|
+
!check_parent(cur_sibl, _patt.str_pnt[pnt],
|
|
160
|
+
last_strpnt[cur_itm - 1],
|
|
161
|
+
pot_patt[cur_itm + L - 1].str_pnt))) {
|
|
162
|
+
pot_patt[cur_itm + L - 1].freq += Tree[cur_sibl].freq;
|
|
163
|
+
if (Tree[cur_sibl].chld != 0) {
|
|
164
|
+
pot_patt[cur_itm + L - 1].str_pnt.push_back(cur_sibl);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (slist[cur_itm - 1] &&
|
|
168
|
+
Tree[Tree[cur_sibl].anct].itmset <= Tree[_patt.str_pnt[pnt]].itmset) {
|
|
169
|
+
pot_patt[cur_itm - 1].freq += Tree[cur_sibl].freq;
|
|
170
|
+
if (Tree[cur_sibl].chld != 0) {
|
|
171
|
+
pot_patt[cur_itm - 1].str_pnt.push_back(cur_sibl);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
if (Tree[cur_sibl].chld != static_cast<unsigned long long>(-1)) {
|
|
175
|
+
DFS_patt.push_back(cur_sibl);
|
|
176
|
+
if (ilist_nempty) {
|
|
177
|
+
if (num_found < itmset_size &&
|
|
178
|
+
cur_itm == std::abs(_patt.seq[last_neg + num_found])) {
|
|
179
|
+
DFS_numfound.push_back(num_found + 1);
|
|
180
|
+
} else {
|
|
181
|
+
DFS_numfound.push_back(num_found);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
} else {
|
|
186
|
+
cur_itm = -cur_itm;
|
|
187
|
+
if (slist[cur_itm - 1] &&
|
|
188
|
+
Tree[Tree[cur_sibl].anct].itmset <= Tree[_patt.str_pnt[pnt]].itmset) {
|
|
189
|
+
pot_patt[cur_itm - 1].freq += Tree[cur_sibl].freq;
|
|
190
|
+
if (Tree[cur_sibl].chld != 0) {
|
|
191
|
+
pot_patt[cur_itm - 1].str_pnt.push_back(cur_sibl);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (Tree[cur_sibl].chld != static_cast<unsigned long long>(-1)) {
|
|
195
|
+
DFS_patt.push_back(cur_sibl);
|
|
196
|
+
if (ilist_nempty) {
|
|
197
|
+
if (cur_itm == -_patt.seq[last_neg]) {
|
|
198
|
+
DFS_numfound.push_back(1);
|
|
199
|
+
} else {
|
|
200
|
+
DFS_numfound.push_back(0);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
cur_sibl = Tree[cur_sibl].sibl;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
std::vector<int> ilistp;
|
|
211
|
+
std::vector<int> slistp;
|
|
212
|
+
for (auto it = _patt.list.begin(); it != _patt.list.end(); ++it) {
|
|
213
|
+
int idx = (*it < 0) ? (-(*it) - 1) : ((*it) - 1 + static_cast<int>(L));
|
|
214
|
+
if (*it > 0 && pot_patt[idx].freq >= theta) {
|
|
215
|
+
ilistp.push_back(*it);
|
|
216
|
+
} else if (*it < 0 && pot_patt[-(*it) - 1].freq >= theta) {
|
|
217
|
+
if (itmset_exists) {
|
|
218
|
+
slistp.push_back(-(*it));
|
|
219
|
+
}
|
|
220
|
+
ilistp.push_back(*it);
|
|
221
|
+
slistp.push_back(*it);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
for (auto it = ilistp.begin(); it != ilistp.end(); ++it) {
|
|
226
|
+
int p;
|
|
227
|
+
if (*it < 0) {
|
|
228
|
+
p = -(*it) - 1;
|
|
229
|
+
} else {
|
|
230
|
+
p = (*it) - 1 + static_cast<int>(L);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
DFS.emplace_back();
|
|
234
|
+
swap(DFS.back(), pot_patt[p]);
|
|
235
|
+
DFS.back().seq = _patt.seq;
|
|
236
|
+
DFS.back().seq.push_back(*it);
|
|
237
|
+
if (*it < 0) {
|
|
238
|
+
DFS.back().list = slistp;
|
|
239
|
+
} else {
|
|
240
|
+
DFS.back().list = ilistp;
|
|
241
|
+
}
|
|
242
|
+
if (b_disp || b_write) {
|
|
243
|
+
Out_patt(DFS.back().seq, DFS.back().freq);
|
|
244
|
+
}
|
|
245
|
+
++num_patt;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
void Out_patt(const std::vector<int>& seq, unsigned long long freq) {
|
|
250
|
+
if (b_disp || b_write) {
|
|
251
|
+
std::ofstream file_o;
|
|
252
|
+
if (b_write) {
|
|
253
|
+
file_o.open(out_file, std::ios::app);
|
|
254
|
+
}
|
|
255
|
+
for (int v : seq) {
|
|
256
|
+
if (b_disp) std::cout << v << ' ';
|
|
257
|
+
if (b_write) file_o << v << ' ';
|
|
258
|
+
}
|
|
259
|
+
if (b_disp) std::cout << '\n';
|
|
260
|
+
if (b_write) file_o << '\n';
|
|
261
|
+
|
|
262
|
+
if (b_disp) {
|
|
263
|
+
std::cout << "************** Freq: " << freq << '\n';
|
|
264
|
+
}
|
|
265
|
+
if (b_write) {
|
|
266
|
+
file_o << "************** Freq: " << freq << '\n';
|
|
267
|
+
file_o.close();
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
collected.push_back(seq);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
void Freq_miner_list(const std::vector<std::vector<int>>& db,
|
|
274
|
+
std::vector<int>& prefix,
|
|
275
|
+
unsigned long long minsup,
|
|
276
|
+
std::vector<std::vector<int>>& out) {
|
|
277
|
+
// 1) count single‐item support (one count per sequence)
|
|
278
|
+
std::unordered_map<int, unsigned long long> freq;
|
|
279
|
+
for (auto const& seq : db) {
|
|
280
|
+
std::unordered_set<int> seen;
|
|
281
|
+
for (int x : seq) {
|
|
282
|
+
if (seen.insert(x).second) {
|
|
283
|
+
++freq[x];
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// 2) collect the frequent candidates
|
|
289
|
+
std::vector<std::pair<int, unsigned long long>> cand;
|
|
290
|
+
cand.reserve(freq.size());
|
|
291
|
+
for (auto& p : freq) {
|
|
292
|
+
if (p.second >= minsup) {
|
|
293
|
+
cand.emplace_back(p.first, p.second);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// 3) sort by absolute item ID
|
|
298
|
+
std::sort(cand.begin(), cand.end(),
|
|
299
|
+
[](const std::pair<int, unsigned long long>& a,
|
|
300
|
+
const std::pair<int, unsigned long long>& b) {
|
|
301
|
+
return std::abs(a.first) < std::abs(b.first);
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
// 4) depth-first enumerate them
|
|
305
|
+
for (auto const& pr : cand) {
|
|
306
|
+
int item = pr.first;
|
|
307
|
+
prefix.push_back(item);
|
|
308
|
+
|
|
309
|
+
if (use_dic) {
|
|
310
|
+
// “un-compress” each pattern back to original IDs
|
|
311
|
+
std::vector<int> unmapped;
|
|
312
|
+
unmapped.reserve(prefix.size());
|
|
313
|
+
for (int cid : prefix) {
|
|
314
|
+
int abs_id = std::abs(cid);
|
|
315
|
+
int o = inv_item_dic[abs_id];
|
|
316
|
+
unmapped.push_back(cid < 0 ? -o : o);
|
|
317
|
+
}
|
|
318
|
+
out.push_back(std::move(unmapped));
|
|
319
|
+
} else {
|
|
320
|
+
// just store the raw prefix
|
|
321
|
+
out.push_back(prefix);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// 5) project on the *first* occurrence of `item`
|
|
325
|
+
std::vector<std::vector<int>> proj;
|
|
326
|
+
proj.reserve(db.size());
|
|
327
|
+
for (auto const& seq : db) {
|
|
328
|
+
auto it = std::find(seq.begin(), seq.end(), item);
|
|
329
|
+
if (it != seq.end() && ++it != seq.end()) {
|
|
330
|
+
proj.emplace_back(it, seq.end());
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
if (!proj.empty()) {
|
|
335
|
+
Freq_miner_list(proj, prefix, minsup, out);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
prefix.pop_back();
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
} // namespace largebm
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "load_inst.hpp"
|
|
4
|
+
#include "build_mdd.hpp"
|
|
5
|
+
|
|
6
|
+
namespace largebm {
|
|
7
|
+
|
|
8
|
+
void Freq_miner();
|
|
9
|
+
// recursive helper for the list‐based mode
|
|
10
|
+
void Freq_miner_list(const std::vector<std::vector<int>>& db,
|
|
11
|
+
std::vector<int>& prefix,
|
|
12
|
+
unsigned long long theta,
|
|
13
|
+
std::vector<std::vector<int>>& out);
|
|
14
|
+
class Pattern {
|
|
15
|
+
public:
|
|
16
|
+
|
|
17
|
+
vector<int> seq;
|
|
18
|
+
vector<unsigned long long int> str_pnt;
|
|
19
|
+
vector<int> list;
|
|
20
|
+
|
|
21
|
+
unsigned long long int freq;
|
|
22
|
+
|
|
23
|
+
Pattern(vector<int>& _seq, int item) {
|
|
24
|
+
seq.swap(_seq);
|
|
25
|
+
seq.push_back(item);
|
|
26
|
+
freq = 0;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Pattern(int item) {
|
|
30
|
+
seq.push_back(item);
|
|
31
|
+
freq = 0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
Pattern() {
|
|
35
|
+
freq = 0;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
extern unsigned long long int num_patt;
|
|
42
|
+
extern std::vector<bool> ilist;
|
|
43
|
+
extern std::vector<bool> slist;
|
|
44
|
+
extern std::vector<int> DFS_numfound;
|
|
45
|
+
extern Pattern _patt;
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
}
|