effspm 0.1.11__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm-0.2.1/MANIFEST.in +12 -0
- {effspm-0.1.11/effspm.egg-info → effspm-0.2.1}/PKG-INFO +1 -1
- effspm-0.2.1/effspm/__init__.py +3 -0
- effspm-0.2.1/effspm/_effspm.cpp +577 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/freq_miner.cpp +3 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/load_inst.cpp +10 -4
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/load_inst.hpp +2 -0
- effspm-0.2.1/effspm/btminer/src/utility.cpp +65 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/utility.hpp +9 -13
- effspm-0.2.1/effspm/htminer/src/build_mdd.cpp +192 -0
- effspm-0.2.1/effspm/htminer/src/build_mdd.hpp +64 -0
- effspm-0.2.1/effspm/htminer/src/freq_miner.cpp +350 -0
- effspm-0.2.1/effspm/htminer/src/freq_miner.hpp +60 -0
- effspm-0.2.1/effspm/htminer/src/load_inst.cpp +381 -0
- effspm-0.2.1/effspm/htminer/src/load_inst.hpp +23 -0
- effspm-0.2.1/effspm/htminer/src/main.cpp +96 -0
- effspm-0.2.1/effspm/htminer/src/utility.cpp +72 -0
- effspm-0.2.1/effspm/htminer/src/utility.hpp +77 -0
- effspm-0.2.1/effspm/largebm/src/build_mdd.cpp +137 -0
- effspm-0.2.1/effspm/largebm/src/build_mdd.hpp +47 -0
- effspm-0.2.1/effspm/largebm/src/freq_miner.cpp +342 -0
- effspm-0.2.1/effspm/largebm/src/freq_miner.hpp +48 -0
- effspm-0.2.1/effspm/largebm/src/load_inst.cpp +235 -0
- effspm-0.2.1/effspm/largebm/src/load_inst.hpp +45 -0
- effspm-0.2.1/effspm/largebm/src/main.cpp +95 -0
- effspm-0.2.1/effspm/largebm/src/utility.cpp +45 -0
- effspm-0.2.1/effspm/largebm/src/utility.hpp +18 -0
- effspm-0.2.1/effspm/largehm/src/build_mdd.cpp +173 -0
- effspm-0.2.1/effspm/largehm/src/build_mdd.hpp +93 -0
- effspm-0.2.1/effspm/largehm/src/freq_miner.cpp +441 -0
- effspm-0.2.1/effspm/largehm/src/freq_miner.hpp +77 -0
- effspm-0.2.1/effspm/largehm/src/load_inst.cpp +357 -0
- effspm-0.2.1/effspm/largehm/src/load_inst.hpp +64 -0
- effspm-0.2.1/effspm/largehm/src/main.cpp +95 -0
- effspm-0.2.1/effspm/largehm/src/utility.cpp +38 -0
- effspm-0.2.1/effspm/largehm/src/utility.hpp +29 -0
- effspm-0.2.1/effspm/largepp/src/freq_miner.cpp +170 -0
- effspm-0.2.1/effspm/largepp/src/freq_miner.hpp +43 -0
- effspm-0.2.1/effspm/largepp/src/load_inst.cpp +219 -0
- effspm-0.2.1/effspm/largepp/src/load_inst.hpp +28 -0
- effspm-0.2.1/effspm/largepp/src/main.cpp +108 -0
- effspm-0.2.1/effspm/largepp/src/utility.cpp +33 -0
- effspm-0.2.1/effspm/largepp/src/utility.hpp +20 -0
- {effspm-0.1.11 → effspm-0.2.1/effspm.egg-info}/PKG-INFO +1 -1
- effspm-0.2.1/effspm.egg-info/SOURCES.txt +66 -0
- {effspm-0.1.11 → effspm-0.2.1}/pyproject.toml +1 -1
- effspm-0.2.1/setup.py +70 -0
- effspm-0.2.1/tests/test.py +31 -0
- effspm-0.2.1/tests/test_basic.py +37 -0
- effspm-0.1.11/MANIFEST.in +0 -7
- effspm-0.1.11/effspm/__init__.py +0 -3
- effspm-0.1.11/effspm/_effspm.cpp +0 -153
- effspm-0.1.11/effspm/btminer/src/utility.cpp +0 -67
- effspm-0.1.11/effspm.egg-info/SOURCES.txt +0 -32
- effspm-0.1.11/setup.py +0 -37
- effspm-0.1.11/tests/test.py +0 -3
- effspm-0.1.11/tests/test_basic.py +0 -11
- {effspm-0.1.11 → effspm-0.2.1}/LICENSE +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/README.md +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/_core.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/build_mdd.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/build_mdd.hpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/freq_miner.hpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/btminer/src/main.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/freq_miner.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/freq_miner.hpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/load_inst.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/load_inst.hpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/main.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/utility.cpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm/utility.hpp +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm.egg-info/dependency_links.txt +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm.egg-info/not-zip-safe +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm.egg-info/requires.txt +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/effspm.egg-info/top_level.txt +0 -0
- {effspm-0.1.11 → effspm-0.2.1}/setup.cfg +0 -0
effspm-0.2.1/MANIFEST.in
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
include LICENSE
|
|
2
|
+
include README.md
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
include setup.py
|
|
5
|
+
recursive-include effspm *.hpp
|
|
6
|
+
recursive-include effspm *.cpp
|
|
7
|
+
recursive-include effspm *.py
|
|
8
|
+
recursive-include effspm/htminer/src *.cpp *.hpp
|
|
9
|
+
recursive-include effspm/btminer/src *.cpp *.hpp
|
|
10
|
+
recursive-include effspm/largepp/src *.cpp *.hpp
|
|
11
|
+
recursive-include effspm/largebm/src *.cpp *.hpp
|
|
12
|
+
recursive-include effspm/largehm/src *.cpp *.hpp
|
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
// _effspm.cpp
|
|
2
|
+
|
|
3
|
+
#include <pybind11/pybind11.h>
|
|
4
|
+
#include <pybind11/stl.h>
|
|
5
|
+
namespace py = pybind11;
|
|
6
|
+
#include <iostream>
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
// PrefixProjection headers
|
|
10
|
+
#include "freq_miner.hpp"
|
|
11
|
+
#include "load_inst.hpp"
|
|
12
|
+
#include "utility.hpp"
|
|
13
|
+
|
|
14
|
+
// BTMiner (wrapped in its own namespace in source files)
|
|
15
|
+
#include "btminer/src/freq_miner.hpp"
|
|
16
|
+
#include "btminer/src/load_inst.hpp"
|
|
17
|
+
#include "btminer/src/utility.hpp"
|
|
18
|
+
#include "btminer/src/build_mdd.hpp"
|
|
19
|
+
|
|
20
|
+
// HTMiner (wrapped in its own namespace in source files)
|
|
21
|
+
#include "htminer/src/build_mdd.hpp" // ← ensure HTMiner MDD builder is available
|
|
22
|
+
#include "htminer/src/freq_miner.hpp"
|
|
23
|
+
#include "htminer/src/load_inst.hpp"
|
|
24
|
+
#include "htminer/src/utility.hpp"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
#include "largepp/src/freq_miner.hpp"
|
|
28
|
+
#include "largepp/src/load_inst.hpp"
|
|
29
|
+
#include "largepp/src/utility.hpp"
|
|
30
|
+
|
|
31
|
+
#include "largebm/src/freq_miner.hpp"
|
|
32
|
+
#include "largebm/src/load_inst.hpp"
|
|
33
|
+
#include "largebm/src/utility.hpp"
|
|
34
|
+
#include "largebm/src/build_mdd.hpp"
|
|
35
|
+
|
|
36
|
+
#include "largehm/src/freq_miner.hpp"
|
|
37
|
+
#include "largehm/src/load_inst.hpp"
|
|
38
|
+
#include "largehm/src/utility.hpp"
|
|
39
|
+
#include "largehm/src/build_mdd.hpp"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
PYBIND11_MODULE(_effspm, m) {
|
|
44
|
+
m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner";
|
|
45
|
+
|
|
46
|
+
// ─────────────────────────────────────────────────────────────
|
|
47
|
+
// PrefixProjection
|
|
48
|
+
// ─────────────────────────────────────────────────────────────
|
|
49
|
+
m.def("PrefixProjection",
|
|
50
|
+
[](py::object data,
|
|
51
|
+
double minsup,
|
|
52
|
+
unsigned int time_limit,
|
|
53
|
+
bool preproc,
|
|
54
|
+
bool use_dic,
|
|
55
|
+
bool verbose,
|
|
56
|
+
const std::string &out_file)
|
|
57
|
+
{
|
|
58
|
+
::time_limit = time_limit;
|
|
59
|
+
::pre_pro = preproc;
|
|
60
|
+
::use_dic = use_dic;
|
|
61
|
+
::use_list = false;
|
|
62
|
+
::b_disp = verbose;
|
|
63
|
+
::b_write = !out_file.empty();
|
|
64
|
+
::out_file = out_file;
|
|
65
|
+
|
|
66
|
+
ClearCollected();
|
|
67
|
+
start_time = std::clock();
|
|
68
|
+
|
|
69
|
+
if (py::isinstance<py::str>(data)) {
|
|
70
|
+
std::string path = data.cast<std::string>();
|
|
71
|
+
if (!Load_instance(path, minsup))
|
|
72
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
73
|
+
} else {
|
|
74
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
75
|
+
items = std::move(seqs);
|
|
76
|
+
N = items.size();
|
|
77
|
+
|
|
78
|
+
int max_id = 0;
|
|
79
|
+
for (auto &seq : items)
|
|
80
|
+
for (int x : seq)
|
|
81
|
+
max_id = std::max(max_id, std::abs(x));
|
|
82
|
+
L = max_id;
|
|
83
|
+
|
|
84
|
+
theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
|
|
85
|
+
|
|
86
|
+
DFS.clear();
|
|
87
|
+
DFS.reserve(L);
|
|
88
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
89
|
+
DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
90
|
+
|
|
91
|
+
M = 0;
|
|
92
|
+
E = 0;
|
|
93
|
+
for (auto &seq : items) {
|
|
94
|
+
M = std::max<unsigned int>(M, seq.size());
|
|
95
|
+
E += seq.size();
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
Freq_miner();
|
|
100
|
+
|
|
101
|
+
py::dict out;
|
|
102
|
+
out["patterns"] = GetCollected();
|
|
103
|
+
out["time"] = give_time(std::clock() - start_time);
|
|
104
|
+
return out;
|
|
105
|
+
},
|
|
106
|
+
py::arg("data"),
|
|
107
|
+
py::arg("minsup") = 0.01,
|
|
108
|
+
py::arg("time_limit") = 36000,
|
|
109
|
+
py::arg("preproc") = false,
|
|
110
|
+
py::arg("use_dic") = false,
|
|
111
|
+
py::arg("verbose") = false,
|
|
112
|
+
py::arg("out_file") = ""
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
// ─────────────────────────────────────────────────────────────
|
|
116
|
+
// BTMiner
|
|
117
|
+
// ─────────────────────────────────────────────────────────────
|
|
118
|
+
m.def("BTMiner",
|
|
119
|
+
[](py::object data,
|
|
120
|
+
double minsup,
|
|
121
|
+
unsigned int time_limit,
|
|
122
|
+
bool preproc,
|
|
123
|
+
bool use_dic,
|
|
124
|
+
bool verbose,
|
|
125
|
+
const std::string &out_file)
|
|
126
|
+
{
|
|
127
|
+
btminer::time_limit = time_limit;
|
|
128
|
+
btminer::pre_pro = preproc;
|
|
129
|
+
btminer::use_dic = use_dic;
|
|
130
|
+
btminer::use_list = false;
|
|
131
|
+
btminer::b_disp = verbose;
|
|
132
|
+
btminer::b_write = !out_file.empty();
|
|
133
|
+
btminer::out_file = out_file;
|
|
134
|
+
|
|
135
|
+
btminer::ClearCollected();
|
|
136
|
+
btminer::start_time = std::clock();
|
|
137
|
+
|
|
138
|
+
if (py::isinstance<py::str>(data)) {
|
|
139
|
+
std::string path = data.cast<std::string>();
|
|
140
|
+
if (!btminer::Load_instance(path, minsup))
|
|
141
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
142
|
+
} else {
|
|
143
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
144
|
+
btminer::items = std::move(seqs);
|
|
145
|
+
btminer::N = btminer::items.size();
|
|
146
|
+
|
|
147
|
+
int max_id = 0;
|
|
148
|
+
for (auto &seq : btminer::items)
|
|
149
|
+
for (int x : seq)
|
|
150
|
+
max_id = std::max(max_id, std::abs(x));
|
|
151
|
+
btminer::L = max_id;
|
|
152
|
+
|
|
153
|
+
btminer::theta = (minsup < 1.0) ? std::ceil(minsup * btminer::N) : minsup;
|
|
154
|
+
|
|
155
|
+
btminer::DFS.clear();
|
|
156
|
+
btminer::DFS.reserve(btminer::L);
|
|
157
|
+
for (unsigned int i = 0; i < btminer::L; ++i)
|
|
158
|
+
btminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
159
|
+
|
|
160
|
+
btminer::M = 0;
|
|
161
|
+
btminer::E = 0;
|
|
162
|
+
for (auto &seq : btminer::items) {
|
|
163
|
+
btminer::M = std::max<unsigned int>(btminer::M, seq.size());
|
|
164
|
+
btminer::E += seq.size();
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
btminer::Freq_miner();
|
|
169
|
+
|
|
170
|
+
py::dict out;
|
|
171
|
+
out["patterns"] = btminer::GetCollected();
|
|
172
|
+
out["time"] = btminer::give_time(std::clock() - btminer::start_time);
|
|
173
|
+
return out;
|
|
174
|
+
},
|
|
175
|
+
py::arg("data"),
|
|
176
|
+
py::arg("minsup") = 0.01,
|
|
177
|
+
py::arg("time_limit") = 36000,
|
|
178
|
+
py::arg("preproc") = false,
|
|
179
|
+
py::arg("use_dic") = false,
|
|
180
|
+
py::arg("verbose") = false,
|
|
181
|
+
py::arg("out_file") = ""
|
|
182
|
+
);
|
|
183
|
+
|
|
184
|
+
// ─────────────────────────────────────────────────────────────
|
|
185
|
+
// HTMiner
|
|
186
|
+
// ─────────────────────────────────────────────────────────────
|
|
187
|
+
m.def("HTMiner",
|
|
188
|
+
[](py::object data,
|
|
189
|
+
double minsup, unsigned int time_limit,
|
|
190
|
+
bool preproc, bool use_dic,
|
|
191
|
+
bool verbose, const std::string &out_file)
|
|
192
|
+
{
|
|
193
|
+
// 1) set HTMiner globals (declared in htminer/src/utility.hpp)
|
|
194
|
+
htminer::time_limit = time_limit;
|
|
195
|
+
htminer::pre_pro = preproc;
|
|
196
|
+
htminer::use_dic = use_dic;
|
|
197
|
+
htminer::just_build = false; // or true if you want “build only”
|
|
198
|
+
htminer::use_list = false; // HTMiner always uses MDD‐based mode
|
|
199
|
+
htminer::b_disp = verbose;
|
|
200
|
+
htminer::b_write = !out_file.empty();
|
|
201
|
+
htminer::out_file = out_file;
|
|
202
|
+
htminer::ClearCollected(); // clear any leftover patterns
|
|
203
|
+
htminer::start_time = std::clock();
|
|
204
|
+
|
|
205
|
+
// 2) load sequences (either from filename or from Python list)
|
|
206
|
+
if (py::isinstance<py::str>(data)) {
|
|
207
|
+
std::string path = data.cast<std::string>();
|
|
208
|
+
if (!htminer::Load_instance(path, minsup))
|
|
209
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
210
|
+
} else {
|
|
211
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
212
|
+
htminer::items = std::move(seqs);
|
|
213
|
+
htminer::N = htminer::items.size();
|
|
214
|
+
|
|
215
|
+
// compute L (max item ID), M (max sequence length), E (total entries)
|
|
216
|
+
int max_id = 0;
|
|
217
|
+
htminer::M = 0;
|
|
218
|
+
htminer::E = 0;
|
|
219
|
+
for (auto &seq : htminer::items) {
|
|
220
|
+
htminer::M = std::max<unsigned int>(htminer::M, seq.size());
|
|
221
|
+
for (int x : seq)
|
|
222
|
+
max_id = std::max(max_id, std::abs(x));
|
|
223
|
+
htminer::E += seq.size();
|
|
224
|
+
}
|
|
225
|
+
htminer::L = max_id;
|
|
226
|
+
htminer::theta = (minsup < 1.0)
|
|
227
|
+
? static_cast<unsigned long long>(std::ceil(minsup * htminer::N))
|
|
228
|
+
: static_cast<unsigned long long>(minsup);
|
|
229
|
+
|
|
230
|
+
// build empty DFS stack (size L) as HTMiner expects
|
|
231
|
+
htminer::DFS.clear();
|
|
232
|
+
htminer::DFS.reserve(htminer::L);
|
|
233
|
+
for (unsigned int i = 0; i < static_cast<unsigned int>(htminer::L); ++i)
|
|
234
|
+
htminer::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
235
|
+
|
|
236
|
+
// initialize VDFS if HTMiner needs it
|
|
237
|
+
htminer::VDFS.clear();
|
|
238
|
+
htminer::VDFS.resize(htminer::L);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// 3) run the mining algorithm
|
|
242
|
+
htminer::Freq_miner();
|
|
243
|
+
|
|
244
|
+
// std::cout << "[HTMiner] dumping all collected patterns:\n";
|
|
245
|
+
// for (size_t i = 0; i < htminer::collectedPatterns.size(); ++i) {
|
|
246
|
+
// const auto &seq = htminer::collectedPatterns[i];
|
|
247
|
+
// std::cout << "Pattern " << i << ": { ";
|
|
248
|
+
// for (int x : seq) {
|
|
249
|
+
// std::cout << x << " ";
|
|
250
|
+
// }
|
|
251
|
+
// std::cout << "}\n";
|
|
252
|
+
//}
|
|
253
|
+
std::cout << " total patterns = "
|
|
254
|
+
<< htminer::collectedPatterns.size() << "\n";
|
|
255
|
+
// ─────────────────────────────────────────────────
|
|
256
|
+
|
|
257
|
+
// 4) return patterns + elapsed time
|
|
258
|
+
py::dict out;
|
|
259
|
+
out["patterns"] = htminer::GetCollected();
|
|
260
|
+
out["time"] = htminer::give_time(std::clock() - htminer::start_time);
|
|
261
|
+
return out;
|
|
262
|
+
},
|
|
263
|
+
py::arg("data"),
|
|
264
|
+
py::arg("minsup") = 0.01,
|
|
265
|
+
py::arg("time_limit") = 36000,
|
|
266
|
+
py::arg("preproc") = false,
|
|
267
|
+
py::arg("use_dic") = false,
|
|
268
|
+
py::arg("verbose") = false,
|
|
269
|
+
py::arg("out_file") = ""
|
|
270
|
+
);
|
|
271
|
+
|
|
272
|
+
m.def("LargePrefixProjection",
|
|
273
|
+
[](py::object data,
|
|
274
|
+
double minsup,
|
|
275
|
+
unsigned int time_limit,
|
|
276
|
+
bool preproc,
|
|
277
|
+
bool use_dic,
|
|
278
|
+
bool verbose,
|
|
279
|
+
const std::string &out_file)
|
|
280
|
+
{
|
|
281
|
+
largepp::time_limit = time_limit;
|
|
282
|
+
largepp::pre_pro = preproc;
|
|
283
|
+
largepp::use_dic = use_dic;
|
|
284
|
+
largepp::use_list = true; // ← key difference
|
|
285
|
+
largepp::b_disp = verbose;
|
|
286
|
+
largepp::b_write = !out_file.empty();
|
|
287
|
+
largepp::out_file = out_file;
|
|
288
|
+
largepp::just_build = false;
|
|
289
|
+
|
|
290
|
+
largepp::ClearCollected();
|
|
291
|
+
largepp::start_time = std::clock();
|
|
292
|
+
std::string fname = data.cast<std::string>();
|
|
293
|
+
/* 1) load instance (py list or filename) */
|
|
294
|
+
if (py::isinstance<py::str>(data))
|
|
295
|
+
|
|
296
|
+
largepp::Load_instance(fname, minsup);
|
|
297
|
+
else
|
|
298
|
+
largepp::Load_py(data, minsup); // helper you’ll expose
|
|
299
|
+
|
|
300
|
+
std::vector<unsigned long long> dbg;
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
largepp::Freq_miner();
|
|
308
|
+
|
|
309
|
+
py::dict out;
|
|
310
|
+
out["patterns"] = largepp::GetCollected();
|
|
311
|
+
out["time"] = largepp::give_time(std::clock() - largepp::start_time);
|
|
312
|
+
return out;
|
|
313
|
+
},
|
|
314
|
+
py::arg("data"),
|
|
315
|
+
py::arg("minsup") = 0.01,
|
|
316
|
+
py::arg("time_limit") = 36000,
|
|
317
|
+
py::arg("preproc") = false,
|
|
318
|
+
py::arg("use_dic") = false,
|
|
319
|
+
py::arg("verbose") = false,
|
|
320
|
+
py::arg("out_file") = ""
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
// ─────────────────────────────────────────────────────────────
|
|
324
|
+
// LargeBTMiner -- Python wrapper for the largebm implementation
|
|
325
|
+
// ─────────────────────────────────────────────────────────────
|
|
326
|
+
// m.def(
|
|
327
|
+
// "LargeBTMiner",
|
|
328
|
+
// [](py::object data,
|
|
329
|
+
// double minsup ,
|
|
330
|
+
// unsigned int time_limit,
|
|
331
|
+
// bool preproc ,
|
|
332
|
+
// bool use_dic,
|
|
333
|
+
// bool verbose,
|
|
334
|
+
// const std::string &out_file )
|
|
335
|
+
// {
|
|
336
|
+
// /* 1) Global flags */
|
|
337
|
+
// largebm::time_limit = time_limit;
|
|
338
|
+
// largebm::pre_pro = preproc;
|
|
339
|
+
// largebm::use_dic = use_dic;
|
|
340
|
+
// largebm::use_list = false; // large-mode → always MDD
|
|
341
|
+
// largebm::just_build = false;
|
|
342
|
+
// largebm::b_disp = verbose;
|
|
343
|
+
// largebm::b_write = !out_file.empty();
|
|
344
|
+
// largebm::out_file = out_file;
|
|
345
|
+
|
|
346
|
+
// /* 2) Reset per-run state */
|
|
347
|
+
// largebm::ClearCollected();
|
|
348
|
+
// largebm::start_time = std::clock();
|
|
349
|
+
|
|
350
|
+
// /* 3) Load the DB (file path or in-memory list<list<int>>) */
|
|
351
|
+
// if (py::isinstance<py::str>(data)) {
|
|
352
|
+
// std::string path = data.cast<std::string>();
|
|
353
|
+
// if (!largebm::Load_instance(path, minsup))
|
|
354
|
+
// throw std::runtime_error("Failed to load file: " + path);
|
|
355
|
+
// } else {
|
|
356
|
+
// // In-memory sequences
|
|
357
|
+
// largebm::items = std::move(data.cast<std::vector<std::vector<int>>>());
|
|
358
|
+
// largebm::N = static_cast<unsigned int>(largebm::items.size());
|
|
359
|
+
|
|
360
|
+
// /* -- basic stats -- */
|
|
361
|
+
// int max_id = 0;
|
|
362
|
+
// largebm::M = 0;
|
|
363
|
+
// largebm::E = 0;
|
|
364
|
+
// for ( auto &seq : largebm::items) {
|
|
365
|
+
// largebm::M = std::max<unsigned int>(largebm::M,
|
|
366
|
+
// static_cast<unsigned int>(seq.size()));
|
|
367
|
+
// largebm::E += static_cast<unsigned long long>(seq.size());
|
|
368
|
+
// for (int x : seq) max_id = std::max(max_id, std::abs(x));
|
|
369
|
+
// }
|
|
370
|
+
// largebm::L = static_cast<unsigned int>(max_id);
|
|
371
|
+
// largebm::theta = (minsup < 1.0)
|
|
372
|
+
// ? static_cast<unsigned long long>(std::ceil(minsup * largebm::N))
|
|
373
|
+
// : static_cast<unsigned long long>(minsup);
|
|
374
|
+
|
|
375
|
+
// /* -- DFS buffer (size = L) -- */
|
|
376
|
+
// largebm::DFS.clear();
|
|
377
|
+
// largebm::DFS.reserve(largebm::L);
|
|
378
|
+
// for (unsigned int i = 0; i < largebm::L; ++i)
|
|
379
|
+
// largebm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
380
|
+
|
|
381
|
+
// /* -- Build the MDD -- */
|
|
382
|
+
// largebm::Tree.clear();
|
|
383
|
+
// largebm::Tree.emplace_back(0, 0, 0); // dummy root
|
|
384
|
+
// for ( auto &seq : largebm::items)
|
|
385
|
+
// largebm::Build_MDD(seq);
|
|
386
|
+
// }
|
|
387
|
+
|
|
388
|
+
// /* 4) Mine and return results */
|
|
389
|
+
// largebm::Freq_miner();
|
|
390
|
+
|
|
391
|
+
// py::dict out;
|
|
392
|
+
// out["patterns"] = largebm::GetCollected();
|
|
393
|
+
// out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
394
|
+
// return out;
|
|
395
|
+
// },
|
|
396
|
+
// py::arg("data"),
|
|
397
|
+
// py::arg("minsup") = 0.01,
|
|
398
|
+
// py::arg("time_limit") = 36000,
|
|
399
|
+
// py::arg("preproc") = false,
|
|
400
|
+
// py::arg("use_dic") = false,
|
|
401
|
+
// py::arg("verbose") = false,
|
|
402
|
+
// py::arg("out_file") = ""
|
|
403
|
+
// );
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
m.def("LargeBTMiner",
|
|
407
|
+
[](py::object data,
|
|
408
|
+
double minsup,
|
|
409
|
+
unsigned int time_limit,
|
|
410
|
+
bool preproc,
|
|
411
|
+
bool use_dic,
|
|
412
|
+
bool verbose,
|
|
413
|
+
const std::string &out_file)
|
|
414
|
+
{
|
|
415
|
+
largebm::time_limit = time_limit;
|
|
416
|
+
largebm::pre_pro = preproc;
|
|
417
|
+
largebm::use_dic = use_dic;
|
|
418
|
+
largebm::use_list = false; // <-- switch into “large” mode
|
|
419
|
+
largebm::b_disp = verbose;
|
|
420
|
+
largebm::b_write = !out_file.empty();
|
|
421
|
+
largebm::out_file = out_file;
|
|
422
|
+
largebm::just_build = false;
|
|
423
|
+
|
|
424
|
+
// ── Build the inverse‐dictionary here ────────────────────────────
|
|
425
|
+
{
|
|
426
|
+
std::vector<int> local_inv( largebm::item_dic.size() + 1 );
|
|
427
|
+
for (int old = 1; old <= (int)largebm::item_dic.size(); ++old) {
|
|
428
|
+
int cid = largebm::item_dic[old - 1];
|
|
429
|
+
if (cid > 0)
|
|
430
|
+
local_inv[cid] = old;
|
|
431
|
+
}
|
|
432
|
+
largebm::inv_item_dic = std::move(local_inv);
|
|
433
|
+
}
|
|
434
|
+
// ─std::cerr << "inv_item_dic size=" << largebm::inv_item_dic.size() << "\n";
|
|
435
|
+
for (size_t i = 0; i < largebm::inv_item_dic.size(); ++i) {
|
|
436
|
+
//std::cerr << i << "→" << largebm::inv_item_dic[i] << " ";
|
|
437
|
+
}
|
|
438
|
+
std::cerr << "\n";
|
|
439
|
+
|
|
440
|
+
largebm::ClearCollected();
|
|
441
|
+
largebm::start_time = std::clock();
|
|
442
|
+
|
|
443
|
+
if (py::isinstance<py::str>(data)) {
|
|
444
|
+
// load from filename
|
|
445
|
+
std::string path = data.cast<std::string>();
|
|
446
|
+
if (!largebm::Load_instance(path, minsup))
|
|
447
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
448
|
+
}
|
|
449
|
+
else {
|
|
450
|
+
// load from in‐memory sequences
|
|
451
|
+
largebm::items = std::move(data.cast<std::vector<std::vector<int>>>());
|
|
452
|
+
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
largebm::Freq_miner();
|
|
456
|
+
|
|
457
|
+
py::dict out;
|
|
458
|
+
out["patterns"] = largebm::GetCollected();
|
|
459
|
+
out["time"] = largebm::give_time(std::clock() - largebm::start_time);
|
|
460
|
+
return out;
|
|
461
|
+
},
|
|
462
|
+
py::arg("data"),
|
|
463
|
+
py::arg("minsup") = 0.01,
|
|
464
|
+
py::arg("time_limit") = 36000,
|
|
465
|
+
py::arg("preproc") = false,
|
|
466
|
+
py::arg("use_dic") = false,
|
|
467
|
+
py::arg("verbose") = false,
|
|
468
|
+
py::arg("out_file") = ""
|
|
469
|
+
);
|
|
470
|
+
|
|
471
|
+
m.def("LargeHTMiner",
|
|
472
|
+
[](py::object data,
|
|
473
|
+
double minsup,
|
|
474
|
+
unsigned int time_limit,
|
|
475
|
+
bool preproc,
|
|
476
|
+
bool use_dic,
|
|
477
|
+
bool verbose,
|
|
478
|
+
const std::string &out_file)
|
|
479
|
+
{
|
|
480
|
+
// 0) Set global flags and timers:
|
|
481
|
+
largehm::time_limit = time_limit;
|
|
482
|
+
largehm::pre_pro = preproc;
|
|
483
|
+
largehm::use_dic = use_dic;
|
|
484
|
+
largehm::use_list = true; // force in‐memory mode
|
|
485
|
+
largehm::b_disp = verbose;
|
|
486
|
+
largehm::b_write = !out_file.empty();
|
|
487
|
+
largehm::out_file = out_file;
|
|
488
|
+
largehm::just_build = false;
|
|
489
|
+
|
|
490
|
+
largehm::ClearCollected();
|
|
491
|
+
largehm::start_time = std::clock();
|
|
492
|
+
|
|
493
|
+
if (py::isinstance<py::str>(data)) {
|
|
494
|
+
// ───────────── FILE‐BASED MODE ─────────────
|
|
495
|
+
// Force mlim so that every item lands in temp_vec (never temp_lim):
|
|
496
|
+
largehm::mlim = UINT_MAX;
|
|
497
|
+
|
|
498
|
+
std::string path = data.cast<std::string>();
|
|
499
|
+
if (! largehm::Load_instance(path, minsup))
|
|
500
|
+
throw std::runtime_error("Failed to load file: " + path);
|
|
501
|
+
}
|
|
502
|
+
else {
|
|
503
|
+
// ───────────── IN‐MEMORY MODE ─────────────
|
|
504
|
+
auto seqs = data.cast<std::vector<std::vector<int>>>();
|
|
505
|
+
largehm::items = std::move(seqs);
|
|
506
|
+
largehm::N = largehm::items.size();
|
|
507
|
+
|
|
508
|
+
// 1) Compute L = maximum absolute item ID
|
|
509
|
+
int max_id = 0;
|
|
510
|
+
for (auto &seq : largehm::items)
|
|
511
|
+
for (int x : seq)
|
|
512
|
+
max_id = std::max(max_id, std::abs(x));
|
|
513
|
+
largehm::L = static_cast<unsigned int>(max_id);
|
|
514
|
+
|
|
515
|
+
// 2) Compute theta as absolute support threshold
|
|
516
|
+
largehm::theta = (minsup < 1.0)
|
|
517
|
+
? static_cast<unsigned long long>(std::ceil(minsup * largehm::N))
|
|
518
|
+
: static_cast<unsigned long long>(minsup);
|
|
519
|
+
|
|
520
|
+
// 3) Initialize DFS (size = L)
|
|
521
|
+
largehm::DFS.clear();
|
|
522
|
+
largehm::DFS.reserve(largehm::L);
|
|
523
|
+
for (unsigned int i = 0; i < largehm::L; ++i)
|
|
524
|
+
largehm::DFS.emplace_back(-static_cast<int>(i) - 1);
|
|
525
|
+
|
|
526
|
+
// 4) Compute M (max sequence length) and E (total entries)
|
|
527
|
+
largehm::M = 0;
|
|
528
|
+
largehm::E = 0;
|
|
529
|
+
for (auto &seq : largehm::items) {
|
|
530
|
+
largehm::M = std::max<unsigned int>(
|
|
531
|
+
largehm::M, static_cast<unsigned int>(seq.size()));
|
|
532
|
+
largehm::E += seq.size();
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// 5) ─── Build the MDD “manually” ───
|
|
536
|
+
largehm::Tree.clear();
|
|
537
|
+
largehm::VTree.clear();
|
|
538
|
+
largehm::CTree.clear();
|
|
539
|
+
|
|
540
|
+
// Insert exactly one dummy root node (chld=0, sibl=0, freq=0):
|
|
541
|
+
largehm::Tree.emplace_back(0,0,0);
|
|
542
|
+
|
|
543
|
+
// For each sequence “seq”, insert into MDD by placing a single −1 sentinel:
|
|
544
|
+
for (auto &seq : largehm::items) {
|
|
545
|
+
// Copy the item IDs:
|
|
546
|
+
std::vector<int> temp_vec = seq;
|
|
547
|
+
// Only a single “−1” is needed to force the suffix insertion:
|
|
548
|
+
std::vector<int> temp_lim(1, -1);
|
|
549
|
+
|
|
550
|
+
largehm::Build_MDD(temp_vec, temp_lim);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// 6) Run the frequency miner (Tree is now properly built):
|
|
557
|
+
largehm::Freq_miner();
|
|
558
|
+
|
|
559
|
+
// 7) Return results to Python:
|
|
560
|
+
py::dict out;
|
|
561
|
+
out["patterns"] = largehm::GetCollected();
|
|
562
|
+
out["time"] = largehm::give_time(std::clock() - largehm::start_time);
|
|
563
|
+
return out;
|
|
564
|
+
},
|
|
565
|
+
py::arg("data"),
|
|
566
|
+
py::arg("minsup") = 0.01,
|
|
567
|
+
py::arg("time_limit") = 36000,
|
|
568
|
+
py::arg("preproc") = false,
|
|
569
|
+
py::arg("use_dic") = false,
|
|
570
|
+
py::arg("verbose") = false,
|
|
571
|
+
py::arg("out_file") = ""
|
|
572
|
+
);
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
}
|
|
@@ -156,6 +156,9 @@ void Extend_patt(Pattern _patt) {
|
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
void Out_patt(std::vector<int>& seq, int freq) {
|
|
159
|
+
|
|
160
|
+
btminer::collected.push_back(seq); // make pattern visible to Python
|
|
161
|
+
|
|
159
162
|
std::ofstream file_o;
|
|
160
163
|
if (b_write) file_o.open(out_file, std::ios::app);
|
|
161
164
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
|
|
1
2
|
#include <iostream>
|
|
2
3
|
#include <sstream>
|
|
3
4
|
#include <fstream>
|
|
@@ -15,18 +16,21 @@ namespace btminer {
|
|
|
15
16
|
|
|
16
17
|
using namespace std;
|
|
17
18
|
|
|
18
|
-
extern int num_nodes, cur_node;
|
|
19
|
+
extern int num_nodes, cur_node;
|
|
20
|
+
|
|
19
21
|
|
|
20
22
|
map<string, int> item_map;
|
|
21
23
|
map<int, string> item_map_rev;
|
|
22
24
|
vector<int> freq;
|
|
23
25
|
vector<int> item_dic;
|
|
24
26
|
|
|
25
|
-
|
|
26
27
|
void Load_items_pre(string& inst_name);
|
|
27
28
|
bool Load_items(string& inst_name);
|
|
28
29
|
bool Preprocess(string& inst, double thresh);
|
|
29
30
|
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
30
34
|
bool Load_instance(string& items_file, double thresh) {
|
|
31
35
|
clock_t kk = clock();
|
|
32
36
|
Tree.emplace_back(0, 0, 0);
|
|
@@ -124,11 +128,11 @@ void Load_items_pre(string& inst_name) {
|
|
|
124
128
|
ditem = stoi(itm);
|
|
125
129
|
}
|
|
126
130
|
|
|
127
|
-
if (freq[abs(ditem) - 1] < theta) {
|
|
131
|
+
if (pre_pro && freq.size() > abs(ditem) - 1 && freq[abs(ditem) - 1] < theta) {
|
|
128
132
|
if (!sgn)
|
|
129
133
|
sgn = ditem < 0;
|
|
130
134
|
continue;
|
|
131
|
-
} else {
|
|
135
|
+
} else if (pre_pro) {
|
|
132
136
|
ditem = (ditem > 0) ? item_dic[ditem - 1] : -item_dic[-ditem - 1];
|
|
133
137
|
}
|
|
134
138
|
|
|
@@ -144,6 +148,7 @@ void Load_items_pre(string& inst_name) {
|
|
|
144
148
|
++N;
|
|
145
149
|
if (temp_vec.size() > M) M = temp_vec.size();
|
|
146
150
|
|
|
151
|
+
E += temp_vec.size(); // <-- make sure E gets incremented
|
|
147
152
|
Build_MDD(temp_vec);
|
|
148
153
|
}
|
|
149
154
|
}
|
|
@@ -186,6 +191,7 @@ bool Load_items(string& inst_name) {
|
|
|
186
191
|
}
|
|
187
192
|
|
|
188
193
|
if (temp_vec.size() > M) M = temp_vec.size();
|
|
194
|
+
E += temp_vec.size(); // <-- make sure E gets incremented
|
|
189
195
|
Build_MDD(temp_vec);
|
|
190
196
|
}
|
|
191
197
|
return true;
|