effspm 0.1.5__cp310-cp310-win_amd64.whl → 0.3.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- effspm/__init__.py +9 -2
- effspm/_core.cpp +91 -13
- effspm/_effspm.cp310-win_amd64.pyd +0 -0
- effspm/_effspm.cpp +679 -0
- effspm/btminer/src/build_mdd.cpp +88 -0
- effspm/btminer/src/build_mdd.hpp +34 -0
- effspm/btminer/src/freq_miner.cpp +264 -0
- effspm/btminer/src/freq_miner.hpp +55 -0
- effspm/btminer/src/load_inst.cpp +275 -0
- effspm/btminer/src/load_inst.hpp +43 -0
- effspm/btminer/src/utility.cpp +50 -0
- effspm/btminer/src/utility.hpp +16 -0
- effspm/freq_miner.hpp +7 -1
- effspm/htminer/src/build_mdd.cpp +139 -0
- effspm/htminer/src/build_mdd.hpp +64 -0
- effspm/htminer/src/freq_miner.cpp +350 -0
- effspm/htminer/src/freq_miner.hpp +60 -0
- effspm/htminer/src/load_inst.cpp +394 -0
- effspm/htminer/src/load_inst.hpp +23 -0
- effspm/htminer/src/utility.cpp +72 -0
- effspm/htminer/src/utility.hpp +77 -0
- effspm/largebm/src/build_mdd.cpp +96 -0
- effspm/largebm/src/build_mdd.hpp +32 -0
- effspm/largebm/src/freq_miner.cpp +299 -0
- effspm/largebm/src/freq_miner.hpp +37 -0
- effspm/largebm/src/load_inst.cpp +224 -0
- effspm/largebm/src/load_inst.hpp +35 -0
- effspm/largebm/src/utility.cpp +35 -0
- effspm/largebm/src/utility.hpp +15 -0
- effspm/largehm/src/build_mdd.cpp +174 -0
- effspm/largehm/src/build_mdd.hpp +93 -0
- effspm/largehm/src/freq_miner.cpp +429 -0
- effspm/largehm/src/freq_miner.hpp +77 -0
- effspm/largehm/src/load_inst.cpp +375 -0
- effspm/largehm/src/load_inst.hpp +64 -0
- effspm/largehm/src/utility.cpp +38 -0
- effspm/largehm/src/utility.hpp +29 -0
- effspm/largepp/src/freq_miner.cpp +198 -0
- effspm/largepp/src/freq_miner.hpp +18 -0
- effspm/largepp/src/load_inst.cpp +238 -0
- effspm/largepp/src/load_inst.hpp +34 -0
- effspm/largepp/src/pattern.hpp +31 -0
- effspm/largepp/src/utility.cpp +34 -0
- effspm/largepp/src/utility.hpp +21 -0
- effspm/load_inst.hpp +18 -12
- effspm-0.3.0.dist-info/METADATA +237 -0
- effspm-0.3.0.dist-info/RECORD +54 -0
- {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/WHEEL +1 -1
- effspm/_core.cp310-win_amd64.pyd +0 -0
- effspm-0.1.5.dist-info/METADATA +0 -38
- effspm-0.1.5.dist-info/RECORD +0 -14
- {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
#include <algorithm>
|
|
2
|
+
#include <cstdlib>
|
|
3
|
+
#include <fstream>
|
|
4
|
+
#include <iostream>
|
|
5
|
+
|
|
6
|
+
#include "freq_miner.hpp"
|
|
7
|
+
#include "pattern.hpp"
|
|
8
|
+
#include "load_inst.hpp"
|
|
9
|
+
#include "utility.hpp"
|
|
10
|
+
|
|
11
|
+
namespace largepp {
|
|
12
|
+
|
|
13
|
+
using std::abs;
|
|
14
|
+
using std::cout;
|
|
15
|
+
using std::endl;
|
|
16
|
+
using std::ofstream;
|
|
17
|
+
using std::swap;
|
|
18
|
+
using std::vector;
|
|
19
|
+
|
|
20
|
+
static void Out_patt(vector<int>& seq, unsigned int freq);
|
|
21
|
+
static void Extend_patt(Pattern& _pattern);
|
|
22
|
+
|
|
23
|
+
unsigned long long int num_patt = 0; // counter for emitted patterns
|
|
24
|
+
static Pattern _patt; // scratch pattern (for in-place extend)
|
|
25
|
+
|
|
26
|
+
/* ------------------------------------------------------------------ */
|
|
27
|
+
/* Driver */
|
|
28
|
+
/* ------------------------------------------------------------------ */
|
|
29
|
+
void Freq_miner()
|
|
30
|
+
{
|
|
31
|
+
// Build the candidate item list once (items that pass minsup at length-1)
|
|
32
|
+
vector<int> islist;
|
|
33
|
+
islist.reserve(L);
|
|
34
|
+
for (unsigned int i = 0; i < L; ++i) {
|
|
35
|
+
if (DFS[i].freq >= theta) islist.push_back(static_cast<int>(i));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Seed each 1-length pattern’s extension lists
|
|
39
|
+
for (unsigned int i = 0; i < DFS.size(); ++i) {
|
|
40
|
+
DFS[i].ilist = islist;
|
|
41
|
+
DFS[i].slist = islist;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// DFS over the stack, extending only nodes whose current support ≥ theta
|
|
45
|
+
while (!DFS.empty() && give_time(std::clock() - start_time) < time_limit) {
|
|
46
|
+
if (DFS.back().freq >= theta) {
|
|
47
|
+
Extend_patt(DFS.back());
|
|
48
|
+
} else {
|
|
49
|
+
DFS.pop_back();
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/* ------------------------------------------------------------------ */
|
|
55
|
+
/* Extend_patt: given a frequent pattern, enumerate its i- and s-ext */
|
|
56
|
+
/* ------------------------------------------------------------------ */
|
|
57
|
+
static void Extend_patt(Pattern& _pattern)
|
|
58
|
+
{
|
|
59
|
+
swap(_patt, _pattern); // work on local scratch
|
|
60
|
+
DFS.pop_back(); // remove from stack
|
|
61
|
+
|
|
62
|
+
// Quick presence tables for allowed i-/s-extensions
|
|
63
|
+
vector<bool> slist(L, false);
|
|
64
|
+
vector<bool> ilist(L, false);
|
|
65
|
+
for (int idx : _patt.slist) slist[static_cast<size_t>(idx)] = true;
|
|
66
|
+
for (int idx : _patt.ilist) ilist[static_cast<size_t>(idx)] = true;
|
|
67
|
+
|
|
68
|
+
// Potential children buffers:
|
|
69
|
+
vector<Pattern> pot_patt(L * 2); // [0..L-1] = i-ext, [L..2L-1] = s-ext
|
|
70
|
+
|
|
71
|
+
// Find last negative from the end (boundary between itemsets)
|
|
72
|
+
int last_neg = static_cast<int>(_patt.seq.size()) - 1;
|
|
73
|
+
while (last_neg >= 0 && _patt.seq[static_cast<size_t>(last_neg)] > 0) --last_neg;
|
|
74
|
+
|
|
75
|
+
// Scan occurrences to build supports for all valid next-steps
|
|
76
|
+
for (size_t i = 0; i < _patt.str_pnt.size(); ++i) {
|
|
77
|
+
vector<bool> found(L * 2, false);
|
|
78
|
+
|
|
79
|
+
unsigned long long seq_id = _patt.seq_ID[i];
|
|
80
|
+
unsigned int j = _patt.str_pnt[i] + 1;
|
|
81
|
+
|
|
82
|
+
// 1) Same itemset (i-extension) forward until end-of-itemset (>0)
|
|
83
|
+
while (j < items[seq_id].size() && items[seq_id][j] > 0) {
|
|
84
|
+
int cur_itm = items[seq_id][j];
|
|
85
|
+
if (ilist[static_cast<size_t>(cur_itm - 1)]) {
|
|
86
|
+
pot_patt[static_cast<size_t>(cur_itm - 1)].seq_ID.push_back(seq_id);
|
|
87
|
+
pot_patt[static_cast<size_t>(cur_itm - 1)].str_pnt.push_back(j);
|
|
88
|
+
++pot_patt[static_cast<size_t>(cur_itm - 1)].freq;
|
|
89
|
+
found[static_cast<size_t>(cur_itm - 1)] = true;
|
|
90
|
+
}
|
|
91
|
+
++j;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// 2) Later itemsets (s-extension), plus special re-open i-ext rule
|
|
95
|
+
int num_itmfnd = 0;
|
|
96
|
+
for (size_t k = j; k < items[seq_id].size(); ++k) {
|
|
97
|
+
int cur = items[seq_id][k];
|
|
98
|
+
int cur_itm = abs(cur);
|
|
99
|
+
|
|
100
|
+
if (cur < 0) num_itmfnd = 0; // new itemset boundary seen
|
|
101
|
+
|
|
102
|
+
// s-extension: add cur_itm as new itemset element
|
|
103
|
+
if (slist[static_cast<size_t>(cur_itm - 1)] &&
|
|
104
|
+
!found[static_cast<size_t>(L + cur_itm - 1)]) {
|
|
105
|
+
pot_patt[static_cast<size_t>(L + cur_itm - 1)].seq_ID.push_back(seq_id);
|
|
106
|
+
pot_patt[static_cast<size_t>(L + cur_itm - 1)].str_pnt.push_back(k);
|
|
107
|
+
++pot_patt[static_cast<size_t>(L + cur_itm - 1)].freq;
|
|
108
|
+
found[static_cast<size_t>(L + cur_itm - 1)] = true;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// once we've seen the suffix of the last itemset fully,
|
|
112
|
+
// allow i-extension again (across future itemsets)
|
|
113
|
+
if (num_itmfnd == static_cast<int>(_patt.seq.size()) - last_neg) {
|
|
114
|
+
if (ilist[static_cast<size_t>(cur_itm - 1)] &&
|
|
115
|
+
!found[static_cast<size_t>(cur_itm - 1)]) {
|
|
116
|
+
pot_patt[static_cast<size_t>(cur_itm - 1)].seq_ID.push_back(seq_id);
|
|
117
|
+
pot_patt[static_cast<size_t>(cur_itm - 1)].str_pnt.push_back(k);
|
|
118
|
+
++pot_patt[static_cast<size_t>(cur_itm - 1)].freq;
|
|
119
|
+
found[static_cast<size_t>(cur_itm - 1)] = true;
|
|
120
|
+
}
|
|
121
|
+
} else if (last_neg + num_itmfnd >= 0 &&
|
|
122
|
+
cur_itm == abs(_patt.seq[static_cast<size_t>(last_neg + num_itmfnd)])) {
|
|
123
|
+
++num_itmfnd;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Filter children by support threshold
|
|
129
|
+
vector<int> ilistp;
|
|
130
|
+
vector<int> slistp;
|
|
131
|
+
ilistp.reserve(_patt.ilist.size());
|
|
132
|
+
slistp.reserve(_patt.slist.size());
|
|
133
|
+
|
|
134
|
+
for (int idx : _patt.ilist) {
|
|
135
|
+
if (pot_patt[static_cast<size_t>(idx)].freq >= theta)
|
|
136
|
+
ilistp.push_back(idx);
|
|
137
|
+
}
|
|
138
|
+
for (int idx : _patt.slist) {
|
|
139
|
+
if (pot_patt[static_cast<size_t>(idx + static_cast<int>(L))].freq >= theta)
|
|
140
|
+
slistp.push_back(idx);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Push all i-extensions
|
|
144
|
+
for (int idx : ilistp) {
|
|
145
|
+
DFS.emplace_back();
|
|
146
|
+
swap(DFS.back(), pot_patt[static_cast<size_t>(idx)]);
|
|
147
|
+
|
|
148
|
+
DFS.back().seq = _patt.seq;
|
|
149
|
+
DFS.back().seq.push_back(idx + 1);
|
|
150
|
+
|
|
151
|
+
DFS.back().slist = slistp;
|
|
152
|
+
DFS.back().ilist = ilistp;
|
|
153
|
+
|
|
154
|
+
// ALWAYS emit (so collected fills even if !b_disp && !b_write)
|
|
155
|
+
Out_patt(DFS.back().seq, DFS.back().freq);
|
|
156
|
+
++num_patt;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Push all s-extensions
|
|
160
|
+
for (int idx : slistp) {
|
|
161
|
+
DFS.emplace_back();
|
|
162
|
+
swap(DFS.back(), pot_patt[static_cast<size_t>(idx + static_cast<int>(L))]);
|
|
163
|
+
|
|
164
|
+
DFS.back().seq = _patt.seq;
|
|
165
|
+
DFS.back().seq.push_back(-(idx + 1)); // negative encodes new itemset
|
|
166
|
+
|
|
167
|
+
DFS.back().slist = slistp;
|
|
168
|
+
DFS.back().ilist = slistp; // as in original code
|
|
169
|
+
|
|
170
|
+
// ALWAYS emit
|
|
171
|
+
Out_patt(DFS.back().seq, DFS.back().freq);
|
|
172
|
+
++num_patt;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/* ------------------------------------------------------------------ */
|
|
177
|
+
/* Out_patt: append to buffer; optionally print/write */
|
|
178
|
+
/* ------------------------------------------------------------------ */
|
|
179
|
+
static void Out_patt(vector<int>& seq, unsigned int freq)
|
|
180
|
+
{
|
|
181
|
+
// Always append to in-memory results returned to Python
|
|
182
|
+
largepp::collected.push_back(seq);
|
|
183
|
+
|
|
184
|
+
ofstream file_o;
|
|
185
|
+
if (b_write) file_o.open(out_file, std::ios::app);
|
|
186
|
+
|
|
187
|
+
if (b_disp) {
|
|
188
|
+
for (int v : seq) cout << v << " ";
|
|
189
|
+
cout << "\n************** Freq: " << freq << endl;
|
|
190
|
+
}
|
|
191
|
+
if (b_write) {
|
|
192
|
+
for (int v : seq) file_o << v << " ";
|
|
193
|
+
file_o << "\n************** Freq: " << freq << "\n";
|
|
194
|
+
file_o.close();
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
} // namespace largepp
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <string>
|
|
5
|
+
|
|
6
|
+
#include "pattern.hpp" // defines largepp::Pattern
|
|
7
|
+
#include "load_inst.hpp" // declares externs: items, L, theta, DFS, etc.
|
|
8
|
+
#include "utility.hpp" // flags, collected buffer, timers, helpers
|
|
9
|
+
|
|
10
|
+
namespace largepp {
|
|
11
|
+
|
|
12
|
+
// Public entry point
|
|
13
|
+
void Freq_miner();
|
|
14
|
+
|
|
15
|
+
// (defined in the .cpp)
|
|
16
|
+
extern unsigned long long int num_patt;
|
|
17
|
+
|
|
18
|
+
} // namespace largepp
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <sstream>
|
|
3
|
+
#include <algorithm>
|
|
4
|
+
#include <cmath>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include "load_inst.hpp"
|
|
7
|
+
#include "freq_miner.hpp"
|
|
8
|
+
#include "utility.hpp"
|
|
9
|
+
|
|
10
|
+
namespace largepp { // ─── BEGIN namespace ─────────────────────
|
|
11
|
+
using namespace std;
|
|
12
|
+
|
|
13
|
+
/* ------------------------------------------------------------------
|
|
14
|
+
* Global definitions (match the externs in load_inst.hpp)
|
|
15
|
+
* ---------------------------------------------------------------- */
|
|
16
|
+
unsigned int M = 0, L = 0;
|
|
17
|
+
unsigned long long N = 0, E = 0;
|
|
18
|
+
double theta = 0.01;
|
|
19
|
+
vector<vector<int>> items;
|
|
20
|
+
vector<Pattern> DFS;
|
|
21
|
+
vector<int> item_dic;
|
|
22
|
+
|
|
23
|
+
/* Forward decls for helper routines in this file */
|
|
24
|
+
static bool Load_items(string& inst);
|
|
25
|
+
static void Load_items_pre(string& inst);
|
|
26
|
+
static bool Preprocess(string& inst, double thresh);
|
|
27
|
+
|
|
28
|
+
/* ==================================================================
|
|
29
|
+
* MAIN ENTRY — load from disk
|
|
30
|
+
* ================================================================= */
|
|
31
|
+
bool Load_instance(string& items_file, double thresh)
|
|
32
|
+
{
|
|
33
|
+
clock_t kk = clock();
|
|
34
|
+
|
|
35
|
+
if (pre_pro) {
|
|
36
|
+
if (!Preprocess(items_file, thresh)) return false;
|
|
37
|
+
|
|
38
|
+
cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
|
|
39
|
+
|
|
40
|
+
DFS.clear();
|
|
41
|
+
DFS.reserve(L);
|
|
42
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
43
|
+
DFS.emplace_back(-int(i) - 1);
|
|
44
|
+
|
|
45
|
+
kk = clock();
|
|
46
|
+
Load_items_pre(items_file);
|
|
47
|
+
N = items.size();
|
|
48
|
+
}
|
|
49
|
+
else if (!Load_items(items_file))
|
|
50
|
+
return false;
|
|
51
|
+
else
|
|
52
|
+
theta = (thresh < 1.0) ? ceil(thresh * N) : thresh;
|
|
53
|
+
|
|
54
|
+
cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
|
|
55
|
+
cout << "Found " << N << " sequence, with max line len " << M
|
|
56
|
+
<< ", and " << L << " items, and " << E << " enteries\n";
|
|
57
|
+
|
|
58
|
+
// ───────────────────────────────────────────────────────────
|
|
59
|
+
// DEBUG snapshot of seeds right after loading
|
|
60
|
+
// ───────────────────────────────────────────────────────────
|
|
61
|
+
{
|
|
62
|
+
unsigned long long seeds_ge_theta = 0, seeds_nonzero = 0, max_freq = 0;
|
|
63
|
+
for (size_t i = 0; i < DFS.size(); ++i) {
|
|
64
|
+
if (DFS[i].freq > 0) ++seeds_nonzero;
|
|
65
|
+
if (DFS[i].freq >= theta) ++seeds_ge_theta;
|
|
66
|
+
if (DFS[i].freq > max_freq) max_freq = DFS[i].freq;
|
|
67
|
+
}
|
|
68
|
+
// std::cout << " theta=" << theta
|
|
69
|
+
// << " | DFS.size=" << DFS.size()
|
|
70
|
+
// << " | seeds>=theta=" << seeds_ge_theta
|
|
71
|
+
// << " | seeds>0=" << seeds_nonzero
|
|
72
|
+
// << " | max_seed_freq=" << max_freq << "\n";
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/* ==================================================================
|
|
79
|
+
* ALT ENTRY — load directly from a Python list of lists
|
|
80
|
+
* ================================================================= */
|
|
81
|
+
void Load_py(const pybind11::object& data, double thresh)
|
|
82
|
+
{
|
|
83
|
+
items = data.cast<vector<vector<int>>>();
|
|
84
|
+
N = items.size();
|
|
85
|
+
|
|
86
|
+
int max_id = 0;
|
|
87
|
+
M = 0; E = 0;
|
|
88
|
+
for (auto& seq : items) {
|
|
89
|
+
M = max<unsigned int>(M, static_cast<unsigned int>(seq.size()));
|
|
90
|
+
E += seq.size();
|
|
91
|
+
for (int x : seq)
|
|
92
|
+
max_id = max(max_id, abs(x));
|
|
93
|
+
}
|
|
94
|
+
L = static_cast<unsigned int>(max_id);
|
|
95
|
+
theta = (thresh < 1.0) ? ceil(thresh * N) : thresh;
|
|
96
|
+
|
|
97
|
+
DFS.clear();
|
|
98
|
+
DFS.reserve(L);
|
|
99
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
100
|
+
DFS.emplace_back(-int(i) - 1);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/* =================================================================
|
|
104
|
+
* The professor’s original helpers — untouched except minor safety
|
|
105
|
+
* ================================================================= */
|
|
106
|
+
static bool Preprocess(string& inst, double thresh)
|
|
107
|
+
{
|
|
108
|
+
ifstream file(inst);
|
|
109
|
+
vector<unsigned long long> freq(1000000), counted(1000000, 0);
|
|
110
|
+
|
|
111
|
+
if (file.good()) {
|
|
112
|
+
string line; int ditem;
|
|
113
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
114
|
+
++N;
|
|
115
|
+
istringstream word(line);
|
|
116
|
+
string itm;
|
|
117
|
+
while (word >> itm) {
|
|
118
|
+
ditem = stoi(itm);
|
|
119
|
+
L = max<unsigned int>(L, static_cast<unsigned int>(abs(ditem)));
|
|
120
|
+
|
|
121
|
+
if (freq.size() < L) {
|
|
122
|
+
freq.resize(L, 0);
|
|
123
|
+
counted.resize(L, 0);
|
|
124
|
+
}
|
|
125
|
+
if (counted[abs(ditem) - 1] != N) {
|
|
126
|
+
++freq[abs(ditem) - 1];
|
|
127
|
+
counted[abs(ditem) - 1] = N;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
} else {
|
|
132
|
+
cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
133
|
+
return false;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
theta = (thresh < 1.0) ? ceil(thresh * N) : thresh;
|
|
137
|
+
|
|
138
|
+
int real_L = 0;
|
|
139
|
+
item_dic.assign(L, -1);
|
|
140
|
+
for (unsigned int i = 0; i < L; ++i)
|
|
141
|
+
if (freq[i] >= theta) item_dic[i] = ++real_L;
|
|
142
|
+
|
|
143
|
+
cout << "Original number of items: " << L
|
|
144
|
+
<< " Reduced to: " << real_L << '\n';
|
|
145
|
+
|
|
146
|
+
L = real_L;
|
|
147
|
+
N = 0;
|
|
148
|
+
return true;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
static void Load_items_pre(string& inst)
|
|
152
|
+
{
|
|
153
|
+
ifstream file(inst);
|
|
154
|
+
|
|
155
|
+
if (!file.good()) return;
|
|
156
|
+
string line; int size_m, ditem; bool empty_seq = false;
|
|
157
|
+
|
|
158
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
159
|
+
vector<bool> counted(L, 0);
|
|
160
|
+
istringstream word(line);
|
|
161
|
+
|
|
162
|
+
if (!empty_seq) items.emplace_back();
|
|
163
|
+
string itm; size_m = 0; bool sgn = false; empty_seq = true;
|
|
164
|
+
|
|
165
|
+
while (word >> itm) {
|
|
166
|
+
ditem = stoi(itm);
|
|
167
|
+
|
|
168
|
+
if (item_dic[abs(ditem) - 1] == -1) {
|
|
169
|
+
if (!sgn) sgn = ditem < 0;
|
|
170
|
+
continue;
|
|
171
|
+
} else {
|
|
172
|
+
ditem = (ditem > 0)
|
|
173
|
+
? item_dic[ditem - 1]
|
|
174
|
+
: -item_dic[-ditem - 1];
|
|
175
|
+
}
|
|
176
|
+
empty_seq = false;
|
|
177
|
+
|
|
178
|
+
if (sgn) { if (ditem > 0) ditem = -ditem; sgn = false; }
|
|
179
|
+
|
|
180
|
+
items.back().push_back(ditem);
|
|
181
|
+
|
|
182
|
+
if (!counted[abs(ditem) - 1] && !just_build) {
|
|
183
|
+
DFS[abs(ditem) - 1].seq_ID.push_back(items.size() - 1);
|
|
184
|
+
DFS[abs(ditem) - 1].str_pnt.push_back(items.back().size() - 1);
|
|
185
|
+
++DFS[abs(ditem) - 1].freq;
|
|
186
|
+
counted[abs(ditem) - 1] = true;
|
|
187
|
+
}
|
|
188
|
+
++size_m;
|
|
189
|
+
}
|
|
190
|
+
if (empty_seq) continue;
|
|
191
|
+
|
|
192
|
+
++N; E += size_m; M = max<unsigned int>(M, static_cast<unsigned int>(size_m));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
static bool Load_items(string& inst)
|
|
197
|
+
{
|
|
198
|
+
ifstream file(inst);
|
|
199
|
+
if (!file.good()) {
|
|
200
|
+
cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
string line; int size_m, ditem;
|
|
205
|
+
while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
|
|
206
|
+
++N;
|
|
207
|
+
vector<bool> counted(L, 0);
|
|
208
|
+
istringstream word(line);
|
|
209
|
+
|
|
210
|
+
items.emplace_back();
|
|
211
|
+
string itm; size_m = 0;
|
|
212
|
+
|
|
213
|
+
while (word >> itm) {
|
|
214
|
+
ditem = stoi(itm);
|
|
215
|
+
if (L < static_cast<unsigned int>(abs(ditem))) {
|
|
216
|
+
L = static_cast<unsigned int>(abs(ditem));
|
|
217
|
+
while (DFS.size() < L) {
|
|
218
|
+
DFS.emplace_back(-int(DFS.size()) - 1);
|
|
219
|
+
counted.push_back(0);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
items.back().push_back(ditem);
|
|
223
|
+
|
|
224
|
+
if (!counted[abs(ditem) - 1] && !just_build) {
|
|
225
|
+
DFS[abs(ditem) - 1].seq_ID.push_back(items.size() - 1);
|
|
226
|
+
DFS[abs(ditem) - 1].str_pnt.push_back(items.back().size() - 1);
|
|
227
|
+
++DFS[abs(ditem) - 1].freq;
|
|
228
|
+
counted[abs(ditem) - 1] = true;
|
|
229
|
+
}
|
|
230
|
+
++size_m;
|
|
231
|
+
}
|
|
232
|
+
E += size_m;
|
|
233
|
+
M = max<unsigned int>(M, static_cast<unsigned int>(size_m));
|
|
234
|
+
}
|
|
235
|
+
return true;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
} // namespace largepp // ─── END namespace ──────────────────────
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include <map>
|
|
7
|
+
#include <pybind11/pybind11.h>
|
|
8
|
+
|
|
9
|
+
#include "largepp/src/pattern.hpp" // ← ensure Pattern is a complete type here
|
|
10
|
+
|
|
11
|
+
namespace largepp {
|
|
12
|
+
using namespace std;
|
|
13
|
+
|
|
14
|
+
// public entry points
|
|
15
|
+
bool Load_instance(std::string& items_file, double thresh);
|
|
16
|
+
void Load_py(const pybind11::object& py_data, double thresh);
|
|
17
|
+
|
|
18
|
+
// shared state (defined in load_inst.cpp)
|
|
19
|
+
extern std::vector<std::vector<int>> items;
|
|
20
|
+
extern std::string out_file;
|
|
21
|
+
|
|
22
|
+
extern bool b_disp, b_write, use_dic, just_build, ovr_count, pre_pro;
|
|
23
|
+
extern bool use_list;
|
|
24
|
+
|
|
25
|
+
extern unsigned int M, L, time_limit;
|
|
26
|
+
extern unsigned long long N;
|
|
27
|
+
extern double theta;
|
|
28
|
+
extern unsigned long long E;
|
|
29
|
+
extern std::clock_t start_time;
|
|
30
|
+
|
|
31
|
+
// DFS queue of potential patterns to extend
|
|
32
|
+
extern std::vector<largepp::Pattern> DFS;
|
|
33
|
+
|
|
34
|
+
} // namespace largepp
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
namespace largepp {
|
|
5
|
+
|
|
6
|
+
class Pattern {
|
|
7
|
+
public:
|
|
8
|
+
std::vector<int> seq;
|
|
9
|
+
std::vector<unsigned int> str_pnt;
|
|
10
|
+
std::vector<unsigned long long> seq_ID;
|
|
11
|
+
|
|
12
|
+
std::vector<int> slist;
|
|
13
|
+
std::vector<int> ilist;
|
|
14
|
+
|
|
15
|
+
unsigned long long freq;
|
|
16
|
+
|
|
17
|
+
Pattern() : freq(0) {}
|
|
18
|
+
|
|
19
|
+
explicit Pattern(int item) : freq(0) {
|
|
20
|
+
seq.push_back(item);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
Pattern(std::vector<int>& _seq, int item) : freq(0) {
|
|
24
|
+
seq.reserve(_seq.size() + 1);
|
|
25
|
+
for (int i = 0; i < static_cast<int>(_seq.size()); ++i)
|
|
26
|
+
seq.push_back(_seq[i]);
|
|
27
|
+
seq.push_back(item);
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
} // namespace largepp
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#include "utility.hpp"
|
|
2
|
+
#include <string>
|
|
3
|
+
|
|
4
|
+
namespace largepp {
|
|
5
|
+
|
|
6
|
+
// ─── instantiate the globals declared in the header ─────────────
|
|
7
|
+
bool b_disp = false;
|
|
8
|
+
bool b_write = false;
|
|
9
|
+
bool use_dic = false;
|
|
10
|
+
bool just_build = false;
|
|
11
|
+
bool ovr_count = false;
|
|
12
|
+
bool pre_pro = false;
|
|
13
|
+
bool use_list = true; // large-prefix flag the binder toggles
|
|
14
|
+
unsigned int time_limit = 36000;
|
|
15
|
+
std::string out_file;
|
|
16
|
+
std::vector<std::vector<int>> collected; // mined pattern output
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
std::clock_t start_time = 0;
|
|
20
|
+
|
|
21
|
+
// ─── helper implementations ─────────────────────────────────────
|
|
22
|
+
void ClearCollected() { collected.clear(); }
|
|
23
|
+
|
|
24
|
+
const std::vector<std::vector<int>>& GetCollected()
|
|
25
|
+
{
|
|
26
|
+
return collected;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
double give_time(std::clock_t ticks)
|
|
30
|
+
{
|
|
31
|
+
return static_cast<double>(ticks) / CLOCKS_PER_SEC;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
} // namespace largepp
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <vector>
|
|
3
|
+
#include <ctime>
|
|
4
|
+
#include <string>
|
|
5
|
+
|
|
6
|
+
namespace largepp {
|
|
7
|
+
|
|
8
|
+
// Flag & option globals (only declare here – actual values in utility.cpp)
|
|
9
|
+
extern bool b_disp, b_write, use_dic, just_build, ovr_count, pre_pro;
|
|
10
|
+
extern bool use_list; // ← NEW (large-prefix needs this)
|
|
11
|
+
extern unsigned int time_limit;
|
|
12
|
+
|
|
13
|
+
// Pattern buffer that _effspm.cpp_ returns to Python
|
|
14
|
+
extern std::vector<std::vector<int>> collected;
|
|
15
|
+
|
|
16
|
+
// Helper functions every source file uses
|
|
17
|
+
void ClearCollected(); // wipe buffer
|
|
18
|
+
const std::vector<std::vector<int>>& GetCollected(); // read buffer
|
|
19
|
+
double give_time(std::clock_t ticks); // secs from clocks
|
|
20
|
+
|
|
21
|
+
} // namespace largepp
|
effspm/load_inst.hpp
CHANGED
|
@@ -1,25 +1,31 @@
|
|
|
1
|
+
// effspm/load_inst.hpp
|
|
1
2
|
#pragma once
|
|
2
3
|
|
|
3
|
-
#include<vector>
|
|
4
|
-
#include<string>
|
|
4
|
+
#include <vector>
|
|
5
|
+
#include <string>
|
|
5
6
|
#include <fstream>
|
|
6
7
|
#include <map>
|
|
7
|
-
//
|
|
8
|
-
|
|
8
|
+
#include <ctime> // for clock_t
|
|
9
9
|
|
|
10
10
|
using namespace std;
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
// ------------------------------------------------------------
|
|
13
|
+
// forward declare Pattern (defined in freq_miner.hpp)
|
|
14
|
+
struct Pattern;
|
|
15
15
|
|
|
16
|
-
extern string out_file;
|
|
17
16
|
|
|
18
|
-
|
|
17
|
+
// Main entrypoint: load your file on disk into 'items', build DFS, theta, etc.
|
|
18
|
+
bool Load_instance(string &items_file, double thresh);
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
// storage & globals shared between the C++-CLI & Python bindings
|
|
21
|
+
extern vector<vector<int>> items;
|
|
22
|
+
extern vector<Pattern> DFS; // now Pattern is known
|
|
23
|
+
extern vector<int> item_dic;
|
|
21
24
|
|
|
22
|
-
extern
|
|
25
|
+
extern string out_file;
|
|
26
|
+
extern bool b_disp, b_write, use_dic, use_list, pre_pro;
|
|
23
27
|
|
|
24
|
-
extern
|
|
28
|
+
extern unsigned int M, L, time_limit;
|
|
29
|
+
extern unsigned long long N, E, theta; // E = total number of entries
|
|
25
30
|
|
|
31
|
+
extern clock_t start_time;
|