effspm 0.2.8__cp311-cp311-win_amd64.whl → 0.3.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,11 @@
1
-
1
+ // effspm/btminer/src/load_inst.cpp
2
2
  #include <iostream>
3
- #include <sstream>
4
3
  #include <fstream>
5
- #include <cmath>
6
- #include <ctime>
7
- #include <map>
8
- #include <vector>
4
+ #include <sstream>
9
5
  #include <algorithm>
6
+ #include <math.h>
7
+ #include <time.h>
8
+
10
9
  #include "load_inst.hpp"
11
10
  #include "utility.hpp"
12
11
  #include "build_mdd.hpp"
@@ -16,23 +15,53 @@ namespace btminer {
16
15
 
17
16
  using namespace std;
18
17
 
19
- extern int num_nodes, cur_node;
20
-
21
-
22
- map<string, int> item_map;
23
- map<int, string> item_map_rev;
24
- vector<int> freq;
25
- vector<int> item_dic;
26
-
27
- void Load_items_pre(string& inst_name);
28
- bool Load_items(string& inst_name);
29
- bool Preprocess(string& inst, double thresh);
30
-
31
-
32
-
33
-
34
- bool Load_instance(string& items_file, double thresh) {
18
+ // ---------------------------------------------------------------------
19
+ // global definitions (must match load_inst.hpp)
20
+ // ---------------------------------------------------------------------
21
+ int M = 0;
22
+ int N = 0;
23
+ int L = 0;
24
+ unsigned long long E = 0ULL; // matches header: extern unsigned long long E;
25
+ int num_nodes = 0;
26
+ int theta = 0;
27
+ int cur_node = 0;
28
+
29
+ map<string, int> item_map;
30
+ map<int, string> item_map_rev;
31
+
32
+ std::vector<int> freq;
33
+ std::vector<int> item_dic;
34
+
35
+ // ✅ REAL DEFINITION lives here:
36
+ std::vector<Pattern> DFS;
37
+
38
+ string out_file, folder;
39
+ bool b_disp = 0;
40
+ bool b_write = 0;
41
+ bool use_dic = 0;
42
+ bool just_build= 0;
43
+ bool pre_pro = 1;
44
+
45
+ int N_mult = 1;
46
+ int M_mult = 1;
47
+ int time_limit= 30 * 3600; // 30 hours, same as professor
48
+
49
+ clock_t start_time;
50
+
51
+ // ---------------------------------------------------------------------
52
+ // forward decls
53
+ // ---------------------------------------------------------------------
54
+ void Load_items_pre(string &inst_name);
55
+ bool Load_items(string &inst_name);
56
+ bool Preprocess(string &inst, double thresh);
57
+
58
+ // ---------------------------------------------------------------------
59
+ // main loader
60
+ // ---------------------------------------------------------------------
61
+ bool Load_instance(string &items_file, double thresh) {
35
62
  clock_t kk = clock();
63
+
64
+ // root node for MDD
36
65
  Tree.emplace_back(0, 0, 0);
37
66
 
38
67
  if (pre_pro) {
@@ -41,55 +70,77 @@ bool Load_instance(string& items_file, double thresh) {
41
70
 
42
71
  cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
43
72
 
73
+ // build empty DFS of size L
74
+ DFS.clear();
44
75
  DFS.reserve(L);
45
76
  for (int i = 0; i < L; ++i)
46
77
  DFS.emplace_back(-i - 1);
47
78
 
48
79
  kk = clock();
49
80
  Load_items_pre(items_file);
50
- } else if (!Load_items(items_file))
81
+ }
82
+ else if (!Load_items(items_file)) {
51
83
  return false;
84
+ }
52
85
  else {
53
- theta = (thresh < 1) ? ceil(thresh * N * N_mult) : thresh;
86
+ if (thresh < 1)
87
+ theta = static_cast<int>(ceil(thresh * N * N_mult));
88
+ else
89
+ theta = static_cast<int>(thresh);
54
90
  }
55
91
 
56
92
  cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
57
- cout << "Found " << N * N_mult << " sequence, with max line len " << M << ", and " << L << " items, and " << E << " enteries\n";
93
+ cout << "Found " << N * N_mult
94
+ << " sequence, with max line len " << M
95
+ << ", and " << L << " items, and " << E << " enteries\n";
58
96
  cout << "Total MDD nodes: " << Tree.size() << endl;
59
97
 
60
98
  return true;
61
99
  }
62
100
 
63
- bool Preprocess(string& inst, double thresh) {
101
+ // ---------------------------------------------------------------------
102
+ // preprocessing pass
103
+ // ---------------------------------------------------------------------
104
+ bool Preprocess(string &inst, double thresh) {
64
105
  ifstream file(inst);
65
- if (!file.good()) {
66
- cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
67
- return false;
68
- }
69
106
 
70
- string line;
71
- int size_m, ditem;
72
- while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
73
- ++N;
74
- vector<bool> counted(L, 0);
75
- istringstream word(line);
76
- string itm;
77
- while (word >> itm) {
78
- ditem = stoi(itm);
79
- if (L < abs(ditem)) L = abs(ditem);
80
- while (freq.size() < L) {
81
- freq.push_back(0);
82
- counted.push_back(0);
83
- }
84
- if (!counted[abs(ditem) - 1]) {
85
- ++freq[abs(ditem) - 1];
86
- counted[abs(ditem) - 1] = 1;
107
+ if (file.good()) {
108
+ string line;
109
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
110
+ ++N;
111
+ vector<bool> counted(L, false);
112
+
113
+ istringstream word(line);
114
+ string itm;
115
+ while (word >> itm) {
116
+ int ditem = stoi(itm);
117
+ if (L < abs(ditem))
118
+ L = abs(ditem);
119
+
120
+ // extend freq / counted if L grew
121
+ while (static_cast<int>(freq.size()) < L) {
122
+ freq.push_back(0);
123
+ counted.push_back(false);
124
+ }
125
+
126
+ int idx = abs(ditem) - 1;
127
+ if (!counted[idx]) {
128
+ ++freq[idx];
129
+ counted[idx] = true;
130
+ }
87
131
  }
88
132
  }
133
+ } else {
134
+ cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
135
+ return false;
89
136
  }
90
137
 
91
- theta = (thresh < 1) ? ceil(thresh * N * N_mult) : thresh;
138
+ if (thresh < 1)
139
+ theta = static_cast<int>(ceil(thresh * N * N_mult));
140
+ else
141
+ theta = static_cast<int>(thresh);
92
142
 
143
+ // build item_dic with only frequent items
93
144
  int real_L = 0;
94
145
  item_dic = vector<int>(L, -1);
95
146
  for (int i = 0; i < L; ++i) {
@@ -97,103 +148,127 @@ bool Preprocess(string& inst, double thresh) {
97
148
  item_dic[i] = ++real_L;
98
149
  }
99
150
 
100
- cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
151
+ cout << "Original number of items: " << L
152
+ << " Reduced to: " << real_L << endl;
153
+
101
154
  L = real_L;
102
- N = 0;
155
+ N = 0; // will be recounted in Load_items_pre
156
+
103
157
  return true;
104
158
  }
105
159
 
106
- void Load_items_pre(string& inst_name) {
160
+ // ---------------------------------------------------------------------
161
+ // load after preprocessing
162
+ // ---------------------------------------------------------------------
163
+ void Load_items_pre(string &inst_name) {
107
164
  ifstream file(inst_name);
108
- if (!file.good()) return;
109
-
110
- string line;
111
- int ditem;
112
- while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
113
- istringstream word(line);
114
- string itm;
115
- vector<int> temp_vec;
116
- bool sgn = 0;
117
- while (word >> itm) {
118
- if (use_dic) {
119
- auto it = item_map.find(itm);
120
- if (it == item_map.end()) {
121
- item_map[itm] = ++L;
122
- item_map_rev[L] = itm;
123
- ditem = L;
165
+
166
+ if (file.good()) {
167
+ string line;
168
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
169
+ istringstream word(line);
170
+ string itm;
171
+ vector<int> temp_vec;
172
+ bool sgn = false;
173
+ while (word >> itm) {
174
+ int ditem;
175
+ if (use_dic) {
176
+ auto it = item_map.find(itm);
177
+ if (it == item_map.end()) {
178
+ item_map[itm] = ++L;
179
+ item_map_rev[L] = itm;
180
+ ditem = L;
181
+ } else {
182
+ ditem = it->second;
183
+ }
124
184
  } else {
125
- ditem = it->second;
185
+ ditem = stoi(itm);
126
186
  }
127
- } else {
128
- ditem = stoi(itm);
129
- }
130
187
 
131
- if (pre_pro && freq.size() > abs(ditem) - 1 && freq[abs(ditem) - 1] < theta) {
132
- if (!sgn)
133
- sgn = ditem < 0;
134
- continue;
135
- } else if (pre_pro) {
136
- ditem = (ditem > 0) ? item_dic[ditem - 1] : -item_dic[-ditem - 1];
137
- }
188
+ // drop infrequent items
189
+ if (freq[abs(ditem) - 1] < theta) {
190
+ if (!sgn)
191
+ sgn = (ditem < 0);
192
+ continue;
193
+ } else {
194
+ if (ditem > 0)
195
+ ditem = item_dic[ditem - 1];
196
+ else
197
+ ditem = -item_dic[-ditem - 1];
198
+ }
138
199
 
139
- if (sgn && ditem > 0)
140
- ditem = -ditem;
141
- sgn = 0;
200
+ if (sgn) {
201
+ if (ditem > 0)
202
+ ditem = -ditem;
203
+ sgn = false;
204
+ }
142
205
 
143
- temp_vec.push_back(ditem);
144
- }
206
+ temp_vec.push_back(ditem);
207
+ }
145
208
 
146
- if (temp_vec.empty()) continue;
209
+ if (temp_vec.empty())
210
+ continue;
211
+
212
+ ++N;
147
213
 
148
- ++N;
149
- if (temp_vec.size() > M) M = temp_vec.size();
214
+ if (static_cast<int>(temp_vec.size()) > M)
215
+ M = static_cast<int>(temp_vec.size());
150
216
 
151
- E += temp_vec.size(); // <-- make sure E gets incremented
152
- Build_MDD(temp_vec);
217
+ // this increments E inside Build_MDD
218
+ Build_MDD(temp_vec);
219
+ }
153
220
  }
154
221
  }
155
222
 
156
- bool Load_items(string& inst_name) {
223
+ // ---------------------------------------------------------------------
224
+ // load without preprocessing
225
+ // ---------------------------------------------------------------------
226
+ bool Load_items(string &inst_name) {
157
227
  ifstream file(inst_name);
158
- if (!file.good()) {
159
- cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
160
- return false;
161
- }
162
228
 
163
- string line;
164
- int ditem;
165
- while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
166
- ++N;
167
- istringstream word(line);
168
- string itm;
169
- vector<int> temp_vec;
170
- while (word >> itm) {
171
- if (use_dic) {
172
- auto it = item_map.find(itm);
173
- if (it == item_map.end()) {
174
- item_map[itm] = ++L;
175
- item_map_rev[L] = itm;
176
- ditem = L;
229
+ if (file.good()) {
230
+ string line;
231
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
232
+ ++N;
233
+ istringstream word(line);
234
+ string itm;
235
+ vector<int> temp_vec;
236
+ while (word >> itm) {
237
+ int ditem;
238
+ if (use_dic) {
239
+ auto it = item_map.find(itm);
240
+ if (it == item_map.end()) {
241
+ item_map[itm] = ++L;
242
+ item_map_rev[L] = itm;
243
+ ditem = L;
244
+ } else {
245
+ ditem = it->second;
246
+ }
177
247
  } else {
178
- ditem = it->second;
179
- }
180
- } else {
181
- ditem = stoi(itm);
182
- if (L < abs(ditem)) {
183
- L = abs(ditem);
184
- while (DFS.size() < L && !just_build) {
185
- DFS.reserve(L);
186
- DFS.emplace_back(-DFS.size() - 1);
248
+ ditem = stoi(itm);
249
+ if (L < abs(ditem)) {
250
+ L = abs(ditem);
251
+ // make sure DFS is large enough (unless just_build)
252
+ while (static_cast<int>(DFS.size()) < L && !just_build) {
253
+ DFS.reserve(L);
254
+ DFS.emplace_back(-((int)DFS.size()) - 1);
255
+ }
187
256
  }
188
257
  }
258
+
259
+ temp_vec.push_back(ditem);
189
260
  }
190
- temp_vec.push_back(ditem);
191
- }
192
261
 
193
- if (temp_vec.size() > M) M = temp_vec.size();
194
- E += temp_vec.size(); // <-- make sure E gets incremented
195
- Build_MDD(temp_vec);
262
+ if (static_cast<int>(temp_vec.size()) > M)
263
+ M = static_cast<int>(temp_vec.size());
264
+
265
+ Build_MDD(temp_vec);
266
+ }
267
+ } else {
268
+ cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
269
+ return false;
196
270
  }
271
+
197
272
  return true;
198
273
  }
199
274
 
@@ -10,16 +10,34 @@
10
10
 
11
11
  namespace btminer {
12
12
 
13
- bool Load_instance(std::string& items_file, double thresh);
13
+ using std::string;
14
+ using std::vector;
15
+ using std::map;
16
+ using std::unordered_map;
17
+ using std::unordered_set;
14
18
 
15
- extern std::string out_file, folder;
19
+ bool Load_instance(string& items_file, double thresh);
20
+
21
+ extern string out_file, folder;
16
22
 
17
23
  extern bool b_disp, b_write, use_dic, just_build, pre_pro;
18
24
 
19
- extern int N, M, L, theta, num_nodes, M_mult, N_mult, time_limit, cur_node;
25
+ extern int N, M, L, theta, num_nodes, M_mult, N_mult, time_limit, cur_node;
26
+ extern unsigned long long E; // total number of entries (we need this for _effspm.cpp)
27
+
28
+ extern std::clock_t start_time;
29
+
30
+ // these 2 are for dictionary mode
31
+ extern map<string,int> item_map;
32
+ extern map<int,string> item_map_rev;
20
33
 
21
- extern clock_t start_time;
34
+ extern vector<int> freq;
35
+ extern vector<int> item_dic;
22
36
 
37
+ // expose items so _effspm.cpp can fall back to seeding (it expects btminer::items)
38
+ extern vector<vector<int>> items;
23
39
 
40
+ class Pattern;
41
+ extern vector<Pattern> DFS;
24
42
 
25
43
  } // namespace btminer
@@ -5,61 +5,46 @@
5
5
 
6
6
  namespace btminer {
7
7
 
8
- // ─── Global definitions ──────────────────────────────────────────
9
- bool use_dic = false;
10
- std::vector<std::vector<int>> items;
11
- bool use_list = false;
12
- bool just_build = false;
13
- int E = 0, M = 0, N = 0, L = 0, theta = 0;
14
- std::vector<Pattern> DFS;
15
- clock_t start_time = 0;
16
- bool b_disp = false, b_write = false;
17
- std::string out_file;
18
-
19
- bool pre_pro = true;
20
- int N_mult = 1, M_mult = 1;
21
- int time_limit = 30 * 3600;
22
-
23
- // buffer of mined patterns returned to Python
24
- std::vector<std::vector<int>> collected;
25
-
26
- void ClearCollected() { collected.clear(); }
27
- const std::vector<std::vector<int>>& GetCollected() { return collected; }
28
-
29
- // ─── Utility functions ───────────────────────────────────────────
30
- int find_ID(std::vector<int>& vec, int itm)
31
- {
8
+ int find_ID(vector<int>& vec, int itm) {
32
9
  int plc = 0;
33
- while (plc < static_cast<int>(vec.size()) && vec[plc] != itm) ++plc;
34
- return (plc == static_cast<int>(vec.size())) ? -1 : plc;
10
+ while (plc < static_cast<int>(vec.size()) && vec[plc] != itm)
11
+ ++plc;
12
+
13
+ if (plc == static_cast<int>(vec.size()))
14
+ return -1;
15
+ else
16
+ return plc;
35
17
  }
36
18
 
37
- bool check_parent(int cur_arc, int str_pnt, int start,
38
- std::vector<int>& strpnt_vec)
39
- {
19
+ bool check_parent(int cur_arc, int str_pnt, int start, std::vector<int>& strpnt_vec) {
20
+
40
21
  std::vector<int> ancestors;
22
+
41
23
  int cur_anct = Tree[cur_arc].anct;
42
24
 
43
25
  while (Tree[cur_anct].itmset > Tree[str_pnt].itmset) {
44
- if (Tree[cur_anct].item > 0) ancestors.push_back(cur_anct);
26
+ if (Tree[cur_anct].item > 0)
27
+ ancestors.push_back(cur_anct);
45
28
  cur_anct = Tree[cur_anct].anct;
46
29
  }
47
- if (Tree[cur_anct].itmset == Tree[str_pnt].itmset) return true;
48
30
 
49
- for (auto it = ancestors.rbegin(); it != ancestors.rend(); ++it)
50
- for (int i = start; i < static_cast<int>(strpnt_vec.size()); ++i)
51
- if (strpnt_vec[i] == *it) return true;
31
+ if (Tree[cur_anct].itmset == Tree[str_pnt].itmset)
32
+ return true;
33
+ else {
34
+ for (auto it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
35
+ for (int i = start; i < static_cast<int>(strpnt_vec.size()); ++i) {
36
+ if (strpnt_vec[i] == *it)
37
+ return true;
38
+ }
39
+ }
40
+ }
52
41
 
53
42
  return false;
54
43
  }
55
44
 
56
- bool find_pnt(Arc* pnt, std::vector<Arc*>& vec, int pos)
57
- {
58
- for (size_t i = pos; i < vec.size(); ++i)
59
- if (vec[i] == pnt) return true;
60
- return false;
45
+ float give_time(clock_t kk) {
46
+ float ll = static_cast<float>(kk) / CLOCKS_PER_SEC;
47
+ return ll;
61
48
  }
62
49
 
63
- double give_time(clock_t kk) { return double(kk) / CLOCKS_PER_SEC; }
64
-
65
50
  } // namespace btminer
@@ -1,40 +1,16 @@
1
1
  #pragma once
2
2
 
3
3
  #include <vector>
4
- #include <ctime>
4
+ #include <time.h>
5
5
  #include <string>
6
6
  #include "build_mdd.hpp"
7
- #include "freq_miner.hpp"
8
- #include "load_inst.hpp"
9
7
 
10
8
  namespace btminer {
11
9
 
12
- // === Utility function declarations ===
13
- bool find_pnt(Arc* pnt, std::vector<Arc*>& vec, int pos);
14
- int find_ID(std::vector<int>& vec, int itm);
15
- double give_time(clock_t kk);
16
- bool check_parent(int cur_arc, int str_pnt, int start, std::vector<int>& strpnt_vec);
10
+ using std::vector;
17
11
 
18
- // === Global variables (DECLARATIONS ONLY) ===
12
+ int find_ID(vector<int>& vec, int itm);
13
+ float give_time(clock_t kk);
14
+ bool check_parent(int cur_arc, int str_pnt, int start, vector<int>& strpnt_vec);
19
15
 
20
- extern bool use_list;
21
- extern bool just_build;
22
- extern int E, M, N, L, theta;
23
- extern std::vector<Pattern> DFS;
24
- extern clock_t start_time;
25
- extern bool b_disp, b_write;
26
- extern std::string out_file;
27
- extern bool pre_pro;
28
- extern int N_mult, M_mult;
29
- extern int time_limit;
30
- extern std::vector<std::vector<int>> items;
31
-
32
- extern std::vector<std::vector<int>> collected;
33
-
34
- void ClearCollected();
35
- const std::vector<std::vector<int>>& GetCollected();
36
-
37
- }
38
-
39
-
40
- // namespace btminer
16
+ } // namespace btminer
effspm/freq_miner.hpp CHANGED
@@ -24,7 +24,8 @@ public:
24
24
 
25
25
  Pattern(vector<int>& _seq, int item) {
26
26
  seq.reserve(_seq.size());
27
- for (std::size_t i = 0; i < _seq.size(); ++i)
27
+ for (int i = 0; i < _seq.size(); ++i)
28
+
28
29
 
29
30
  seq.push_back(_seq[i]);
30
31
  seq.push_back(item);