effspm 0.1.12__cp313-cp313-macosx_10_13_universal2.whl → 0.2.1__cp313-cp313-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of effspm might be problematic. Click here for more details.

Files changed (48) hide show
  1. effspm/__init__.py +3 -3
  2. effspm/_effspm.cpp +437 -13
  3. effspm/_effspm.cpython-313-darwin.so +0 -0
  4. effspm/btminer/src/freq_miner.cpp +3 -0
  5. effspm/btminer/src/load_inst.cpp +4 -0
  6. effspm/btminer/src/load_inst.hpp +2 -0
  7. effspm/btminer/src/utility.cpp +31 -33
  8. effspm/btminer/src/utility.hpp +9 -13
  9. effspm/htminer/src/build_mdd.cpp +192 -0
  10. effspm/htminer/src/build_mdd.hpp +64 -0
  11. effspm/htminer/src/freq_miner.cpp +350 -0
  12. effspm/htminer/src/freq_miner.hpp +60 -0
  13. effspm/htminer/src/load_inst.cpp +381 -0
  14. effspm/htminer/src/load_inst.hpp +23 -0
  15. effspm/htminer/src/main.cpp +96 -0
  16. effspm/htminer/src/utility.cpp +72 -0
  17. effspm/htminer/src/utility.hpp +77 -0
  18. effspm/largebm/src/build_mdd.cpp +137 -0
  19. effspm/largebm/src/build_mdd.hpp +47 -0
  20. effspm/largebm/src/freq_miner.cpp +342 -0
  21. effspm/largebm/src/freq_miner.hpp +48 -0
  22. effspm/largebm/src/load_inst.cpp +235 -0
  23. effspm/largebm/src/load_inst.hpp +45 -0
  24. effspm/largebm/src/main.cpp +95 -0
  25. effspm/largebm/src/utility.cpp +45 -0
  26. effspm/largebm/src/utility.hpp +18 -0
  27. effspm/largehm/src/build_mdd.cpp +173 -0
  28. effspm/largehm/src/build_mdd.hpp +93 -0
  29. effspm/largehm/src/freq_miner.cpp +441 -0
  30. effspm/largehm/src/freq_miner.hpp +77 -0
  31. effspm/largehm/src/load_inst.cpp +357 -0
  32. effspm/largehm/src/load_inst.hpp +64 -0
  33. effspm/largehm/src/main.cpp +95 -0
  34. effspm/largehm/src/utility.cpp +38 -0
  35. effspm/largehm/src/utility.hpp +29 -0
  36. effspm/largepp/src/freq_miner.cpp +170 -0
  37. effspm/largepp/src/freq_miner.hpp +43 -0
  38. effspm/largepp/src/load_inst.cpp +219 -0
  39. effspm/largepp/src/load_inst.hpp +28 -0
  40. effspm/largepp/src/main.cpp +108 -0
  41. effspm/largepp/src/utility.cpp +33 -0
  42. effspm/largepp/src/utility.hpp +20 -0
  43. {effspm-0.1.12.dist-info → effspm-0.2.1.dist-info}/METADATA +1 -1
  44. effspm-0.2.1.dist-info/RECORD +59 -0
  45. {effspm-0.1.12.dist-info → effspm-0.2.1.dist-info}/WHEEL +1 -1
  46. effspm-0.1.12.dist-info/RECORD +0 -25
  47. {effspm-0.1.12.dist-info → effspm-0.2.1.dist-info}/licenses/LICENSE +0 -0
  48. {effspm-0.1.12.dist-info → effspm-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,357 @@
1
+ #include <iostream>
2
+ #include <sstream>
3
+ #include <algorithm>
4
+ #include <fstream>
5
+ #include <cmath>
6
+ #include <ctime>
7
+
8
+ #include "load_inst.hpp"
9
+ #include "utility.hpp"
10
+ #include "build_mdd.hpp"
11
+ #include "freq_miner.hpp"
12
+
13
+ namespace largehm {
14
+ using namespace std;
15
+
16
+ string out_file;
17
+ string folder;
18
+
19
+ bool b_disp = false;
20
+ bool b_write = false;
21
+ bool use_dic = false;
22
+ bool use_list = false;
23
+ bool just_build = false;
24
+ bool pre_pro = false;
25
+ bool itmset_exists = false;
26
+
27
+ unsigned int M = 0;
28
+ unsigned int L = 0;
29
+ unsigned int mlim = 0;
30
+ unsigned int time_limit = 0;
31
+
32
+ unsigned long long int N = 0;
33
+ unsigned long long int theta = 0;
34
+ unsigned long long int E = 0;
35
+
36
+ clock_t start_time = 0;
37
+
38
+ vector<vector<int>> items;
39
+
40
+ vector<int> item_dic;
41
+ vector<Pattern> DFS;
42
+ vector<VPattern> VDFS;
43
+
44
+
45
+ bool Load_instance(string& items_file, double thresh) {
46
+ // ─── 1) CLEAR ANY leftover state from a previous run ───
47
+ Tree.clear();
48
+ VTree.clear();
49
+ CTree.clear();
50
+ DFS.clear();
51
+ VDFS.clear();
52
+ item_dic.clear();
53
+ items.clear();
54
+
55
+ N = 0;
56
+ M = 0;
57
+ L = 0;
58
+ E = 0;
59
+ theta = 0;
60
+ itmset_exists = false;
61
+ // ────────────────────────────────────────────────────
62
+
63
+ clock_t kk = clock();
64
+
65
+ // Insert fresh dummy root node:
66
+
67
+
68
+
69
+
70
+ Tree.emplace_back(0, 0, 0);
71
+
72
+ if (!pre_pro) {
73
+ if (!Load_items(items_file))
74
+ return false;
75
+ DFS.reserve(L);
76
+ while (DFS.size() < L) {
77
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
78
+ }
79
+ VDFS.reserve(L);
80
+ while (VDFS.size() < L) {
81
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
82
+ }
83
+ if (thresh < 1.0) {
84
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
85
+ } else {
86
+ theta = static_cast<unsigned long long>(thresh);
87
+ }
88
+ }
89
+ else {
90
+ if (!Load_items(items_file))
91
+ return false;
92
+ if (thresh < 1.0) {
93
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
94
+ } else {
95
+ theta = static_cast<unsigned long long>(thresh);
96
+ }
97
+ }
98
+
99
+ cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
100
+ cout << "Found " << N << " sequence, with max line len " << M
101
+ << ", and " << L << " items, and " << E << " enteries\n";
102
+ // cout << "Total Trie nodes: " << Tree.size()
103
+ // << " Total CTree nodes: " << CTree.size()
104
+ // << " Total VTree nodes: " << VTree.size() << endl;
105
+
106
+ return true;
107
+ }
108
+
109
+
110
+ bool Preprocess(string &inst, double thresh) {
111
+ vector<unsigned long long int> MN(100, 0);
112
+ vector<vector<bool>> ML(100, vector<bool>(1000000, false));
113
+
114
+ ifstream file(inst);
115
+ if (!file.good()) {
116
+ cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
117
+ return false;
118
+ }
119
+
120
+ vector<unsigned long long int> freq(1000000, 0ULL);
121
+ vector<unsigned long long int> counted(1000000, 0ULL);
122
+
123
+ string line;
124
+ int ditem;
125
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
126
+ ++N;
127
+ if (N % 10000000 == 0)
128
+ cout << "N: " << N << endl;
129
+
130
+ istringstream word(line);
131
+ string itm;
132
+ int size_m = 0;
133
+ while (word >> itm) {
134
+ ++size_m;
135
+ ditem = stoi(itm);
136
+
137
+ if (ditem > 0)
138
+ itmset_exists = true;
139
+ else
140
+ ditem = -ditem;
141
+
142
+ if (size_m < (int)MN.size()) {
143
+ ++MN[size_m - 1];
144
+ if ((int)ML[size_m - 1].size() < ditem) {
145
+ ML[size_m - 1].resize(ditem, false);
146
+ }
147
+ ML[size_m - 1][ditem - 1] = true;
148
+ }
149
+
150
+ if (L < static_cast<unsigned int>(ditem)) {
151
+ L = static_cast<unsigned int>(ditem);
152
+ }
153
+
154
+ if ((int)freq.size() < ditem) {
155
+ freq.resize(ditem, 0ULL);
156
+ counted.resize(ditem, 0ULL);
157
+ }
158
+ if (counted[ditem - 1] != N) {
159
+ ++freq[ditem - 1];
160
+ counted[ditem - 1] = N;
161
+ }
162
+ }
163
+ if (size_m > (int)M)
164
+ M = size_m;
165
+ }
166
+
167
+ if (thresh < 1.0) {
168
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
169
+ } else {
170
+ theta = static_cast<unsigned long long>(thresh);
171
+ }
172
+
173
+ int real_L = 0;
174
+ item_dic.assign(L, -1);
175
+ vector<bool> item_in(L, false);
176
+ for (int i = 0; i < (int)L; ++i) {
177
+ if (freq[i] >= theta) {
178
+ item_dic[i] = ++real_L;
179
+ item_in[i] = true;
180
+ }
181
+ }
182
+
183
+ cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
184
+
185
+ unsigned long long int LpM = 1;
186
+ mlim = M;
187
+ int orgmlim = 0;
188
+ int ulim = min(1 + real_L / 4, 10);
189
+ unsigned long long int ml;
190
+
191
+ for (int i = 0; i + ulim < (int)MN.size() && i + ulim < (int)M; ++i) {
192
+ ml = 0;
193
+ for (int j = 0; j < (int)L; ++j) {
194
+ if (ML[i][j] && item_in[j])
195
+ ++ml;
196
+ }
197
+ LpM *= ml * (1 + itmset_exists);
198
+ cout << ml << " " << LpM << " " << MN[i] << endl;
199
+ if (LpM * ulim > MN[i]) {
200
+ orgmlim = i;
201
+ while (i + ulim - 1 < (int)MN.size() && i + ulim - 1 < (int)M) {
202
+ cout << (MN[i - 1] - MN[i + ulim - 1]) << " "
203
+ << MN[i + ulim - 1] << endl;
204
+ if ((MN[i - 1] - MN[i + ulim - 1]) < MN[i + ulim - 1]
205
+ && MN[i + ulim - 1] < 600000000) {
206
+ mlim = i - 1;
207
+ break;
208
+ }
209
+ ++i;
210
+ }
211
+ break;
212
+ }
213
+ }
214
+
215
+ cout << "M is: " << M << " Mlim is: " << mlim
216
+ << " ulim is: " << ulim
217
+ << " original mlim is: " << orgmlim
218
+ << " guess is: "
219
+ << round((log(N) - log(6)) / log(real_L)) << endl;
220
+
221
+ if (mlim < (int)M) {
222
+ for (int i = 0; i < real_L; ++i)
223
+ VDFS.emplace_back(i);
224
+ }
225
+
226
+ L = static_cast<unsigned int>(real_L);
227
+ N = 0;
228
+ M = 0;
229
+ return true;
230
+ }
231
+
232
+
233
+ bool Load_items_pre(string &inst_name) {
234
+ ifstream file(inst_name);
235
+ if (!file.good()) {
236
+ cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
237
+ return false;
238
+ }
239
+
240
+ string line;
241
+ int ditem;
242
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
243
+ istringstream word(line);
244
+ string itm;
245
+ vector<int> temp_vec;
246
+ vector<int> temp_lim;
247
+ bool sgn = false;
248
+
249
+ // L is final from Preprocess
250
+ while (word >> itm) {
251
+ ditem = stoi(itm);
252
+ if (item_dic[std::abs(ditem) - 1] == -1) {
253
+ if (!sgn)
254
+ sgn = (ditem < 0);
255
+ continue;
256
+ } else {
257
+ if (ditem > 0)
258
+ ditem = item_dic[ditem - 1];
259
+ else
260
+ ditem = -item_dic[-ditem - 1];
261
+ }
262
+ if (sgn) {
263
+ if (ditem > 0)
264
+ ditem = -ditem;
265
+ sgn = false;
266
+ }
267
+ if (temp_vec.size() <= (size_t)mlim)
268
+ temp_vec.push_back(ditem);
269
+ else
270
+ temp_lim.push_back(ditem);
271
+ }
272
+
273
+ if (temp_vec.empty())
274
+ continue;
275
+
276
+ ++N;
277
+ if (N % 10000000 == 0)
278
+ cout << N << endl;
279
+
280
+ if (temp_vec.size() + temp_lim.size() > (size_t)M)
281
+ M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
282
+
283
+ // ─── Ensure DFS/VDFS size before Build_MDD ───
284
+ while (DFS.size() < L)
285
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
286
+ while (VDFS.size() < L)
287
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
288
+ // ──────────────────────────────────────────────
289
+
290
+ Build_MDD(temp_vec, temp_lim);
291
+ }
292
+
293
+ return true;
294
+ }
295
+
296
+
297
+ bool Load_items(string &inst_name) {
298
+ // std::cerr << "[SANITY] In Load_items: inst_name='"
299
+ // << inst_name << "'" << std::endl;
300
+ ifstream file(inst_name);
301
+ if (!file.good()) {
302
+ cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
303
+ return false;
304
+ }
305
+
306
+ string line;
307
+ int ditem;
308
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
309
+ ++N;
310
+ if (N % 1000000 == 0)
311
+ cout << "Found " << N << " sequence, with max line len "
312
+ << M << ", and " << L << " items, and " << E
313
+ << " enteries\n";
314
+
315
+ istringstream word(line);
316
+ string itm;
317
+ vector<int> temp_vec;
318
+ vector<int> temp_lim;
319
+
320
+ while (word >> itm) {
321
+ ditem = stoi(itm);
322
+
323
+ if (ditem > 0)
324
+ itmset_exists = true;
325
+
326
+ if (L < static_cast<unsigned int>(std::abs(ditem))) {
327
+ L = static_cast<unsigned int>(std::abs(ditem));
328
+ // Immediately grow DFS/VDFS to handle new L
329
+ while (DFS.size() < L)
330
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
331
+ while (VDFS.size() < L)
332
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
333
+ }
334
+
335
+ if (temp_vec.size() < (size_t)mlim)
336
+ temp_vec.push_back(ditem);
337
+ else
338
+ temp_lim.push_back(ditem);
339
+ }
340
+ E += static_cast<unsigned long long>(temp_vec.size() + temp_lim.size());
341
+ if (temp_vec.size() + temp_lim.size() > (size_t)M)
342
+ M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
343
+
344
+ // ─── Ensure DFS/VDFS size before Build_MDD ───
345
+ while (DFS.size() < L)
346
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
347
+ while (VDFS.size() < L)
348
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
349
+ // ──────────────────────────────────────────────
350
+
351
+ Build_MDD(temp_vec, temp_lim);
352
+ }
353
+
354
+ return true;
355
+ }
356
+
357
+ } // namespace largehm
@@ -0,0 +1,64 @@
1
+ #ifndef LARGEHM_LOAD_INST_HPP
2
+ #define LARGEHM_LOAD_INST_HPP
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <fstream>
7
+ #include <ctime> // for clock_t
8
+
9
+ // We need Pattern and VPattern, so include freq_miner.hpp here:
10
+ #include "freq_miner.hpp"
11
+
12
+ namespace largehm {
13
+
14
+ //
15
+ // ─── Globals & Function Prototypes ───────────────────────────────────────────
16
+ //
17
+
18
+ // Output/folder:
19
+ extern std::string out_file;
20
+ extern std::string folder;
21
+
22
+ // Flags:
23
+ extern bool b_disp;
24
+ extern bool b_write;
25
+ extern bool use_dic;
26
+ extern bool use_list;
27
+ extern bool just_build;
28
+ extern bool pre_pro;
29
+ extern bool itmset_exists;
30
+
31
+ // Database statistics:
32
+ extern unsigned int M;
33
+ extern unsigned int L;
34
+ extern unsigned int mlim;
35
+ extern unsigned int time_limit;
36
+
37
+ extern unsigned long long int N;
38
+ extern unsigned long long int theta;
39
+ extern unsigned long long int E;
40
+
41
+ // Timing:
42
+ extern clock_t start_time;
43
+
44
+ // In‐memory sequences (only if “in‐memory” mode):
45
+ extern std::vector<std::vector<int>> items;
46
+
47
+ // Preprocessing dictionary (maps original → compressed IDs):
48
+ extern std::vector<int> item_dic;
49
+
50
+ // DFS stacks used by the miner (Pattern / VPattern):
51
+ extern std::vector<Pattern> DFS;
52
+ extern std::vector<VPattern> VDFS;
53
+
54
+ // Internal loader functions:
55
+ bool Load_items_pre(std::string &inst_name);
56
+ bool Load_items(std::string &inst_name);
57
+ bool Preprocess(std::string &inst, double thresh);
58
+
59
+ // Main entry‐point for loading & building the MDD:
60
+ bool Load_instance(std::string &items_file, double thresh);
61
+
62
+ } // namespace largehm
63
+
64
+ #endif // LARGEHM_LOAD_INST_HPP
@@ -0,0 +1,95 @@
1
+ #include <iostream>
2
+ #include <time.h>
3
+ #include <string.h>
4
+ #include <string>
5
+ #include "load_inst.hpp"
6
+ #include "build_mdd.hpp"
7
+ #include "utility.hpp"
8
+ #include "freq_miner.hpp"
9
+
10
+
11
+ using namespace std;
12
+
13
+ string out_file;
14
+
15
+ bool b_disp = 0,
16
+ b_write = 0,
17
+ use_dic = 0,
18
+ use_list = 0, // <-- Add this
19
+ just_build = 0,
20
+ pre_pro = 1;
21
+
22
+ unsigned int time_limit = 10 * 3600;
23
+
24
+ clock_t start_time;
25
+
26
+ string folder;
27
+
28
+ int main(int argc, char* argv[]) {
29
+
30
+ string VV, attr;
31
+
32
+ double thresh = 0;
33
+ for (int i = 1; i<argc; i++) {
34
+ if (argv[i][0] != '-' || isdigit(argv[i][1]))
35
+ continue;
36
+ else if (strcmp(argv[i], "-thr") == 0)
37
+ thresh = stod(argv[i + 1]);
38
+ else if (strcmp(argv[i], "-file") == 0)
39
+ VV = argv[i + 1];
40
+ else if (strcmp(argv[i], "-time") == 0)
41
+ time_limit = stoi(argv[i + 1]);
42
+ else if (strcmp(argv[i], "-jbuild") == 0)
43
+ just_build = 1;
44
+ else if (strcmp(argv[i], "-folder") == 0)
45
+ folder = argv[i + 1];
46
+ else if (strcmp(argv[i], "-npre") == 0)
47
+ pre_pro = 0;
48
+ else if (strcmp(argv[i], "-dic") == 0)
49
+ use_dic = 1;
50
+ else if (strcmp(argv[i], "-out") == 0) {
51
+ if (i + 1 == argc || argv[i + 1][0] == '-')
52
+ b_disp = 1;
53
+ else if (argv[i + 1][0] == '+') {
54
+ b_disp = 1;
55
+ b_write = 1;
56
+ if (strlen(argv[i + 1]) > 1) {
57
+ out_file = argv[i + 1];
58
+ out_file = out_file.substr(1, out_file.size() - 1);
59
+ }
60
+ else
61
+ out_file = VV;
62
+ }
63
+ else {
64
+ b_write = 1;
65
+ out_file = argv[i + 1];
66
+ }
67
+ }
68
+
69
+ else
70
+ cout << "Command " << argv[i] << " not recognized and skipped.\n";
71
+ }
72
+
73
+
74
+
75
+ cout << "\n********************** " << VV << "**********************\n";
76
+
77
+ string item_file = VV;
78
+
79
+ cout << "loading instances...\n";
80
+
81
+ start_time = clock();
82
+
83
+ if (!largehm::Load_instance(item_file, thresh)) {
84
+ if (!largehm::just_build && largehm::give_time(clock() - largehm::start_time) < largehm::time_limit) {
85
+ largehm::Freq_miner();
86
+ if (largehm::give_time(clock() - largehm::start_time) >= largehm::time_limit)
87
+ std::cout << "TIME LIMIT REACHED\n";
88
+ std::cout << "Mining Complete\n\nFound a total of " << largehm::num_patt << " patterns\n";
89
+ std::cout << "\nTotal CPU time " << largehm::give_time(clock() - largehm::start_time) << " seconds\n\n";
90
+ }
91
+
92
+
93
+ return 0;
94
+ }
95
+ }
@@ -0,0 +1,38 @@
1
+ #include "utility.hpp"
2
+ #include "build_mdd.hpp"
3
+ #include "load_inst.hpp"
4
+ #include <iostream>
5
+ namespace largehm {
6
+ std::vector<std::vector<int>> collected;
7
+ bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec) {
8
+
9
+ vector<unsigned long long int> ancestors;
10
+
11
+ while (abs(Tree[cur_anct].itmset) > abs(Tree[str_pnt].itmset)) {
12
+ if (Tree[cur_anct].item > 0)
13
+ ancestors.push_back(cur_anct);
14
+ cur_anct = Tree[cur_anct].anct;
15
+ }
16
+
17
+ if (abs(Tree[cur_anct].itmset) == abs(Tree[str_pnt].itmset))
18
+ return 1;
19
+ else {
20
+ for (vector<unsigned long long int>::reverse_iterator it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
21
+ for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
22
+ if (strpnt_vec[i] == *it)
23
+ return 1;
24
+ }
25
+ }
26
+ }
27
+
28
+ return 0;
29
+
30
+ }
31
+
32
+
33
+
34
+ // float give_time(clock_t kk) {
35
+ // float ll = ((float)kk) / CLOCKS_PER_SEC;
36
+ // return ll;
37
+ // }
38
+ }
@@ -0,0 +1,29 @@
1
+ #pragma once
2
+
3
+ #include <vector>
4
+ #include <time.h>
5
+ #include <string>
6
+ #include "build_mdd.hpp"
7
+
8
+ namespace largehm {
9
+ using namespace std;
10
+
11
+ extern std::vector<std::vector<int>> collected;
12
+
13
+ // Helpers to clear and fetch collected patterns from Python:
14
+ inline void ClearCollected() {
15
+ collected.clear();
16
+ }
17
+ inline const std::vector<std::vector<int>>& GetCollected() {
18
+ return collected;
19
+ }
20
+
21
+ // A small timer helper:
22
+ inline float give_time(clock_t kk) {
23
+ float ll = ((float)kk) / CLOCKS_PER_SEC;
24
+ return ll;
25
+ }
26
+ bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec);
27
+
28
+
29
+ }