effspm 0.1.5__cp310-cp310-win_amd64.whl → 0.3.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. effspm/__init__.py +9 -2
  2. effspm/_core.cpp +91 -13
  3. effspm/_effspm.cp310-win_amd64.pyd +0 -0
  4. effspm/_effspm.cpp +679 -0
  5. effspm/btminer/src/build_mdd.cpp +88 -0
  6. effspm/btminer/src/build_mdd.hpp +34 -0
  7. effspm/btminer/src/freq_miner.cpp +264 -0
  8. effspm/btminer/src/freq_miner.hpp +55 -0
  9. effspm/btminer/src/load_inst.cpp +275 -0
  10. effspm/btminer/src/load_inst.hpp +43 -0
  11. effspm/btminer/src/utility.cpp +50 -0
  12. effspm/btminer/src/utility.hpp +16 -0
  13. effspm/freq_miner.hpp +7 -1
  14. effspm/htminer/src/build_mdd.cpp +139 -0
  15. effspm/htminer/src/build_mdd.hpp +64 -0
  16. effspm/htminer/src/freq_miner.cpp +350 -0
  17. effspm/htminer/src/freq_miner.hpp +60 -0
  18. effspm/htminer/src/load_inst.cpp +394 -0
  19. effspm/htminer/src/load_inst.hpp +23 -0
  20. effspm/htminer/src/utility.cpp +72 -0
  21. effspm/htminer/src/utility.hpp +77 -0
  22. effspm/largebm/src/build_mdd.cpp +96 -0
  23. effspm/largebm/src/build_mdd.hpp +32 -0
  24. effspm/largebm/src/freq_miner.cpp +299 -0
  25. effspm/largebm/src/freq_miner.hpp +37 -0
  26. effspm/largebm/src/load_inst.cpp +224 -0
  27. effspm/largebm/src/load_inst.hpp +35 -0
  28. effspm/largebm/src/utility.cpp +35 -0
  29. effspm/largebm/src/utility.hpp +15 -0
  30. effspm/largehm/src/build_mdd.cpp +174 -0
  31. effspm/largehm/src/build_mdd.hpp +93 -0
  32. effspm/largehm/src/freq_miner.cpp +429 -0
  33. effspm/largehm/src/freq_miner.hpp +77 -0
  34. effspm/largehm/src/load_inst.cpp +375 -0
  35. effspm/largehm/src/load_inst.hpp +64 -0
  36. effspm/largehm/src/utility.cpp +38 -0
  37. effspm/largehm/src/utility.hpp +29 -0
  38. effspm/largepp/src/freq_miner.cpp +198 -0
  39. effspm/largepp/src/freq_miner.hpp +18 -0
  40. effspm/largepp/src/load_inst.cpp +238 -0
  41. effspm/largepp/src/load_inst.hpp +34 -0
  42. effspm/largepp/src/pattern.hpp +31 -0
  43. effspm/largepp/src/utility.cpp +34 -0
  44. effspm/largepp/src/utility.hpp +21 -0
  45. effspm/load_inst.hpp +18 -12
  46. effspm-0.3.0.dist-info/METADATA +237 -0
  47. effspm-0.3.0.dist-info/RECORD +54 -0
  48. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/WHEEL +1 -1
  49. effspm/_core.cp310-win_amd64.pyd +0 -0
  50. effspm-0.1.5.dist-info/METADATA +0 -38
  51. effspm-0.1.5.dist-info/RECORD +0 -14
  52. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/licenses/LICENSE +0 -0
  53. {effspm-0.1.5.dist-info → effspm-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,77 @@
1
+ #ifndef LARGEHM_FREQ_MINER_HPP
2
+ #define LARGEHM_FREQ_MINER_HPP
3
+ #include <cstdint>
4
+ #include <vector>
5
+ #include <string>
6
+ #include <fstream>
7
+ #include <ctime> // for clock_t
8
+ extern std::vector<std::uint64_t> ancest_base;
9
+ namespace largehm {
10
+
11
+ //
12
+ // ─── Pattern & VPattern ──────────────────────────────────────────────────────
13
+ //
14
+
15
+ class Pattern {
16
+ public:
17
+ std::vector<int> seq;
18
+ unsigned int freq;
19
+ std::vector<int> list;
20
+ std::vector<unsigned long long int> str_pnt;
21
+
22
+ Pattern(int start_code = 0) : freq(0) {
23
+ if (start_code != 0)
24
+ seq.push_back(start_code);
25
+ }
26
+ };
27
+
28
+ class VPattern {
29
+ public:
30
+ std::vector<unsigned long long int> str_pnt;
31
+ std::vector<unsigned long long int> seq_ID;
32
+ int ass_patt;
33
+
34
+ VPattern(int assoc = -1) : ass_patt(assoc) {}
35
+ };
36
+
37
+ //
38
+ // ─── Globals used by Freq_miner ──────────────────────────────────────────────
39
+ //
40
+ extern std::vector<Pattern> DFS;
41
+ extern std::vector<VPattern> VDFS;
42
+
43
+ extern unsigned long long int num_patt;
44
+
45
+ extern std::vector<bool> ilist;
46
+ extern std::vector<bool> slist;
47
+
48
+ extern std::vector<Pattern> pot_patt;
49
+ extern std::vector<VPattern> pot_vpatt;
50
+ extern std::vector<unsigned long long int> last_strpnt;
51
+
52
+ extern std::vector<int> DFS_numfound;
53
+
54
+ extern Pattern _patt;
55
+ extern VPattern _vpatt;
56
+
57
+ extern int itmset_size;
58
+ extern int last_neg;
59
+ extern bool ilist_nempty;
60
+
61
+ //
62
+ // ─── Function Prototypes ─────────────────────────────────────────────────────
63
+ //
64
+ void Freq_miner();
65
+ void Extend_patt(Pattern& _patt);
66
+ void Mine_vec(std::uint64_t seq_ID,
67
+ int pos,
68
+ int num_found,
69
+ std::vector<std::uint64_t>& ancest,
70
+ std::vector<int>& items,
71
+ std::uint64_t pnt,
72
+ int sgn);
73
+ void Out_patt(std::vector<int>& seq, unsigned int freq);
74
+
75
+ } // namespace largehm
76
+
77
+ #endif // LARGEHM_FREQ_MINER_HPP
@@ -0,0 +1,375 @@
1
+ // ─── effspm/largehm/src/load_inst.cpp ────────────────────────────────────────
2
+
3
+ #include <iostream>
4
+ #include <sstream>
5
+ #include <algorithm>
6
+ #include <fstream>
7
+ #include <cmath>
8
+ #include <ctime>
9
+
10
+ #include "load_inst.hpp"
11
+ #include "utility.hpp"
12
+ #include "build_mdd.hpp"
13
+ #include "freq_miner.hpp"
14
+
15
+ namespace largehm {
16
+ using namespace std;
17
+
18
+ string out_file;
19
+ string folder;
20
+
21
+ bool b_disp = false;
22
+ bool b_write = false;
23
+ bool use_dic = false;
24
+ bool use_list = false;
25
+ bool just_build = false;
26
+ bool pre_pro = false;
27
+ bool itmset_exists = false;
28
+
29
+ unsigned int M = 0;
30
+ unsigned int L = 0;
31
+ unsigned int mlim = 0;
32
+ unsigned int time_limit = 0;
33
+
34
+ unsigned long long int N = 0;
35
+ unsigned long long int theta = 0;
36
+ unsigned long long int E = 0;
37
+
38
+ clock_t start_time = 0;
39
+
40
+ vector<vector<int>> items;
41
+
42
+ vector<int> item_dic;
43
+ vector<Pattern> DFS;
44
+ vector<VPattern> VDFS;
45
+
46
+
47
+ // ─────────────────────────────────────────────────────────────────────────────
48
+ // Load_instance
49
+ // ─────────────────────────────────────────────────────────────────────────────
50
+ bool Load_instance(string& items_file, double thresh) {
51
+ // 1) CLEAR leftover state
52
+ Tree.clear();
53
+ VTree.clear();
54
+ CTree.clear();
55
+ DFS.clear();
56
+ VDFS.clear();
57
+ item_dic.clear();
58
+ items.clear();
59
+
60
+ N = 0;
61
+ M = 0;
62
+ L = 0;
63
+ E = 0;
64
+ theta = 0;
65
+ itmset_exists = false;
66
+
67
+ clock_t kk = clock();
68
+
69
+ // root
70
+ Tree.emplace_back(0, 0, 0);
71
+
72
+ if (!pre_pro) {
73
+ if (!Load_items(items_file))
74
+ return false;
75
+
76
+ DFS.reserve(L);
77
+ while (DFS.size() < L)
78
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
79
+
80
+ VDFS.reserve(L);
81
+ while (VDFS.size() < L)
82
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
83
+
84
+ if (thresh < 1.0)
85
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
86
+ else
87
+ theta = static_cast<unsigned long long>(thresh);
88
+
89
+ start_time = clock();
90
+ }
91
+ else {
92
+ if (!Load_items(items_file))
93
+ return false;
94
+
95
+ if (thresh < 1.0)
96
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
97
+ else
98
+ theta = static_cast<unsigned long long>(thresh);
99
+
100
+ start_time = clock();
101
+ }
102
+
103
+ // 👇 only print when verbose/b_disp
104
+ if (b_disp) {
105
+ cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
106
+ cout << "Found " << N << " sequence, with max line len " << M
107
+ << ", and " << L << " items, and " << E << " enteries\n";
108
+ // cout << "Total Trie nodes: " << Tree.size()
109
+ // << " Total CTree nodes: " << CTree.size()
110
+ // << " Total VTree nodes: " << VTree.size() << endl;
111
+ }
112
+
113
+ return true;
114
+ }
115
+
116
+
117
+ // ─────────────────────────────────────────────────────────────────────────────
118
+ // Preprocess
119
+ // ─────────────────────────────────────────────────────────────────────────────
120
+ bool Preprocess(string &inst, double thresh) {
121
+ vector<unsigned long long int> MN(100, 0);
122
+ vector<vector<bool>> ML(100, vector<bool>(1000000, false));
123
+
124
+ ifstream file(inst);
125
+ if (!file.good()) {
126
+ if (b_disp)
127
+ cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
128
+ return false;
129
+ }
130
+
131
+ vector<unsigned long long int> freq(1000000, 0ULL);
132
+ vector<unsigned long long int> counted(1000000, 0ULL);
133
+
134
+ string line;
135
+ int ditem;
136
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
137
+ ++N;
138
+ if (b_disp && N % 10000000 == 0)
139
+ cout << "N: " << N << endl;
140
+
141
+ istringstream word(line);
142
+ string itm;
143
+ int size_m = 0;
144
+ while (word >> itm) {
145
+ ++size_m;
146
+ ditem = stoi(itm);
147
+
148
+ if (ditem > 0)
149
+ itmset_exists = true;
150
+ else
151
+ ditem = -ditem;
152
+
153
+ if (size_m < (int)MN.size()) {
154
+ ++MN[size_m - 1];
155
+ if ((int)ML[size_m - 1].size() < ditem) {
156
+ ML[size_m - 1].resize(ditem, false);
157
+ }
158
+ ML[size_m - 1][ditem - 1] = true;
159
+ }
160
+
161
+ if (L < static_cast<unsigned int>(ditem)) {
162
+ L = static_cast<unsigned int>(ditem);
163
+ }
164
+
165
+ if ((int)freq.size() < ditem) {
166
+ freq.resize(ditem, 0ULL);
167
+ counted.resize(ditem, 0ULL);
168
+ }
169
+ if (counted[ditem - 1] != N) {
170
+ ++freq[ditem - 1];
171
+ counted[ditem - 1] = N;
172
+ }
173
+ }
174
+ if (size_m > (int)M)
175
+ M = size_m;
176
+ }
177
+
178
+ if (thresh < 1.0)
179
+ theta = static_cast<unsigned long long>(ceil(thresh * N));
180
+ else
181
+ theta = static_cast<unsigned long long>(thresh);
182
+
183
+ int real_L = 0;
184
+ item_dic.assign(L, -1);
185
+ vector<bool> item_in(L, false);
186
+ for (int i = 0; i < (int)L; ++i) {
187
+ if (freq[i] >= theta) {
188
+ item_dic[i] = ++real_L;
189
+ item_in[i] = true;
190
+ }
191
+ }
192
+
193
+ if (b_disp)
194
+ cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
195
+
196
+ unsigned long long int LpM = 1;
197
+ mlim = M;
198
+ int orgmlim = 0;
199
+ int ulim = min(1 + real_L / 4, 10);
200
+ unsigned long long int ml;
201
+
202
+ for (int i = 0; i + ulim < (int)MN.size() && i + ulim < (int)M; ++i) {
203
+ ml = 0;
204
+ for (int j = 0; j < (int)L; ++j) {
205
+ if (ML[i][j] && item_in[j])
206
+ ++ml;
207
+ }
208
+ LpM *= ml * (1 + itmset_exists);
209
+
210
+ if (b_disp)
211
+ cout << ml << " " << LpM << " " << MN[i] << endl;
212
+
213
+ if (LpM * ulim > MN[i]) {
214
+ orgmlim = i;
215
+ while (i + ulim - 1 < (int)MN.size() && i + ulim - 1 < (int)M) {
216
+ if (b_disp)
217
+ cout << (MN[i - 1] - MN[i + ulim - 1]) << " "
218
+ << MN[i + ulim - 1] << endl;
219
+
220
+ if ((MN[i - 1] - MN[i + ulim - 1]) < MN[i + ulim - 1]
221
+ && MN[i + ulim - 1] < 600000000) {
222
+ mlim = i - 1;
223
+ break;
224
+ }
225
+ ++i;
226
+ }
227
+ break;
228
+ }
229
+ }
230
+
231
+ if (b_disp)
232
+ cout << "M is: " << M << " Mlim is: " << mlim
233
+ << " ulim is: " << ulim
234
+ << " original mlim is: " << orgmlim
235
+ << " guess is: "
236
+ << round((log(N) - log(6)) / log(real_L)) << endl;
237
+
238
+ if (mlim < (int)M) {
239
+ for (int i = 0; i < real_L; ++i)
240
+ VDFS.emplace_back(i);
241
+ }
242
+
243
+ L = static_cast<unsigned int>(real_L);
244
+ N = 0;
245
+ M = 0;
246
+ return true;
247
+ }
248
+
249
+
250
+ // ─────────────────────────────────────────────────────────────────────────────
251
+ // Load_items_pre
252
+ // ─────────────────────────────────────────────────────────────────────────────
253
+ bool Load_items_pre(string &inst_name) {
254
+ ifstream file(inst_name);
255
+ if (!file.good()) {
256
+ if (b_disp)
257
+ cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
258
+ return false;
259
+ }
260
+
261
+ string line;
262
+ int ditem;
263
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
264
+ istringstream word(line);
265
+ string itm;
266
+ vector<int> temp_vec;
267
+ vector<int> temp_lim;
268
+ bool sgn = false;
269
+
270
+ while (word >> itm) {
271
+ ditem = stoi(itm);
272
+ if (item_dic[std::abs(ditem) - 1] == -1) {
273
+ if (!sgn)
274
+ sgn = (ditem < 0);
275
+ continue;
276
+ } else {
277
+ if (ditem > 0)
278
+ ditem = item_dic[ditem - 1];
279
+ else
280
+ ditem = -item_dic[-ditem - 1];
281
+ }
282
+ if (sgn) {
283
+ if (ditem > 0)
284
+ ditem = -ditem;
285
+ sgn = false;
286
+ }
287
+ if (temp_vec.size() <= (size_t)mlim)
288
+ temp_vec.push_back(ditem);
289
+ else
290
+ temp_lim.push_back(ditem);
291
+ }
292
+
293
+ if (temp_vec.empty())
294
+ continue;
295
+
296
+ ++N;
297
+ if (b_disp && N % 10000000 == 0)
298
+ cout << N << endl;
299
+
300
+ if (temp_vec.size() + temp_lim.size() > (size_t)M)
301
+ M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
302
+
303
+ while (DFS.size() < L)
304
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
305
+ while (VDFS.size() < L)
306
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
307
+
308
+ Build_MDD(temp_vec, temp_lim);
309
+ }
310
+
311
+ return true;
312
+ }
313
+
314
+
315
+ // ─────────────────────────────────────────────────────────────────────────────
316
+ // Load_items (no preprocess)
317
+ // ─────────────────────────────────────────────────────────────────────────────
318
+ bool Load_items(string &inst_name) {
319
+ ifstream file(inst_name);
320
+ if (!file.good()) {
321
+ if (b_disp)
322
+ cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
323
+ return false;
324
+ }
325
+
326
+ string line;
327
+ int ditem;
328
+ while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
329
+ ++N;
330
+ if (b_disp && N % 1000000 == 0)
331
+ cout << "Found " << N << " sequence, with max line len "
332
+ << M << ", and " << L << " items, and " << E
333
+ << " enteries\n";
334
+
335
+ istringstream word(line);
336
+ string itm;
337
+ vector<int> temp_vec;
338
+ vector<int> temp_lim;
339
+
340
+ while (word >> itm) {
341
+ ditem = stoi(itm);
342
+
343
+ if (ditem > 0)
344
+ itmset_exists = true;
345
+
346
+ if (L < static_cast<unsigned int>(std::abs(ditem))) {
347
+ L = static_cast<unsigned int>(std::abs(ditem));
348
+
349
+ while (DFS.size() < L)
350
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
351
+ while (VDFS.size() < L)
352
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
353
+ }
354
+
355
+ if (temp_vec.size() < (size_t)mlim)
356
+ temp_vec.push_back(ditem);
357
+ else
358
+ temp_lim.push_back(ditem);
359
+ }
360
+ E += static_cast<unsigned long long>(temp_vec.size() + temp_lim.size());
361
+ if (temp_vec.size() + temp_lim.size() > (size_t)M)
362
+ M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
363
+
364
+ while (DFS.size() < L)
365
+ DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
366
+ while (VDFS.size() < L)
367
+ VDFS.emplace_back(static_cast<int>(VDFS.size()));
368
+
369
+ Build_MDD(temp_vec, temp_lim);
370
+ }
371
+
372
+ return true;
373
+ }
374
+
375
+ } // namespace largehm
@@ -0,0 +1,64 @@
1
+ #ifndef LARGEHM_LOAD_INST_HPP
2
+ #define LARGEHM_LOAD_INST_HPP
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <fstream>
7
+ #include <ctime> // for clock_t
8
+
9
+ // We need Pattern and VPattern, so include freq_miner.hpp here:
10
+ #include "freq_miner.hpp"
11
+
12
+ namespace largehm {
13
+
14
+ //
15
+ // ─── Globals & Function Prototypes ───────────────────────────────────────────
16
+ //
17
+
18
+ // Output/folder:
19
+ extern std::string out_file;
20
+ extern std::string folder;
21
+
22
+ // Flags:
23
+ extern bool b_disp;
24
+ extern bool b_write;
25
+ extern bool use_dic;
26
+ extern bool use_list;
27
+ extern bool just_build;
28
+ extern bool pre_pro;
29
+ extern bool itmset_exists;
30
+
31
+ // Database statistics:
32
+ extern unsigned int M;
33
+ extern unsigned int L;
34
+ extern unsigned int mlim;
35
+ extern unsigned int time_limit;
36
+
37
+ extern unsigned long long int N;
38
+ extern unsigned long long int theta;
39
+ extern unsigned long long int E;
40
+
41
+ // Timing:
42
+ extern clock_t start_time;
43
+
44
+ // In‐memory sequences (only if “in‐memory” mode):
45
+ extern std::vector<std::vector<int>> items;
46
+
47
+ // Preprocessing dictionary (maps original → compressed IDs):
48
+ extern std::vector<int> item_dic;
49
+
50
+ // DFS stacks used by the miner (Pattern / VPattern):
51
+ extern std::vector<Pattern> DFS;
52
+ extern std::vector<VPattern> VDFS;
53
+
54
+ // Internal loader functions:
55
+ bool Load_items_pre(std::string &inst_name);
56
+ bool Load_items(std::string &inst_name);
57
+ bool Preprocess(std::string &inst, double thresh);
58
+
59
+ // Main entry‐point for loading & building the MDD:
60
+ bool Load_instance(std::string &items_file, double thresh);
61
+
62
+ } // namespace largehm
63
+
64
+ #endif // LARGEHM_LOAD_INST_HPP
@@ -0,0 +1,38 @@
1
+ #include "utility.hpp"
2
+ #include "build_mdd.hpp"
3
+ #include "load_inst.hpp"
4
+ #include <iostream>
5
+ namespace largehm {
6
+ std::vector<std::vector<int>> collected;
7
+ bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec) {
8
+
9
+ vector<unsigned long long int> ancestors;
10
+
11
+ while (abs(Tree[cur_anct].itmset) > abs(Tree[str_pnt].itmset)) {
12
+ if (Tree[cur_anct].item > 0)
13
+ ancestors.push_back(cur_anct);
14
+ cur_anct = Tree[cur_anct].anct;
15
+ }
16
+
17
+ if (abs(Tree[cur_anct].itmset) == abs(Tree[str_pnt].itmset))
18
+ return 1;
19
+ else {
20
+ for (vector<unsigned long long int>::reverse_iterator it = ancestors.rbegin(); it != ancestors.rend(); ++it) {
21
+ for (unsigned int i = start; i < strpnt_vec.size(); ++i) {
22
+ if (strpnt_vec[i] == *it)
23
+ return 1;
24
+ }
25
+ }
26
+ }
27
+
28
+ return 0;
29
+
30
+ }
31
+
32
+
33
+
34
+ // float give_time(clock_t kk) {
35
+ // float ll = ((float)kk) / CLOCKS_PER_SEC;
36
+ // return ll;
37
+ // }
38
+ }
@@ -0,0 +1,29 @@
1
+ #pragma once
2
+
3
+ #include <vector>
4
+ #include <time.h>
5
+ #include <string>
6
+ #include "build_mdd.hpp"
7
+
8
+ namespace largehm {
9
+ using namespace std;
10
+
11
+ extern std::vector<std::vector<int>> collected;
12
+
13
+ // Helpers to clear and fetch collected patterns from Python:
14
+ inline void ClearCollected() {
15
+ collected.clear();
16
+ }
17
+ inline const std::vector<std::vector<int>>& GetCollected() {
18
+ return collected;
19
+ }
20
+
21
+ // A small timer helper:
22
+ inline float give_time(clock_t kk) {
23
+ float ll = ((float)kk) / CLOCKS_PER_SEC;
24
+ return ll;
25
+ }
26
+ bool check_parent(unsigned long long int cur_anct, unsigned long long int str_pnt, unsigned long long int start, vector<unsigned long long int>& strpnt_vec);
27
+
28
+
29
+ }