effspm 0.3.0__cp310-cp310-macosx_11_0_arm64.whl → 0.3.3__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,77 +1,54 @@
1
- #ifndef LARGEHM_FREQ_MINER_HPP
2
- #define LARGEHM_FREQ_MINER_HPP
3
- #include <cstdint>
4
- #include <vector>
5
- #include <string>
6
- #include <fstream>
7
- #include <ctime> // for clock_t
8
- extern std::vector<std::uint64_t> ancest_base;
1
+ #pragma once
2
+
3
+ #include "load_inst.hpp"
4
+ #include "build_mdd.hpp"
5
+
9
6
  namespace largehm {
10
7
 
11
- //
12
- // ─── Pattern & VPattern ──────────────────────────────────────────────────────
13
- //
8
+ using namespace std;
9
+
10
+ void Freq_miner();
14
11
 
15
12
  class Pattern {
16
13
  public:
17
- std::vector<int> seq;
18
- unsigned int freq;
19
- std::vector<int> list;
20
- std::vector<unsigned long long int> str_pnt;
14
+ vector<int> seq;
15
+ vector<unsigned long long int> str_pnt;
16
+ vector<int> list;
17
+ unsigned long long int freq;
18
+
19
+ Pattern(vector<int>& _seq, int item) {
20
+ seq.swap(_seq);
21
+ seq.push_back(item);
22
+ freq = 0;
23
+ }
21
24
 
22
- Pattern(int start_code = 0) : freq(0) {
23
- if (start_code != 0)
24
- seq.push_back(start_code);
25
+ Pattern(int item) {
26
+ seq.push_back(item);
27
+ freq = 0;
28
+ }
29
+
30
+ Pattern() {
31
+ freq = 0;
25
32
  }
26
33
  };
27
34
 
28
35
  class VPattern {
29
36
  public:
30
- std::vector<unsigned long long int> str_pnt;
31
- std::vector<unsigned long long int> seq_ID;
32
- int ass_patt;
37
+ unsigned long long int ass_patt;
38
+ vector<int> str_pnt;
39
+ vector<unsigned long long int> seq_ID;
33
40
 
34
- VPattern(int assoc = -1) : ass_patt(assoc) {}
35
- };
41
+ VPattern(unsigned long long int _patt) {
42
+ ass_patt = _patt;
43
+ }
36
44
 
37
- //
38
- // ─── Globals used by Freq_miner ──────────────────────────────────────────────
39
- //
40
- extern std::vector<Pattern> DFS;
41
- extern std::vector<VPattern> VDFS;
45
+ VPattern() {
46
+ ass_patt = 0;
47
+ }
48
+ };
42
49
 
43
50
  extern unsigned long long int num_patt;
44
-
45
- extern std::vector<bool> ilist;
46
- extern std::vector<bool> slist;
47
-
48
- extern std::vector<Pattern> pot_patt;
49
- extern std::vector<VPattern> pot_vpatt;
50
- extern std::vector<unsigned long long int> last_strpnt;
51
-
52
- extern std::vector<int> DFS_numfound;
53
-
54
- extern Pattern _patt;
55
- extern VPattern _vpatt;
56
-
57
- extern int itmset_size;
58
- extern int last_neg;
59
- extern bool ilist_nempty;
60
-
61
- //
62
- // ─── Function Prototypes ─────────────────────────────────────────────────────
63
- //
64
- void Freq_miner();
65
- void Extend_patt(Pattern& _patt);
66
- void Mine_vec(std::uint64_t seq_ID,
67
- int pos,
68
- int num_found,
69
- std::vector<std::uint64_t>& ancest,
70
- std::vector<int>& items,
71
- std::uint64_t pnt,
72
- int sgn);
73
- void Out_patt(std::vector<int>& seq, unsigned int freq);
51
+ extern vector<Pattern> DFS;
52
+ extern vector<VPattern> VDFS;
74
53
 
75
54
  } // namespace largehm
76
-
77
- #endif // LARGEHM_FREQ_MINER_HPP
@@ -1,142 +1,106 @@
1
- // ─── effspm/largehm/src/load_inst.cpp ────────────────────────────────────────
2
-
3
1
  #include <iostream>
4
2
  #include <sstream>
5
3
  #include <algorithm>
6
- #include <fstream>
7
- #include <cmath>
8
- #include <ctime>
9
-
4
+ #include <math.h>
10
5
  #include "load_inst.hpp"
11
6
  #include "utility.hpp"
12
7
  #include "build_mdd.hpp"
13
8
  #include "freq_miner.hpp"
14
9
 
15
10
  namespace largehm {
16
- using namespace std;
17
11
 
18
- string out_file;
19
- string folder;
12
+ using namespace std;
20
13
 
21
- bool b_disp = false;
22
- bool b_write = false;
23
- bool use_dic = false;
24
- bool use_list = false;
25
- bool just_build = false;
26
- bool pre_pro = false;
27
- bool itmset_exists = false;
14
+ unsigned int M = 0, L = 0, mlim;
15
+ unsigned long long int N = 0, theta, E = 0;
28
16
 
29
- unsigned int M = 0;
30
- unsigned int L = 0;
31
- unsigned int mlim = 0;
32
- unsigned int time_limit = 0;
17
+ bool itmset_exists = 0;
33
18
 
34
- unsigned long long int N = 0;
35
- unsigned long long int theta = 0;
36
- unsigned long long int E = 0;
19
+ vector<int> item_dic;
20
+ vector<Pattern> DFS;
21
+ vector<VPattern> VDFS;
37
22
 
38
- clock_t start_time = 0;
23
+ string out_file, folder;
39
24
 
40
- vector<vector<int>> items;
25
+ bool b_disp = 0;
26
+ bool b_write = 0;
27
+ bool use_dic = 0;
28
+ bool just_build = 0;
29
+ bool pre_pro = 1;
41
30
 
42
- vector<int> item_dic;
43
- vector<Pattern> DFS;
44
- vector<VPattern> VDFS;
31
+ unsigned int time_limit = 10 * 3600;
32
+ clock_t start_time;
45
33
 
34
+ void Load_items_pre(string &inst_name);
35
+ bool Load_items(string &inst_name);
36
+ bool Preprocess(string& inst, double thresh);
46
37
 
47
- // ─────────────────────────────────────────────────────────────────────────────
48
- // Load_instance
49
- // ─────────────────────────────────────────────────────────────────────────────
50
38
  bool Load_instance(string& items_file, double thresh) {
51
- // 1) CLEAR leftover state
52
- Tree.clear();
53
- VTree.clear();
54
- CTree.clear();
55
- DFS.clear();
56
- VDFS.clear();
57
- item_dic.clear();
58
- items.clear();
59
-
60
- N = 0;
61
- M = 0;
62
- L = 0;
63
- E = 0;
64
- theta = 0;
65
- itmset_exists = false;
66
39
 
67
40
  clock_t kk = clock();
68
-
69
- // root
70
41
  Tree.emplace_back(0, 0, 0);
71
42
 
72
- if (!pre_pro) {
73
- if (!Load_items(items_file))
74
- return false;
43
+ if (pre_pro) {
44
+ if (!Preprocess(items_file, thresh))
45
+ return 0;
75
46
 
76
- DFS.reserve(L);
77
- while (DFS.size() < L)
78
- DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
47
+ // ✅ KEEP THIS: Preprocess timing
48
+ if (b_disp)
49
+ cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
79
50
 
80
- VDFS.reserve(L);
81
- while (VDFS.size() < L)
82
- VDFS.emplace_back(static_cast<int>(VDFS.size()));
51
+ DFS.reserve(L);
52
+ for (int i = 0; i < (int)L; ++i)
53
+ DFS.emplace_back(-i - 1);
83
54
 
84
- if (thresh < 1.0)
85
- theta = static_cast<unsigned long long>(ceil(thresh * N));
86
- else
87
- theta = static_cast<unsigned long long>(thresh);
55
+ kk = clock();
56
+ Load_items_pre(items_file);
88
57
 
89
- start_time = clock();
90
58
  }
59
+ else if (!Load_items(items_file))
60
+ return 0;
91
61
  else {
92
- if (!Load_items(items_file))
93
- return false;
94
-
95
- if (thresh < 1.0)
96
- theta = static_cast<unsigned long long>(ceil(thresh * N));
62
+ if (thresh < 1)
63
+ theta = ceil(thresh * N);
97
64
  else
98
- theta = static_cast<unsigned long long>(thresh);
99
-
100
- start_time = clock();
65
+ theta = thresh;
101
66
  }
102
67
 
103
- // 👇 only print when verbose/b_disp
104
- if (b_disp) {
105
- cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
106
- cout << "Found " << N << " sequence, with max line len " << M
107
- << ", and " << L << " items, and " << E << " enteries\n";
108
- // cout << "Total Trie nodes: " << Tree.size()
109
- // << " Total CTree nodes: " << CTree.size()
110
- // << " Total VTree nodes: " << VTree.size() << endl;
111
- }
68
+ // KEEP THIS: MDD build timing
69
+ if (b_disp)
70
+ cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
112
71
 
113
- return true;
114
- }
72
+ // ✅ KEEP THIS: main summary line
73
+ if (b_disp)
74
+ cout << "Found " << N << " sequence, with max line len " << M
75
+ << ", and " << L << " items, and " << E << " enteries\n";
76
+
77
+ // ❌ COMMENT OUT: extra debug
78
+ // cout << "Total Trie nodes: " << Tree.size()
79
+ // << " Total CTree nodes: " << CTree.size()
80
+ // << " Total VTree nodes: " << VTree.size() << endl;
115
81
 
82
+ return 1;
83
+ }
116
84
 
117
- // ─────────────────────────────────────────────────────────────────────────────
118
- // Preprocess
119
- // ─────────────────────────────────────────────────────────────────────────────
120
85
  bool Preprocess(string &inst, double thresh) {
121
- vector<unsigned long long int> MN(100, 0);
122
- vector<vector<bool>> ML(100, vector<bool>(1000000, false));
123
86
 
87
+ vector<unsigned long long int> MN(100, 0);
88
+ vector<vector<bool>> ML(100, vector<bool>(1000000, 0));
124
89
  ifstream file(inst);
90
+
91
+ vector<unsigned long long int> freq(1000000);
92
+ vector<unsigned long long int> counted(1000000, 0);
93
+
125
94
  if (!file.good()) {
126
- if (b_disp)
127
- cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
128
- return false;
95
+ // cout << "!!!!!! No such file exists: " << inst << " !!!!!!\n";
96
+ return 0;
129
97
  }
130
98
 
131
- vector<unsigned long long int> freq(1000000, 0ULL);
132
- vector<unsigned long long int> counted(1000000, 0ULL);
133
-
134
99
  string line;
135
100
  int ditem;
136
101
  while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
137
102
  ++N;
138
- if (b_disp && N % 10000000 == 0)
139
- cout << "N: " << N << endl;
103
+ // if (N % 10000000 == 0) cout << "N: " << N << endl;
140
104
 
141
105
  istringstream word(line);
142
106
  string itm;
@@ -146,117 +110,114 @@ bool Preprocess(string &inst, double thresh) {
146
110
  ditem = stoi(itm);
147
111
 
148
112
  if (ditem > 0)
149
- itmset_exists = true;
113
+ itmset_exists = 1;
150
114
  else
151
- ditem = -ditem;
115
+ ditem *= -1;
152
116
 
153
117
  if (size_m < (int)MN.size()) {
154
118
  ++MN[size_m - 1];
155
- if ((int)ML[size_m - 1].size() < ditem) {
156
- ML[size_m - 1].resize(ditem, false);
119
+ if (ML[size_m - 1].size() < (size_t)ditem) {
120
+ ML[size_m - 1].reserve(ditem);
121
+ while (ML[size_m - 1].size() < (size_t)ditem)
122
+ ML[size_m - 1].push_back(0);
157
123
  }
158
- ML[size_m - 1][ditem - 1] = true;
124
+ ML[size_m - 1][ditem - 1] = 1;
159
125
  }
160
126
 
161
- if (L < static_cast<unsigned int>(ditem)) {
162
- L = static_cast<unsigned int>(ditem);
163
- }
127
+ if (L < (unsigned int)ditem)
128
+ L = ditem;
164
129
 
165
- if ((int)freq.size() < ditem) {
166
- freq.resize(ditem, 0ULL);
167
- counted.resize(ditem, 0ULL);
130
+ if (freq.size() < L) {
131
+ freq.reserve(L);
132
+ counted.reserve(L);
133
+ while (freq.size() < L) {
134
+ freq.push_back(0);
135
+ counted.push_back(0);
136
+ }
168
137
  }
138
+
169
139
  if (counted[ditem - 1] != N) {
170
140
  ++freq[ditem - 1];
171
141
  counted[ditem - 1] = N;
172
142
  }
143
+
144
+ ++E; // count entries
173
145
  }
174
146
  if (size_m > (int)M)
175
147
  M = size_m;
176
148
  }
177
149
 
178
- if (thresh < 1.0)
179
- theta = static_cast<unsigned long long>(ceil(thresh * N));
150
+ if (thresh < 1)
151
+ theta = ceil(thresh * N);
180
152
  else
181
- theta = static_cast<unsigned long long>(thresh);
153
+ theta = thresh;
182
154
 
183
155
  int real_L = 0;
184
- item_dic.assign(L, -1);
185
- vector<bool> item_in(L, false);
156
+ item_dic = vector<int>(L, -1);
157
+ vector<bool> item_in(L, 0);
186
158
  for (int i = 0; i < (int)L; ++i) {
187
159
  if (freq[i] >= theta) {
188
160
  item_dic[i] = ++real_L;
189
- item_in[i] = true;
161
+ item_in[i] = 1;
190
162
  }
191
163
  }
192
164
 
193
- if (b_disp)
194
- cout << "Original number of items: " << L << " Reduced to: " << real_L << endl;
165
+ // ❌ COMMENTED: extra stats
166
+ // cout << "Original number of items: " << L
167
+ // << " Reduced to: " << real_L << endl;
195
168
 
196
169
  unsigned long long int LpM = 1;
197
170
  mlim = M;
198
171
  int orgmlim = 0;
199
172
  int ulim = min(1 + real_L / 4, 10);
200
173
  unsigned long long int ml;
201
-
174
+ int coef = 1 + 1 * itmset_exists;
202
175
  for (int i = 0; i + ulim < (int)MN.size() && i + ulim < (int)M; ++i) {
203
176
  ml = 0;
204
177
  for (int j = 0; j < (int)L; ++j) {
205
178
  if (ML[i][j] && item_in[j])
206
179
  ++ml;
207
180
  }
208
- LpM *= ml * (1 + itmset_exists);
209
-
210
- if (b_disp)
211
- cout << ml << " " << LpM << " " << MN[i] << endl;
212
-
181
+ LpM *= ml * coef;
182
+ // cout << ml << " " << LpM << " " << MN[i] << endl;
213
183
  if (LpM * ulim > MN[i]) {
214
184
  orgmlim = i;
215
185
  while (i + ulim - 1 < (int)MN.size() && i + ulim - 1 < (int)M) {
216
- if (b_disp)
217
- cout << (MN[i - 1] - MN[i + ulim - 1]) << " "
218
- << MN[i + ulim - 1] << endl;
219
-
220
- if ((MN[i - 1] - MN[i + ulim - 1]) < MN[i + ulim - 1]
221
- && MN[i + ulim - 1] < 600000000) {
186
+ // cout << MN[i - 1] - MN[i + ulim - 1]
187
+ // << " " << MN[i + ulim - 1] << endl;
188
+ if (MN[i - 1] - MN[i + ulim - 1] < MN[i + ulim - 1] &&
189
+ MN[i + ulim - 1] < 600000000) {
222
190
  mlim = i - 1;
223
191
  break;
224
192
  }
225
- ++i;
193
+ i += 1;
226
194
  }
227
195
  break;
228
196
  }
229
197
  }
230
198
 
231
- if (b_disp)
232
- cout << "M is: " << M << " Mlim is: " << mlim
233
- << " ulim is: " << ulim
234
- << " original mlim is: " << orgmlim
235
- << " guess is: "
236
- << round((log(N) - log(6)) / log(real_L)) << endl;
199
+ // cout << "M is: " << M << " Mlim is: " << mlim
200
+ // << " ulim is: " << ulim
201
+ // << " original mlim is: " << orgmlim
202
+ // << " guess is: " << round((log(N) - log(6)) / log(real_L)) << endl;
237
203
 
238
- if (mlim < (int)M) {
204
+ if (mlim < M) {
239
205
  for (int i = 0; i < real_L; ++i)
240
206
  VDFS.emplace_back(i);
241
207
  }
242
208
 
243
- L = static_cast<unsigned int>(real_L);
209
+ L = real_L;
244
210
  N = 0;
245
211
  M = 0;
246
- return true;
212
+
213
+ return 1;
247
214
  }
248
215
 
216
+ void Load_items_pre(string &inst_name) {
249
217
 
250
- // ─────────────────────────────────────────────────────────────────────────────
251
- // Load_items_pre
252
- // ─────────────────────────────────────────────────────────────────────────────
253
- bool Load_items_pre(string &inst_name) {
254
218
  ifstream file(inst_name);
255
- if (!file.good()) {
256
- if (b_disp)
257
- cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
258
- return false;
259
- }
219
+ if (!file.good())
220
+ return;
260
221
 
261
222
  string line;
262
223
  int ditem;
@@ -265,11 +226,11 @@ bool Load_items_pre(string &inst_name) {
265
226
  string itm;
266
227
  vector<int> temp_vec;
267
228
  vector<int> temp_lim;
268
- bool sgn = false;
269
-
229
+ bool sgn = 0;
270
230
  while (word >> itm) {
271
231
  ditem = stoi(itm);
272
- if (item_dic[std::abs(ditem) - 1] == -1) {
232
+
233
+ if (item_dic[abs(ditem) - 1] == -1) {
273
234
  if (!sgn)
274
235
  sgn = (ditem < 0);
275
236
  continue;
@@ -279,97 +240,81 @@ bool Load_items_pre(string &inst_name) {
279
240
  else
280
241
  ditem = -item_dic[-ditem - 1];
281
242
  }
243
+
282
244
  if (sgn) {
283
245
  if (ditem > 0)
284
246
  ditem = -ditem;
285
- sgn = false;
247
+ sgn = 0;
286
248
  }
287
- if (temp_vec.size() <= (size_t)mlim)
249
+
250
+ if (temp_vec.size() <= mlim)
288
251
  temp_vec.push_back(ditem);
289
252
  else
290
253
  temp_lim.push_back(ditem);
254
+
255
+ ++E;
291
256
  }
292
257
 
293
258
  if (temp_vec.empty())
294
259
  continue;
295
260
 
296
261
  ++N;
297
- if (b_disp && N % 10000000 == 0)
298
- cout << N << endl;
299
-
300
- if (temp_vec.size() + temp_lim.size() > (size_t)M)
301
- M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
262
+ // if (N % 10000000 == 0) cout << N << endl;
302
263
 
303
- while (DFS.size() < L)
304
- DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
305
- while (VDFS.size() < L)
306
- VDFS.emplace_back(static_cast<int>(VDFS.size()));
264
+ if (temp_vec.size() + temp_lim.size() > M)
265
+ M = temp_vec.size() + temp_lim.size();
307
266
 
308
267
  Build_MDD(temp_vec, temp_lim);
309
268
  }
310
-
311
- return true;
312
269
  }
313
270
 
314
-
315
- // ─────────────────────────────────────────────────────────────────────────────
316
- // Load_items (no preprocess)
317
- // ─────────────────────────────────────────────────────────────────────────────
318
271
  bool Load_items(string &inst_name) {
272
+
319
273
  ifstream file(inst_name);
320
274
  if (!file.good()) {
321
- if (b_disp)
322
- cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
323
- return false;
275
+ // cout << "!!!!!! No such file exists: " << inst_name << " !!!!!!\n";
276
+ return 0;
324
277
  }
325
278
 
326
279
  string line;
327
280
  int ditem;
328
281
  while (getline(file, line) && give_time(clock() - start_time) < time_limit) {
329
282
  ++N;
330
- if (b_disp && N % 1000000 == 0)
331
- cout << "Found " << N << " sequence, with max line len "
332
- << M << ", and " << L << " items, and " << E
333
- << " enteries\n";
283
+ // if (N % 1000000 == 0)
284
+ // cout << "Found " << N << " sequence, with max line len " << M
285
+ // << ", and " << L << " items, and " << E << " enteries\n";
334
286
 
335
287
  istringstream word(line);
336
288
  string itm;
337
289
  vector<int> temp_vec;
338
290
  vector<int> temp_lim;
339
-
340
291
  while (word >> itm) {
341
292
  ditem = stoi(itm);
342
-
343
293
  if (ditem > 0)
344
- itmset_exists = true;
345
-
346
- if (L < static_cast<unsigned int>(std::abs(ditem))) {
347
- L = static_cast<unsigned int>(std::abs(ditem));
348
-
349
- while (DFS.size() < L)
350
- DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
351
- while (VDFS.size() < L)
352
- VDFS.emplace_back(static_cast<int>(VDFS.size()));
294
+ itmset_exists = 1;
295
+ if (L < (unsigned int)abs(ditem)) {
296
+ L = abs(ditem);
297
+ while (DFS.size() < L) {
298
+ DFS.reserve(L);
299
+ DFS.emplace_back(-DFS.size() - 1);
300
+ }
353
301
  }
354
302
 
355
- if (temp_vec.size() < (size_t)mlim)
303
+ if (temp_vec.size() < mlim)
356
304
  temp_vec.push_back(ditem);
357
305
  else
358
306
  temp_lim.push_back(ditem);
307
+
308
+ ++E;
359
309
  }
360
- E += static_cast<unsigned long long>(temp_vec.size() + temp_lim.size());
361
- if (temp_vec.size() + temp_lim.size() > (size_t)M)
362
- M = static_cast<unsigned int>(temp_vec.size() + temp_lim.size());
363
310
 
364
- while (DFS.size() < L)
365
- DFS.emplace_back(-static_cast<int>(DFS.size()) - 1);
366
- while (VDFS.size() < L)
367
- VDFS.emplace_back(static_cast<int>(VDFS.size()));
311
+ if (temp_vec.size() + temp_lim.size() > M)
312
+ M = temp_vec.size();
368
313
 
369
314
  Build_MDD(temp_vec, temp_lim);
370
315
  }
371
316
 
372
- return true;
317
+ return 1;
373
318
  }
374
319
 
375
320
  } // namespace largehm