effspm 0.3.0__cp312-cp312-macosx_10_9_x86_64.whl → 0.3.1__cp312-cp312-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
effspm/_effspm.cpp CHANGED
@@ -2,6 +2,553 @@
2
2
 
3
3
  #include <pybind11/pybind11.h>
4
4
  #include <pybind11/stl.h>
5
+
6
+ #include <iostream>
7
+ #include <fstream>
8
+ #include <cstdio> // std::remove
9
+ #include <vector>
10
+ #include <string>
11
+ #include <ctime>
12
+ #include <cmath>
13
+
14
+ namespace py = pybind11;
15
+
16
+ // PrefixProjection headers (global namespace)
17
+ #include "freq_miner.hpp"
18
+ #include "load_inst.hpp"
19
+ #include "utility.hpp"
20
+
21
+ // BTMiner (namespaced)
22
+ #include "btminer/src/freq_miner.hpp"
23
+ #include "btminer/src/load_inst.hpp"
24
+ #include "btminer/src/utility.hpp"
25
+ #include "btminer/src/build_mdd.hpp"
26
+
27
+ // HTMiner (namespaced)
28
+ #include "htminer/src/build_mdd.hpp"
29
+ #include "htminer/src/freq_miner.hpp"
30
+ #include "htminer/src/load_inst.hpp"
31
+ #include "htminer/src/utility.hpp"
32
+
33
+ // LargePrefixProjection
34
+ #include "largepp/src/freq_miner.hpp"
35
+ #include "largepp/src/load_inst.hpp"
36
+ #include "largepp/src/utility.hpp"
37
+
38
+ // LargeBTMiner
39
+ #include "largebm/src/freq_miner.hpp"
40
+ #include "largebm/src/load_inst.hpp"
41
+ #include "largebm/src/utility.hpp"
42
+ #include "largebm/src/build_mdd.hpp"
43
+
44
+ // LargeHTMiner
45
+ #include "largehm/src/freq_miner.hpp"
46
+ #include "largehm/src/load_inst.hpp"
47
+ #include "largehm/src/utility.hpp"
48
+ #include "largehm/src/build_mdd.hpp"
49
+
50
+ namespace {
51
+
52
+ // RAII helper for temp file
53
+ struct TempFile {
54
+ std::string path;
55
+ ~TempFile() {
56
+ if (!path.empty()) {
57
+ std::remove(path.c_str());
58
+ }
59
+ }
60
+ };
61
+
62
+ // Write Python list[list[int]] to a temp file in professor’s format:
63
+ // one sequence per line, items separated by spaces.
64
+ std::string write_temp_seq_file(const std::vector<std::vector<int>>& seqs) {
65
+ char tmp_name[L_tmpnam];
66
+ if (!std::tmpnam(tmp_name)) {
67
+ throw std::runtime_error("Failed to create temporary file name");
68
+ }
69
+ std::string path = std::string(tmp_name) + ".txt";
70
+
71
+ std::ofstream ofs(path);
72
+ if (!ofs) {
73
+ throw std::runtime_error("Failed to open temporary file for writing: " + path);
74
+ }
75
+
76
+ for (const auto& seq : seqs) {
77
+ for (size_t i = 0; i < seq.size(); ++i) {
78
+ if (i) ofs << ' ';
79
+ ofs << seq[i];
80
+ }
81
+ ofs << '\n';
82
+ }
83
+
84
+ ofs.close();
85
+ return path;
86
+ }
87
+
88
+ } // anonymous namespace
89
+
90
+
91
+ PYBIND11_MODULE(_effspm, m) {
92
+ m.doc() = "Unified SPM library: PrefixProjection, BTMiner, HTMiner, Large* variants";
93
+
94
+ // ─────────────────────────────────────────────────────────────
95
+ // PrefixProjection (works directly on Python lists or files)
96
+ // ─────────────────────────────────────────────────────────────
97
+ m.def("PrefixProjection",
98
+ [](py::object data,
99
+ double minsup,
100
+ unsigned int time_limit,
101
+ bool preproc,
102
+ bool use_dic,
103
+ bool verbose,
104
+ const std::string &out_file)
105
+ {
106
+ ::time_limit = time_limit;
107
+ ::pre_pro = preproc;
108
+ ::use_dic = use_dic;
109
+ ::use_list = false;
110
+ ::b_disp = verbose; // controls prints in original code
111
+ ::b_write = !out_file.empty();
112
+ ::out_file = out_file;
113
+
114
+ ClearCollected();
115
+ start_time = std::clock();
116
+
117
+ if (py::isinstance<py::str>(data)) {
118
+ std::string path = data.cast<std::string>();
119
+ if (!Load_instance(path, minsup))
120
+ throw std::runtime_error("PrefixProjection: failed to load file: " + path);
121
+ } else {
122
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
123
+ items = std::move(seqs);
124
+ N = items.size();
125
+
126
+ int max_id = 0;
127
+ for (auto &seq : items)
128
+ for (int x : seq)
129
+ max_id = std::max(max_id, std::abs(x));
130
+ L = max_id;
131
+
132
+ theta = (minsup < 1.0) ? std::ceil(minsup * N) : minsup;
133
+
134
+ DFS.clear();
135
+ DFS.reserve(L);
136
+ for (unsigned int i = 0; i < L; ++i)
137
+ DFS.emplace_back(-static_cast<int>(i) - 1);
138
+
139
+ M = 0;
140
+ E = 0;
141
+ for (auto &seq : items) {
142
+ M = std::max<unsigned int>(M, seq.size());
143
+ E += seq.size();
144
+ }
145
+ }
146
+
147
+ Freq_miner();
148
+
149
+ py::dict out;
150
+ out["patterns"] = GetCollected();
151
+ out["time"] = give_time(std::clock() - start_time);
152
+ return out;
153
+ },
154
+ py::arg("data"),
155
+ py::arg("minsup") = 0.01,
156
+ py::arg("time_limit") = 36000,
157
+ py::arg("preproc") = false,
158
+ py::arg("use_dic") = false,
159
+ py::arg("verbose") = false,
160
+ py::arg("out_file") = ""
161
+ );
162
+
163
+ // ─────────────────────────────────────────────────────────────
164
+ // BTMiner (always uses professor's Load_instance)
165
+ // ─────────────────────────────────────────────────────────────
166
+ m.def("BTMiner",
167
+ [](py::object data,
168
+ double minsup,
169
+ unsigned int time_limit,
170
+ bool preproc,
171
+ bool use_dic,
172
+ bool verbose,
173
+ const std::string &out_file)
174
+ {
175
+ // Configure professor globals
176
+ btminer::time_limit = static_cast<int>(time_limit);
177
+ btminer::pre_pro = preproc;
178
+ btminer::use_dic = use_dic;
179
+ btminer::b_disp = verbose;
180
+ btminer::b_write = !out_file.empty();
181
+ btminer::out_file = out_file;
182
+ btminer::N_mult = 1;
183
+ btminer::M_mult = 1;
184
+ btminer::just_build = false;
185
+
186
+ btminer::ClearCollected();
187
+ btminer::start_time = std::clock();
188
+
189
+ TempFile tmp;
190
+ std::string path;
191
+
192
+ if (py::isinstance<py::str>(data)) {
193
+ // File path: use directly
194
+ path = data.cast<std::string>();
195
+ } else {
196
+ // Python list → write to a temp file in the same format
197
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
198
+ tmp.path = write_temp_seq_file(seqs);
199
+ path = tmp.path;
200
+ }
201
+
202
+ if (verbose) {
203
+ std::cerr << "[BTMiner] path=" << path
204
+ << " minsup=" << minsup
205
+ << " preproc=" << preproc
206
+ << " use_dic=" << use_dic
207
+ << std::endl;
208
+ }
209
+
210
+ if (!btminer::Load_instance(path, minsup)) {
211
+ throw std::runtime_error("BTMiner: failed to load instance from: " + path);
212
+ }
213
+
214
+ btminer::Freq_miner();
215
+
216
+ py::dict out;
217
+ out["patterns"] = btminer::GetCollected();
218
+ out["num_patterns"] = btminer::num_patt;
219
+ out["time"] = btminer::give_time(std::clock() - btminer::start_time);
220
+ out["N"] = btminer::N;
221
+ out["L"] = btminer::L;
222
+ out["theta"] = btminer::theta;
223
+ return out;
224
+ },
225
+ py::arg("data"),
226
+ py::arg("minsup") = 0.01,
227
+ py::arg("time_limit") = 36000,
228
+ py::arg("preproc") = false,
229
+ py::arg("use_dic") = false,
230
+ py::arg("verbose") = false,
231
+ py::arg("out_file") = ""
232
+ );
233
+
234
+ // ─────────────────────────────────────────────────────────────
235
+ // HTMiner (works on files; we use a temp file for in-memory data)
236
+ // ─────────────────────────────────────────────────────────────
237
+ // ─────────────────────────────────────────────────────────────
238
+ // HTMiner (always uses professor's Load_instance; pre_pro forced ON)
239
+ // ─────────────────────────────────────────────────────────────
240
+ m.def("HTMiner",
241
+ [](py::object data,
242
+ double minsup,
243
+ unsigned int time_limit,
244
+ bool /*preproc*/, // Python arg is ignored internally
245
+ bool use_dic,
246
+ bool verbose,
247
+ const std::string &out_file)
248
+ {
249
+ using namespace htminer;
250
+
251
+ // ───────── Global parameter setup ─────────
252
+ htminer::time_limit = time_limit;
253
+
254
+ // IMPORTANT: always run with preprocessing ON,
255
+ // regardless of the Python `preproc` flag.
256
+ htminer::pre_pro = true;
257
+ htminer::use_dic = use_dic;
258
+ htminer::just_build = false;
259
+ htminer::b_disp = verbose;
260
+ htminer::b_write = !out_file.empty();
261
+ htminer::out_file = out_file;
262
+
263
+ // ───────── HARD RESET of HTMiner globals ─────────
264
+ htminer::ClearCollected();
265
+ htminer::Tree.clear();
266
+ htminer::VTree.clear();
267
+ htminer::CTree.clear();
268
+ htminer::DFS.clear();
269
+ htminer::VDFS.clear();
270
+ htminer::item_dic.clear();
271
+
272
+ htminer::M = 0;
273
+ htminer::N = 0;
274
+ htminer::L = 0;
275
+ htminer::E = 0;
276
+ htminer::theta = 0;
277
+ htminer::mlim = 0;
278
+ htminer::itmset_exists = false;
279
+
280
+ // NOTE: do NOT add a root arc here;
281
+ // htminer::Load_instance() already does Tree.emplace_back(0,0,0)
282
+ htminer::start_time = std::clock();
283
+
284
+ // ───────── Handle input (path or in-memory sequences) ─────────
285
+ TempFile tmp;
286
+ std::string path;
287
+
288
+ if (py::isinstance<py::str>(data)) {
289
+ // data is a file path
290
+ path = data.cast<std::string>();
291
+ } else {
292
+ // data is a list[list[int]] → write a temp file in the same text format
293
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
294
+ tmp.path = write_temp_seq_file(seqs);
295
+ path = tmp.path;
296
+ }
297
+
298
+ if (verbose) {
299
+ std::cerr << "[HTMiner] path=" << path
300
+ << " minsup=" << minsup
301
+ << " preproc(always)=true"
302
+ << " use_dic=" << use_dic
303
+ << std::endl;
304
+ }
305
+
306
+ // ───────── Build MDD via professor's loader ─────────
307
+ if (!htminer::Load_instance(path, minsup)) {
308
+ throw std::runtime_error("HTMiner: failed to load instance from: " + path);
309
+ }
310
+
311
+ // ───────── Run miner ─────────
312
+ htminer::Freq_miner();
313
+
314
+ // ───────── Return results ─────────
315
+ py::dict out;
316
+ out["patterns"] = htminer::GetCollected();
317
+ out["time"] = htminer::give_time(std::clock() - htminer::start_time);
318
+ return out;
319
+ },
320
+ py::arg("data"),
321
+ py::arg("minsup") = 0.01,
322
+ py::arg("time_limit") = 36000,
323
+ py::arg("preproc") = false, // kept for API symmetry, but IGNORED
324
+ py::arg("use_dic") = false,
325
+ py::arg("verbose") = false,
326
+ py::arg("out_file") = ""
327
+ );
328
+
329
+ // ─────────────────────────────────────────────────────────────
330
+ // LargePrefixProjection (already has its own Load_py)
331
+ // ─────────────────────────────────────────────────────────────
332
+ m.def("LargePrefixProjection",
333
+ [](py::object data,
334
+ double minsup,
335
+ unsigned int time_limit,
336
+ bool preproc,
337
+ bool use_dic,
338
+ bool verbose,
339
+ const std::string &out_file)
340
+ {
341
+ largepp::time_limit = time_limit;
342
+ largepp::pre_pro = preproc;
343
+ largepp::use_dic = use_dic;
344
+ largepp::use_list = true; // large prefix uses list-based mining
345
+ largepp::b_disp = verbose;
346
+ largepp::b_write = !out_file.empty();
347
+ largepp::out_file = out_file;
348
+ largepp::just_build = false;
349
+
350
+ largepp::ClearCollected();
351
+ largepp::start_time = std::clock();
352
+
353
+ if (py::isinstance<py::str>(data)) {
354
+ std::string fname = data.cast<std::string>();
355
+ largepp::Load_instance(fname, minsup);
356
+ } else {
357
+ largepp::Load_py(data, minsup);
358
+ }
359
+
360
+ largepp::Freq_miner();
361
+
362
+ py::dict out;
363
+ out["patterns"] = largepp::GetCollected();
364
+ out["time"] = largepp::give_time(std::clock() - largepp::start_time);
365
+ return out;
366
+ },
367
+ py::arg("data"),
368
+ py::arg("minsup") = 0.01,
369
+ py::arg("time_limit") = 36000,
370
+ py::arg("preproc") = false,
371
+ py::arg("use_dic") = false,
372
+ py::arg("verbose") = false,
373
+ py::arg("out_file") = ""
374
+ );
375
+
376
+ // ─────────────────────────────────────────────────────────────
377
+ // LargeBTMiner (always uses professor's largebm::Load_instance)
378
+ // ─────────────────────────────────────────────────────────────
379
+ m.def("LargeBTMiner",
380
+ [](py::object data,
381
+ double minsup,
382
+ unsigned int time_limit,
383
+ bool preproc,
384
+ bool use_dic,
385
+ bool verbose,
386
+ const std::string &out_file)
387
+ {
388
+ using namespace largebm;
389
+
390
+ largebm::time_limit = time_limit;
391
+ largebm::pre_pro = preproc;
392
+ largebm::use_dic = use_dic;
393
+ largebm::use_list = false; // MDD-based
394
+ largebm::b_disp = verbose;
395
+ largebm::b_write = !out_file.empty();
396
+ largebm::out_file = out_file;
397
+ largebm::just_build = false;
398
+
399
+ largebm::ClearCollected();
400
+ largebm::items.clear();
401
+ largebm::item_dic.clear();
402
+ largebm::inv_item_dic.clear();
403
+ largebm::Tree.clear();
404
+ largebm::DFS.clear();
405
+
406
+ largebm::start_time = std::clock();
407
+
408
+ TempFile tmp;
409
+ std::string path;
410
+
411
+ if (py::isinstance<py::str>(data)) {
412
+ path = data.cast<std::string>();
413
+ } else {
414
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
415
+ tmp.path = write_temp_seq_file(seqs);
416
+ path = tmp.path;
417
+ }
418
+
419
+ if (verbose) {
420
+ std::cerr << "[LargeBTMiner] path=" << path
421
+ << " minsup=" << minsup
422
+ << " preproc=" << preproc
423
+ << " use_dic=" << use_dic
424
+ << std::endl;
425
+ }
426
+
427
+ if (!largebm::Load_instance(path, minsup)) {
428
+ throw std::runtime_error("LargeBTMiner: failed to load instance from: " + path);
429
+ }
430
+
431
+ largebm::Freq_miner();
432
+
433
+ py::dict out;
434
+ out["patterns"] = largebm::GetCollected();
435
+ out["time"] = largebm::give_time(std::clock() - largebm::start_time);
436
+ return out;
437
+ },
438
+ py::arg("data"),
439
+ py::arg("minsup") = 0.01,
440
+ py::arg("time_limit") = 36000,
441
+ py::arg("preproc") = false,
442
+ py::arg("use_dic") = false,
443
+ py::arg("verbose") = false,
444
+ py::arg("out_file") = ""
445
+ );
446
+
447
+ // ─────────────────────────────────────────────────────────────
448
+ // LargeHTMiner (always uses professor's largehm::Load_instance; pre_pro forced ON)
449
+ // ─────────────────────────────────────────────────────────────
450
+ // ─────────────────────────────────────────────────────────────
451
+ // LargeHTMiner (professor's Large HTMiner, namespaced as largehm)
452
+ // ─────────────────────────────────────────────────────────────
453
+ m.def("LargeHTMiner",
454
+ [](py::object data,
455
+ double minsup,
456
+ unsigned int time_limit,
457
+ bool /*preproc*/, // kept for API symmetry; ignored
458
+ bool use_dic,
459
+ bool verbose,
460
+ const std::string &out_file)
461
+ {
462
+ using namespace largehm;
463
+
464
+ // 1) Global configuration (mirror professor's style)
465
+ largehm::time_limit = time_limit;
466
+ largehm::pre_pro = true; // always preprocess
467
+ largehm::use_dic = use_dic;
468
+ largehm::just_build = false;
469
+ largehm::b_disp = verbose;
470
+ largehm::b_write = !out_file.empty();
471
+ largehm::out_file = out_file;
472
+
473
+ // 2) HARD RESET of all global state for a fresh run
474
+ largehm::ClearCollected(); // our helper in largehm::utility.cpp
475
+
476
+ largehm::M = 0;
477
+ largehm::L = 0;
478
+ largehm::mlim = 0;
479
+ largehm::N = 0;
480
+ largehm::theta = 0;
481
+ largehm::E = 0;
482
+ largehm::itmset_exists = false;
483
+
484
+ // containers
485
+ // (item_dic reset is optional and not strictly needed here)
486
+ largehm::DFS.clear();
487
+ largehm::VDFS.clear();
488
+ largehm::Tree.clear();
489
+ largehm::VTree.clear();
490
+ largehm::CTree.clear();
491
+
492
+ largehm::start_time = std::clock();
493
+
494
+ // 3) Handle input (file path or Python list)
495
+ TempFile tmp;
496
+ std::string path;
497
+
498
+ if (py::isinstance<py::str>(data)) {
499
+ path = data.cast<std::string>();
500
+ } else {
501
+ auto seqs = data.cast<std::vector<std::vector<int>>>();
502
+ tmp.path = write_temp_seq_file(seqs);
503
+ path = tmp.path;
504
+ }
505
+
506
+ if (verbose) {
507
+ std::cerr << "[LargeHTMiner] path=" << path
508
+ << " minsup=" << minsup
509
+ << " preproc(always)=true"
510
+ << " use_dic=" << use_dic
511
+ << std::endl;
512
+ }
513
+
514
+ // 4) Build MDD / load instance.
515
+ // NOTE: Load_instance() itself does Tree.emplace_back(0,0,0),
516
+ // so we DO NOT create a root node here.
517
+ if (!largehm::Load_instance(path, minsup)) {
518
+ throw std::runtime_error("LargeHTMiner: failed to load instance from: " + path);
519
+ }
520
+
521
+ // 5) Run miner (same timing logic as original main)
522
+ if (!largehm::just_build &&
523
+ largehm::give_time(std::clock() - largehm::start_time) < largehm::time_limit)
524
+ {
525
+ largehm::Freq_miner();
526
+ if (largehm::give_time(std::clock() - largehm::start_time) >= largehm::time_limit) {
527
+ std::cout << "TIME LIMIT REACHED\n";
528
+ }
529
+ }
530
+
531
+ // 6) Return collected patterns + runtime
532
+ py::dict out;
533
+ out["patterns"] = largehm::GetCollected();
534
+ out["time"] = largehm::give_time(std::clock() - largehm::start_time);
535
+ return out;
536
+ },
537
+ py::arg("data"),
538
+ py::arg("minsup") = 0.01,
539
+ py::arg("time_limit") = 36000,
540
+ py::arg("preproc") = false, // kept for API symmetry
541
+ py::arg("use_dic") = false,
542
+ py::arg("verbose") = false,
543
+ py::arg("out_file") = ""
544
+ );
545
+
546
+
547
+ }
548
+
549
+
550
+ /*#include <pybind11/pybind11.h>
551
+ #include <pybind11/stl.h>
5
552
  namespace py = pybind11;
6
553
  #include <iostream>
7
554
 
@@ -28,6 +575,7 @@ namespace py = pybind11;
28
575
  #include "largepp/src/load_inst.hpp"
29
576
  #include "largepp/src/utility.hpp"
30
577
 
578
+
31
579
  #include "largebm/src/freq_miner.hpp"
32
580
  #include "largebm/src/load_inst.hpp"
33
581
  #include "largebm/src/utility.hpp"
@@ -469,7 +1017,7 @@ m.def("HTMiner",
469
1017
  // ─────────────────────────────────────────────────────────────────────────
470
1018
  // LargeBTMiner (MDD-based)
471
1019
  // ─────────────────────────────────────────────────────────────────────────
472
- m.def("LargeBTMiner",
1020
+ /*m.def("LargeBTMiner",
473
1021
  [](py::object data,
474
1022
  double minsup,
475
1023
  unsigned int time_limit,
@@ -676,4 +1224,4 @@ m.def("LargeHTMiner",
676
1224
 
677
1225
 
678
1226
 
679
- }
1227
+ } */
Binary file
@@ -67,8 +67,8 @@ bool Load_instance(string &items_file, double thresh) {
67
67
  if (pre_pro) {
68
68
  if (!Preprocess(items_file, thresh))
69
69
  return false;
70
-
71
- cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
70
+ if (b_disp)
71
+ cout << "\nPreprocess done in " << give_time(clock() - kk) << " seconds\n\n";
72
72
 
73
73
  // build empty DFS of size L
74
74
  DFS.clear();
@@ -88,12 +88,13 @@ bool Load_instance(string &items_file, double thresh) {
88
88
  else
89
89
  theta = static_cast<int>(thresh);
90
90
  }
91
-
92
- cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
93
- cout << "Found " << N * N_mult
94
- << " sequence, with max line len " << M
95
- << ", and " << L << " items, and " << E << " enteries\n";
96
- cout << "Total MDD nodes: " << Tree.size() << endl;
91
+ if (b_disp)
92
+ cout << "\nMDD Database built in " << give_time(clock() - kk) << " seconds\n\n";
93
+ if (b_disp)
94
+ cout << "Found " << N * N_mult
95
+ << " sequence, with max line len " << M
96
+ << ", and " << L << " items, and " << E << " enteries\n";
97
+ //cout << "Total MDD nodes: " << Tree.size() << endl;
97
98
 
98
99
  return true;
99
100
  }
@@ -147,8 +148,8 @@ bool Preprocess(string &inst, double thresh) {
147
148
  if (freq[i] >= theta)
148
149
  item_dic[i] = ++real_L;
149
150
  }
150
-
151
- cout << "Original number of items: " << L
151
+ if (b_disp)
152
+ cout << "Original number of items: " << L
152
153
  << " Reduced to: " << real_L << endl;
153
154
 
154
155
  L = real_L;