StrIdx 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/stridx.hpp CHANGED
@@ -10,6 +10,7 @@
10
10
  #include <array>
11
11
  #include <iostream>
12
12
  #include <unordered_map>
13
+ #include <map>
13
14
  #include <set>
14
15
  #include <algorithm>
15
16
  #include <sstream>
@@ -28,21 +29,22 @@ namespace StrIdx {
28
29
  class Output {
29
30
  private:
30
31
  int verboseLevel;
32
+ // TODO: add mutex?
31
33
 
32
34
  public:
33
35
  Output(int verb) : verboseLevel(verb) {}
34
- Output() : Output(3) {}
36
+ Output() : Output(1) {}
35
37
  ~Output() = default;
36
- void print() {}
38
+ static void print() {}
37
39
 
38
40
  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
- template <typename T, typename... Types> void print(T var1, Types... var2) {
41
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
40
42
  std::cout << var1;
41
43
  print(var2...);
42
44
  }
43
45
 
44
46
  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
- template <typename... Types> void printl(Types... var2) {
47
+ template <typename... Types> static void printl(Types... var2) {
46
48
  print(var2...);
47
49
  print("\n");
48
50
  }
@@ -61,86 +63,259 @@ public:
61
63
  }
62
64
  };
63
65
 
66
+ Output out{1};
67
+
68
+ struct CharNode {
69
+ int *ids;
70
+ int ids_sz;
71
+ char c;
72
+ std::uint8_t size;
73
+ CharNode *children;
74
+ CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
75
+
76
+ void init() {
77
+ ids = nullptr;
78
+ ids_sz = 0;
79
+ c = 0;
80
+ size = 0;
81
+ children = nullptr;
82
+ }
83
+
84
+ void dealloc() {
85
+ if (children != nullptr) {
86
+ for (CharNode *it = children; it != children + size; it++) {
87
+ it->dealloc();
88
+ }
89
+ free(children);
90
+ }
91
+ delete[] ids;
92
+ }
93
+
94
+ ~CharNode() {}
95
+
96
+ // Gets Id's stored in this node and all child nodes combined
97
+ std::set<int> getIds() {
98
+ std::set<int> set;
99
+ getIds(set);
100
+ return set;
101
+ }
102
+
103
+ void getIds(std::set<int> &set) {
104
+ for (int j = 0; j < ids_sz; j++) {
105
+ set.insert(ids[j]);
106
+ }
107
+ for (CharNode *it = children; it != children + size; it++) {
108
+ it->getIds(set);
109
+ }
110
+ }
111
+
112
+ // Find if character 'c' is included in children of the node
113
+ CharNode *find(char c) {
114
+ CharNode *ret = nullptr;
115
+ if (size > 0) {
116
+ for (auto it = children; it != children + size; it++) {
117
+ if (it->c == c) {
118
+ ret = it;
119
+ break;
120
+ }
121
+ }
122
+ }
123
+ return ret;
124
+ }
125
+ };
126
+
127
+ /* Tree type data structure consisting of strings of file path segments
128
+ * (somewhat like a trie)
129
+ * For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
130
+ * substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
131
+ * (CharTree::addStr callled for each separately)
132
+ *
133
+ * Which forms a tree like structure:
134
+ * [root]-a-b-r-a-c-a-d-a
135
+ * | | ╰-4
136
+ * | ╰─c-a-d-a-b-r-4
137
+ * ╰───b-r-a-c-a-d-a-b
138
+ * | ╰─4
139
+ * ╰───r-a-c-a-d-a-b-r
140
+ * ╰───d-a-b-r-4
141
+ *
142
+ * Id's pointing to path segments are stored in nodes that match the end of the inserted substring
143
+ *
144
+ * This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
145
+ * For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
146
+ * Therefore, having std::vector's or similar structures with memory overhead is not really an
147
+ * option.
148
+ */
149
+ class CharTree {
150
+ Output out;
151
+ std::mutex mu;
152
+
153
+ public:
154
+ CharNode *root;
155
+
156
+ CharTree() { root = new CharNode; }
157
+ ~CharTree() {
158
+ root->dealloc();
159
+ delete root;
160
+ }
161
+
162
+ void addStr(std::string s, int id) {
163
+ if (s.size() < 2) {
164
+ return;
165
+ }
166
+
167
+ // out.printl("add str:",s);
168
+ CharNode *cn = root;
169
+
170
+ std::lock_guard<std::mutex> mu_lock(mu);
171
+
172
+ for (int i = 0; i < s.size() && i < 8; i++) {
173
+ int c = ((char)s[i]);
174
+ bool found = false;
175
+
176
+ if (cn->size > 0) {
177
+ // out.printl("(1) cn->size > 0");
178
+ for (auto it = cn->children; it != cn->children + cn->size; it++) {
179
+ if (it->c == c) {
180
+ // out.printl("{", c, "}");
181
+ found = true;
182
+ cn = it;
183
+ break;
184
+ }
185
+ }
186
+ }
187
+ if (!found) {
188
+ // auto x = new CharNode[cn->size + 1];
189
+ CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
190
+ if (cn->size > 0) {
191
+ memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
192
+ free(cn->children);
193
+ }
194
+ cn->children = x;
195
+ CharNode *nn = &(cn->children[cn->size]);
196
+ nn->init();
197
+ nn->c = c;
198
+ cn->size++;
199
+ cn = nn;
200
+ }
201
+
202
+ if (i == s.size() - 1 && true) {
203
+ out.printv(4, "i=", i, "s:", s.size(), "|");
204
+ bool found = false;
205
+ if (cn->ids_sz > 0) {
206
+ for (int i = 0; i < cn->ids_sz; i++) {
207
+ if (cn->ids[i] == id) {
208
+ found = true;
209
+ out.printv(3, "found:", id, "\n");
210
+ }
211
+ }
212
+ }
213
+ if (!found) {
214
+ // out.print(".a.");
215
+ auto x = new int[cn->ids_sz + 1];
216
+ if (cn->ids_sz > 0) {
217
+ memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
218
+ delete[] cn->ids;
219
+ }
220
+ cn->ids = x;
221
+ cn->ids[cn->ids_sz] = id;
222
+ cn->ids_sz++;
223
+ out.printv(3, "sz:", cn->ids_sz, ",");
224
+ }
225
+ }
226
+
227
+ } // END for
228
+ }
229
+
230
+ void debug() { debug("", root); }
231
+ void debug(std::string trail, CharNode *cn) {
232
+
233
+ // if (trail.size() > 6) {
234
+ // out.print("\n");
235
+ // return;
236
+ // }
237
+
238
+ if (cn == nullptr) {
239
+ return;
240
+ }
241
+ for (int i = 0; i < cn->size; i++) {
242
+ CharNode *child = &cn->children[i];
243
+ out.print("[", child->ids_sz, "]");
244
+ if (child->size > 0) {
245
+ debug(trail + child->c, child);
246
+ } else {
247
+ out.printl(trail, child->c);
248
+ // out.printl();
249
+ }
250
+ }
251
+ }
252
+ };
253
+
64
254
  // Transforms input string as follows:
65
255
  // '/foo/bar/file1.txt'
66
- // => vector{"foo", "bar", "file1.txt"}
67
- std::vector<std::string> splitString(const std::string &input, const char &separator) {
256
+ // => vector{"/foo", "/bar", "/file1.txt"}
257
+
258
+ std::vector<std::string> splitString(const std::string &str, char delimiter) {
68
259
  std::vector<std::string> result;
69
- std::stringstream ss(input);
70
- std::string item;
260
+ std::string part;
71
261
 
72
- while (std::getline(ss, item, separator)) {
73
- if (item.size() > 0) {
74
- result.push_back(item);
262
+ for (char ch : str) {
263
+ if (ch == delimiter) {
264
+ if (part.size() > 0) {
265
+ result.push_back(part);
266
+ }
267
+ part.clear(); // Start a new part
268
+ part += ch;
269
+ } else {
270
+ part += ch;
75
271
  }
76
272
  }
77
273
 
78
- return result;
79
- }
80
-
81
- // Convert int64_t to binary string
82
- [[nodiscard]] std::string int64ToBinaryString(int64_t num) {
83
- std::string result;
84
- for (int i = 63; i >= 0; --i) {
85
- result += ((num >> i) & 1) ? '1' : '0';
274
+ // If there's any remaining part after the loop, add it to the result
275
+ if (!part.empty()) {
276
+ result.push_back(part);
86
277
  }
87
- return result;
88
- }
89
278
 
90
- // Debug. Convert a (8 char) string represented as int64_t to std::string
91
- [[nodiscard]] std::string int64ToStr(int64_t key) {
92
- int nchars = 8;
93
- std::string str;
94
- int multip = nchars * 8;
95
- for (int i = 0; i <= nchars; i++) {
96
- char c = (key >> multip) & 255;
97
- str.push_back(c);
98
- multip -= 8;
99
- }
100
- return str;
279
+ // for (const auto &value : result) {
280
+ // std::cout << value << "|";
281
+ // }
282
+ // std::cout << std::endl;
283
+
284
+ return result;
101
285
  }
102
286
 
103
287
  // Debug
104
- void printVector(const std::vector<int> &vec) {
288
+ void printVector(const std::vector<float> &vec) {
105
289
  for (const auto &value : vec) {
106
290
  std::cout << value << " ";
107
291
  }
108
292
  }
109
293
 
110
- // Debug
111
- [[nodiscard]] std::string charToBinaryString(char chr) {
112
- std::string result;
113
- for (int i = 7; i >= 0; --i) {
114
- result += ((chr >> i) & 1) ? '1' : '0';
115
- }
116
- return result;
117
- }
118
-
119
294
  class Candidate;
120
295
  enum class segmentType { Dir, File };
121
296
 
122
297
  // A segment of a file path
123
298
  // e.g. if path is /foo/bar/baz.txt
124
299
  // segments are [{root}, foo, bar, baz.txt]
125
- class PathSegment {
126
- public:
300
+ struct PathSegment {
127
301
  std::string str;
128
302
  int fileId; // (if FILE)
129
303
  Candidate *cand;
130
304
  PathSegment *parent;
131
305
  std::mutex mu;
132
- ankerl::unordered_dense::map<std::string, PathSegment *> children;
306
+ std::map<std::string, PathSegment *> children;
307
+
133
308
  segmentType type = segmentType::Dir;
134
309
  PathSegment() : parent(nullptr) {}
135
310
  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
136
311
  PathSegment(std::string _str, int _fileId)
137
312
  : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
138
- [[nodiscard]] int size() {
313
+ [[nodiscard]] int size() const {
139
314
  int sz = str.size();
140
315
  PathSegment *cur = parent;
141
- // Sum up length of parent segments (+1 for divisors)
316
+ // Sum up length of parent segments
142
317
  while (cur->parent != nullptr) {
143
- sz += cur->str.size() + 1;
318
+ sz += cur->str.size();
144
319
  cur = cur->parent;
145
320
  }
146
321
  return sz;
@@ -148,8 +323,7 @@ public:
148
323
  };
149
324
 
150
325
  // Candidate for result in string (filename) search
151
- class Candidate {
152
- public:
326
+ struct Candidate {
153
327
  std::vector<float> v_charscore;
154
328
  PathSegment *seg;
155
329
  int fileId;
@@ -161,26 +335,19 @@ public:
161
335
  float maxscore;
162
336
  int candLen; // Length of candidate
163
337
 
338
+ ~Candidate(){};
164
339
  Candidate(){};
165
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
166
- // Initialize v_charscores with zeros
167
- v_charscore.resize(len, 0);
168
- candLen = str.size();
169
- seg = nullptr;
170
- }
171
-
172
340
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
173
341
  // Initialize v_charscores with zeros
174
342
  v_charscore.resize(len, 0);
175
343
  candLen = seg->size();
176
344
  }
177
345
 
178
- [[nodiscard]] float getScore() {
346
+ [[nodiscard]] float getScore() const {
179
347
  int i = 0;
180
348
  float score = 0.0;
181
- candLen = seg->size();
182
349
 
183
- for (float &charscore : v_charscore) {
350
+ for (const float &charscore : v_charscore) {
184
351
  score += charscore;
185
352
  i++;
186
353
  }
@@ -188,20 +355,17 @@ public:
188
355
  float div2 = len * candLen;
189
356
  float score1 = score / div;
190
357
  float score2 = score / div2;
358
+ // out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
191
359
 
192
360
  score = score1 * 0.97 + score2 * 0.03;
193
361
  return score;
194
362
  }
195
363
 
196
- [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
364
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
197
365
  };
198
366
 
199
- // This seems to give 10x speed improvement over std::unordered_map
200
- typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
201
- // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
202
-
203
367
  typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
204
- // typedef std::unordered_map<int, Candidate*> CandMap;
368
+ // typedef std::unordered_map<int, Candidate *> CandMap;
205
369
 
206
370
  class StringIndex {
207
371
  private:
@@ -209,34 +373,28 @@ private:
209
373
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
210
374
  int numStrings = 0;
211
375
 
212
- std::vector<SegMap *> dirmaps;
213
- std::vector<SegMap *> filemaps;
214
-
215
376
  std::vector<PathSegment *> segsToClean;
216
377
 
217
- std::unordered_map<int, std::string> strlist;
218
378
  std::unordered_map<int, PathSegment *> seglist;
379
+ std::unordered_map<int, PathSegment *> seglist_dir;
380
+ std::mutex seglist_mu;
381
+
219
382
  PathSegment *root;
220
383
  int dirId = 0;
221
384
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
222
385
 
223
- std::array<std::mutex, 9> mts_f;
224
- std::array<std::mutex, 9> mts_d;
225
-
226
386
  std::unique_ptr<ThreadPool> pool;
227
387
  Output out{1}; // verbose level = 1
388
+ std::mutex cm_mu;
228
389
 
229
390
  public:
391
+ CharTree cm; // for files
392
+ CharTree cm_dir; // for directories
230
393
  StringIndex(char sep) : dirSeparator(sep) {
231
394
  root = new PathSegment();
232
395
  root->parent = nullptr;
233
396
  root->str = "[ROOT]";
234
397
 
235
- for (int i = 0; i <= 8; i++) {
236
- dirmaps.push_back(new SegMap);
237
- filemaps.push_back(new SegMap);
238
- }
239
-
240
398
  // Threads between 4 and 6
241
399
  // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
242
400
  int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
@@ -245,43 +403,31 @@ public:
245
403
  pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
246
404
  }
247
405
 
248
- /* Don't separate path to segments separator=\0.
406
+ /* Don't separate path to segments when separator=\0.
249
407
  This is slower, but can be used for other data than files also. */
250
408
  StringIndex() : StringIndex('\0') {}
251
409
 
252
410
  void setDirSeparator(char sep) { dirSeparator = sep; }
253
411
  void setDirWeight(float val) { dirWeight = val; }
254
412
 
255
- ~StringIndex() {
256
- for (auto x : dirmaps) {
257
- for (auto y : *x) {
258
- y.second->clear();
259
- delete (y.second);
260
- }
261
- x->clear();
262
- delete x;
263
- }
264
- for (auto x : filemaps) {
265
- for (auto y : *x) {
266
- y.second->clear();
267
- delete (y.second);
268
- }
269
- x->clear();
270
- delete x;
271
- }
272
- clearPathSegmentChildren(root);
273
- }
413
+ ~StringIndex() { clearPathSegmentChildren(root); }
274
414
 
275
415
  void addStrToIndex(std::string filePath, int fileId) {
276
416
  addStrToIndex(filePath, fileId, dirSeparator);
277
417
  }
278
418
 
279
419
  void addStrToIndexThreaded(std::string filePath, int fileId) {
280
- pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
420
+ pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
421
+ // addStrToIndex(filePath, fileId, dirSeparator);
281
422
  }
282
- void waitUntilReady() { pool->waitUntilDone(); }
423
+ void waitUntilReady() const { pool->waitUntilDone(); }
283
424
 
284
- void waitUntilDone() { pool->waitUntilDone(); }
425
+ void waitUntilDone() const { pool->waitUntilDone(); }
426
+
427
+ int size() {
428
+ std::lock_guard<std::mutex> guard(seglist_mu);
429
+ return seglist.size();
430
+ }
285
431
 
286
432
  /**
287
433
  * Add a string to the index to be searched for afterwards
@@ -291,8 +437,21 @@ public:
291
437
  * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
292
438
  * one of {'\\', '/', '\0' (no separation)}.
293
439
  */
440
+
294
441
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
295
- out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
442
+
443
+ std::lock_guard<std::mutex> guard(cm_mu);
444
+
445
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
446
+
447
+ // If a string with this index has beeen added already
448
+ {
449
+ std::lock_guard<std::mutex> guard(seglist_mu);
450
+ if (seglist.find(fileId) != seglist.end()) {
451
+ out.printl("seglist.find(fileId) != seglist.end()");
452
+ return;
453
+ }
454
+ }
296
455
 
297
456
  std::vector<std::string> segs;
298
457
  numStrings += 1;
@@ -323,28 +482,77 @@ public:
323
482
  if (auto it = prev->children.find(x); it != prev->children.end()) {
324
483
  p = it->second;
325
484
  prev->mu.unlock();
326
- } else {
485
+ } else { // File or dir not included in tree yet
327
486
  p = new PathSegment(x, fileId);
328
487
  p->parent = prev;
329
488
  // If this is last item in segs, then it is a file.
330
489
  if (_x == std::prev(segs.end())) {
331
490
  p->type = segmentType::File;
332
- seglist[fileId] = p;
491
+ {
492
+ std::lock_guard<std::mutex> guard(seglist_mu);
493
+ seglist[fileId] = p;
494
+
495
+ for (int i = 0; i < x.size() + 1; i++) {
496
+ auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
497
+ cm.addStr(s, fileId);
498
+ }
499
+ }
333
500
  } else { // otherwise, it is a directory
334
501
  p->type = segmentType::Dir;
335
502
  p->fileId = dirId;
336
- // Files use user input Id. Directories need to have it generated
503
+ /* Add "/" to the end of the string so that
504
+ * /path/to/file will be indexed as:
505
+ * {"/path/", "/to/", "/file"}
506
+ */
507
+ auto dir_str = x + "/";
508
+
509
+ {
510
+ std::lock_guard<std::mutex> guard(seglist_mu);
511
+ seglist_dir[dirId] = p;
512
+ // Files use user input Id. Directories need to have it generated
513
+ }
514
+
515
+ // TODO: Create a function
516
+ for (int i = 0; i < dir_str.size() + 1; i++) {
517
+ auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
518
+ cm_dir.addStr(s, dirId);
519
+ }
520
+
337
521
  dirId++;
338
522
  }
339
523
  prev->children[x] = p;
340
524
  prev->mu.unlock();
341
- addPathSegmentKeys(p);
342
- }
525
+ } // END of first if
343
526
 
344
527
  prev = p;
345
528
  }
346
529
  }
347
530
 
531
+ std::string getString(int id) { return getString(id, false); }
532
+
533
+ // Reconstruct original filepath from segments
534
+ std::string getString(int id, bool isDir) {
535
+ std::string s = "";
536
+ std::lock_guard<std::mutex> guard(seglist_mu);
537
+
538
+ PathSegment *seg = nullptr;
539
+
540
+ if (isDir) {
541
+ seg = seglist_dir[id];
542
+ } else {
543
+ seg = seglist[id];
544
+ }
545
+ s += seg->str;
546
+ while (seg->parent->parent != nullptr) {
547
+ seg = seg->parent;
548
+ s = seg->str + s;
549
+ // out.print(seg, "(", seg->str, ")", ",");
550
+ }
551
+ // out.printl(s);
552
+
553
+ return s;
554
+ }
555
+
348
556
  /**
349
557
  The search will find filepaths similar to the input string
350
558
 
@@ -377,33 +585,46 @@ public:
377
585
  @param query String to search for inside the index
378
586
  */
379
587
 
380
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
381
- return findSimilar(query, 2);
382
- }
383
-
384
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
385
- CandMap fileCandMap;
386
- CandMap dirCandMap;
387
-
388
- waitUntilDone();
389
-
390
- // Find both files and directories that match the input query
391
- addToCandMap(fileCandMap, query, filemaps);
392
- addToCandMap(dirCandMap, query, dirmaps);
393
-
394
- /* If parent dir of a file matches the input string add the scores of the direcotry to the
395
- scores of the file */
396
- mergeCandidateMaps(fileCandMap, dirCandMap);
397
-
398
- // Set all candidate pointers to nullptr so they won't mess up future searches
399
- for (auto seg : segsToClean) {
400
- seg->cand = nullptr;
588
+ void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
589
+
590
+ int last_start = query.size() - 2;
591
+ for (int start = 0; start <= last_start; start++) {
592
+ CharNode *cn = chartr.root;
593
+ int end = std::min(start + 7, ((int)query.size()) - 1);
594
+ int nchars = end - start + 1;
595
+ std::string s = query.substr(start, nchars);
596
+
597
+ for (int i = 0; i < s.size(); i++) {
598
+ char c = s[i];
599
+ CharNode *x = cn->find(c);
600
+ if (x != nullptr) {
601
+ cn = x;
602
+ // Consider scores only for substrings with size >= 2
603
+ if (i > 0) {
604
+ std::set<int> ids = cn->getIds();
605
+ for (const int &y : ids) {
606
+ PathSegment *p = nullptr;
607
+ if (&chartr == &cm) {
608
+ p = seglist[y];
609
+ } else {
610
+ p = seglist_dir[y];
611
+ }
612
+ assert(p != nullptr);
613
+ addToResults(p, query, start, i + 1, candmap);
614
+ }
615
+ }
616
+ } else {
617
+ // assert(cn->ids_sz < 1); // TODO: should not come here?
618
+ break;
619
+ }
620
+ }
401
621
  }
402
- segsToClean.clear();
622
+ }
403
623
 
624
+ std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
404
625
  // Form return result, 2d array with file id's and scores
405
626
  std::vector<std::pair<float, int>> results;
406
- for (auto &[fid, cand] : fileCandMap) {
627
+ for (auto &[fid, cand] : candmap) {
407
628
  std::pair<float, int> v;
408
629
  float sc = cand->getScore();
409
630
  v.first = sc;
@@ -412,164 +633,134 @@ public:
412
633
  delete cand;
413
634
  }
414
635
 
415
- for (auto &[fid, cand] : dirCandMap) {
416
- delete cand;
417
- }
418
-
419
636
  // Sort highest score first
420
637
  std::sort(results.begin(), results.end(),
421
638
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
422
639
  return results;
423
640
  }
424
641
 
425
- // Return int64_t representation of the first nchars in str, starting from index i
426
- [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
427
- int64_t key = 0;
428
- for (int i_char = 0; i_char < nchars; i_char++) {
429
- key = key | static_cast<int64_t>(str[i + i_char]);
430
- if (i_char < nchars - 1) {
431
- // Shift 8 bits to the left except on the last iteration
432
- key = key << 8;
433
- }
434
- }
435
- return key;
436
- }
642
+ std::vector<std::pair<float, int>> findDirectories(std::string query) {
643
+ CandMap dirCandMap;
644
+ auto &candmap = dirCandMap;
645
+ waitUntilDone();
437
646
 
438
- void debug() {
647
+ searchCharTree(query, dirCandMap, cm_dir);
648
+ addParentScores(dirCandMap);
649
+ auto results = candidatesToVec(dirCandMap);
439
650
 
440
- int nchars = 3;
441
- for (const auto &[key, value] : (*filemaps[nchars])) {
442
- int64_t x;
443
- x = key;
444
- int multip = nchars * 8;
445
- for (int i = 0; i <= nchars; i++) {
446
- char c = (x >> multip) & 255;
447
- std::cout << c;
448
- multip -= 8;
449
- }
450
- std::cout << "\n";
451
- // for (auto y : *value) {
452
- // std::cout << y << " ";
453
- // }
454
- // std::cout << "\n";
455
- }
651
+ return results;
456
652
  }
457
653
 
458
- private:
459
- void clearPathSegmentChildren(PathSegment *p) {
460
- if (p->children.size() > 0) {
461
- for (auto x : p->children) {
462
- clearPathSegmentChildren(x.second);
463
- }
464
- }
465
- delete p;
654
+ std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
655
+ return findFilesAndDirectories(query, true, true);
466
656
  }
467
657
 
468
- void addPathSegmentKeys(PathSegment *p) {
469
- // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
470
- // This function generates int64 representations (keys) of all substrings of size 2..8 in that
471
- // path segment and stores pointer to p in hash tables using these int values as keys.
658
+ std::vector<std::pair<float, std::string>>
659
+ findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
472
660
 
473
- int maxChars = 8;
474
- int minChars = 2;
661
+ CandMap fileCandMap;
662
+ CandMap dirCandMap;
663
+ waitUntilDone();
664
+ std::vector<std::pair<float, std::string>> results;
475
665
 
476
- std::string str = p->str;
477
- if (p->str.size() < 2) {
478
- return;
479
- }
480
- if (static_cast<int>(p->str.size()) < maxChars) {
481
- maxChars = p->str.size();
666
+ if (includeFiles) {
667
+ searchCharTree(query, fileCandMap, cm);
668
+ // out.printl("size:",fileCandMap.size());
482
669
  }
483
670
 
484
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
671
+ searchCharTree(query, dirCandMap, cm_dir);
485
672
 
486
- std::mutex *mu;
487
- SegMap *map;
488
- if (p->type == segmentType::File) {
489
- map = filemaps[sublen];
490
- mu = &mts_f[sublen];
491
- } else {
492
- map = dirmaps[sublen];
493
- mu = &mts_d[sublen];
494
- }
673
+ if (includeFiles) {
674
+ addParentScores(fileCandMap);
675
+ }
495
676
 
496
- int count = str.size() - sublen + 1;
677
+ if (includeDirs) {
678
+ addParentScores(dirCandMap);
679
+ }
680
+
681
+ for (auto seg : segsToClean) {
682
+ seg->cand = nullptr;
683
+ }
684
+ segsToClean.clear();
497
685
 
498
- int64_t keys[count + 1];
499
- for (int i = 0; i <= count; i++) {
500
- keys[i] = getKeyAtIdx(str, i, sublen);
686
+ // TODO: Need to call this just to delete candidates
687
+ auto res_dir = candidatesToVec(dirCandMap);
688
+ if (includeDirs) {
689
+ for (const auto &[score, id] : res_dir) {
690
+ results.push_back(std::pair<float, std::string>{score, getString(id, true)});
501
691
  }
692
+ }
502
693
 
503
- mu->lock();
504
- for (int i = 0; i <= count; i++) {
505
- // int64_t key = getKeyAtIdx(str, i, sublen);
506
- auto key = keys[i];
694
+ if (includeFiles) {
695
+ auto res_file = candidatesToVec(fileCandMap);
696
+ // out.printl("size2:",fileCandMap.size());
697
+ for (const auto &[score, id] : res_file) {
507
698
 
508
- // Create a new std::set for key if doesn't exist already
509
- auto it = map->find(key);
510
- if (it == map->end()) {
511
- (*map)[key] = new std::set<PathSegment *>;
512
- }
513
- (*map)[key]->insert(p);
699
+ // out.print("|",getString(id),"|");
700
+ results.push_back(std::pair<float, std::string>{score, getString(id)});
514
701
  }
515
- mu->unlock();
516
702
  }
703
+
704
+ // Sort highest score first
705
+ std::sort(results.begin(), results.end(),
706
+ [](std::pair<float, std::string> a, std::pair<float, std::string> b) {
707
+ return a.first > b.first;
708
+ });
709
+ return results;
517
710
  }
518
711
 
519
- // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
520
- // is of length <nchars>.
521
- [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
522
- SegMap &map) {
712
+ // TODO: delete?
713
+ std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
523
714
 
524
- assert(i + nchars <= static_cast<int>(str.size()));
525
- std::vector<PathSegment *> res;
715
+ std::vector<std::pair<float, int>> findFiles(std::string query) {
526
716
 
527
- // Take substring of str, starting at i, spanning nchars
528
- // transform that to 64 bit integer
529
- int64_t key = getKeyAtIdx(str, i, nchars);
530
- // Find all path segments in map that have the same substring
531
- if (auto it = map.find(key); it != map.end()) { // key found
532
- auto set = it->second;
533
- for (auto value : *set) {
534
- res.push_back(value);
535
- }
536
- }
537
- return res;
538
- }
717
+ CandMap fileCandMap;
718
+ CandMap dirCandMap;
719
+ auto &candmap = fileCandMap;
720
+ waitUntilDone();
539
721
 
540
- void addToCandMap(CandMap &candmap, std::string query,
541
- std::vector<SegMap *> &map // filemaps or dirmaps
542
- ) {
543
- int maxChars = 8;
544
- int minChars = 2;
545
- if (static_cast<int>(query.size()) < maxChars) {
546
- maxChars = query.size();
722
+ searchCharTree(query, fileCandMap, cm);
723
+ searchCharTree(query, dirCandMap, cm_dir);
724
+ addParentScores(fileCandMap);
725
+
726
+ for (auto seg : segsToClean) {
727
+ seg->cand = nullptr;
547
728
  }
729
+ segsToClean.clear();
548
730
 
549
- // Loop all substring lengths between minChars..maxChars
550
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
551
- int count = query.size() - sublen + 1;
731
+ auto results = candidatesToVec(fileCandMap);
732
+ auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
552
733
 
553
- // Loop all possible start positions
554
- for (int i = 0; i < count; i++) {
555
- std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
734
+ return results;
735
+ }
556
736
 
557
- for (PathSegment *p : res) {
558
- addToResults(p, query, i, sublen, candmap);
559
- }
737
+ private:
738
+ void clearPathSegmentChildren(PathSegment *p) {
739
+ if (p->children.size() > 0) {
740
+ for (auto x : p->children) {
741
+ clearPathSegmentChildren(x.second);
560
742
  }
561
743
  }
744
+ delete p;
562
745
  }
563
746
 
564
747
  // Add parent directories scores to files
565
- void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
748
+ void addParentScores(CandMap &fileCandMap) {
566
749
 
567
750
  for (auto &[fid, cand] : fileCandMap) {
568
751
  PathSegment *p = cand->seg->parent;
569
752
  while (p->parent != nullptr) {
570
753
  if (p->cand != nullptr) {
754
+
571
755
  auto &scoreA = cand->v_charscore;
572
756
  auto &scoreB = p->cand->v_charscore;
757
+
758
+ // out.print("[");
759
+ // printVector(scoreA);
760
+ // out.print(",");
761
+ // printVector(scoreB);
762
+ // out.print(",");
763
+ // out.print("]");
573
764
  for (int i = 0; i < cand->len; i++) {
574
765
  if (scoreA[i] < scoreB[i] * dirWeight) {
575
766
  scoreA[i] = scoreB[i] * dirWeight;
@@ -585,6 +776,7 @@ private:
585
776
 
586
777
  if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
587
778
  Candidate *cand = new Candidate(seg, str.size());
779
+ // out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
588
780
  segsToClean.push_back(seg);
589
781
  candmap[seg->fileId] = cand;
590
782
  seg->cand = cand;