StrIdx 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/stridx.hpp CHANGED
@@ -10,6 +10,7 @@
10
10
  #include <array>
11
11
  #include <iostream>
12
12
  #include <unordered_map>
13
+ #include <map>
13
14
  #include <set>
14
15
  #include <algorithm>
15
16
  #include <sstream>
@@ -28,10 +29,11 @@ namespace StrIdx {
28
29
  class Output {
29
30
  private:
30
31
  int verboseLevel;
32
+ // TODO: add mutex?
31
33
 
32
34
  public:
33
35
  Output(int verb) : verboseLevel(verb) {}
34
- Output() : Output(3) {}
36
+ Output() : Output(1) {}
35
37
  ~Output() = default;
36
38
  static void print() {}
37
39
 
@@ -61,61 +63,234 @@ public:
61
63
  }
62
64
  };
63
65
 
66
+ Output out{1};
67
+
68
+ struct CharNode {
69
+ int *ids;
70
+ int ids_sz;
71
+ char c;
72
+ std::uint8_t size;
73
+ CharNode *children;
74
+ CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
75
+
76
+ void init() {
77
+ ids = nullptr;
78
+ ids_sz = 0;
79
+ c = 0;
80
+ size = 0;
81
+ children = nullptr;
82
+ }
83
+
84
+ void dealloc() {
85
+ if (children != nullptr) {
86
+ for (CharNode *it = children; it != children + size; it++) {
87
+ it->dealloc();
88
+ }
89
+ free(children);
90
+ }
91
+ delete[] ids;
92
+ }
93
+
94
+ ~CharNode() {}
95
+
96
+ // Gets Id's stored in this node and all child nodes combined
97
+ std::set<int> getIds() {
98
+ std::set<int> set;
99
+ getIds(set);
100
+ return set;
101
+ }
102
+
103
+ void getIds(std::set<int> &set) {
104
+ for (int j = 0; j < ids_sz; j++) {
105
+ set.insert(ids[j]);
106
+ }
107
+ for (CharNode *it = children; it != children + size; it++) {
108
+ it->getIds(set);
109
+ }
110
+ }
111
+
112
+ // Find if character 'c' is included in children of the node
113
+ CharNode *find(char c) {
114
+ CharNode *ret = nullptr;
115
+ if (size > 0) {
116
+ for (auto it = children; it != children + size; it++) {
117
+ if (it->c == c) {
118
+ ret = it;
119
+ break;
120
+ }
121
+ }
122
+ }
123
+ return ret;
124
+ }
125
+ };
126
+
127
+ /* Tree type data structure consisting of strings of file path segments
128
+ * (somewhat like a trie)
129
+ * For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
130
+ * substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
131
+ * (CharTree::addStr callled for each separately)
132
+ *
133
+ * Which forms a tree like structure:
134
+ * [root]-a-b-r-a-c-a-d-a
135
+ * | | ╰-4
136
+ * | ╰─c-a-d-a-b-r-4
137
+ * ╰───b-r-a-c-a-d-a-b
138
+ * | ╰─4
139
+ * ╰───r-a-c-a-d-a-b-r
140
+ * ╰───d-a-b-r-4
141
+ *
142
+ * Id's pointing to path segments are stored in nodes that match the end of the inserted substring
143
+ *
144
+ * This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
145
+ * For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
146
+ * Therefore, having std::vector's or similar structures with memory overhead is not really an
147
+ * option.
148
+ */
149
+ class CharTree {
150
+ Output out;
151
+ std::mutex mu;
152
+
153
+ public:
154
+ CharNode *root;
155
+
156
+ CharTree() { root = new CharNode; }
157
+ ~CharTree() {
158
+ root->dealloc();
159
+ delete root;
160
+ }
161
+
162
+ void addStr(std::string s, int id) {
163
+ if (s.size() < 2) {
164
+ return;
165
+ }
166
+
167
+ // out.printl("add str:",s);
168
+ CharNode *cn = root;
169
+
170
+ std::lock_guard<std::mutex> mu_lock(mu);
171
+
172
+ for (int i = 0; i < s.size() && i < 8; i++) {
173
+ int c = ((char)s[i]);
174
+ bool found = false;
175
+
176
+ if (cn->size > 0) {
177
+ // out.printl("(1) cn->size > 0");
178
+ for (auto it = cn->children; it != cn->children + cn->size; it++) {
179
+ if (it->c == c) {
180
+ // out.printl("{", c, "}");
181
+ found = true;
182
+ cn = it;
183
+ break;
184
+ }
185
+ }
186
+ }
187
+ if (!found) {
188
+ // auto x = new CharNode[cn->size + 1];
189
+ CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
190
+ if (cn->size > 0) {
191
+ memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
192
+ free(cn->children);
193
+ }
194
+ cn->children = x;
195
+ CharNode *nn = &(cn->children[cn->size]);
196
+ nn->init();
197
+ nn->c = c;
198
+ cn->size++;
199
+ cn = nn;
200
+ }
201
+
202
+ if (i == s.size() - 1 && true) {
203
+ out.printv(4, "i=", i, "s:", s.size(), "|");
204
+ bool found = false;
205
+ if (cn->ids_sz > 0) {
206
+ for (int i = 0; i < cn->ids_sz; i++) {
207
+ if (cn->ids[i] == id) {
208
+ found = true;
209
+ out.printv(3, "found:", id, "\n");
210
+ }
211
+ }
212
+ }
213
+ if (!found) {
214
+ // out.print(".a.");
215
+ auto x = new int[cn->ids_sz + 1];
216
+ if (cn->ids_sz > 0) {
217
+ memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
218
+ delete[] cn->ids;
219
+ }
220
+ cn->ids = x;
221
+ cn->ids[cn->ids_sz] = id;
222
+ cn->ids_sz++;
223
+ out.printv(3, "sz:", cn->ids_sz, ",");
224
+ }
225
+ }
226
+
227
+ } // END for
228
+ }
229
+
230
+ void debug() { debug("", root); }
231
+ void debug(std::string trail, CharNode *cn) {
232
+
233
+ // if (trail.size() > 6) {
234
+ // out.print("\n");
235
+ // return;
236
+ // }
237
+
238
+ if (cn == nullptr) {
239
+ return;
240
+ }
241
+ for (int i = 0; i < cn->size; i++) {
242
+ CharNode *child = &cn->children[i];
243
+ out.print("[", child->ids_sz, "]");
244
+ if (child->size > 0) {
245
+ debug(trail + child->c, child);
246
+ } else {
247
+ out.printl(trail, child->c);
248
+ // out.printl();
249
+ }
250
+ }
251
+ }
252
+ };
253
+
64
254
  // Transforms input string as follows:
65
255
  // '/foo/bar/file1.txt'
66
- // => vector{"foo", "bar", "file1.txt"}
67
- std::vector<std::string> splitString(const std::string &input, const char &separator) {
256
+ // => vector{"/foo", "/bar", "/file1.txt"}
257
+
258
+ std::vector<std::string> splitString(const std::string &str, char delimiter) {
68
259
  std::vector<std::string> result;
69
- std::stringstream ss(input);
70
- std::string item;
260
+ std::string part;
71
261
 
72
- while (std::getline(ss, item, separator)) {
73
- if (item.size() > 0) {
74
- result.push_back(item);
262
+ for (char ch : str) {
263
+ if (ch == delimiter) {
264
+ if (part.size() > 0) {
265
+ result.push_back(part);
266
+ }
267
+ part.clear(); // Start a new part
268
+ part += ch;
269
+ } else {
270
+ part += ch;
75
271
  }
76
272
  }
77
273
 
78
- return result;
79
- }
80
-
81
- // Convert int64_t to binary string
82
- [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
83
- std::string result;
84
- for (int i = 63; i >= 0; --i) {
85
- result += ((num >> i) & 1) ? '1' : '0';
274
+ // If there's any remaining part after the loop, add it to the result
275
+ if (!part.empty()) {
276
+ result.push_back(part);
86
277
  }
87
- return result;
88
- }
89
278
 
90
- // Debug. Convert a (8 char) string represented as int64_t to std::string
91
- [[nodiscard]] std::string int64ToStr(const int64_t &key) {
92
- int nchars = 8;
93
- std::string str;
94
- int multip = nchars * 8;
95
- for (int i = 0; i <= nchars; i++) {
96
- char c = (key >> multip) & 255;
97
- str.push_back(c);
98
- multip -= 8;
99
- }
100
- return str;
279
+ // for (const auto &value : result) {
280
+ // std::cout << value << "|";
281
+ // }
282
+ // std::cout << std::endl;
283
+
284
+ return result;
101
285
  }
102
286
 
103
287
  // Debug
104
- void printVector(const std::vector<int> &vec) {
288
+ void printVector(const std::vector<float> &vec) {
105
289
  for (const auto &value : vec) {
106
290
  std::cout << value << " ";
107
291
  }
108
292
  }
109
293
 
110
- // Debug
111
- [[nodiscard]] std::string charToBinaryString(const char &chr) {
112
- std::string result;
113
- for (int i = 7; i >= 0; --i) {
114
- result += ((chr >> i) & 1) ? '1' : '0';
115
- }
116
- return result;
117
- }
118
-
119
294
  class Candidate;
120
295
  enum class segmentType { Dir, File };
121
296
 
@@ -128,7 +303,8 @@ struct PathSegment {
128
303
  Candidate *cand;
129
304
  PathSegment *parent;
130
305
  std::mutex mu;
131
- ankerl::unordered_dense::map<std::string, PathSegment *> children;
306
+ std::map<std::string, PathSegment *> children;
307
+
132
308
  segmentType type = segmentType::Dir;
133
309
  PathSegment() : parent(nullptr) {}
134
310
  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
@@ -137,9 +313,9 @@ struct PathSegment {
137
313
  [[nodiscard]] int size() const {
138
314
  int sz = str.size();
139
315
  PathSegment *cur = parent;
140
- // Sum up length of parent segments (+1 for divisors)
316
+ // Sum up length of parent segments
141
317
  while (cur->parent != nullptr) {
142
- sz += cur->str.size() + 1;
318
+ sz += cur->str.size();
143
319
  cur = cur->parent;
144
320
  }
145
321
  return sz;
@@ -148,7 +324,11 @@ struct PathSegment {
148
324
 
149
325
  // Candidate for result in string (filename) search
150
326
  struct Candidate {
327
+
328
+ //This holds the subscores for each character in the query string
151
329
  std::vector<float> v_charscore;
330
+
331
+
152
332
  PathSegment *seg;
153
333
  int fileId;
154
334
  // The string that this candidate represents
@@ -159,6 +339,7 @@ struct Candidate {
159
339
  float maxscore;
160
340
  int candLen; // Length of candidate
161
341
 
342
+ ~Candidate(){};
162
343
  Candidate(){};
163
344
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
164
345
  // Initialize v_charscores with zeros
@@ -166,6 +347,7 @@ struct Candidate {
166
347
  candLen = seg->size();
167
348
  }
168
349
 
350
+ // Sum subscores in v_charscore and normalize to get final score
169
351
  [[nodiscard]] float getScore() const {
170
352
  int i = 0;
171
353
  float score = 0.0;
@@ -178,6 +360,7 @@ struct Candidate {
178
360
  float div2 = len * candLen;
179
361
  float score1 = score / div;
180
362
  float score2 = score / div2;
363
+ // out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
181
364
 
182
365
  score = score1 * 0.97 + score2 * 0.03;
183
366
  return score;
@@ -186,12 +369,8 @@ struct Candidate {
186
369
  [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
187
370
  };
188
371
 
189
- // This seems to give 10x speed improvement over std::unordered_map
190
- typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
191
- // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
192
-
193
372
  typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
194
- // typedef std::unordered_map<int, Candidate*> CandMap;
373
+ // typedef std::unordered_map<int, Candidate *> CandMap;
195
374
 
196
375
  class StringIndex {
197
376
  private:
@@ -199,32 +378,29 @@ private:
199
378
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
200
379
  int numStrings = 0;
201
380
 
202
- std::vector<SegMap *> dirmaps;
203
- std::array<std::mutex, 9> mts_d; // for dirmaps
204
- std::vector<SegMap *> filemaps;
205
- std::array<std::mutex, 9> mts_f; // for filemaps
206
-
207
381
  std::vector<PathSegment *> segsToClean;
208
382
 
383
+ // Maps id's stored in charTree to corresponding PathSegment's
209
384
  std::unordered_map<int, PathSegment *> seglist;
385
+ std::unordered_map<int, PathSegment *> seglist_dir;
386
+ std::mutex seglist_mu;
387
+
210
388
  PathSegment *root;
211
389
  int dirId = 0;
212
- float dirWeight = 0.7; // Give only 70% of score if match is for a directory
390
+ float dirWeight = 1.0; // =0.7: Give only 70% of score if match is for a directory
213
391
 
214
392
  std::unique_ptr<ThreadPool> pool;
215
393
  Output out{1}; // verbose level = 1
394
+ std::mutex cm_mu;
216
395
 
217
396
  public:
397
+ CharTree cm; // for files
398
+ CharTree cm_dir; // for directories
218
399
  StringIndex(char sep) : dirSeparator(sep) {
219
400
  root = new PathSegment();
220
401
  root->parent = nullptr;
221
402
  root->str = "[ROOT]";
222
403
 
223
- for (int i = 0; i <= 8; i++) {
224
- dirmaps.push_back(new SegMap);
225
- filemaps.push_back(new SegMap);
226
- }
227
-
228
404
  // Threads between 4 and 6
229
405
  // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
230
406
  int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
@@ -233,45 +409,31 @@ public:
233
409
  pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
234
410
  }
235
411
 
236
- /* Don't separate path to segments separator=\0.
412
+ /* Don't separate path to segments when separator=\0.
237
413
  This is slower, but can be used for other data than files also. */
238
414
  StringIndex() : StringIndex('\0') {}
239
415
 
240
416
  void setDirSeparator(char sep) { dirSeparator = sep; }
241
417
  void setDirWeight(float val) { dirWeight = val; }
242
418
 
243
- ~StringIndex() {
244
- for (auto x : dirmaps) {
245
- for (auto y : *x) {
246
- y.second->clear();
247
- delete (y.second);
248
- }
249
- x->clear();
250
- delete x;
251
- }
252
- for (auto x : filemaps) {
253
- for (auto y : *x) {
254
- y.second->clear();
255
- delete (y.second);
256
- }
257
- x->clear();
258
- delete x;
259
- }
260
- clearPathSegmentChildren(root);
261
- }
419
+ ~StringIndex() { clearPathSegmentChildren(root); }
262
420
 
263
421
  void addStrToIndex(std::string filePath, int fileId) {
264
422
  addStrToIndex(filePath, fileId, dirSeparator);
265
423
  }
266
424
 
267
425
  void addStrToIndexThreaded(std::string filePath, int fileId) {
268
- pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
426
+ pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
427
+ // addStrToIndex(filePath, fileId, dirSeparator);
269
428
  }
270
429
  void waitUntilReady() const { pool->waitUntilDone(); }
271
430
 
272
431
  void waitUntilDone() const { pool->waitUntilDone(); }
273
432
 
274
- int size() const { return seglist.size(); }
433
+ int size() {
434
+ std::lock_guard<std::mutex> guard(seglist_mu);
435
+ return seglist.size();
436
+ }
275
437
 
276
438
  /**
277
439
  * Add a string to the index to be searched for afterwards
@@ -283,11 +445,18 @@ public:
283
445
  */
284
446
 
285
447
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
286
- out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
448
+
449
+ std::lock_guard<std::mutex> guard(cm_mu);
450
+
451
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
287
452
 
288
453
  // If a string with this index has beeen added already
289
- if (seglist.find(fileId) != seglist.end()) {
290
- return;
454
+ {
455
+ std::lock_guard<std::mutex> guard(seglist_mu);
456
+ if (seglist.find(fileId) != seglist.end()) {
457
+ out.printl("seglist.find(fileId) != seglist.end()");
458
+ return;
459
+ }
291
460
  }
292
461
 
293
462
  std::vector<std::string> segs;
@@ -319,36 +488,74 @@ public:
319
488
  if (auto it = prev->children.find(x); it != prev->children.end()) {
320
489
  p = it->second;
321
490
  prev->mu.unlock();
322
- } else {
491
+ } else { // File or dir not included in tree yet
323
492
  p = new PathSegment(x, fileId);
324
493
  p->parent = prev;
325
494
  // If this is last item in segs, then it is a file.
326
495
  if (_x == std::prev(segs.end())) {
327
496
  p->type = segmentType::File;
328
- seglist[fileId] = p;
497
+ {
498
+ std::lock_guard<std::mutex> guard(seglist_mu);
499
+ seglist[fileId] = p;
500
+
501
+ for (int i = 0; i < x.size() + 1; i++) {
502
+ auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
503
+ cm.addStr(s, fileId);
504
+ }
505
+ }
329
506
  } else { // otherwise, it is a directory
330
507
  p->type = segmentType::Dir;
331
508
  p->fileId = dirId;
332
- // Files use user input Id. Directories need to have it generated
509
+ /* Add "/" to the end of the string so that
510
+ * /path/to/file will be indexed as:
511
+ * {"/path/", "/to/", "/file"}
512
+ */
513
+ auto dir_str = x + "/";
514
+
515
+ {
516
+ std::lock_guard<std::mutex> guard(seglist_mu);
517
+ seglist_dir[dirId] = p;
518
+ // Files use user input Id. Directories need to have it generated
519
+ }
520
+
521
+ // TODO: Create a function
522
+ for (int i = 0; i < dir_str.size() + 1; i++) {
523
+ auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
524
+ cm_dir.addStr(s, dirId);
525
+ }
526
+
333
527
  dirId++;
334
528
  }
335
529
  prev->children[x] = p;
336
530
  prev->mu.unlock();
337
- addPathSegmentKeys(p);
338
- }
531
+ } // END of first if
339
532
 
340
533
  prev = p;
341
534
  }
342
535
  }
343
536
 
344
- std::string getString(int id) {
537
+ std::string getString(int id) { return getString(id, false); }
538
+
539
+ // Reconstruct original filepath from segments
540
+ std::string getString(int id, bool isDir) {
345
541
  std::string s = "";
346
- PathSegment *seg = seglist[id];
542
+ std::lock_guard<std::mutex> guard(seglist_mu);
543
+
544
+ PathSegment *seg = nullptr;
545
+
546
+ if (isDir) {
547
+ seg = seglist_dir[id];
548
+ } else {
549
+ seg = seglist[id];
550
+ }
347
551
  s += seg->str;
348
552
  while (seg->parent->parent != nullptr) {
349
553
  seg = seg->parent;
350
- s = seg->str + dirSeparator + s;
554
+ s = seg->str + s;
555
+ // out.print(seg, "(", seg->str, ")", ",");
351
556
  }
557
+ // out.printl(s);
558
+
352
559
  return s;
353
560
  }
354
561
 
@@ -384,33 +591,59 @@ public:
384
591
  @param query String to search for inside the index
385
592
  */
386
593
 
387
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
388
- return findSimilar(query, 2);
389
- }
390
-
391
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
392
- CandMap fileCandMap;
393
- CandMap dirCandMap;
394
-
395
- waitUntilDone();
396
-
397
- // Find both files and directories that match the input query
398
- addToCandMap(fileCandMap, query, filemaps);
399
- addToCandMap(dirCandMap, query, dirmaps);
400
-
401
- /* If parent dir of a file matches the input string add the scores of the direcotry to the
402
- scores of the file */
403
- mergeCandidateMaps(fileCandMap, dirCandMap);
404
-
405
- // Set all candidate pointers to nullptr so they won't mess up future searches
406
- for (auto seg : segsToClean) {
407
- seg->cand = nullptr;
594
+ void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
595
+
596
+ int last_start = query.size() - 2;
597
+ // Loop all possible start positions in query string. Indexes [0..(n-3)]
598
+ for (int start = 0; start <= last_start; start++) {
599
+ CharNode *cn = chartr.root;
600
+
601
+ // select a suffix (substring) starting from start, but cap length to 8 chars
602
+ int end = std::min(start + 7, ((int)query.size()) - 1);
603
+ int nchars = end - start + 1;
604
+ std::string s = query.substr(start, nchars);
605
+
606
+ // Loop all chars of the query substring
607
+ // Traverse from the
608
+ for (int i = 0; i < s.size(); i++) {
609
+ char c = s[i];
610
+ CharNode *x = cn->find(c);
611
+ if (x != nullptr) {
612
+ cn = x;
613
+ // Consider scores only for substrings with size >= 2
614
+ if (i > 0) {
615
+ // If we've reached here, size of substring is i+2
616
+
617
+ // Get identifiers of files that include substring
618
+ // query[start..(start+i+1)] ??
619
+ std::set<int> ids = cn->getIds();
620
+ for (const int &y : ids) {
621
+ PathSegment *p = nullptr;
622
+
623
+ // Searching in file segments
624
+ // (or no file/dir separation)
625
+ if (&chartr == &cm) {
626
+ p = seglist[y];
627
+ } else {
628
+ // Searching in dir segments
629
+ p = seglist_dir[y];
630
+ }
631
+ assert(p != nullptr);
632
+ addToResults(p, query, start, i + 1, candmap);
633
+ }
634
+ }
635
+ } else {
636
+ // assert(cn->ids_sz < 1); // TODO: should not come here?
637
+ break;
638
+ }
639
+ }
408
640
  }
409
- segsToClean.clear();
641
+ }
410
642
 
643
+ std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
411
644
  // Form return result, 2d array with file id's and scores
412
645
  std::vector<std::pair<float, int>> results;
413
- for (auto &[fid, cand] : fileCandMap) {
646
+ for (auto &[fid, cand] : candmap) {
414
647
  std::pair<float, int> v;
415
648
  float sc = cand->getScore();
416
649
  v.first = sc;
@@ -419,164 +652,134 @@ public:
419
652
  delete cand;
420
653
  }
421
654
 
422
- for (auto &[fid, cand] : dirCandMap) {
423
- delete cand;
424
- }
425
-
426
655
  // Sort highest score first
427
656
  std::sort(results.begin(), results.end(),
428
657
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
429
658
  return results;
430
659
  }
431
660
 
432
- // Return int64_t representation of the first nchars in str, starting from index i
433
- [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
434
- int64_t key = 0;
435
- for (int i_char = 0; i_char < nchars; i_char++) {
436
- key = key | static_cast<int64_t>(str[i + i_char]);
437
- if (i_char < nchars - 1) {
438
- // Shift 8 bits to the left except on the last iteration
439
- key = key << 8;
440
- }
441
- }
442
- return key;
443
- }
661
+ std::vector<std::pair<float, int>> findDirectories(std::string query) {
662
+ CandMap dirCandMap;
663
+ auto &candmap = dirCandMap;
664
+ waitUntilDone();
444
665
 
445
- void debug() {
666
+ searchCharTree(query, dirCandMap, cm_dir);
667
+ addParentScores(dirCandMap);
668
+ auto results = candidatesToVec(dirCandMap);
446
669
 
447
- int nchars = 3;
448
- for (const auto &[key, value] : (*filemaps[nchars])) {
449
- int64_t x;
450
- x = key;
451
- int multip = nchars * 8;
452
- for (int i = 0; i <= nchars; i++) {
453
- char c = (x >> multip) & 255;
454
- std::cout << c;
455
- multip -= 8;
456
- }
457
- std::cout << "\n";
458
- // for (auto y : *value) {
459
- // std::cout << y << " ";
460
- // }
461
- // std::cout << "\n";
462
- }
670
+ return results;
463
671
  }
464
672
 
465
- private:
466
- void clearPathSegmentChildren(PathSegment *p) {
467
- if (p->children.size() > 0) {
468
- for (auto x : p->children) {
469
- clearPathSegmentChildren(x.second);
470
- }
471
- }
472
- delete p;
673
+ std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
674
+ return findFilesAndDirectories(query, true, true);
473
675
  }
474
676
 
475
- void addPathSegmentKeys(PathSegment *p) {
476
- // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
477
- // This function generates int64 representations (keys) of all substrings of size 2..8 in that
478
- // path segment and stores pointer to p in hash tables using these int values as keys.
677
+ std::vector<std::pair<float, std::string>>
678
+ findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
479
679
 
480
- int maxChars = 8;
481
- int minChars = 2;
680
+ CandMap fileCandMap;
681
+ CandMap dirCandMap;
682
+ waitUntilDone();
683
+ std::vector<std::pair<float, std::string>> results;
482
684
 
483
- std::string str = p->str;
484
- if (p->str.size() < 2) {
485
- return;
486
- }
487
- if (static_cast<int>(p->str.size()) < maxChars) {
488
- maxChars = p->str.size();
685
+ if (includeFiles) {
686
+ searchCharTree(query, fileCandMap, cm);
687
+ // out.printl("size:",fileCandMap.size());
489
688
  }
490
689
 
491
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
690
+ searchCharTree(query, dirCandMap, cm_dir);
492
691
 
493
- std::mutex *mu;
494
- SegMap *map;
495
- if (p->type == segmentType::File) {
496
- map = filemaps[sublen];
497
- mu = &mts_f[sublen];
498
- } else {
499
- map = dirmaps[sublen];
500
- mu = &mts_d[sublen];
501
- }
692
+ if (includeFiles) {
693
+ addParentScores(fileCandMap);
694
+ }
502
695
 
503
- int count = str.size() - sublen + 1;
696
+ if (includeDirs) {
697
+ addParentScores(dirCandMap);
698
+ }
699
+
700
+ for (auto seg : segsToClean) {
701
+ seg->cand = nullptr;
702
+ }
703
+ segsToClean.clear();
504
704
 
505
- int64_t keys[count + 1];
506
- for (int i = 0; i <= count; i++) {
507
- keys[i] = getKeyAtIdx(str, i, sublen);
705
+ // TODO: Need to call this just to delete candidates
706
+ auto res_dir = candidatesToVec(dirCandMap);
707
+ if (includeDirs) {
708
+ for (const auto &[score, id] : res_dir) {
709
+ results.push_back(std::pair<float, std::string>{score, getString(id, true)});
508
710
  }
711
+ }
509
712
 
510
- mu->lock();
511
- for (int i = 0; i <= count; i++) {
512
- // int64_t key = getKeyAtIdx(str, i, sublen);
513
- auto key = keys[i];
713
+ if (includeFiles) {
714
+ auto res_file = candidatesToVec(fileCandMap);
715
+ // out.printl("size2:",fileCandMap.size());
716
+ for (const auto &[score, id] : res_file) {
514
717
 
515
- // Create a new std::set for key if doesn't exist already
516
- auto it = map->find(key);
517
- if (it == map->end()) {
518
- (*map)[key] = new std::set<PathSegment *>;
519
- }
520
- (*map)[key]->insert(p);
718
+ // out.print("|",getString(id),"|");
719
+ results.push_back(std::pair<float, std::string>{score, getString(id)});
521
720
  }
522
- mu->unlock();
523
721
  }
722
+
723
+ // Sort highest score first
724
+ std::sort(results.begin(), results.end(),
725
+ [](std::pair<float, std::string> a, std::pair<float, std::string> b) {
726
+ return a.first > b.first;
727
+ });
728
+ return results;
524
729
  }
525
730
 
526
- // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
527
- // is of length <nchars>.
528
- [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
529
- SegMap &map) const {
731
+ // TODO: delete?
732
+ std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
530
733
 
531
- assert(i + nchars <= static_cast<int>(str.size()));
532
- std::vector<PathSegment *> res;
734
+ std::vector<std::pair<float, int>> findFiles(std::string query) {
533
735
 
534
- // Take substring of str, starting at i, spanning nchars
535
- // transform that to 64 bit integer
536
- int64_t key = getKeyAtIdx(str, i, nchars);
537
- // Find all path segments in map that have the same substring
538
- if (auto it = map.find(key); it != map.end()) { // key found
539
- auto set = it->second;
540
- for (auto value : *set) {
541
- res.push_back(value);
542
- }
543
- }
544
- return res;
545
- }
736
+ CandMap fileCandMap;
737
+ CandMap dirCandMap;
738
+ auto &candmap = fileCandMap;
739
+ waitUntilDone();
740
+
741
+ searchCharTree(query, fileCandMap, cm);
742
+ searchCharTree(query, dirCandMap, cm_dir);
743
+ addParentScores(fileCandMap);
546
744
 
547
- void addToCandMap(CandMap &candmap, std::string query,
548
- std::vector<SegMap *> &map // filemaps or dirmaps
549
- ) {
550
- int maxChars = 8;
551
- int minChars = 2;
552
- if (static_cast<int>(query.size()) < maxChars) {
553
- maxChars = query.size();
745
+ for (auto seg : segsToClean) {
746
+ seg->cand = nullptr;
554
747
  }
748
+ segsToClean.clear();
555
749
 
556
- // Loop all substring lengths between minChars..maxChars
557
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
558
- int count = query.size() - sublen + 1;
750
+ auto results = candidatesToVec(fileCandMap);
751
+ auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
559
752
 
560
- // Loop all possible start positions
561
- for (int i = 0; i < count; i++) {
562
- std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
753
+ return results;
754
+ }
563
755
 
564
- for (PathSegment *p : res) {
565
- addToResults(p, query, i, sublen, candmap);
566
- }
756
+ private:
757
+ void clearPathSegmentChildren(PathSegment *p) {
758
+ if (p->children.size() > 0) {
759
+ for (auto x : p->children) {
760
+ clearPathSegmentChildren(x.second);
567
761
  }
568
762
  }
763
+ delete p;
569
764
  }
570
765
 
571
766
  // Add parent directories scores to files
572
- void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
767
+ void addParentScores(CandMap &fileCandMap) {
573
768
 
574
769
  for (auto &[fid, cand] : fileCandMap) {
575
770
  PathSegment *p = cand->seg->parent;
576
771
  while (p->parent != nullptr) {
577
772
  if (p->cand != nullptr) {
773
+
578
774
  auto &scoreA = cand->v_charscore;
579
775
  auto &scoreB = p->cand->v_charscore;
776
+
777
+ // out.print("[");
778
+ // printVector(scoreA);
779
+ // out.print(",");
780
+ // printVector(scoreB);
781
+ // out.print(",");
782
+ // out.print("]");
580
783
  for (int i = 0; i < cand->len; i++) {
581
784
  if (scoreA[i] < scoreB[i] * dirWeight) {
582
785
  scoreA[i] = scoreB[i] * dirWeight;
@@ -592,6 +795,7 @@ private:
592
795
 
593
796
  if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
594
797
  Candidate *cand = new Candidate(seg, str.size());
798
+ // out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
595
799
  segsToClean.push_back(seg);
596
800
  candmap[seg->fileId] = cand;
597
801
  seg->cand = cand;