StrIdx 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/stridx.hpp CHANGED
@@ -10,6 +10,7 @@
10
10
  #include <array>
11
11
  #include <iostream>
12
12
  #include <unordered_map>
13
+ #include <map>
13
14
  #include <set>
14
15
  #include <algorithm>
15
16
  #include <sstream>
@@ -28,10 +29,11 @@ namespace StrIdx {
28
29
  class Output {
29
30
  private:
30
31
  int verboseLevel;
32
+ // TODO: add mutex?
31
33
 
32
34
  public:
33
35
  Output(int verb) : verboseLevel(verb) {}
34
- Output() : Output(3) {}
36
+ Output() : Output(1) {}
35
37
  ~Output() = default;
36
38
  static void print() {}
37
39
 
@@ -61,61 +63,234 @@ public:
61
63
  }
62
64
  };
63
65
 
66
+ Output out{1};
67
+
68
+ struct CharNode {
69
+ int *ids;
70
+ int ids_sz;
71
+ char c;
72
+ std::uint8_t size;
73
+ CharNode *children;
74
+ CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
75
+
76
+ void init() {
77
+ ids = nullptr;
78
+ ids_sz = 0;
79
+ c = 0;
80
+ size = 0;
81
+ children = nullptr;
82
+ }
83
+
84
+ void dealloc() {
85
+ if (children != nullptr) {
86
+ for (CharNode *it = children; it != children + size; it++) {
87
+ it->dealloc();
88
+ }
89
+ free(children);
90
+ }
91
+ delete[] ids;
92
+ }
93
+
94
+ ~CharNode() {}
95
+
96
+ // Gets Id's stored in this node and all child nodes combined
97
+ std::set<int> getIds() {
98
+ std::set<int> set;
99
+ getIds(set);
100
+ return set;
101
+ }
102
+
103
+ void getIds(std::set<int> &set) {
104
+ for (int j = 0; j < ids_sz; j++) {
105
+ set.insert(ids[j]);
106
+ }
107
+ for (CharNode *it = children; it != children + size; it++) {
108
+ it->getIds(set);
109
+ }
110
+ }
111
+
112
+ // Find if character 'c' is included in children of the node
113
+ CharNode *find(char c) {
114
+ CharNode *ret = nullptr;
115
+ if (size > 0) {
116
+ for (auto it = children; it != children + size; it++) {
117
+ if (it->c == c) {
118
+ ret = it;
119
+ break;
120
+ }
121
+ }
122
+ }
123
+ return ret;
124
+ }
125
+ };
126
+
127
+ /* Tree type data structure consisting of strings of file path segments
128
+ * (somewhat like a trie)
129
+ * For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
130
+ * substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
131
+ * (CharTree::addStr callled for each separately)
132
+ *
133
+ * Which forms a tree like structure:
134
+ * [root]-a-b-r-a-c-a-d-a
135
+ * | | ╰-4
136
+ * | ╰─c-a-d-a-b-r-4
137
+ * ╰───b-r-a-c-a-d-a-b
138
+ * | ╰─4
139
+ * ╰───r-a-c-a-d-a-b-r
140
+ * ╰───d-a-b-r-4
141
+ *
142
+ * Id's pointing to path segments are stored in nodes that match the end of the inserted substring
143
+ *
144
+ * This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
145
+ * For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
146
+ * Therefore, having std::vector's or similar structures with memory overhead is not really an
147
+ * option.
148
+ */
149
+ class CharTree {
150
+ Output out;
151
+ std::mutex mu;
152
+
153
+ public:
154
+ CharNode *root;
155
+
156
+ CharTree() { root = new CharNode; }
157
+ ~CharTree() {
158
+ root->dealloc();
159
+ delete root;
160
+ }
161
+
162
+ void addStr(std::string s, int id) {
163
+ if (s.size() < 2) {
164
+ return;
165
+ }
166
+
167
+ // out.printl("add str:",s);
168
+ CharNode *cn = root;
169
+
170
+ std::lock_guard<std::mutex> mu_lock(mu);
171
+
172
+ for (int i = 0; i < s.size() && i < 8; i++) {
173
+ int c = ((char)s[i]);
174
+ bool found = false;
175
+
176
+ if (cn->size > 0) {
177
+ // out.printl("(1) cn->size > 0");
178
+ for (auto it = cn->children; it != cn->children + cn->size; it++) {
179
+ if (it->c == c) {
180
+ // out.printl("{", c, "}");
181
+ found = true;
182
+ cn = it;
183
+ break;
184
+ }
185
+ }
186
+ }
187
+ if (!found) {
188
+ // auto x = new CharNode[cn->size + 1];
189
+ CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
190
+ if (cn->size > 0) {
191
+ memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
192
+ free(cn->children);
193
+ }
194
+ cn->children = x;
195
+ CharNode *nn = &(cn->children[cn->size]);
196
+ nn->init();
197
+ nn->c = c;
198
+ cn->size++;
199
+ cn = nn;
200
+ }
201
+
202
+ if (i == s.size() - 1 && true) {
203
+ out.printv(4, "i=", i, "s:", s.size(), "|");
204
+ bool found = false;
205
+ if (cn->ids_sz > 0) {
206
+ for (int i = 0; i < cn->ids_sz; i++) {
207
+ if (cn->ids[i] == id) {
208
+ found = true;
209
+ out.printv(3, "found:", id, "\n");
210
+ }
211
+ }
212
+ }
213
+ if (!found) {
214
+ // out.print(".a.");
215
+ auto x = new int[cn->ids_sz + 1];
216
+ if (cn->ids_sz > 0) {
217
+ memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
218
+ delete[] cn->ids;
219
+ }
220
+ cn->ids = x;
221
+ cn->ids[cn->ids_sz] = id;
222
+ cn->ids_sz++;
223
+ out.printv(3, "sz:", cn->ids_sz, ",");
224
+ }
225
+ }
226
+
227
+ } // END for
228
+ }
229
+
230
+ void debug() { debug("", root); }
231
+ void debug(std::string trail, CharNode *cn) {
232
+
233
+ // if (trail.size() > 6) {
234
+ // out.print("\n");
235
+ // return;
236
+ // }
237
+
238
+ if (cn == nullptr) {
239
+ return;
240
+ }
241
+ for (int i = 0; i < cn->size; i++) {
242
+ CharNode *child = &cn->children[i];
243
+ out.print("[", child->ids_sz, "]");
244
+ if (child->size > 0) {
245
+ debug(trail + child->c, child);
246
+ } else {
247
+ out.printl(trail, child->c);
248
+ // out.printl();
249
+ }
250
+ }
251
+ }
252
+ };
253
+
64
254
  // Transforms input string as follows:
65
255
  // '/foo/bar/file1.txt'
66
- // => vector{"foo", "bar", "file1.txt"}
67
- std::vector<std::string> splitString(const std::string &input, const char &separator) {
256
+ // => vector{"/foo", "/bar", "/file1.txt"}
257
+
258
+ std::vector<std::string> splitString(const std::string &str, char delimiter) {
68
259
  std::vector<std::string> result;
69
- std::stringstream ss(input);
70
- std::string item;
260
+ std::string part;
71
261
 
72
- while (std::getline(ss, item, separator)) {
73
- if (item.size() > 0) {
74
- result.push_back(item);
262
+ for (char ch : str) {
263
+ if (ch == delimiter) {
264
+ if (part.size() > 0) {
265
+ result.push_back(part);
266
+ }
267
+ part.clear(); // Start a new part
268
+ part += ch;
269
+ } else {
270
+ part += ch;
75
271
  }
76
272
  }
77
273
 
78
- return result;
79
- }
80
-
81
- // Convert int64_t to binary string
82
- [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
83
- std::string result;
84
- for (int i = 63; i >= 0; --i) {
85
- result += ((num >> i) & 1) ? '1' : '0';
274
+ // If there's any remaining part after the loop, add it to the result
275
+ if (!part.empty()) {
276
+ result.push_back(part);
86
277
  }
87
- return result;
88
- }
89
278
 
90
- // Debug. Convert a (8 char) string represented as int64_t to std::string
91
- [[nodiscard]] std::string int64ToStr(const int64_t &key) {
92
- int nchars = 8;
93
- std::string str;
94
- int multip = nchars * 8;
95
- for (int i = 0; i <= nchars; i++) {
96
- char c = (key >> multip) & 255;
97
- str.push_back(c);
98
- multip -= 8;
99
- }
100
- return str;
279
+ // for (const auto &value : result) {
280
+ // std::cout << value << "|";
281
+ // }
282
+ // std::cout << std::endl;
283
+
284
+ return result;
101
285
  }
102
286
 
103
287
  // Debug
104
- void printVector(const std::vector<int> &vec) {
288
+ void printVector(const std::vector<float> &vec) {
105
289
  for (const auto &value : vec) {
106
290
  std::cout << value << " ";
107
291
  }
108
292
  }
109
293
 
110
- // Debug
111
- [[nodiscard]] std::string charToBinaryString(const char &chr) {
112
- std::string result;
113
- for (int i = 7; i >= 0; --i) {
114
- result += ((chr >> i) & 1) ? '1' : '0';
115
- }
116
- return result;
117
- }
118
-
119
294
  class Candidate;
120
295
  enum class segmentType { Dir, File };
121
296
 
@@ -128,7 +303,8 @@ struct PathSegment {
128
303
  Candidate *cand;
129
304
  PathSegment *parent;
130
305
  std::mutex mu;
131
- ankerl::unordered_dense::map<std::string, PathSegment *> children;
306
+ std::map<std::string, PathSegment *> children;
307
+
132
308
  segmentType type = segmentType::Dir;
133
309
  PathSegment() : parent(nullptr) {}
134
310
  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
@@ -137,9 +313,9 @@ struct PathSegment {
137
313
  [[nodiscard]] int size() const {
138
314
  int sz = str.size();
139
315
  PathSegment *cur = parent;
140
- // Sum up length of parent segments (+1 for divisors)
316
+ // Sum up length of parent segments
141
317
  while (cur->parent != nullptr) {
142
- sz += cur->str.size() + 1;
318
+ sz += cur->str.size();
143
319
  cur = cur->parent;
144
320
  }
145
321
  return sz;
@@ -159,6 +335,7 @@ struct Candidate {
159
335
  float maxscore;
160
336
  int candLen; // Length of candidate
161
337
 
338
+ ~Candidate(){};
162
339
  Candidate(){};
163
340
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
164
341
  // Initialize v_charscores with zeros
@@ -178,6 +355,7 @@ struct Candidate {
178
355
  float div2 = len * candLen;
179
356
  float score1 = score / div;
180
357
  float score2 = score / div2;
358
+ // out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
181
359
 
182
360
  score = score1 * 0.97 + score2 * 0.03;
183
361
  return score;
@@ -186,12 +364,8 @@ struct Candidate {
186
364
  [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
187
365
  };
188
366
 
189
- // This seems to give 10x speed improvement over std::unordered_map
190
- typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
191
- // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
192
-
193
367
  typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
194
- // typedef std::unordered_map<int, Candidate*> CandMap;
368
+ // typedef std::unordered_map<int, Candidate *> CandMap;
195
369
 
196
370
  class StringIndex {
197
371
  private:
@@ -199,32 +373,28 @@ private:
199
373
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
200
374
  int numStrings = 0;
201
375
 
202
- std::vector<SegMap *> dirmaps;
203
- std::array<std::mutex, 9> mts_d; // for dirmaps
204
- std::vector<SegMap *> filemaps;
205
- std::array<std::mutex, 9> mts_f; // for filemaps
206
-
207
376
  std::vector<PathSegment *> segsToClean;
208
377
 
209
378
  std::unordered_map<int, PathSegment *> seglist;
379
+ std::unordered_map<int, PathSegment *> seglist_dir;
380
+ std::mutex seglist_mu;
381
+
210
382
  PathSegment *root;
211
383
  int dirId = 0;
212
384
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
213
385
 
214
386
  std::unique_ptr<ThreadPool> pool;
215
387
  Output out{1}; // verbose level = 1
388
+ std::mutex cm_mu;
216
389
 
217
390
  public:
391
+ CharTree cm; // for files
392
+ CharTree cm_dir; // for directories
218
393
  StringIndex(char sep) : dirSeparator(sep) {
219
394
  root = new PathSegment();
220
395
  root->parent = nullptr;
221
396
  root->str = "[ROOT]";
222
397
 
223
- for (int i = 0; i <= 8; i++) {
224
- dirmaps.push_back(new SegMap);
225
- filemaps.push_back(new SegMap);
226
- }
227
-
228
398
  // Threads between 4 and 6
229
399
  // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
230
400
  int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
@@ -233,45 +403,31 @@ public:
233
403
  pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
234
404
  }
235
405
 
236
- /* Don't separate path to segments separator=\0.
406
+ /* Don't separate path to segments when separator=\0.
237
407
  This is slower, but can be used for other data than files also. */
238
408
  StringIndex() : StringIndex('\0') {}
239
409
 
240
410
  void setDirSeparator(char sep) { dirSeparator = sep; }
241
411
  void setDirWeight(float val) { dirWeight = val; }
242
412
 
243
- ~StringIndex() {
244
- for (auto x : dirmaps) {
245
- for (auto y : *x) {
246
- y.second->clear();
247
- delete (y.second);
248
- }
249
- x->clear();
250
- delete x;
251
- }
252
- for (auto x : filemaps) {
253
- for (auto y : *x) {
254
- y.second->clear();
255
- delete (y.second);
256
- }
257
- x->clear();
258
- delete x;
259
- }
260
- clearPathSegmentChildren(root);
261
- }
413
+ ~StringIndex() { clearPathSegmentChildren(root); }
262
414
 
263
415
  void addStrToIndex(std::string filePath, int fileId) {
264
416
  addStrToIndex(filePath, fileId, dirSeparator);
265
417
  }
266
418
 
267
419
  void addStrToIndexThreaded(std::string filePath, int fileId) {
268
- pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
420
+ pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
421
+ // addStrToIndex(filePath, fileId, dirSeparator);
269
422
  }
270
423
  void waitUntilReady() const { pool->waitUntilDone(); }
271
424
 
272
425
  void waitUntilDone() const { pool->waitUntilDone(); }
273
426
 
274
- int size() const { return seglist.size(); }
427
+ int size() {
428
+ std::lock_guard<std::mutex> guard(seglist_mu);
429
+ return seglist.size();
430
+ }
275
431
 
276
432
  /**
277
433
  * Add a string to the index to be searched for afterwards
@@ -283,11 +439,18 @@ public:
283
439
  */
284
440
 
285
441
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
286
- out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
442
+
443
+ std::lock_guard<std::mutex> guard(cm_mu);
444
+
445
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
287
446
 
288
447
  // If a string with this index has beeen added already
289
- if (seglist.find(fileId) != seglist.end()) {
290
- return;
448
+ {
449
+ std::lock_guard<std::mutex> guard(seglist_mu);
450
+ if (seglist.find(fileId) != seglist.end()) {
451
+ out.printl("seglist.find(fileId) != seglist.end()");
452
+ return;
453
+ }
291
454
  }
292
455
 
293
456
  std::vector<std::string> segs;
@@ -319,36 +482,74 @@ public:
319
482
  if (auto it = prev->children.find(x); it != prev->children.end()) {
320
483
  p = it->second;
321
484
  prev->mu.unlock();
322
- } else {
485
+ } else { // File or dir not included in tree yet
323
486
  p = new PathSegment(x, fileId);
324
487
  p->parent = prev;
325
488
  // If this is last item in segs, then it is a file.
326
489
  if (_x == std::prev(segs.end())) {
327
490
  p->type = segmentType::File;
328
- seglist[fileId] = p;
491
+ {
492
+ std::lock_guard<std::mutex> guard(seglist_mu);
493
+ seglist[fileId] = p;
494
+
495
+ for (int i = 0; i < x.size() + 1; i++) {
496
+ auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
497
+ cm.addStr(s, fileId);
498
+ }
499
+ }
329
500
  } else { // otherwise, it is a directory
330
501
  p->type = segmentType::Dir;
331
502
  p->fileId = dirId;
332
- // Files use user input Id. Directories need to have it generated
503
+ /* Add "/" to the end of the string so that
504
+ * /path/to/file will be indexed as:
505
+ * {"/path/", "/to/", "/file"}
506
+ */
507
+ auto dir_str = x + "/";
508
+
509
+ {
510
+ std::lock_guard<std::mutex> guard(seglist_mu);
511
+ seglist_dir[dirId] = p;
512
+ // Files use user input Id. Directories need to have it generated
513
+ }
514
+
515
+ // TODO: Create a function
516
+ for (int i = 0; i < dir_str.size() + 1; i++) {
517
+ auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
518
+ cm_dir.addStr(s, dirId);
519
+ }
520
+
333
521
  dirId++;
334
522
  }
335
523
  prev->children[x] = p;
336
524
  prev->mu.unlock();
337
- addPathSegmentKeys(p);
338
- }
525
+ } // END of first if
339
526
 
340
527
  prev = p;
341
528
  }
342
529
  }
343
530
 
344
- std::string getString(int id) {
531
+ std::string getString(int id) { return getString(id, false); }
532
+
533
+ // Reconstruct original filepath from segments
534
+ std::string getString(int id, bool isDir) {
345
535
  std::string s = "";
346
- PathSegment *seg = seglist[id];
536
+ std::lock_guard<std::mutex> guard(seglist_mu);
537
+
538
+ PathSegment *seg = nullptr;
539
+
540
+ if (isDir) {
541
+ seg = seglist_dir[id];
542
+ } else {
543
+ seg = seglist[id];
544
+ }
347
545
  s += seg->str;
348
546
  while (seg->parent->parent != nullptr) {
349
547
  seg = seg->parent;
350
- s = seg->str + dirSeparator + s;
548
+ s = seg->str + s;
549
+ // out.print(seg, "(", seg->str, ")", ",");
351
550
  }
551
+ // out.printl(s);
552
+
352
553
  return s;
353
554
  }
354
555
 
@@ -384,33 +585,46 @@ public:
384
585
  @param query String to search for inside the index
385
586
  */
386
587
 
387
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
388
- return findSimilar(query, 2);
389
- }
390
-
391
- [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
392
- CandMap fileCandMap;
393
- CandMap dirCandMap;
394
-
395
- waitUntilDone();
396
-
397
- // Find both files and directories that match the input query
398
- addToCandMap(fileCandMap, query, filemaps);
399
- addToCandMap(dirCandMap, query, dirmaps);
400
-
401
- /* If parent dir of a file matches the input string add the scores of the direcotry to the
402
- scores of the file */
403
- mergeCandidateMaps(fileCandMap, dirCandMap);
404
-
405
- // Set all candidate pointers to nullptr so they won't mess up future searches
406
- for (auto seg : segsToClean) {
407
- seg->cand = nullptr;
588
+ void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
589
+
590
+ int last_start = query.size() - 2;
591
+ for (int start = 0; start <= last_start; start++) {
592
+ CharNode *cn = chartr.root;
593
+ int end = std::min(start + 7, ((int)query.size()) - 1);
594
+ int nchars = end - start + 1;
595
+ std::string s = query.substr(start, nchars);
596
+
597
+ for (int i = 0; i < s.size(); i++) {
598
+ char c = s[i];
599
+ CharNode *x = cn->find(c);
600
+ if (x != nullptr) {
601
+ cn = x;
602
+ // Consider scores only for substrings with size >= 2
603
+ if (i > 0) {
604
+ std::set<int> ids = cn->getIds();
605
+ for (const int &y : ids) {
606
+ PathSegment *p = nullptr;
607
+ if (&chartr == &cm) {
608
+ p = seglist[y];
609
+ } else {
610
+ p = seglist_dir[y];
611
+ }
612
+ assert(p != nullptr);
613
+ addToResults(p, query, start, i + 1, candmap);
614
+ }
615
+ }
616
+ } else {
617
+ // assert(cn->ids_sz < 1); // TODO: should not come here?
618
+ break;
619
+ }
620
+ }
408
621
  }
409
- segsToClean.clear();
622
+ }
410
623
 
624
+ std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
411
625
  // Form return result, 2d array with file id's and scores
412
626
  std::vector<std::pair<float, int>> results;
413
- for (auto &[fid, cand] : fileCandMap) {
627
+ for (auto &[fid, cand] : candmap) {
414
628
  std::pair<float, int> v;
415
629
  float sc = cand->getScore();
416
630
  v.first = sc;
@@ -419,164 +633,134 @@ public:
419
633
  delete cand;
420
634
  }
421
635
 
422
- for (auto &[fid, cand] : dirCandMap) {
423
- delete cand;
424
- }
425
-
426
636
  // Sort highest score first
427
637
  std::sort(results.begin(), results.end(),
428
638
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
429
639
  return results;
430
640
  }
431
641
 
432
- // Return int64_t representation of the first nchars in str, starting from index i
433
- [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
434
- int64_t key = 0;
435
- for (int i_char = 0; i_char < nchars; i_char++) {
436
- key = key | static_cast<int64_t>(str[i + i_char]);
437
- if (i_char < nchars - 1) {
438
- // Shift 8 bits to the left except on the last iteration
439
- key = key << 8;
440
- }
441
- }
442
- return key;
443
- }
642
+ std::vector<std::pair<float, int>> findDirectories(std::string query) {
643
+ CandMap dirCandMap;
644
+ auto &candmap = dirCandMap;
645
+ waitUntilDone();
444
646
 
445
- void debug() {
647
+ searchCharTree(query, dirCandMap, cm_dir);
648
+ addParentScores(dirCandMap);
649
+ auto results = candidatesToVec(dirCandMap);
446
650
 
447
- int nchars = 3;
448
- for (const auto &[key, value] : (*filemaps[nchars])) {
449
- int64_t x;
450
- x = key;
451
- int multip = nchars * 8;
452
- for (int i = 0; i <= nchars; i++) {
453
- char c = (x >> multip) & 255;
454
- std::cout << c;
455
- multip -= 8;
456
- }
457
- std::cout << "\n";
458
- // for (auto y : *value) {
459
- // std::cout << y << " ";
460
- // }
461
- // std::cout << "\n";
462
- }
651
+ return results;
463
652
  }
464
653
 
465
- private:
466
- void clearPathSegmentChildren(PathSegment *p) {
467
- if (p->children.size() > 0) {
468
- for (auto x : p->children) {
469
- clearPathSegmentChildren(x.second);
470
- }
471
- }
472
- delete p;
654
+ std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
655
+ return findFilesAndDirectories(query, true, true);
473
656
  }
474
657
 
475
- void addPathSegmentKeys(PathSegment *p) {
476
- // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
477
- // This function generates int64 representations (keys) of all substrings of size 2..8 in that
478
- // path segment and stores pointer to p in hash tables using these int values as keys.
658
+ std::vector<std::pair<float, std::string>>
659
+ findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
479
660
 
480
- int maxChars = 8;
481
- int minChars = 2;
661
+ CandMap fileCandMap;
662
+ CandMap dirCandMap;
663
+ waitUntilDone();
664
+ std::vector<std::pair<float, std::string>> results;
482
665
 
483
- std::string str = p->str;
484
- if (p->str.size() < 2) {
485
- return;
486
- }
487
- if (static_cast<int>(p->str.size()) < maxChars) {
488
- maxChars = p->str.size();
666
+ if (includeFiles) {
667
+ searchCharTree(query, fileCandMap, cm);
668
+ // out.printl("size:",fileCandMap.size());
489
669
  }
490
670
 
491
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
671
+ searchCharTree(query, dirCandMap, cm_dir);
492
672
 
493
- std::mutex *mu;
494
- SegMap *map;
495
- if (p->type == segmentType::File) {
496
- map = filemaps[sublen];
497
- mu = &mts_f[sublen];
498
- } else {
499
- map = dirmaps[sublen];
500
- mu = &mts_d[sublen];
501
- }
673
+ if (includeFiles) {
674
+ addParentScores(fileCandMap);
675
+ }
502
676
 
503
- int count = str.size() - sublen + 1;
677
+ if (includeDirs) {
678
+ addParentScores(dirCandMap);
679
+ }
680
+
681
+ for (auto seg : segsToClean) {
682
+ seg->cand = nullptr;
683
+ }
684
+ segsToClean.clear();
504
685
 
505
- int64_t keys[count + 1];
506
- for (int i = 0; i <= count; i++) {
507
- keys[i] = getKeyAtIdx(str, i, sublen);
686
+ // TODO: Need to call this just to delete candidates
687
+ auto res_dir = candidatesToVec(dirCandMap);
688
+ if (includeDirs) {
689
+ for (const auto &[score, id] : res_dir) {
690
+ results.push_back(std::pair<float, std::string>{score, getString(id, true)});
508
691
  }
692
+ }
509
693
 
510
- mu->lock();
511
- for (int i = 0; i <= count; i++) {
512
- // int64_t key = getKeyAtIdx(str, i, sublen);
513
- auto key = keys[i];
694
+ if (includeFiles) {
695
+ auto res_file = candidatesToVec(fileCandMap);
696
+ // out.printl("size2:",fileCandMap.size());
697
+ for (const auto &[score, id] : res_file) {
514
698
 
515
- // Create a new std::set for key if doesn't exist already
516
- auto it = map->find(key);
517
- if (it == map->end()) {
518
- (*map)[key] = new std::set<PathSegment *>;
519
- }
520
- (*map)[key]->insert(p);
699
+ // out.print("|",getString(id),"|");
700
+ results.push_back(std::pair<float, std::string>{score, getString(id)});
521
701
  }
522
- mu->unlock();
523
702
  }
703
+
704
+ // Sort highest score first
705
+ std::sort(results.begin(), results.end(),
706
+ [](std::pair<float, std::string> a, std::pair<float, std::string> b) {
707
+ return a.first > b.first;
708
+ });
709
+ return results;
524
710
  }
525
711
 
526
- // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
527
- // is of length <nchars>.
528
- [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
529
- SegMap &map) const {
712
+ // TODO: delete?
713
+ std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
530
714
 
531
- assert(i + nchars <= static_cast<int>(str.size()));
532
- std::vector<PathSegment *> res;
715
+ std::vector<std::pair<float, int>> findFiles(std::string query) {
533
716
 
534
- // Take substring of str, starting at i, spanning nchars
535
- // transform that to 64 bit integer
536
- int64_t key = getKeyAtIdx(str, i, nchars);
537
- // Find all path segments in map that have the same substring
538
- if (auto it = map.find(key); it != map.end()) { // key found
539
- auto set = it->second;
540
- for (auto value : *set) {
541
- res.push_back(value);
542
- }
543
- }
544
- return res;
545
- }
717
+ CandMap fileCandMap;
718
+ CandMap dirCandMap;
719
+ auto &candmap = fileCandMap;
720
+ waitUntilDone();
721
+
722
+ searchCharTree(query, fileCandMap, cm);
723
+ searchCharTree(query, dirCandMap, cm_dir);
724
+ addParentScores(fileCandMap);
546
725
 
547
- void addToCandMap(CandMap &candmap, std::string query,
548
- std::vector<SegMap *> &map // filemaps or dirmaps
549
- ) {
550
- int maxChars = 8;
551
- int minChars = 2;
552
- if (static_cast<int>(query.size()) < maxChars) {
553
- maxChars = query.size();
726
+ for (auto seg : segsToClean) {
727
+ seg->cand = nullptr;
554
728
  }
729
+ segsToClean.clear();
555
730
 
556
- // Loop all substring lengths between minChars..maxChars
557
- for (int sublen = minChars; sublen <= maxChars; sublen++) {
558
- int count = query.size() - sublen + 1;
731
+ auto results = candidatesToVec(fileCandMap);
732
+ auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
559
733
 
560
- // Loop all possible start positions
561
- for (int i = 0; i < count; i++) {
562
- std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
734
+ return results;
735
+ }
563
736
 
564
- for (PathSegment *p : res) {
565
- addToResults(p, query, i, sublen, candmap);
566
- }
737
+ private:
738
+ void clearPathSegmentChildren(PathSegment *p) {
739
+ if (p->children.size() > 0) {
740
+ for (auto x : p->children) {
741
+ clearPathSegmentChildren(x.second);
567
742
  }
568
743
  }
744
+ delete p;
569
745
  }
570
746
 
571
747
  // Add parent directories scores to files
572
- void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
748
+ void addParentScores(CandMap &fileCandMap) {
573
749
 
574
750
  for (auto &[fid, cand] : fileCandMap) {
575
751
  PathSegment *p = cand->seg->parent;
576
752
  while (p->parent != nullptr) {
577
753
  if (p->cand != nullptr) {
754
+
578
755
  auto &scoreA = cand->v_charscore;
579
756
  auto &scoreB = p->cand->v_charscore;
757
+
758
+ // out.print("[");
759
+ // printVector(scoreA);
760
+ // out.print(",");
761
+ // printVector(scoreB);
762
+ // out.print(",");
763
+ // out.print("]");
580
764
  for (int i = 0; i < cand->len; i++) {
581
765
  if (scoreA[i] < scoreB[i] * dirWeight) {
582
766
  scoreA[i] = scoreB[i] * dirWeight;
@@ -592,6 +776,7 @@ private:
592
776
 
593
777
  if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
594
778
  Candidate *cand = new Candidate(seg, str.size());
779
+ // out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
595
780
  segsToClean.push_back(seg);
596
781
  candmap[seg->fileId] = cand;
597
782
  seg->cand = cand;