RubyGems - StrIdx - Versions diffs - 0.1.4 → 0.1.6 - Mend

StrIdx 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/stridx.hpp CHANGED Viewed

@@ -10,6 +10,7 @@
 #include <array>
 #include <iostream>
 #include <unordered_map>
+#include <map>
 #include <set>
 #include <algorithm>
 #include <sstream>
@@ -28,10 +29,11 @@ namespace StrIdx {
 class Output {
 private:
   int verboseLevel;
+  // TODO: add mutex?
 public:
   Output(int verb) : verboseLevel(verb) {}
-  Output() : Output(3) {}
+  Output() : Output(1) {}
   ~Output() = default;
   static void print() {}
@@ -61,61 +63,234 @@ public:
   }
 };
+Output out{1};
+struct CharNode {
+  int *ids;
+  int ids_sz;
+  char c;
+  std::uint8_t size;
+  CharNode *children;
+  CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
+  void init() {
+    ids = nullptr;
+    ids_sz = 0;
+    c = 0;
+    size = 0;
+    children = nullptr;
+  }
+  void dealloc() {
+    if (children != nullptr) {
+      for (CharNode *it = children; it != children + size; it++) {
+        it->dealloc();
+      }
+      free(children);
+    }
+    delete[] ids;
+  }
+  ~CharNode() {}
+  // Gets Id's stored in this node and all child nodes combined
+  std::set<int> getIds() {
+    std::set<int> set;
+    getIds(set);
+    return set;
+  }
+  void getIds(std::set<int> &set) {
+    for (int j = 0; j < ids_sz; j++) {
+      set.insert(ids[j]);
+    }
+    for (CharNode *it = children; it != children + size; it++) {
+      it->getIds(set);
+    }
+  }
+  // Find if character 'c' is included in children of the node
+  CharNode *find(char c) {
+    CharNode *ret = nullptr;
+    if (size > 0) {
+      for (auto it = children; it != children + size; it++) {
+        if (it->c == c) {
+          ret = it;
+          break;
+        }
+      }
+    }
+    return ret;
+  }
+};
+/* Tree type data structure consisting of strings of file path segments
+ *  (somewhat like a trie)
+ * For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
+ * substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
+ * (CharTree::addStr callled for each separately)
+ *
+ * Which forms a tree like structure:
+ * [root]-a-b-r-a-c-a-d-a
+ *    |   |   ╰-4
+ *    |   ╰─c-a-d-a-b-r-4
+ *    ╰───b-r-a-c-a-d-a-b
+ *    |     ╰─4
+ *    ╰───r-a-c-a-d-a-b-r
+ *    ╰───d-a-b-r-4
+ *
+ * Id's pointing to path segments are stored in nodes that match the end of the inserted substring
+ *
+ * This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
+ * For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
+ * Therefore, having std::vector's or similar structures with memory overhead is not really an
+ * option.
+ */
+class CharTree {
+  Output out;
+  std::mutex mu;
+public:
+  CharNode *root;
+  CharTree() { root = new CharNode; }
+  ~CharTree() {
+    root->dealloc();
+    delete root;
+  }
+  void addStr(std::string s, int id) {
+    if (s.size() < 2) {
+      return;
+    }
+    // out.printl("add str:",s);
+    CharNode *cn = root;
+    std::lock_guard<std::mutex> mu_lock(mu);
+    for (int i = 0; i < s.size() && i < 8; i++) {
+      int c = ((char)s[i]);
+      bool found = false;
+      if (cn->size > 0) {
+        // out.printl("(1) cn->size > 0");
+        for (auto it = cn->children; it != cn->children + cn->size; it++) {
+          if (it->c == c) {
+            // out.printl("{", c, "}");
+            found = true;
+            cn = it;
+            break;
+          }
+        }
+      }
+      if (!found) {
+        // auto x = new CharNode[cn->size + 1];
+        CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
+        if (cn->size > 0) {
+          memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
+          free(cn->children);
+        }
+        cn->children = x;
+        CharNode *nn = &(cn->children[cn->size]);
+        nn->init();
+        nn->c = c;
+        cn->size++;
+        cn = nn;
+      }
+      if (i == s.size() - 1 && true) {
+        out.printv(4, "i=", i, "s:", s.size(), "|");
+        bool found = false;
+        if (cn->ids_sz > 0) {
+          for (int i = 0; i < cn->ids_sz; i++) {
+            if (cn->ids[i] == id) {
+              found = true;
+              out.printv(3, "found:", id, "\n");
+            }
+          }
+        }
+        if (!found) {
+          // out.print(".a.");
+          auto x = new int[cn->ids_sz + 1];
+          if (cn->ids_sz > 0) {
+            memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
+            delete[] cn->ids;
+          }
+          cn->ids = x;
+          cn->ids[cn->ids_sz] = id;
+          cn->ids_sz++;
+          out.printv(3, "sz:", cn->ids_sz, ",");
+        }
+      }
+    } // END for
+  }
+  void debug() { debug("", root); }
+  void debug(std::string trail, CharNode *cn) {
+    // if (trail.size() > 6) {
+    // out.print("\n");
+    // return;
+    // }
+    if (cn == nullptr) {
+      return;
+    }
+    for (int i = 0; i < cn->size; i++) {
+      CharNode *child = &cn->children[i];
+      out.print("[", child->ids_sz, "]");
+      if (child->size > 0) {
+        debug(trail + child->c, child);
+      } else {
+        out.printl(trail, child->c);
+        // out.printl();
+      }
+    }
+  }
+};
 // Transforms input string as follows:
 // '/foo/bar/file1.txt'
-// => vector{"foo", "bar", "file1.txt"}
-std::vector<std::string> splitString(const std::string &input, const char &separator) {
+// => vector{"/foo", "/bar", "/file1.txt"}
+std::vector<std::string> splitString(const std::string &str, char delimiter) {
   std::vector<std::string> result;
-  std::stringstream ss(input);
-  std::string item;
+  std::string part;
-  while (std::getline(ss, item, separator)) {
-    if (item.size() > 0) {
-      result.push_back(item);
+  for (char ch : str) {
+    if (ch == delimiter) {
+      if (part.size() > 0) {
+        result.push_back(part);
+      }
+      part.clear(); // Start a new part
+      part += ch;
+    } else {
+      part += ch;
     }
   }
-  return result;
-}
-// Convert int64_t to binary string
-[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
-  std::string result;
-  for (int i = 63; i >= 0; --i) {
-    result += ((num >> i) & 1) ? '1' : '0';
+  // If there's any remaining part after the loop, add it to the result
+  if (!part.empty()) {
+    result.push_back(part);
   }
-  return result;
-}
-// Debug. Convert a (8 char) string represented as int64_t to std::string
-[[nodiscard]] std::string int64ToStr(const int64_t &key) {
-  int nchars = 8;
-  std::string str;
-  int multip = nchars * 8;
-  for (int i = 0; i <= nchars; i++) {
-    char c = (key >> multip) & 255;
-    str.push_back(c);
-    multip -= 8;
-  }
-  return str;
+  // for (const auto &value : result) {
+  // std::cout << value << "|";
+  // }
+  // std::cout << std::endl;
+  return result;
 }
 // Debug
-void printVector(const std::vector<int> &vec) {
+void printVector(const std::vector<float> &vec) {
   for (const auto &value : vec) {
     std::cout << value << " ";
   }
 }
-// Debug
-[[nodiscard]] std::string charToBinaryString(const char &chr) {
-  std::string result;
-  for (int i = 7; i >= 0; --i) {
-    result += ((chr >> i) & 1) ? '1' : '0';
-  }
-  return result;
-}
 class Candidate;
 enum class segmentType { Dir, File };
@@ -128,7 +303,8 @@ struct PathSegment {
   Candidate *cand;
   PathSegment *parent;
   std::mutex mu;
-  ankerl::unordered_dense::map<std::string, PathSegment *> children;
+  std::map<std::string, PathSegment *> children;
   segmentType type = segmentType::Dir;
   PathSegment() : parent(nullptr) {}
   PathSegment(std::string _str) : str(_str), parent(nullptr) {}
@@ -137,9 +313,9 @@ struct PathSegment {
   [[nodiscard]] int size() const {
     int sz = str.size();
     PathSegment *cur = parent;
-    // Sum up length of parent segments (+1 for divisors)
+    // Sum up length of parent segments
     while (cur->parent != nullptr) {
-      sz += cur->str.size() + 1;
+      sz += cur->str.size();
       cur = cur->parent;
     }
     return sz;
@@ -148,7 +324,11 @@ struct PathSegment {
 // Candidate for result in string (filename) search
 struct Candidate {
+	//This holds the subscores for each character in the query string
   std::vector<float> v_charscore;
   PathSegment *seg;
   int fileId;
   // The string that this candidate represents
@@ -159,6 +339,7 @@ struct Candidate {
   float maxscore;
   int candLen; // Length of candidate
+  ~Candidate(){};
   Candidate(){};
   Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
     // Initialize v_charscores with zeros
@@ -166,6 +347,7 @@ struct Candidate {
     candLen = seg->size();
   }
+	// Sum subscores in v_charscore and normalize to get final score
   [[nodiscard]] float getScore() const {
     int i = 0;
     float score = 0.0;
@@ -178,6 +360,7 @@ struct Candidate {
     float div2 = len * candLen;
     float score1 = score / div;
     float score2 = score / div2;
+    // out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
     score = score1 * 0.97 + score2 * 0.03;
     return score;
@@ -186,12 +369,8 @@ struct Candidate {
   [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
 };
-// This seems to give 10x speed improvement over std::unordered_map
-typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
-// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
 typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
-// typedef std::unordered_map<int, Candidate*> CandMap;
+// typedef std::unordered_map<int, Candidate *> CandMap;
 class StringIndex {
 private:
@@ -199,32 +378,29 @@ private:
   char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
   int numStrings = 0;
-  std::vector<SegMap *> dirmaps;
-  std::array<std::mutex, 9> mts_d; // for dirmaps
-  std::vector<SegMap *> filemaps;
-  std::array<std::mutex, 9> mts_f; // for filemaps
   std::vector<PathSegment *> segsToClean;
+	// Maps id's stored in charTree to corresponding PathSegment's
   std::unordered_map<int, PathSegment *> seglist;
+  std::unordered_map<int, PathSegment *> seglist_dir;
+  std::mutex seglist_mu;
   PathSegment *root;
   int dirId = 0;
-  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+  float dirWeight = 1.0; // =0.7: Give only 70% of score if match is for a directory
   std::unique_ptr<ThreadPool> pool;
   Output out{1}; // verbose level = 1
+  std::mutex cm_mu;
 public:
+  CharTree cm;     // for files
+  CharTree cm_dir; // for directories
   StringIndex(char sep) : dirSeparator(sep) {
     root = new PathSegment();
     root->parent = nullptr;
     root->str = "[ROOT]";
-    for (int i = 0; i <= 8; i++) {
-      dirmaps.push_back(new SegMap);
-      filemaps.push_back(new SegMap);
-    }
     // Threads between 4 and 6
     // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
     int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
@@ -233,45 +409,31 @@ public:
     pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
   }
-  /* Don't separate path to segments separator=\0.
+  /* Don't separate path to segments when separator=\0.
      This is slower, but can be used for other data than files also.  */
   StringIndex() : StringIndex('\0') {}
   void setDirSeparator(char sep) { dirSeparator = sep; }
   void setDirWeight(float val) { dirWeight = val; }
-  ~StringIndex() {
-    for (auto x : dirmaps) {
-      for (auto y : *x) {
-        y.second->clear();
-        delete (y.second);
-      }
-      x->clear();
-      delete x;
-    }
-    for (auto x : filemaps) {
-      for (auto y : *x) {
-        y.second->clear();
-        delete (y.second);
-      }
-      x->clear();
-      delete x;
-    }
-    clearPathSegmentChildren(root);
-  }
+  ~StringIndex() { clearPathSegmentChildren(root); }
   void addStrToIndex(std::string filePath, int fileId) {
     addStrToIndex(filePath, fileId, dirSeparator);
   }
   void addStrToIndexThreaded(std::string filePath, int fileId) {
-    pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
+    pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
+    // addStrToIndex(filePath, fileId, dirSeparator);
   }
   void waitUntilReady() const { pool->waitUntilDone(); }
   void waitUntilDone() const { pool->waitUntilDone(); }
-  int size() const { return seglist.size(); }
+  int size() {
+    std::lock_guard<std::mutex> guard(seglist_mu);
+    return seglist.size();
+  }
   /**
    * Add a string to the index to be searched for afterwards
@@ -283,11 +445,18 @@ public:
    */
   void addStrToIndex(std::string filePath, int fileId, const char &separator) {
-    out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
+    std::lock_guard<std::mutex> guard(cm_mu);
+    out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
     // If a string with this index has beeen added already
-    if (seglist.find(fileId) != seglist.end()) {
-      return;
+    {
+      std::lock_guard<std::mutex> guard(seglist_mu);
+      if (seglist.find(fileId) != seglist.end()) {
+        out.printl("seglist.find(fileId) != seglist.end()");
+        return;
+      }
     }
     std::vector<std::string> segs;
@@ -319,36 +488,74 @@ public:
       if (auto it = prev->children.find(x); it != prev->children.end()) {
         p = it->second;
         prev->mu.unlock();
-      } else {
+      } else { // File or dir not included in tree yet
         p = new PathSegment(x, fileId);
         p->parent = prev;
         // If this is last item in segs, then it is a file.
         if (_x == std::prev(segs.end())) {
           p->type = segmentType::File;
-          seglist[fileId] = p;
+          {
+            std::lock_guard<std::mutex> guard(seglist_mu);
+            seglist[fileId] = p;
+            for (int i = 0; i < x.size() + 1; i++) {
+              auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
+              cm.addStr(s, fileId);
+            }
+          }
         } else { // otherwise, it is a directory
           p->type = segmentType::Dir;
           p->fileId = dirId;
-          // Files use user input Id. Directories need to have it generated
+          /* Add "/" to the end of the string so that
+           * /path/to/file will be indexed as:
+           * {"/path/", "/to/", "/file"}
+           */
+          auto dir_str = x + "/";
+          {
+            std::lock_guard<std::mutex> guard(seglist_mu);
+            seglist_dir[dirId] = p;
+            // Files use user input Id. Directories need to have it generated
+          }
+          // TODO: Create a function
+          for (int i = 0; i < dir_str.size() + 1; i++) {
+            auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
+            cm_dir.addStr(s, dirId);
+          }
           dirId++;
         }
         prev->children[x] = p;
         prev->mu.unlock();
-        addPathSegmentKeys(p);
-      }
+      } // END of first if
       prev = p;
     }
   }
-  std::string getString(int id) {
+  std::string getString(int id) { return getString(id, false); }
+  // Reconstruct original filepath from segments
+  std::string getString(int id, bool isDir) {
     std::string s = "";
-    PathSegment *seg = seglist[id];
+    std::lock_guard<std::mutex> guard(seglist_mu);
+    PathSegment *seg = nullptr;
+    if (isDir) {
+      seg = seglist_dir[id];
+    } else {
+      seg = seglist[id];
+    }
     s += seg->str;
     while (seg->parent->parent != nullptr) {
       seg = seg->parent;
-      s = seg->str + dirSeparator + s;
+      s = seg->str + s;
+      // out.print(seg, "(", seg->str, ")", ",");
     }
+    // out.printl(s);
     return s;
   }
@@ -384,33 +591,59 @@ public:
   @param query String to search for inside the index
   */
-  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
-    return findSimilar(query, 2);
-  }
-  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
-    CandMap fileCandMap;
-    CandMap dirCandMap;
-    waitUntilDone();
-    // Find both files and directories that match the input query
-    addToCandMap(fileCandMap, query, filemaps);
-    addToCandMap(dirCandMap, query, dirmaps);
-    /* If parent dir of a file matches the input string add the scores of the direcotry to the
-     scores of the file */
-    mergeCandidateMaps(fileCandMap, dirCandMap);
-    // Set all candidate pointers to nullptr so they won't mess up future searches
-    for (auto seg : segsToClean) {
-      seg->cand = nullptr;
+  void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
+    int last_start = query.size() - 2;
+    // Loop all possible start positions in query string. Indexes [0..(n-3)]
+    for (int start = 0; start <= last_start; start++) {
+      CharNode *cn = chartr.root;
+      // select a suffix (substring) starting from start, but cap length to 8 chars
+      int end = std::min(start + 7, ((int)query.size()) - 1);
+      int nchars = end - start + 1;
+      std::string s = query.substr(start, nchars);
+			// Loop all chars of the query substring
+			// Traverse from the
+      for (int i = 0; i < s.size(); i++) {
+        char c = s[i];
+        CharNode *x = cn->find(c);
+        if (x != nullptr) {
+          cn = x;
+          // Consider scores only for substrings with size >= 2
+          if (i > 0) {
+          	// If we've reached here, size of substring is i+2
+          	// Get identifiers of files that include substring
+          	// query[start..(start+i+1)] ??
+            std::set<int> ids = cn->getIds();
+            for (const int &y : ids) {
+              PathSegment *p = nullptr;
+							// Searching in file segments
+							// (or no file/dir separation)
+              if (&chartr == &cm) {
+                p = seglist[y];
+              } else {
+							// Searching in dir segments
+                p = seglist_dir[y];
+              }
+              assert(p != nullptr);
+              addToResults(p, query, start, i + 1, candmap);
+            }
+          }
+        } else {
+          // assert(cn->ids_sz < 1); // TODO: should not come here?
+          break;
+        }
+      }
     }
-    segsToClean.clear();
+  }
+  std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
     // Form return result, 2d array with file id's and scores
     std::vector<std::pair<float, int>> results;
-    for (auto &[fid, cand] : fileCandMap) {
+    for (auto &[fid, cand] : candmap) {
       std::pair<float, int> v;
       float sc = cand->getScore();
       v.first = sc;
@@ -419,164 +652,134 @@ public:
       delete cand;
     }
-    for (auto &[fid, cand] : dirCandMap) {
-      delete cand;
-    }
     // Sort highest score first
     std::sort(results.begin(), results.end(),
               [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
     return results;
   }
-  // Return int64_t representation of the first nchars in str, starting from index i
-  [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
-    int64_t key = 0;
-    for (int i_char = 0; i_char < nchars; i_char++) {
-      key = key | static_cast<int64_t>(str[i + i_char]);
-      if (i_char < nchars - 1) {
-        // Shift 8 bits to the left except on the last iteration
-        key = key << 8;
-      }
-    }
-    return key;
-  }
+  std::vector<std::pair<float, int>> findDirectories(std::string query) {
+    CandMap dirCandMap;
+    auto &candmap = dirCandMap;
+    waitUntilDone();
-  void debug() {
+    searchCharTree(query, dirCandMap, cm_dir);
+    addParentScores(dirCandMap);
+    auto results = candidatesToVec(dirCandMap);
-    int nchars = 3;
-    for (const auto &[key, value] : (*filemaps[nchars])) {
-      int64_t x;
-      x = key;
-      int multip = nchars * 8;
-      for (int i = 0; i <= nchars; i++) {
-        char c = (x >> multip) & 255;
-        std::cout << c;
-        multip -= 8;
-      }
-      std::cout << "\n";
-      // for (auto y : *value) {
-      // std::cout << y << " ";
-      // }
-      // std::cout << "\n";
-    }
+    return results;
   }
-private:
-  void clearPathSegmentChildren(PathSegment *p) {
-    if (p->children.size() > 0) {
-      for (auto x : p->children) {
-        clearPathSegmentChildren(x.second);
-      }
-    }
-    delete p;
+  std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
+    return findFilesAndDirectories(query, true, true);
   }
-  void addPathSegmentKeys(PathSegment *p) {
-    // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
-    // This function generates int64 representations (keys) of all substrings of size 2..8 in that
-    // path segment and stores pointer to p in hash tables using these int values as keys.
+  std::vector<std::pair<float, std::string>>
+  findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
-    int maxChars = 8;
-    int minChars = 2;
+    CandMap fileCandMap;
+    CandMap dirCandMap;
+    waitUntilDone();
+    std::vector<std::pair<float, std::string>> results;
-    std::string str = p->str;
-    if (p->str.size() < 2) {
-      return;
-    }
-    if (static_cast<int>(p->str.size()) < maxChars) {
-      maxChars = p->str.size();
+    if (includeFiles) {
+      searchCharTree(query, fileCandMap, cm);
+      // out.printl("size:",fileCandMap.size());
     }
-    for (int sublen = minChars; sublen <= maxChars; sublen++) {
+    searchCharTree(query, dirCandMap, cm_dir);
-      std::mutex *mu;
-      SegMap *map;
-      if (p->type == segmentType::File) {
-        map = filemaps[sublen];
-        mu = &mts_f[sublen];
-      } else {
-        map = dirmaps[sublen];
-        mu = &mts_d[sublen];
-      }
+    if (includeFiles) {
+      addParentScores(fileCandMap);
+    }
-      int count = str.size() - sublen + 1;
+    if (includeDirs) {
+      addParentScores(dirCandMap);
+    }
+    for (auto seg : segsToClean) {
+      seg->cand = nullptr;
+    }
+    segsToClean.clear();
-      int64_t keys[count + 1];
-      for (int i = 0; i <= count; i++) {
-        keys[i] = getKeyAtIdx(str, i, sublen);
+    // TODO: Need to call this just to delete candidates
+    auto res_dir = candidatesToVec(dirCandMap);
+    if (includeDirs) {
+      for (const auto &[score, id] : res_dir) {
+        results.push_back(std::pair<float, std::string>{score, getString(id, true)});
       }
+    }
-      mu->lock();
-      for (int i = 0; i <= count; i++) {
-        // int64_t key = getKeyAtIdx(str, i, sublen);
-        auto key = keys[i];
+    if (includeFiles) {
+      auto res_file = candidatesToVec(fileCandMap);
+      // out.printl("size2:",fileCandMap.size());
+      for (const auto &[score, id] : res_file) {
-        // Create a new std::set for key if doesn't exist already
-        auto it = map->find(key);
-        if (it == map->end()) {
-          (*map)[key] = new std::set<PathSegment *>;
-        }
-        (*map)[key]->insert(p);
+        // out.print("|",getString(id),"|");
+        results.push_back(std::pair<float, std::string>{score, getString(id)});
       }
-      mu->unlock();
     }
+    // Sort highest score first
+    std::sort(results.begin(), results.end(),
+              [](std::pair<float, std::string> a, std::pair<float, std::string> b) {
+                return a.first > b.first;
+              });
+    return results;
   }
-  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
-  // is of length <nchars>.
-  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
-                                                               SegMap &map) const {
+  // TODO: delete?
+  std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
-    assert(i + nchars <= static_cast<int>(str.size()));
-    std::vector<PathSegment *> res;
+  std::vector<std::pair<float, int>> findFiles(std::string query) {
-    // Take substring of str, starting at i, spanning nchars
-    // transform that to 64 bit integer
-    int64_t key = getKeyAtIdx(str, i, nchars);
-    // Find all path segments in map that have the same substring
-    if (auto it = map.find(key); it != map.end()) { // key found
-      auto set = it->second;
-      for (auto value : *set) {
-        res.push_back(value);
-      }
-    }
-    return res;
-  }
+    CandMap fileCandMap;
+    CandMap dirCandMap;
+    auto &candmap = fileCandMap;
+    waitUntilDone();
+    searchCharTree(query, fileCandMap, cm);
+    searchCharTree(query, dirCandMap, cm_dir);
+    addParentScores(fileCandMap);
-  void addToCandMap(CandMap &candmap, std::string query,
-                    std::vector<SegMap *> &map // filemaps or dirmaps
-  ) {
-    int maxChars = 8;
-    int minChars = 2;
-    if (static_cast<int>(query.size()) < maxChars) {
-      maxChars = query.size();
+    for (auto seg : segsToClean) {
+      seg->cand = nullptr;
     }
+    segsToClean.clear();
-    // Loop all substring lengths between minChars..maxChars
-    for (int sublen = minChars; sublen <= maxChars; sublen++) {
-      int count = query.size() - sublen + 1;
+    auto results = candidatesToVec(fileCandMap);
+    auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
-      // Loop all possible start positions
-      for (int i = 0; i < count; i++) {
-        std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
+    return results;
+  }
-        for (PathSegment *p : res) {
-          addToResults(p, query, i, sublen, candmap);
-        }
+private:
+  void clearPathSegmentChildren(PathSegment *p) {
+    if (p->children.size() > 0) {
+      for (auto x : p->children) {
+        clearPathSegmentChildren(x.second);
       }
     }
+    delete p;
   }
   // Add parent directories scores to files
-  void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
+  void addParentScores(CandMap &fileCandMap) {
     for (auto &[fid, cand] : fileCandMap) {
       PathSegment *p = cand->seg->parent;
       while (p->parent != nullptr) {
         if (p->cand != nullptr) {
           auto &scoreA = cand->v_charscore;
           auto &scoreB = p->cand->v_charscore;
+          // out.print("[");
+          // printVector(scoreA);
+          // out.print(",");
+          // printVector(scoreB);
+          // out.print(",");
+          // out.print("]");
           for (int i = 0; i < cand->len; i++) {
             if (scoreA[i] < scoreB[i] * dirWeight) {
               scoreA[i] = scoreB[i] * dirWeight;
@@ -592,6 +795,7 @@ private:
     if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
       Candidate *cand = new Candidate(seg, str.size());
+      // out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
       segsToClean.push_back(seg);
       candmap[seg->fileId] = cand;
       seg->cand = cand;