RubyGems - StrIdx - Versions diffs - 0.1.1 → 0.1.3 - Mend

StrIdx 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/rubyext/extconf.rb CHANGED Viewed

@@ -1,15 +1,13 @@
 #!/usr/bin/env ruby
-#
 require 'mkmf'
 module_name = "stridx"
 extension_name = 'stridx'
-$CXXFLAGS << " -Wall -Wno-unused-variable -O3 -fopenmp"
+$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
 have_library( 'stdc++');
-have_library( 'gomp' );
 dir_config(extension_name)       # The destination
 create_makefile(extension_name)  # Create Makefile

data/rubyext/ruby_interf.cpp CHANGED Viewed

@@ -7,7 +7,7 @@
 extern "C" {
-void str_idx_free(void *data) { delete (StringIndex *)data; }
+void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
 // Wrap StringIndex class inside a ruby variable
 static const rb_data_type_t str_idx_type = {
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
 };
 VALUE str_idx_alloc(VALUE self) {
-  void *data = new StringIndex();
+  void *data = new StrIdx::StringIndex();
   return TypedData_Wrap_Struct(self, &str_idx_type, data);
 }
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
   void *data;
   TypedData_Get_Struct(self, int, &str_idx_type, data);
-  ((StringIndex *)data)->addStrToIndex(s1, fid, '/');
+  // ((StringIndex *)data)->addStrToIndex(s1, fid);
+  ((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
   return self;
 }
+VALUE StringIndexWaitUntilDone(VALUE self) {
+  void *data;
+  TypedData_Get_Struct(self, int, &str_idx_type, data);
+  ((StrIdx::StringIndex *)data)->waitUntilDone();
+  return self;
+}
 VALUE StringIndexFind(VALUE self, VALUE str) {
   VALUE ret;
   std::string s1 = StringValueCStr(str);
   void *data;
   TypedData_Get_Struct(self, int, &str_idx_type, data);
-  StringIndex *idx = (StringIndex *)data;
+  StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
   ret = rb_ary_new();
   const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
@@ -66,6 +75,26 @@ VALUE StringIndexFind(VALUE self, VALUE str) {
   return ret;
 }
+VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
+  char c = '/';
+  if (TYPE(str) == T_STRING) {
+    std::string s = StringValueCStr(str);
+    if (s.size() >= 1) {
+      c = s[0];
+    }
+  } else {
+    c = '\0'; // No separator
+    // if (TYPE(obj) == T_NIL)
+  }
+  void *data;
+  TypedData_Get_Struct(self, int, &str_idx_type, data);
+  StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
+  idx->setDirSeparator(c);
+  return self;
+}
 void Init_stridx(void) {
   VALUE mStrIdx = rb_define_module("StrIdx");
@@ -73,7 +102,12 @@ void Init_stridx(void) {
   rb_define_alloc_func(classStringIndex, str_idx_alloc);
   rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
+  rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
   rb_define_method(classStringIndex, "find", StringIndexFind, 1);
+  rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
 }
 } // End extern "C"

data/stridx.hpp CHANGED Viewed

@@ -1,21 +1,66 @@
+#ifndef SSSTRIDX_HPP
+#define SSSTRIDX_HPP
 #include <stdio.h>
 #include <stdlib.h>
 #include <cassert>
 #include <vector>
+#include <array>
 #include <iostream>
 #include <unordered_map>
 #include <set>
 #include <algorithm>
 #include <sstream>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
+#include <vector>
+#include <mutex>
+#include <thread>
+#include "thread_pool.hpp"
 #include "unordered_dense.h"
+namespace StrIdx {
+/* Alternative to using std::cout
+   Allows to control verbose level */
+class Output {
+private:
+  int verboseLevel;
+public:
+  Output(int verb) : verboseLevel(verb) {}
+  Output() : Output(3) {}
+  ~Output() = default;
+  void print() {}
+  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
+  template <typename T, typename... Types> void print(T var1, Types... var2) {
+    std::cout << var1;
+    print(var2...);
+  }
+  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+  template <typename... Types> void printl(Types... var2) {
+    print(var2...);
+    print("\n");
+  }
+  /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+   * if verboseLevel >= 2 (first arg)
+   */
+  template <typename... Types> void printv(int vlevel, Types... var2) {
+    if (verboseLevel < vlevel) {
+      return;
+    }
+    if (verboseLevel >= 3) {
+      print("[v=", vlevel, "] ");
+    }
+    printl(var2...);
+  }
+};
 // Transforms input string as follows:
 // '/foo/bar/file1.txt'
 // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
 }
 // Convert int64_t to binary string
-std::string int64ToBinaryString(int64_t num) {
+[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
   std::string result;
   for (int i = 63; i >= 0; --i) {
     result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
   return result;
 }
-// Convert a (8 char) string represented as int64_t to std::string
-std::string int64ToStr(int64_t key) {
+// Debug. Convert a (8 char) string represented as int64_t to std::string
+[[nodiscard]] std::string int64ToStr(int64_t key) {
   int nchars = 8;
   std::string str;
   int multip = nchars * 8;
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
   return str;
 }
+// Debug
 void printVector(const std::vector<int> &vec) {
   for (const auto &value : vec) {
     std::cout << value << " ";
   }
 }
-std::string charToBinaryString(char num) {
+// Debug
+[[nodiscard]] std::string charToBinaryString(char chr) {
   std::string result;
   for (int i = 7; i >= 0; --i) {
-    result += ((num >> i) & 1) ? '1' : '0';
+    result += ((chr >> i) & 1) ? '1' : '0';
   }
   return result;
 }
 class Candidate;
-enum segmentType { Dir, File };
+enum class segmentType { Dir, File };
 // A segment of a file path
 // e.g. if path is /foo/bar/baz.txt
@@ -81,17 +128,18 @@ public:
   int fileId; // (if FILE)
   Candidate *cand;
   PathSegment *parent;
+  std::mutex mu;
   ankerl::unordered_dense::map<std::string, PathSegment *> children;
-  segmentType type = Dir;
-  PathSegment() : parent(NULL) {}
-  PathSegment(std::string _str) : str(_str), parent(NULL) {}
+  segmentType type = segmentType::Dir;
+  PathSegment() : parent(nullptr) {}
+  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
   PathSegment(std::string _str, int _fileId)
-      : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
-  int size() {
+      : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
+  [[nodiscard]] int size() {
     int sz = str.size();
     PathSegment *cur = parent;
     // Sum up length of parent segments (+1 for divisors)
-    while (cur->parent != NULL) {
+    while (cur->parent != nullptr) {
       sz += cur->str.size() + 1;
       cur = cur->parent;
     }
@@ -118,7 +166,7 @@ public:
     // Initialize v_charscores with zeros
     v_charscore.resize(len, 0);
     candLen = str.size();
-    seg = NULL;
+    seg = nullptr;
   }
   Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
@@ -127,7 +175,7 @@ public:
     candLen = seg->size();
   }
-  float getScore() {
+  [[nodiscard]] float getScore() {
     int i = 0;
     float score = 0.0;
     candLen = seg->size();
@@ -145,19 +193,21 @@ public:
     return score;
   }
-  float operator[](int idx) { return v_charscore[idx]; }
+  [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
 };
 // This seems to give 10x speed improvement over std::unordered_map
 typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
 // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
-typedef std::unordered_map<float, Candidate> CandMap;
+typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
+// typedef std::unordered_map<int, Candidate*> CandMap;
 class StringIndex {
 private:
   int tmp;
   char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
+  int numStrings = 0;
   std::vector<SegMap *> dirmaps;
   std::vector<SegMap *> filemaps;
@@ -170,10 +220,16 @@ private:
   int dirId = 0;
   float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+  std::array<std::mutex, 9> mts_f;
+  std::array<std::mutex, 9> mts_d;
+  std::unique_ptr<ThreadPool> pool;
+  Output out{1}; // verbose level = 1
 public:
-  StringIndex() {
+  StringIndex(char sep) : dirSeparator(sep) {
     root = new PathSegment();
-    root->parent = NULL;
+    root->parent = nullptr;
     root->str = "[ROOT]";
     for (int i = 0; i <= 8; i++) {
@@ -181,11 +237,18 @@ public:
       filemaps.push_back(new SegMap);
     }
-#ifdef _OPENMP
-    std::cout << "OPENMP enabled\n";
-#endif
+    // Threads between 4 and 6
+    // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
+    int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
+    num_threads = std::min(num_threads, 6);
+    out.printv(2, "Number of threads: ", num_threads);
+    pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
   }
+  /* Don't separate path to segments separator=\0.
+     This is slower, but can be used for other data than files also.  */
+  StringIndex() : StringIndex('\0') {}
   void setDirSeparator(char sep) { dirSeparator = sep; }
   void setDirWeight(float val) { dirWeight = val; }
@@ -213,8 +276,15 @@ public:
     addStrToIndex(filePath, fileId, dirSeparator);
   }
+  void addStrToIndexThreaded(std::string filePath, int fileId) {
+    pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
+  }
+  void waitUntilReady() { pool->waitUntilDone(); }
+  void waitUntilDone() { pool->waitUntilDone(); }
   /**
-   * Add a string to the index to be search for afterwards
+   * Add a string to the index to be searched for afterwards
    *
    * @param filePath String to index (e.g. /home/user/Project/main.cpp).
    * @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
@@ -222,8 +292,10 @@ public:
    * one of {'\\', '/', '\0' (no separation)}.
    */
   void addStrToIndex(std::string filePath, int fileId, const char &separator) {
+    out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
     std::vector<std::string> segs;
+    numStrings += 1;
     if (separator == '\0') {
       // No separation to directories & files
@@ -233,7 +305,7 @@ public:
       segs = splitString(filePath, separator);
     }
-    PathSegment *prev = NULL;
+    PathSegment *prev = nullptr;
     prev = root;
     // Add segments to a tree type data structure
     // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +317,27 @@ public:
       auto x = *_x;
       PathSegment *p;
-      auto it = prev->children.find(x);
+      prev->mu.lock();
       // this part of the path already exists in the tree
-      if (it != prev->children.end()) {
+      if (auto it = prev->children.find(x); it != prev->children.end()) {
         p = it->second;
+        prev->mu.unlock();
       } else {
         p = new PathSegment(x, fileId);
         p->parent = prev;
-        // If this is last item in segs
+        // If this is last item in segs, then it is a file.
         if (_x == std::prev(segs.end())) {
-          // therefore, it is a file.
-          p->type = File;
+          p->type = segmentType::File;
           seglist[fileId] = p;
-        } else {
-          p->type = Dir;
+        } else { // otherwise, it is a directory
+          p->type = segmentType::Dir;
           p->fileId = dirId;
           // Files use user input Id. Directories need to have it generated
           dirId++;
         }
         prev->children[x] = p;
+        prev->mu.unlock();
         addPathSegmentKeys(p);
       }
@@ -272,7 +346,7 @@ public:
   }
   /**
-  * The search will find filepaths similar to the input string
+  The search will find filepaths similar to the input string
   To be considered a candidate path, the file component of the path (e.g. file.txt)
   is required to have at least a substring of two characters in common with the
@@ -286,8 +360,8 @@ public:
       is also included in the PathSegment
     - take the lenght of that substring as score
   sum up the scores for each character c and divide by (string length)^2
-  For example, if query = "rngnomadriv"
+  For example, if query = "rngnomadriv"
   and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
   as follows:
     rngnomadriv
@@ -296,17 +370,23 @@ public:
     score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
     In final score, give a small penalty for larger candidate filenames:
-    Divide main part of score with (query string length)^2
+    Divide main part of score with (query string length)^2
     and minor part by (query string length)*(candidate string length)
     score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
   @param query String to search for inside the index
   */
-  std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
+    return findSimilar(query, 2);
+  }
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
     CandMap fileCandMap;
     CandMap dirCandMap;
+    waitUntilDone();
     // Find both files and directories that match the input query
     addToCandMap(fileCandMap, query, filemaps);
     addToCandMap(dirCandMap, query, dirmaps);
@@ -315,9 +395,9 @@ public:
      scores of the file */
     mergeCandidateMaps(fileCandMap, dirCandMap);
-    // Set all candidate pointers to NULL so they won't mess up future searches
+    // Set all candidate pointers to nullptr so they won't mess up future searches
     for (auto seg : segsToClean) {
-      seg->cand = NULL;
+      seg->cand = nullptr;
     }
     segsToClean.clear();
@@ -325,11 +405,17 @@ public:
     std::vector<std::pair<float, int>> results;
     for (auto &[fid, cand] : fileCandMap) {
       std::pair<float, int> v;
-      float sc = cand.getScore();
+      float sc = cand->getScore();
       v.first = sc;
       v.second = fid;
       results.push_back(v);
+      delete cand;
     }
+    for (auto &[fid, cand] : dirCandMap) {
+      delete cand;
+    }
     // Sort highest score first
     std::sort(results.begin(), results.end(),
               [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -337,10 +423,10 @@ public:
   }
   // Return int64_t representation of the first nchars in str, starting from index i
-  int64_t getKeyAtIdx(std::string str, int i, int nchars) {
+  [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
     int64_t key = 0;
     for (int i_char = 0; i_char < nchars; i_char++) {
-      key = key | static_cast<int>(str[i + i_char]);
+      key = key | static_cast<int64_t>(str[i + i_char]);
       if (i_char < nchars - 1) {
         // Shift 8 bits to the left except on the last iteration
         key = key << 8;
@@ -395,22 +481,29 @@ private:
       maxChars = p->str.size();
     }
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
     for (int sublen = minChars; sublen <= maxChars; sublen++) {
+      std::mutex *mu;
       SegMap *map;
-      if (p->type == File) {
+      if (p->type == segmentType::File) {
         map = filemaps[sublen];
+        mu = &mts_f[sublen];
       } else {
         map = dirmaps[sublen];
+        mu = &mts_d[sublen];
       }
       int count = str.size() - sublen + 1;
+      int64_t keys[count + 1];
+      for (int i = 0; i <= count; i++) {
+        keys[i] = getKeyAtIdx(str, i, sublen);
+      }
+      mu->lock();
       for (int i = 0; i <= count; i++) {
-        int64_t key = getKeyAtIdx(str, i, sublen);
+        // int64_t key = getKeyAtIdx(str, i, sublen);
+        auto key = keys[i];
         // Create a new std::set for key if doesn't exist already
         auto it = map->find(key);
@@ -419,12 +512,14 @@ private:
         }
         (*map)[key]->insert(p);
       }
+      mu->unlock();
     }
   }
   // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
   // is of length <nchars>.
-  std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
+  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
+                                                               SegMap &map) {
     assert(i + nchars <= static_cast<int>(str.size()));
     std::vector<PathSegment *> res;
@@ -433,8 +528,7 @@ private:
     // transform that to 64 bit integer
     int64_t key = getKeyAtIdx(str, i, nchars);
     // Find all path segments in map that have the same substring
-    auto it = map.find(key);
-    if (it != map.end()) { // key found
+    if (auto it = map.find(key); it != map.end()) { // key found
       auto set = it->second;
       for (auto value : *set) {
         res.push_back(value);
@@ -471,12 +565,12 @@ private:
   void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
     for (auto &[fid, cand] : fileCandMap) {
-      PathSegment *p = cand.seg->parent;
-      while (p->parent != NULL) {
-        if (p->cand != NULL) {
-          auto &scoreA = cand.v_charscore;
+      PathSegment *p = cand->seg->parent;
+      while (p->parent != nullptr) {
+        if (p->cand != nullptr) {
+          auto &scoreA = cand->v_charscore;
           auto &scoreB = p->cand->v_charscore;
-          for (int i = 0; i < cand.len; i++) {
+          for (int i = 0; i < cand->len; i++) {
             if (scoreA[i] < scoreB[i] * dirWeight) {
               scoreA[i] = scoreB[i] * dirWeight;
             }
@@ -489,18 +583,22 @@ private:
   void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
-    auto it2 = candmap.find(seg->fileId);
-    if (it2 == candmap.end()) {
-      Candidate cand(seg, str.size());
-      seg->cand = &(candmap[seg->fileId]);
+    if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
+      Candidate *cand = new Candidate(seg, str.size());
       segsToClean.push_back(seg);
       candmap[seg->fileId] = cand;
+      seg->cand = cand;
     }
     for (int j = i; j < i + nchars; j++) {
-      if (candmap[seg->fileId][j] < nchars) {
-        candmap[seg->fileId].v_charscore[j] = nchars;
+      Candidate &cand = *(candmap[seg->fileId]);
+      if (cand[j] < nchars) {
+        cand.v_charscore[j] = nchars;
       }
     }
   }
 };
+} // namespace StrIdx
+#endif

data/test.rb CHANGED Viewed

@@ -13,7 +13,13 @@ for x in lines
 end
 idx_time = Time.new
-puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+# Time to start the threadpool to process indexing
+puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+idx.waitUntilDone() # Not necessary, will be called by idx.find
+idx_time = Time.new
+# Time when all threads have completed
+puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
 query = "rngnomadriv"
 res = idx.find(query)