RubyGems - StrIdx - Versions diffs - 0.1.2 → 0.1.3 - Mend

StrIdx 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2155c54900f595ae954d2074840f425195a66d6bf464e35dc68a1bb10236255c
-  data.tar.gz: fc2b50bcf6083b806643886a275e2a03ce3267cbd66dd81a0d63f5f68f8e4268
+  metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
+  data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
 SHA512:
-  metadata.gz: ac27e251a448f1ca17d4672d12ac165412f2c7fbd71359e77d21f8a8c30521797befe7f2555d181be2af0cace345636558b4a8d93d8e141f707e8b28ffe5687b
-  data.tar.gz: f8e65efbed5079c0a2e37504aa955e0855d0ef088ad07d1d2566dee6c5f51b79d9ba71ef102e5aaeef8d1b7d548eefa2fab3930e21b7de05b940da882c9b86f3
+  metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
+  data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217

data/Makefile CHANGED Viewed

@@ -1,7 +1,7 @@
 all: demo
-demo: *.hpp *.cpp
-	g++  -Wall -O3 -fopenmp -lstdc++ demo.cpp -o demo
+demo: *.hpp *.cpp Makefile
+	g++  -Wall -O3 -lstdc++ demo.cpp -o demo
 clean:
 	rm demo

data/README.md CHANGED Viewed

@@ -43,6 +43,15 @@ Install:
 gem install StrIdx
 ```
+Or, for development version:
+```
+git clone https://github.com/SamiSieranoja/stridx.git
+cd stridx
+cd rubyext;  ruby extconf.rb ; make ; cd ..
+gem build stridx.gemspec
+gem install $(ls -1tr StrIdx*gem | tail -n 1)
+```
 Usage example (see test.rb):
 ```ruby
 require "stridx"
@@ -58,7 +67,7 @@ for x in lines
 end
 idx_time = Time.new
-puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+puts "\nIndexing time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
 query = "rngnomadriv"
 res = idx.find(query)

data/demo.cpp CHANGED Viewed

@@ -1,3 +1,12 @@
+#include <condition_variable>
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <algorithm>
 #include "stridx.hpp"
 #include <iostream>
@@ -28,7 +37,7 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
 }
 int main() {
-  StringIndex idx;
+  StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
   // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
   // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
   // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
@@ -37,26 +46,39 @@ int main() {
   std::string fn_filePaths = "flist.txt";
   std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
+	// Launch indexing to be run on background
+  cout << "File paths: " << v_filePaths.size() << std::endl;
+  cout << "Start indexing in the background" << std::endl;
   auto start = std::chrono::high_resolution_clock::now();
   int id = 0;
   for (const auto &filePath : v_filePaths) {
-    idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
-    // idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
+    idx.addStrToIndexThreaded(filePath, id);
     id++;
   }
+  auto idx_time_launch = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
+  cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
+	// Wait until indexing has finished
+  idx.waitUntilDone();
   auto idx_time = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double, std::milli> duration = idx_time - start;
-  cout << "Indexing creation time for " << v_filePaths.size() << " file paths (seconds): " << duration.count() / 1000 << "\n";
+  cout << "Indexing finished time for " << v_filePaths.size()
+       << " file paths (seconds): " << duration.count() / 1000 << "\n";
   // Find matching filepaths from the index for the query string "rngnomadriv"
   start = std::chrono::high_resolution_clock::now();
   std::string query = "rngnomadriv";
+  for (int i = 0; i < 99; i++) {
+    const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
+  }
   const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
   auto search_time = std::chrono::high_resolution_clock::now();
   duration = search_time - start;
-  cout << "Search time (seconds): " << duration.count() / 1000
-       << "\n";
+  cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
   int i = 0;
   std::cout << "query string: " << query << "\n";
@@ -73,4 +95,4 @@ int main() {
 }
 // Compile:
-// g++  -Wall -Wno-unused-variable -O3 -fopenmp -lstdc++ demo.cpp -o demo
+// g++  -Wall -Wno-unused-variable -O3 -lstdc++ demo.cpp -o demo

data/rubyext/extconf.rb CHANGED Viewed

@@ -1,15 +1,13 @@
 #!/usr/bin/env ruby
-#
 require 'mkmf'
 module_name = "stridx"
 extension_name = 'stridx'
-$CXXFLAGS << " -Wall -Wno-unused-variable -O3 -fopenmp"
+$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
 have_library( 'stdc++');
-have_library( 'gomp' );
 dir_config(extension_name)       # The destination
 create_makefile(extension_name)  # Create Makefile

data/rubyext/ruby_interf.cpp CHANGED Viewed

@@ -7,7 +7,7 @@
 extern "C" {
-void str_idx_free(void *data) { delete (StringIndex *)data; }
+void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
 // Wrap StringIndex class inside a ruby variable
 static const rb_data_type_t str_idx_type = {
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
 };
 VALUE str_idx_alloc(VALUE self) {
-  void *data = new StringIndex();
+  void *data = new StrIdx::StringIndex();
   return TypedData_Wrap_Struct(self, &str_idx_type, data);
 }
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
   void *data;
   TypedData_Get_Struct(self, int, &str_idx_type, data);
-  ((StringIndex *)data)->addStrToIndex(s1, fid);
+  // ((StringIndex *)data)->addStrToIndex(s1, fid);
+  ((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
   return self;
 }
+VALUE StringIndexWaitUntilDone(VALUE self) {
+  void *data;
+  TypedData_Get_Struct(self, int, &str_idx_type, data);
+  ((StrIdx::StringIndex *)data)->waitUntilDone();
+  return self;
+}
 VALUE StringIndexFind(VALUE self, VALUE str) {
   VALUE ret;
   std::string s1 = StringValueCStr(str);
   void *data;
   TypedData_Get_Struct(self, int, &str_idx_type, data);
-  StringIndex *idx = (StringIndex *)data;
+  StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
   ret = rb_ary_new();
   const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
@@ -80,7 +89,7 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
   void *data;
   TypedData_Get_Struct(self, int, &str_idx_type, data);
-  StringIndex *idx = (StringIndex *)data;
+  StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
   idx->setDirSeparator(c);
   return self;
@@ -93,8 +102,12 @@ void Init_stridx(void) {
   rb_define_alloc_func(classStringIndex, str_idx_alloc);
   rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
+  rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
   rb_define_method(classStringIndex, "find", StringIndexFind, 1);
   rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
 }
 } // End extern "C"

data/stridx.hpp CHANGED Viewed

@@ -1,21 +1,66 @@
+#ifndef SSSTRIDX_HPP
+#define SSSTRIDX_HPP
 #include <stdio.h>
 #include <stdlib.h>
 #include <cassert>
 #include <vector>
+#include <array>
 #include <iostream>
 #include <unordered_map>
 #include <set>
 #include <algorithm>
 #include <sstream>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
+#include <vector>
+#include <mutex>
+#include <thread>
+#include "thread_pool.hpp"
 #include "unordered_dense.h"
+namespace StrIdx {
+/* Alternative to using std::cout
+   Allows to control verbose level */
+class Output {
+private:
+  int verboseLevel;
+public:
+  Output(int verb) : verboseLevel(verb) {}
+  Output() : Output(3) {}
+  ~Output() = default;
+  void print() {}
+  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
+  template <typename T, typename... Types> void print(T var1, Types... var2) {
+    std::cout << var1;
+    print(var2...);
+  }
+  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+  template <typename... Types> void printl(Types... var2) {
+    print(var2...);
+    print("\n");
+  }
+  /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+   * if verboseLevel >= 2 (first arg)
+   */
+  template <typename... Types> void printv(int vlevel, Types... var2) {
+    if (verboseLevel < vlevel) {
+      return;
+    }
+    if (verboseLevel >= 3) {
+      print("[v=", vlevel, "] ");
+    }
+    printl(var2...);
+  }
+};
 // Transforms input string as follows:
 // '/foo/bar/file1.txt'
 // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
 }
 // Convert int64_t to binary string
-std::string int64ToBinaryString(int64_t num) {
+[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
   std::string result;
   for (int i = 63; i >= 0; --i) {
     result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
   return result;
 }
-// Convert a (8 char) string represented as int64_t to std::string
-std::string int64ToStr(int64_t key) {
+// Debug. Convert a (8 char) string represented as int64_t to std::string
+[[nodiscard]] std::string int64ToStr(int64_t key) {
   int nchars = 8;
   std::string str;
   int multip = nchars * 8;
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
   return str;
 }
+// Debug
 void printVector(const std::vector<int> &vec) {
   for (const auto &value : vec) {
     std::cout << value << " ";
   }
 }
-std::string charToBinaryString(char num) {
+// Debug
+[[nodiscard]] std::string charToBinaryString(char chr) {
   std::string result;
   for (int i = 7; i >= 0; --i) {
-    result += ((num >> i) & 1) ? '1' : '0';
+    result += ((chr >> i) & 1) ? '1' : '0';
   }
   return result;
 }
 class Candidate;
-enum segmentType { Dir, File };
+enum class segmentType { Dir, File };
 // A segment of a file path
 // e.g. if path is /foo/bar/baz.txt
@@ -81,17 +128,18 @@ public:
   int fileId; // (if FILE)
   Candidate *cand;
   PathSegment *parent;
+  std::mutex mu;
   ankerl::unordered_dense::map<std::string, PathSegment *> children;
-  segmentType type = Dir;
-  PathSegment() : parent(NULL) {}
-  PathSegment(std::string _str) : str(_str), parent(NULL) {}
+  segmentType type = segmentType::Dir;
+  PathSegment() : parent(nullptr) {}
+  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
   PathSegment(std::string _str, int _fileId)
-      : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
-  int size() {
+      : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
+  [[nodiscard]] int size() {
     int sz = str.size();
     PathSegment *cur = parent;
     // Sum up length of parent segments (+1 for divisors)
-    while (cur->parent != NULL) {
+    while (cur->parent != nullptr) {
       sz += cur->str.size() + 1;
       cur = cur->parent;
     }
@@ -118,7 +166,7 @@ public:
     // Initialize v_charscores with zeros
     v_charscore.resize(len, 0);
     candLen = str.size();
-    seg = NULL;
+    seg = nullptr;
   }
   Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
@@ -127,7 +175,7 @@ public:
     candLen = seg->size();
   }
-  float getScore() {
+  [[nodiscard]] float getScore() {
     int i = 0;
     float score = 0.0;
     candLen = seg->size();
@@ -145,19 +193,21 @@ public:
     return score;
   }
-  float operator[](int idx) { return v_charscore[idx]; }
+  [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
 };
 // This seems to give 10x speed improvement over std::unordered_map
 typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
 // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
-typedef std::unordered_map<float, Candidate> CandMap;
+typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
+// typedef std::unordered_map<int, Candidate*> CandMap;
 class StringIndex {
 private:
   int tmp;
   char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
+  int numStrings = 0;
   std::vector<SegMap *> dirmaps;
   std::vector<SegMap *> filemaps;
@@ -170,10 +220,16 @@ private:
   int dirId = 0;
   float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+  std::array<std::mutex, 9> mts_f;
+  std::array<std::mutex, 9> mts_d;
+  std::unique_ptr<ThreadPool> pool;
+  Output out{1}; // verbose level = 1
 public:
-  StringIndex() {
+  StringIndex(char sep) : dirSeparator(sep) {
     root = new PathSegment();
-    root->parent = NULL;
+    root->parent = nullptr;
     root->str = "[ROOT]";
     for (int i = 0; i <= 8; i++) {
@@ -181,11 +237,18 @@ public:
       filemaps.push_back(new SegMap);
     }
-#ifdef _OPENMP
-    std::cout << "OPENMP enabled\n";
-#endif
+    // Threads between 4 and 6
+    // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
+    int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
+    num_threads = std::min(num_threads, 6);
+    out.printv(2, "Number of threads: ", num_threads);
+    pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
   }
+  /* Don't separate path to segments separator=\0.
+     This is slower, but can be used for other data than files also.  */
+  StringIndex() : StringIndex('\0') {}
   void setDirSeparator(char sep) { dirSeparator = sep; }
   void setDirWeight(float val) { dirWeight = val; }
@@ -213,6 +276,13 @@ public:
     addStrToIndex(filePath, fileId, dirSeparator);
   }
+  void addStrToIndexThreaded(std::string filePath, int fileId) {
+    pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
+  }
+  void waitUntilReady() { pool->waitUntilDone(); }
+  void waitUntilDone() { pool->waitUntilDone(); }
   /**
    * Add a string to the index to be searched for afterwards
    *
@@ -222,8 +292,10 @@ public:
    * one of {'\\', '/', '\0' (no separation)}.
    */
   void addStrToIndex(std::string filePath, int fileId, const char &separator) {
+    out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
     std::vector<std::string> segs;
+    numStrings += 1;
     if (separator == '\0') {
       // No separation to directories & files
@@ -233,7 +305,7 @@ public:
       segs = splitString(filePath, separator);
     }
-    PathSegment *prev = NULL;
+    PathSegment *prev = nullptr;
     prev = root;
     // Add segments to a tree type data structure
     // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +317,27 @@ public:
       auto x = *_x;
       PathSegment *p;
-      auto it = prev->children.find(x);
+      prev->mu.lock();
       // this part of the path already exists in the tree
-      if (it != prev->children.end()) {
+      if (auto it = prev->children.find(x); it != prev->children.end()) {
         p = it->second;
+        prev->mu.unlock();
       } else {
         p = new PathSegment(x, fileId);
         p->parent = prev;
-        // If this is last item in segs
+        // If this is last item in segs, then it is a file.
         if (_x == std::prev(segs.end())) {
-          // therefore, it is a file.
-          p->type = File;
+          p->type = segmentType::File;
           seglist[fileId] = p;
-        } else {
-          p->type = Dir;
+        } else { // otherwise, it is a directory
+          p->type = segmentType::Dir;
           p->fileId = dirId;
           // Files use user input Id. Directories need to have it generated
           dirId++;
         }
         prev->children[x] = p;
+        prev->mu.unlock();
         addPathSegmentKeys(p);
       }
@@ -303,14 +377,16 @@ public:
   @param query String to search for inside the index
   */
-  std::vector<std::pair<float, int>> findSimilar(std::string query) {
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
     return findSimilar(query, 2);
   }
-  std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
     CandMap fileCandMap;
     CandMap dirCandMap;
+    waitUntilDone();
     // Find both files and directories that match the input query
     addToCandMap(fileCandMap, query, filemaps);
     addToCandMap(dirCandMap, query, dirmaps);
@@ -319,9 +395,9 @@ public:
      scores of the file */
     mergeCandidateMaps(fileCandMap, dirCandMap);
-    // Set all candidate pointers to NULL so they won't mess up future searches
+    // Set all candidate pointers to nullptr so they won't mess up future searches
     for (auto seg : segsToClean) {
-      seg->cand = NULL;
+      seg->cand = nullptr;
     }
     segsToClean.clear();
@@ -329,11 +405,17 @@ public:
     std::vector<std::pair<float, int>> results;
     for (auto &[fid, cand] : fileCandMap) {
       std::pair<float, int> v;
-      float sc = cand.getScore();
+      float sc = cand->getScore();
       v.first = sc;
       v.second = fid;
       results.push_back(v);
+      delete cand;
     }
+    for (auto &[fid, cand] : dirCandMap) {
+      delete cand;
+    }
     // Sort highest score first
     std::sort(results.begin(), results.end(),
               [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -341,10 +423,10 @@ public:
   }
   // Return int64_t representation of the first nchars in str, starting from index i
-  int64_t getKeyAtIdx(std::string str, int i, int nchars) {
+  [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
     int64_t key = 0;
     for (int i_char = 0; i_char < nchars; i_char++) {
-      key = key | static_cast<int>(str[i + i_char]);
+      key = key | static_cast<int64_t>(str[i + i_char]);
       if (i_char < nchars - 1) {
         // Shift 8 bits to the left except on the last iteration
         key = key << 8;
@@ -399,22 +481,29 @@ private:
       maxChars = p->str.size();
     }
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
     for (int sublen = minChars; sublen <= maxChars; sublen++) {
+      std::mutex *mu;
       SegMap *map;
-      if (p->type == File) {
+      if (p->type == segmentType::File) {
         map = filemaps[sublen];
+        mu = &mts_f[sublen];
       } else {
         map = dirmaps[sublen];
+        mu = &mts_d[sublen];
       }
       int count = str.size() - sublen + 1;
+      int64_t keys[count + 1];
       for (int i = 0; i <= count; i++) {
-        int64_t key = getKeyAtIdx(str, i, sublen);
+        keys[i] = getKeyAtIdx(str, i, sublen);
+      }
+      mu->lock();
+      for (int i = 0; i <= count; i++) {
+        // int64_t key = getKeyAtIdx(str, i, sublen);
+        auto key = keys[i];
         // Create a new std::set for key if doesn't exist already
         auto it = map->find(key);
@@ -423,12 +512,14 @@ private:
         }
         (*map)[key]->insert(p);
       }
+      mu->unlock();
     }
   }
   // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
   // is of length <nchars>.
-  std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
+  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
+                                                               SegMap &map) {
     assert(i + nchars <= static_cast<int>(str.size()));
     std::vector<PathSegment *> res;
@@ -437,8 +528,7 @@ private:
     // transform that to 64 bit integer
     int64_t key = getKeyAtIdx(str, i, nchars);
     // Find all path segments in map that have the same substring
-    auto it = map.find(key);
-    if (it != map.end()) { // key found
+    if (auto it = map.find(key); it != map.end()) { // key found
       auto set = it->second;
       for (auto value : *set) {
         res.push_back(value);
@@ -475,12 +565,12 @@ private:
   void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
     for (auto &[fid, cand] : fileCandMap) {
-      PathSegment *p = cand.seg->parent;
-      while (p->parent != NULL) {
-        if (p->cand != NULL) {
-          auto &scoreA = cand.v_charscore;
+      PathSegment *p = cand->seg->parent;
+      while (p->parent != nullptr) {
+        if (p->cand != nullptr) {
+          auto &scoreA = cand->v_charscore;
           auto &scoreB = p->cand->v_charscore;
-          for (int i = 0; i < cand.len; i++) {
+          for (int i = 0; i < cand->len; i++) {
             if (scoreA[i] < scoreB[i] * dirWeight) {
               scoreA[i] = scoreB[i] * dirWeight;
             }
@@ -493,18 +583,22 @@ private:
   void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
-    auto it2 = candmap.find(seg->fileId);
-    if (it2 == candmap.end()) {
-      Candidate cand(seg, str.size());
-      seg->cand = &(candmap[seg->fileId]);
+    if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
+      Candidate *cand = new Candidate(seg, str.size());
       segsToClean.push_back(seg);
       candmap[seg->fileId] = cand;
+      seg->cand = cand;
     }
     for (int j = i; j < i + nchars; j++) {
-      if (candmap[seg->fileId][j] < nchars) {
-        candmap[seg->fileId].v_charscore[j] = nchars;
+      Candidate &cand = *(candmap[seg->fileId]);
+      if (cand[j] < nchars) {
+        cand.v_charscore[j] = nchars;
       }
     }
   }
 };
+} // namespace StrIdx
+#endif

data/test.rb CHANGED Viewed

@@ -13,7 +13,13 @@ for x in lines
 end
 idx_time = Time.new
-puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+# Time to start the threadpool to process indexing
+puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+idx.waitUntilDone() # Not necessary, will be called by idx.find
+idx_time = Time.new
+# Time when all threads have completed
+puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
 query = "rngnomadriv"
 res = idx.find(query)

data/thread_pool.hpp ADDED Viewed

@@ -0,0 +1,98 @@
+// Based on example in https://www.geeksforgeeks.org/thread-pool-in-cpp/
+#include <condition_variable>
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <chrono>
+class ThreadPool {
+public:
+  // Create a thread pool with given number of threads
+  ThreadPool(size_t num_threads) {
+    // Creating worker threads
+    for (size_t i = 0; i < num_threads; ++i) {
+      workerThreads.emplace_back([this] {
+        while (true) {
+          std::function<void()> task;
+          {
+            std::unique_lock<std::mutex> lock(mu_queue);
+            // Waiting until there is a task to execute or the pool is stopped
+            cv_.wait(lock, [this] { return !taskQueue.empty() || stop_; });
+            // Exit the thread in case the pool is stopped and there are no tasks
+            if (stop_ && taskQueue.empty()) {
+              return;
+            }
+            // Get the next task from the queue
+            task = std::move(taskQueue.front());
+            taskQueue.pop();
+          }
+          task();
+        }
+      });
+    }
+  }
+  // Destructor to stop the thread pool
+  ~ThreadPool() {
+    {
+      std::lock_guard<std::mutex> lock(mu_queue);
+      stop_ = true;
+    }
+    // Notify all threads
+    cv_.notify_all();
+    // Joining all worker threads to ensure they have
+    // completed their tasks
+    for (auto &thread : workerThreads) {
+      thread.join();
+    }
+  }
+  // Wait until all tasks assigned to the threads have been finished
+  void waitUntilDone() {
+    while (true) {
+      {
+        std::lock_guard<std::mutex> guard(mu_queue);
+        if (taskQueue.empty()) {
+          return;
+        }
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+  }
+  // Enqueue task for execution by the thread pool
+  void enqueue(std::function<void()> task) {
+    {
+      std::lock_guard<std::mutex> lock(mu_queue);
+      taskQueue.emplace(move(task));
+    }
+    cv_.notify_one();
+  }
+private:
+  std::vector<std::thread> workerThreads;
+  std::queue<std::function<void()>> taskQueue;
+  std::mutex mu_queue;
+  // Condition variable to signal changes in the state of the tasks queue
+  std::condition_variable cv_;
+  // Flag to indicate whether the thread pool should stop
+  bool stop_ = false;
+};

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: StrIdx
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Sami Sieranoja
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-05-08 00:00:00.000000000 Z
+date: 2024-05-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -55,6 +55,7 @@ files:
 - rubyext/ruby_interf.cpp
 - stridx.hpp
 - test.rb
+- thread_pool.hpp
 - unordered_dense.h
 homepage: https://github.com/SamiSieranoja/stridx
 licenses:
@@ -62,7 +63,7 @@ licenses:
 metadata:
   source_code_uri: https://github.com/SamiSieranoja/stridx
   homepage_uri: https://github.com/SamiSieranoja/stridx
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubygems_version: 3.3.26
-signing_key:
+signing_key:
 specification_version: 4
 summary: StrIdx
 test_files: []