RubyGems - StrIdx - Versions diffs - 0.1.2 → 0.1.4 - Mend

StrIdx 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/stridx-tty.rb ADDED Viewed

@@ -0,0 +1,122 @@
+#!/usr/bin/env ruby
+require "tty-prompt"
+require "tty-cursor"
+require "tty-reader"
+require "pastel"
+require "socket"
+class StrIdxTTY
+  def self.run
+    stty = StrIdxTTY.new
+    selected = stty.search
+    STDOUT.write selected
+  end
+  def initialize()
+    @lines = []
+    @selected = ""
+    @idx = 0
+    @reader = TTY::Reader.new(output: STDERR)
+    @pastel = Pastel.new()
+    @cursor = TTY::Cursor
+    sock_dir = File.expand_path("~/.stridx")
+    sockfn = "#{sock_dir}/sock"
+    error = true
+    while error
+      begin
+        # Create a new UNIXSocket
+        client = UNIXSocket.new(sockfn)
+      rescue Errno::ECONNREFUSED => e
+        out "Waiting for server to start\n"
+        sleep 2
+        error = true
+      else
+        error = false
+        client.close
+        #... executes when no error
+      end
+    end
+  end
+  def out(x)
+    STDERR.write x
+  end
+  def search
+    out "\n" * 20
+    out @cursor.clear_screen
+    out "\n" * 20
+    @cursor.move_to(0, 0)
+    @reader.on(:keypress) { |event|
+      handle_event(event)
+    }
+    @reader.read_line(">> ")
+    out @cursor.clear_screen
+    return @selected.strip
+  end
+  def get_res_from_server(query)
+    # Define the socket file path
+    sock_dir = File.expand_path("~/.stridx")
+    sockfn = "#{sock_dir}/sock"
+    # Create a new UNIXSocket
+    client = UNIXSocket.new(sockfn)
+    # Send data to the server
+    client.puts "find:#{query}"
+    # Read response from the server
+    response = client.recv(200 * 200)
+    # Close the client connection
+    client.close
+    return response.lines
+  end
+  def draw_list()
+    @selected = @list[@idx]
+    i = 0
+    for x in @list
+      out @cursor.up(1)
+      out @cursor.clear_line
+      if i == @idx
+        out @pastel.lookup(:bold)
+      end
+      out x.strip
+      out @pastel.lookup(:reset)
+      i += 1
+    end
+  end
+  def update_search(event)
+    query = event.line[3..-1]
+    if query.size > 2
+      @list = get_res_from_server(query)
+      draw_list
+    end
+  end
+  def handle_event(event)
+    out @cursor.save
+    if event.key.name == :alpha
+      update_search(event)
+    elsif event.key.name == :up
+      @idx += 1 if @idx < @list.size - 1
+      draw_list
+    elsif event.key.name == :down
+      @idx -= 1 if @idx > 0
+      draw_list
+    elsif event.key.name == :backspace
+      update_search(event)
+    end
+    out @cursor.restore
+  end
+end

data/stridx.gemspec ADDED Viewed

@@ -0,0 +1,37 @@
+Gem::Specification.new do |spec|
+  spec.name = "StrIdx"
+  spec.version = "0.1.4"
+  spec.authors = ["Sami Sieranoja"]
+  spec.email = ["sami.sieranoja@gmail.com"]
+  spec.summary = %q{StrIdx}
+  spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
+  spec.homepage = "https://github.com/SamiSieranoja/stridx"
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.files = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(refcode|spec|features)/})
+  end
+  # spec.files << "thread_pool.hpp"
+  # spec.files << "exe/stridx.rb"
+  # spec.files << "server.rb"
+  # spec.files << "stridx-tty.rb"
+  spec.bindir = "exe"
+  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib", "ext"]
+  spec.add_development_dependency "bundler", "~> 2.4.21"
+  spec.add_development_dependency "rake", "~> 13.1.0"
+  spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
+  spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
+  spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
+  spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
+  spec.add_runtime_dependency "pastel", "~> 0.8.0"
+  spec.add_runtime_dependency "daemons", "~> 1.4.1"
+  spec.extensions = ["rubyext/extconf.rb"]
+  spec.licenses = ["LGPL-2.0+"]
+end

data/stridx.hpp CHANGED Viewed

@@ -1,21 +1,66 @@
+#ifndef SSSTRIDX_HPP
+#define SSSTRIDX_HPP
 #include <stdio.h>
 #include <stdlib.h>
 #include <cassert>
 #include <vector>
+#include <array>
 #include <iostream>
 #include <unordered_map>
 #include <set>
 #include <algorithm>
 #include <sstream>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
+#include <vector>
+#include <mutex>
+#include <thread>
+#include "thread_pool.hpp"
 #include "unordered_dense.h"
+namespace StrIdx {
+/* Alternative to using std::cout
+   Allows to control verbose level */
+class Output {
+private:
+  int verboseLevel;
+public:
+  Output(int verb) : verboseLevel(verb) {}
+  Output() : Output(3) {}
+  ~Output() = default;
+  static void print() {}
+  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
+  template <typename T, typename... Types> static void print(T var1, Types... var2) {
+    std::cout << var1;
+    print(var2...);
+  }
+  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+  template <typename... Types> static void printl(Types... var2) {
+    print(var2...);
+    print("\n");
+  }
+  /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+   * if verboseLevel >= 2 (first arg)
+   */
+  template <typename... Types> void printv(int vlevel, Types... var2) {
+    if (verboseLevel < vlevel) {
+      return;
+    }
+    if (verboseLevel >= 3) {
+      print("[v=", vlevel, "] ");
+    }
+    printl(var2...);
+  }
+};
 // Transforms input string as follows:
 // '/foo/bar/file1.txt'
 // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
 }
 // Convert int64_t to binary string
-std::string int64ToBinaryString(int64_t num) {
+[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
   std::string result;
   for (int i = 63; i >= 0; --i) {
     result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
   return result;
 }
-// Convert a (8 char) string represented as int64_t to std::string
-std::string int64ToStr(int64_t key) {
+// Debug. Convert a (8 char) string represented as int64_t to std::string
+[[nodiscard]] std::string int64ToStr(const int64_t &key) {
   int nchars = 8;
   std::string str;
   int multip = nchars * 8;
@@ -55,43 +100,45 @@ std::string int64ToStr(int64_t key) {
   return str;
 }
+// Debug
 void printVector(const std::vector<int> &vec) {
   for (const auto &value : vec) {
     std::cout << value << " ";
   }
 }
-std::string charToBinaryString(char num) {
+// Debug
+[[nodiscard]] std::string charToBinaryString(const char &chr) {
   std::string result;
   for (int i = 7; i >= 0; --i) {
-    result += ((num >> i) & 1) ? '1' : '0';
+    result += ((chr >> i) & 1) ? '1' : '0';
   }
   return result;
 }
 class Candidate;
-enum segmentType { Dir, File };
+enum class segmentType { Dir, File };
 // A segment of a file path
 // e.g. if path is /foo/bar/baz.txt
 // segments are [{root}, foo, bar, baz.txt]
-class PathSegment {
-public:
+struct PathSegment {
   std::string str;
   int fileId; // (if FILE)
   Candidate *cand;
   PathSegment *parent;
+  std::mutex mu;
   ankerl::unordered_dense::map<std::string, PathSegment *> children;
-  segmentType type = Dir;
-  PathSegment() : parent(NULL) {}
-  PathSegment(std::string _str) : str(_str), parent(NULL) {}
+  segmentType type = segmentType::Dir;
+  PathSegment() : parent(nullptr) {}
+  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
   PathSegment(std::string _str, int _fileId)
-      : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
-  int size() {
+      : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
+  [[nodiscard]] int size() const {
     int sz = str.size();
     PathSegment *cur = parent;
     // Sum up length of parent segments (+1 for divisors)
-    while (cur->parent != NULL) {
+    while (cur->parent != nullptr) {
       sz += cur->str.size() + 1;
       cur = cur->parent;
     }
@@ -100,8 +147,7 @@ public:
 };
 // Candidate for result in string (filename) search
-class Candidate {
-public:
+struct Candidate {
   std::vector<float> v_charscore;
   PathSegment *seg;
   int fileId;
@@ -114,25 +160,17 @@ public:
   int candLen; // Length of candidate
   Candidate(){};
-  Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
-    // Initialize v_charscores with zeros
-    v_charscore.resize(len, 0);
-    candLen = str.size();
-    seg = NULL;
-  }
   Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
     // Initialize v_charscores with zeros
     v_charscore.resize(len, 0);
     candLen = seg->size();
   }
-  float getScore() {
+  [[nodiscard]] float getScore() const {
     int i = 0;
     float score = 0.0;
-    candLen = seg->size();
-    for (float &charscore : v_charscore) {
+    for (const float &charscore : v_charscore) {
       score += charscore;
       i++;
     }
@@ -145,35 +183,41 @@ public:
     return score;
   }
-  float operator[](int idx) { return v_charscore[idx]; }
+  [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
 };
 // This seems to give 10x speed improvement over std::unordered_map
 typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
 // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
-typedef std::unordered_map<float, Candidate> CandMap;
+typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
+// typedef std::unordered_map<int, Candidate*> CandMap;
 class StringIndex {
 private:
   int tmp;
   char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
+  int numStrings = 0;
   std::vector<SegMap *> dirmaps;
+  std::array<std::mutex, 9> mts_d; // for dirmaps
   std::vector<SegMap *> filemaps;
+  std::array<std::mutex, 9> mts_f; // for filemaps
   std::vector<PathSegment *> segsToClean;
-  std::unordered_map<int, std::string> strlist;
   std::unordered_map<int, PathSegment *> seglist;
   PathSegment *root;
   int dirId = 0;
   float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+  std::unique_ptr<ThreadPool> pool;
+  Output out{1}; // verbose level = 1
 public:
-  StringIndex() {
+  StringIndex(char sep) : dirSeparator(sep) {
     root = new PathSegment();
-    root->parent = NULL;
+    root->parent = nullptr;
     root->str = "[ROOT]";
     for (int i = 0; i <= 8; i++) {
@@ -181,11 +225,18 @@ public:
       filemaps.push_back(new SegMap);
     }
-#ifdef _OPENMP
-    std::cout << "OPENMP enabled\n";
-#endif
+    // Threads between 4 and 6
+    // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
+    int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
+    num_threads = std::min(num_threads, 6);
+    out.printv(2, "Number of threads: ", num_threads);
+    pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
   }
+  /* Don't separate path to segments separator=\0.
+     This is slower, but can be used for other data than files also.  */
+  StringIndex() : StringIndex('\0') {}
   void setDirSeparator(char sep) { dirSeparator = sep; }
   void setDirWeight(float val) { dirWeight = val; }
@@ -213,6 +264,15 @@ public:
     addStrToIndex(filePath, fileId, dirSeparator);
   }
+  void addStrToIndexThreaded(std::string filePath, int fileId) {
+    pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
+  }
+  void waitUntilReady() const { pool->waitUntilDone(); }
+  void waitUntilDone() const { pool->waitUntilDone(); }
+  int size() const { return seglist.size(); }
   /**
    * Add a string to the index to be searched for afterwards
    *
@@ -221,9 +281,17 @@ public:
    * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
    * one of {'\\', '/', '\0' (no separation)}.
    */
   void addStrToIndex(std::string filePath, int fileId, const char &separator) {
+    out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
+    // If a string with this index has beeen added already
+    if (seglist.find(fileId) != seglist.end()) {
+      return;
+    }
     std::vector<std::string> segs;
+    numStrings += 1;
     if (separator == '\0') {
       // No separation to directories & files
@@ -233,7 +301,7 @@ public:
       segs = splitString(filePath, separator);
     }
-    PathSegment *prev = NULL;
+    PathSegment *prev = nullptr;
     prev = root;
     // Add segments to a tree type data structure
     // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +313,27 @@ public:
       auto x = *_x;
       PathSegment *p;
-      auto it = prev->children.find(x);
+      prev->mu.lock();
       // this part of the path already exists in the tree
-      if (it != prev->children.end()) {
+      if (auto it = prev->children.find(x); it != prev->children.end()) {
         p = it->second;
+        prev->mu.unlock();
       } else {
         p = new PathSegment(x, fileId);
         p->parent = prev;
-        // If this is last item in segs
+        // If this is last item in segs, then it is a file.
         if (_x == std::prev(segs.end())) {
-          // therefore, it is a file.
-          p->type = File;
+          p->type = segmentType::File;
           seglist[fileId] = p;
-        } else {
-          p->type = Dir;
+        } else { // otherwise, it is a directory
+          p->type = segmentType::Dir;
           p->fileId = dirId;
           // Files use user input Id. Directories need to have it generated
           dirId++;
         }
         prev->children[x] = p;
+        prev->mu.unlock();
         addPathSegmentKeys(p);
       }
@@ -271,6 +341,17 @@ public:
     }
   }
+  std::string getString(int id) {
+    std::string s = "";
+    PathSegment *seg = seglist[id];
+    s += seg->str;
+    while (seg->parent->parent != nullptr) {
+      seg = seg->parent;
+      s = seg->str + dirSeparator + s;
+    }
+    return s;
+  }
   /**
   The search will find filepaths similar to the input string
@@ -303,14 +384,16 @@ public:
   @param query String to search for inside the index
   */
-  std::vector<std::pair<float, int>> findSimilar(std::string query) {
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
     return findSimilar(query, 2);
   }
-  std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
+  [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
     CandMap fileCandMap;
     CandMap dirCandMap;
+    waitUntilDone();
     // Find both files and directories that match the input query
     addToCandMap(fileCandMap, query, filemaps);
     addToCandMap(dirCandMap, query, dirmaps);
@@ -319,9 +402,9 @@ public:
      scores of the file */
     mergeCandidateMaps(fileCandMap, dirCandMap);
-    // Set all candidate pointers to NULL so they won't mess up future searches
+    // Set all candidate pointers to nullptr so they won't mess up future searches
     for (auto seg : segsToClean) {
-      seg->cand = NULL;
+      seg->cand = nullptr;
     }
     segsToClean.clear();
@@ -329,11 +412,17 @@ public:
     std::vector<std::pair<float, int>> results;
     for (auto &[fid, cand] : fileCandMap) {
       std::pair<float, int> v;
-      float sc = cand.getScore();
+      float sc = cand->getScore();
       v.first = sc;
       v.second = fid;
       results.push_back(v);
+      delete cand;
     }
+    for (auto &[fid, cand] : dirCandMap) {
+      delete cand;
+    }
     // Sort highest score first
     std::sort(results.begin(), results.end(),
               [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -341,10 +430,10 @@ public:
   }
   // Return int64_t representation of the first nchars in str, starting from index i
-  int64_t getKeyAtIdx(std::string str, int i, int nchars) {
+  [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
     int64_t key = 0;
     for (int i_char = 0; i_char < nchars; i_char++) {
-      key = key | static_cast<int>(str[i + i_char]);
+      key = key | static_cast<int64_t>(str[i + i_char]);
       if (i_char < nchars - 1) {
         // Shift 8 bits to the left except on the last iteration
         key = key << 8;
@@ -399,22 +488,29 @@ private:
       maxChars = p->str.size();
     }
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
     for (int sublen = minChars; sublen <= maxChars; sublen++) {
+      std::mutex *mu;
       SegMap *map;
-      if (p->type == File) {
+      if (p->type == segmentType::File) {
         map = filemaps[sublen];
+        mu = &mts_f[sublen];
       } else {
         map = dirmaps[sublen];
+        mu = &mts_d[sublen];
       }
       int count = str.size() - sublen + 1;
+      int64_t keys[count + 1];
       for (int i = 0; i <= count; i++) {
-        int64_t key = getKeyAtIdx(str, i, sublen);
+        keys[i] = getKeyAtIdx(str, i, sublen);
+      }
+      mu->lock();
+      for (int i = 0; i <= count; i++) {
+        // int64_t key = getKeyAtIdx(str, i, sublen);
+        auto key = keys[i];
         // Create a new std::set for key if doesn't exist already
         auto it = map->find(key);
@@ -423,12 +519,14 @@ private:
         }
         (*map)[key]->insert(p);
       }
+      mu->unlock();
     }
   }
   // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
   // is of length <nchars>.
-  std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
+  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
+                                                               SegMap &map) const {
     assert(i + nchars <= static_cast<int>(str.size()));
     std::vector<PathSegment *> res;
@@ -437,8 +535,7 @@ private:
     // transform that to 64 bit integer
     int64_t key = getKeyAtIdx(str, i, nchars);
     // Find all path segments in map that have the same substring
-    auto it = map.find(key);
-    if (it != map.end()) { // key found
+    if (auto it = map.find(key); it != map.end()) { // key found
       auto set = it->second;
       for (auto value : *set) {
         res.push_back(value);
@@ -475,12 +572,12 @@ private:
   void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
     for (auto &[fid, cand] : fileCandMap) {
-      PathSegment *p = cand.seg->parent;
-      while (p->parent != NULL) {
-        if (p->cand != NULL) {
-          auto &scoreA = cand.v_charscore;
+      PathSegment *p = cand->seg->parent;
+      while (p->parent != nullptr) {
+        if (p->cand != nullptr) {
+          auto &scoreA = cand->v_charscore;
           auto &scoreB = p->cand->v_charscore;
-          for (int i = 0; i < cand.len; i++) {
+          for (int i = 0; i < cand->len; i++) {
             if (scoreA[i] < scoreB[i] * dirWeight) {
               scoreA[i] = scoreB[i] * dirWeight;
             }
@@ -493,18 +590,22 @@ private:
   void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
-    auto it2 = candmap.find(seg->fileId);
-    if (it2 == candmap.end()) {
-      Candidate cand(seg, str.size());
-      seg->cand = &(candmap[seg->fileId]);
+    if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
+      Candidate *cand = new Candidate(seg, str.size());
       segsToClean.push_back(seg);
       candmap[seg->fileId] = cand;
+      seg->cand = cand;
     }
     for (int j = i; j < i + nchars; j++) {
-      if (candmap[seg->fileId][j] < nchars) {
-        candmap[seg->fileId].v_charscore[j] = nchars;
+      Candidate &cand = *(candmap[seg->fileId]);
+      if (cand[j] < nchars) {
+        cand.v_charscore[j] = nchars;
       }
     }
   }
 };
+} // namespace StrIdx
+#endif

data/test.rb CHANGED Viewed

@@ -1,8 +1,13 @@
 #!/usr/bin/env ruby
+$:.unshift File.dirname(__FILE__)
 require "stridx"
 idx = StrIdx::StringIndex.new
+# "/" for unix-style file paths
+idx.setDirSeparator("/") #(comment out if not file paths)
 t = Time.new
 fn = File.expand_path("flist.txt")
 lines = IO.read(fn).lines.collect { |x| x.strip }
@@ -13,7 +18,13 @@ for x in lines
 end
 idx_time = Time.new
-puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
+# Time to start the threadpool to process indexing
+puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
+idx.waitUntilDone() # Not necessary, will be called by idx.find
+idx_time = Time.new
+# Time when all threads have completed
+puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
 query = "rngnomadriv"
 res = idx.find(query)