RubyGems - StrIdx - Versions diffs - 0.1.5 → 0.1.6 - Mend

StrIdx 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6d559876ad4cbf26be66db682e45701f80359d9b4752fdc61042c665f62c3000
-  data.tar.gz: 6c0aa9cce9cea114ada2d1dfb08a833f2d338a3d9815c2a5dc32ceb15186bfe7
+  metadata.gz: e79f1bebc1e56e0a1966ae5ff69b50e4048953dd338807a6c9dba1c53b1c8f34
+  data.tar.gz: 28eea5841ce96f9460975720a43a2655f4902a9c44458a8aae01ce0b3077e67d
 SHA512:
-  metadata.gz: 6f9bf0a17ee3541b0dfc2e296bbb78dcb2c86c821427ffff5b1d6a86b7f1a7df53d428e5fc447a148c561177a495c0a9013cf1ebe18fd074fbc478bf7ad81fec
-  data.tar.gz: 0cdead8fab1925979337d5fa5ee890c42c654006db3aed2274e7a0e4a41cc50e157a346b28cbb2f5ce2dee35876010d503602932addcf06f5409a75b8cb5c81c
+  metadata.gz: e82430abd644e876dd758ceaffb3f759e19fde2a44cc85872f01721943c42d85d2032cbde91d235d44b7921833ed044241692247c88b56f54c6ea7248f5e584e
+  data.tar.gz: 642e16d0e6291474b007e0341bf6571576299f353bf1f5ff13a5c9966d9b1b2450de052f3ebad6f44c9ab9b341686dd41b4adf35922406e3df358c8cbabc9c31

data/README.md CHANGED Viewed

@@ -66,7 +66,7 @@ eval "$(stridx.rb bash)"
 To autostart server, add following line to .bashrc:
 ```
-[ ! -f ~/.stridx/sock ] && stridx.rb start -- ~/Documents/ ~/Pictures/
+stridx.rb start -- ~/Documents/ ~/Pictures/
 ```
@@ -79,7 +79,6 @@ Stop server:
 ```
 stridx.rb stop
 ```
-In case stop doesn't work, try: `kill $(pgrep -f runserver.rb | tail -n 1)`
 Start indexing server (on foreground, to debug):
 ```

data/demo.cpp CHANGED Viewed

@@ -1,5 +1,8 @@
 #include <sys/resource.h>
+       #include <malloc.h>
+#include "mem_info.h"
 #include <condition_variable>
 #include <functional>
@@ -21,7 +24,7 @@ using std::cout;
 using std::pair;
 using std::vector;
-std::vector<std::string> readLinesFromFile(const std::string &filename) {
+std::vector<std::string> readLinesFromFile(const std::string &filename, int limit = 0) {
   std::vector<std::string> lines;
   std::ifstream file(filename);
   if (!file.is_open()) {
@@ -30,8 +33,10 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
   }
   std::string line;
-  while (std::getline(file, line)) {
+  int i=0;
+  while (std::getline(file, line) && ( limit == 0 || i < limit) ) {
     lines.push_back(line);
+    i++;
   }
   file.close();
@@ -39,88 +44,134 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
 }
 int main() {
-  StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
-  // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
-  // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
-  // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
-  // Add the file paths of 89828 files in linux-6.9-rc6 to the index
-  std::string fn_filePaths = "flist.txt";
-  std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
-  // int* a = new int[10];
-  // delete(a);
-  // delete(a);
-  // Launch indexing to be run on background
-  cout << "File paths: " << v_filePaths.size() << std::endl;
-  cout << "Start indexing in the background" << std::endl;
-  auto start = std::chrono::high_resolution_clock::now();
-  int id = 0;
-  for (const auto &filePath : v_filePaths) {
-    idx.addStrToIndexThreaded(filePath, id);
-    id++;
-  }
-  auto idx_time_launch = std::chrono::high_resolution_clock::now();
-  std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
-  cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
-  // Wait until indexing has finished
-  idx.waitUntilDone();
-  auto idx_time = std::chrono::high_resolution_clock::now();
-  std::chrono::duration<double, std::milli> duration = idx_time - start;
-  cout << "Indexing finished time for " << v_filePaths.size()
-       << " file paths (seconds): " << duration.count() / 1000 << "\n";
-  cout << "DEBUG" << std::endl;
-	// idx.cm.debug();
-  cout << "END DEBUG" << std::endl;
-  // Find matching filepaths from the index for the query string "rngnomadriv"
-  start = std::chrono::high_resolution_clock::now();
-  // std::string query = "rngnomadriv";
-  std::string query = "irqbypass.c";
-  for (int i = 0; i < 99; i++) {
-    // const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
-    const vector<pair<float, int>> &results = idx.findSim(query);
-  }
+  {
+    StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
+    // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
+    // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
+    // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
+    // Add the file paths of 89828 files in linux-6.9-rc6 to the index
+    std::string fn_filePaths = "flist2.txt";
+    std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
+    // std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths,10000);
+    // int* a = new int[10];
+    // delete(a);
+    // delete(a);
+    // Launch indexing to be run on background
+    cout << "File paths: " << v_filePaths.size() << std::endl;
+    cout << "Start indexing in the background" << std::endl;
+    auto start = std::chrono::high_resolution_clock::now();
+    int id = 0;
+    for (const auto &filePath : v_filePaths) {
+      // idx.addStrToIndexThreaded(filePath, id);
+      idx.addStrToIndex(filePath, id);
+      id++;
+    }
-  // idx.findSim(query);
+    std::cout << "========\n";
+    for (int i = 0; i < id; i++) {
+      // std::cout << idx.getString(2) << "{}";
+      idx.getString(i);
+    }
+    std::cout << "========\n";
+    auto idx_time_launch = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
+    cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
+    // Wait until indexing has finished
+    idx.waitUntilDone();
+    auto idx_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> duration = idx_time - start;
+    cout << "Indexing finished time for " << v_filePaths.size()
+         << " file paths (seconds): " << duration.count() / 1000 << "\n";
+    cout << "DEBUG" << std::endl;
+    // idx.cm.debug();
+    cout << "END DEBUG" << std::endl;
+    // Find matching filepaths from the index for the query string "rngnomadriv"
+    start = std::chrono::high_resolution_clock::now();
+    std::string query = "rngnomadriv";
+    // std::string query = "rngnomaindriv";
+    // std::string query = "time.rs";
+    // std::string query = "irqbypass.c";
+    for (int i = 0; i < 99; i++) {
+      // const vector<pair<float, int>> &results = idx.findSimilar(query);
+    }
-  // const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
-  const vector<pair<float, int>> &results = idx.findSim(query);
-  auto search_time = std::chrono::high_resolution_clock::now();
-  duration = search_time - start;
-  cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
+    // idx.findSim(query);
+    // auto res = idx.findDirectories(query);
+    // const vector<pair<float, int>> &results = idx.findSimilar(query);
+    // const vector<pair<float, int>> &results = idx.findDirectories(query);
+    // const vector<pair<float, std::string>> &results = idx.findFilesAndDirectories(query, true,
+    // false);
+    vector<pair<float, std::string>> results = idx.findFilesAndDirectories(query, true, false);
+    auto search_time = std::chrono::high_resolution_clock::now();
+    duration = search_time - start;
+    cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
+    int i = 0;
+    std::cout << "query string: " << query << "\n";
+    std::cout << "Top 20 matches[1]:\n";
+    bool isDir = true;
+    for (const auto &res : results) {
+      // std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
+      std::cout << res.first << " " << res.second << "\n";
+      i++;
+      if (i > 40) {
+        break;
+      }
+    }
+    {
+      auto results = idx.findFiles(query);
+      int i = 0;
+      std::cout << "query string: " << query << "\n";
+      std::cout << "Top 20 matchesfff:\n";
+      bool isDir = true;
+      for (const auto &res : results) {
+        std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
+        // std::cout << res.first << " " << res.second << "\n";
+        i++;
+        if (i > 40) {
+          break;
+        }
+      }
+    }
-  int i = 0;
-  std::cout << "query string: " << query << "\n";
-  std::cout << "Top 20 matches:\n";
-  for (const auto &res : results) {
-    std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
-    i++;
-    if (i > 20) {
-      break;
+    std::cout << "========\n";
+    for (int i = 0; i < id; i++) {
+      // std::cout << idx.getString(2) << "{}";
+      // idx.getString(i);
     }
-  }
+    std::cout << "========\n";
-    // std::cout << "Size of MyClass: " << sizeof(StrIdx::CharMap) << " bytes" << std::endl;
-    // std::cout << "Size of CharMap3: " << sizeof(StrIdx::CharMap3) << " bytes" << std::endl;
     std::cout << "Size of CharNode: " << sizeof(StrIdx::CharNode) << " bytes" << std::endl;
     std::cout << "Size of int: " << sizeof(int) << " bytes" << std::endl;
+    StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
+  // std::this_thread::sleep_for(std::chrono::milliseconds(7000));
+  }
+  // Force memory dealloc to properly benchmark
+  // https://www.reddit.com/r/C_Programming/comments/13dn8d7/is_malloc_trim_safe_to_use/
+  malloc_trim(0);
+  StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
+  // std::this_thread::sleep_for(std::chrono::milliseconds(7000));
   struct rusage usage;
   getrusage(RUSAGE_SELF, &usage);
   std::cout << "Maximum resident set size: " << usage.ru_maxrss << " kilobytes" << std::endl;
-  std::cout << "Integral shared memory size: " << usage.ru_ixrss << " kilobytes" << std::endl;
-  std::cout << "Integral unshared data size: " << usage.ru_idrss << " kilobytes" << std::endl;
-  std::cout << "Integral unshared stack size: " << usage.ru_isrss << " kilobytes" << std::endl;
   return 0;
 }

data/exe/stridx.rb CHANGED Viewed

@@ -1,21 +1,135 @@
 #!/usr/bin/env ruby
-require 'fileutils'
+require "fileutils"
 $:.unshift File.dirname(__FILE__) + "/.."
+require "server.rb"
-if ARGV[0] == "tty"
+CUR_FILE = File.basename(__FILE__)
+PID_FILE = File.expand_path("~/.config/stridx/index.pid")
+LOCK_FILE = File.expand_path("~/.config/stridx/index.lock")
+# To prevent against race condition when two process started at the same time
+def obtain_lock_or_exit
+  @lockfile = File.open(LOCK_FILE, File::RDWR | File::CREAT, 0644)
+  unless @lockfile.flock(File::LOCK_NB | File::LOCK_EX)
+    puts "Another instance is already running."
+    exit 1
+  end
+  # Optionally truncate and write PID for info/logging
+  @lockfile.truncate(0)
+  @lockfile.write("#{Process.pid}\n")
+  @lockfile.flush
+end
+def running?
+  return false unless File.exist?(PID_FILE)
+  pid = File.read(PID_FILE).to_i
+  begin
+    # Check if process exists
+    Process.kill(0, pid)
+    # Handle race condition: if the daemon was previously killed with "kill -9",
+    # the PID file may remain. A new, unrelated process could later reuse the same PID,
+    # causing a false positive when checking for an existing instance and preventing the daemon from starting.
+    # ./daemon.rb      # Starts daemon
+    # kill -9 $(cat /tmp/daemon_example.pid)  # Force kill
+    # echo $$ > /tmp/daemon_example.pid       # Simulate reused PID (use another terminal)
+    # ./daemon.rb      # Old version would fail here; fixed version should detect mismatch
+    # Check if command line matches this script
+    cmdline = File.read("/proc/#{pid}/cmdline").split("\0")
+    correct_process = cmdline.any? { |arg| arg.include?(CUR_FILE) }
+    puts correct_process
+    if correct_process == false
+      puts "Old pidfile points to wrong process"
+      return false
+    end
+    return true
+  rescue Errno::ESRCH, Errno::ENOENT
+    return false
+  rescue Errno::EACCES
+    # Process exists, but inaccessible — might still be ours
+    return true
+  end
+end
+# Old version without /proc check
+def running_old?
+  return false unless File.exist?(PID_FILE)
+  pid = File.read(PID_FILE).to_i
+  Process.kill(0, pid)
+  true
+rescue Errno::ESRCH, Errno::EPERM
+  false
+end
+def start(daemonize: false)
+  if running?
+    puts "Daemon is already running."
+    exit 1
+  end
+  if daemonize
+    # Daemonize the process
+    Process.daemon(true, true)  # Don't change directory, close stdio
+    # Save PID
+    File.write(PID_FILE, Process.pid)
+    puts "Daemon started with PID #{Process.pid}"
+    trap("TERM") do
+      puts "Daemon stopping..."
+      File.delete(PID_FILE) if File.exist?(PID_FILE)
+      exit
+    end
+    pid_dir_path = File.expand_path("~/.config/stridx/")
+    FileUtils.mkdir_p(pid_dir_path)
+  end
+  StrIdx::Server.start ARGV
+end
+def stop
+  unless File.exist?(PID_FILE)
+    puts "No PID file found. Daemon not running?"
+    exit 1
+  end
+  pid = File.read(PID_FILE).to_i
+  puts "Stopping daemon with PID #{pid}..."
+  Process.kill("TERM", pid)
+  File.delete(PID_FILE) rescue nil
+rescue Errno::ESRCH
+  puts "Process not found. Cleaning up PID file."
+  File.delete(PID_FILE) rescue nil
+end
+# Entry point
+case ARGV.first
+when "stop"
+  stop
+when "tty"
   require "stridx-tty.rb"
   StrIdxTTY.run
-elsif ARGV[0] == "bash"
+when "bash"
   puts %q/
   bind -m emacs-standard '"\er": redraw-current-line';
   bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
 /
-else
-  require "daemons"
-  pid_dir_path = File.expand_path("~/.config/stridx/")
-  FileUtils.mkdir_p(pid_dir_path)
-  Daemons.run(File.dirname(__FILE__) + "/../runserver.rb",
-  {:dir_mode => :normal, :dir => pid_dir_path })
+when "run"
+  obtain_lock_or_exit
+  start(daemonize: false)
+when "start"
+  obtain_lock_or_exit
+  start(daemonize: true)
 end

data/rubyext/extconf.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'mkmf'
 module_name = "stridx"
 extension_name = 'stridx'
-$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
+$CXXFLAGS << " -std=c++17 -Wall -Wno-unused-variable -O3"
 have_library( 'stdc++');

data/rubyext/ruby_interf.cpp CHANGED Viewed

@@ -48,7 +48,32 @@ VALUE StringIndexWaitUntilDone(VALUE self) {
   ((StrIdx::StringIndex *)data)->waitUntilDone();
   return self;
 }
+VALUE StringIndexFindNum(VALUE self, VALUE str, VALUE _limit) {
+  VALUE ret;
+  std::string s1 = StringValueCStr(str);
+  void *data;
+  TypedData_Get_Struct(self, int, &str_idx_type, data);
+  StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
+  int limit = NUM2INT(_limit);
+  ret = rb_ary_new();
+  const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1);
+  int i = 0;
+  for (const auto &res : results) {
+    VALUE arr = rb_ary_new();
+    rb_ary_push(arr, INT2NUM(res.second));
+    rb_ary_push(arr, DBL2NUM(res.first));
+    rb_ary_push(ret, arr);
+    i++;
+    if (i >= limit) {
+      break;
+    }
+  }
+  return ret;
+}
 VALUE StringIndexFind(VALUE self, VALUE str) {
   VALUE ret;
@@ -109,7 +134,8 @@ VALUE StringIndexFindDirs(VALUE self, VALUE str) {
   StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
   ret = rb_ary_new();
-  const std::vector<std::pair<float, std::string>> &results = idx->findFilesAndDirectories(s1,false,true);
+  const std::vector<std::pair<float, std::string>> &results =
+      idx->findFilesAndDirectories(s1, false, true);
   int limit = 40;
   int i = 0;
   for (const auto &res : results) {
@@ -125,10 +151,6 @@ VALUE StringIndexFindDirs(VALUE self, VALUE str) {
   return ret;
 }
 VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
   char c = '/';
   if (TYPE(str) == T_STRING) {
@@ -149,6 +171,17 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
   return self;
 }
+VALUE StringIndexSetDirWeight(VALUE self, VALUE d) {
+  if (TYPE(d) == T_FLOAT) {
+    double c_float = NUM2DBL(rb_funcall(d, rb_intern("to_f"), 0));
+    void *data;
+    TypedData_Get_Struct(self, int, &str_idx_type, data);
+    StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
+    idx->setDirWeight(c_float);
+  }
+  return self;
+}
 void Init_stridx(void) {
   VALUE mStrIdx = rb_define_module("StrIdx");
@@ -158,12 +191,12 @@ void Init_stridx(void) {
   rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
   rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
   rb_define_method(classStringIndex, "find", StringIndexFind, 1);
+  rb_define_method(classStringIndex, "findNum", StringIndexFindNum, 2);
+  rb_define_method(classStringIndex, "setDirWeight", StringIndexSetDirWeight, 1);
   rb_define_method(classStringIndex, "findFilesAndDirs", StringIndexFindFilesAndDirs, 1);
   rb_define_method(classStringIndex, "findDirs", StringIndexFindDirs, 1);
   rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
 }
 } // End extern "C"

data/runserver.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 #!/usr/bin/env ruby
+# Add cur dir to load path
 $:.unshift File.dirname(__FILE__)
 def kill_signal

data/server.rb CHANGED Viewed

@@ -33,6 +33,7 @@ module StrIdx
     def initialize(dir_list, daemonize: false)
       idx = StrIdx::StringIndex.new
       idx.setDirSeparator("/")
+      idx.setDirWeight(0.85) # Lower scores for directory matches
       t = Time.new

data/stridx.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name = "StrIdx"
-  spec.version = "0.1.5"
+  spec.version = "0.1.6"
   spec.authors = ["Sami Sieranoja"]
   spec.email = ["sami.sieranoja@gmail.com"]

data/stridx.hpp CHANGED Viewed

@@ -324,7 +324,11 @@ struct PathSegment {
 // Candidate for result in string (filename) search
 struct Candidate {
+	//This holds the subscores for each character in the query string
   std::vector<float> v_charscore;
   PathSegment *seg;
   int fileId;
   // The string that this candidate represents
@@ -343,6 +347,7 @@ struct Candidate {
     candLen = seg->size();
   }
+	// Sum subscores in v_charscore and normalize to get final score
   [[nodiscard]] float getScore() const {
     int i = 0;
     float score = 0.0;
@@ -375,13 +380,14 @@ private:
   std::vector<PathSegment *> segsToClean;
+	// Maps id's stored in charTree to corresponding PathSegment's
   std::unordered_map<int, PathSegment *> seglist;
   std::unordered_map<int, PathSegment *> seglist_dir;
   std::mutex seglist_mu;
   PathSegment *root;
   int dirId = 0;
-  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+  float dirWeight = 1.0; // =0.7: Give only 70% of score if match is for a directory
   std::unique_ptr<ThreadPool> pool;
   Output out{1}; // verbose level = 1
@@ -588,12 +594,17 @@ public:
   void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
     int last_start = query.size() - 2;
+    // Loop all possible start positions in query string. Indexes [0..(n-3)]
     for (int start = 0; start <= last_start; start++) {
       CharNode *cn = chartr.root;
+      // select a suffix (substring) starting from start, but cap length to 8 chars
       int end = std::min(start + 7, ((int)query.size()) - 1);
       int nchars = end - start + 1;
       std::string s = query.substr(start, nchars);
+			// Loop all chars of the query substring
+			// Traverse from the
       for (int i = 0; i < s.size(); i++) {
         char c = s[i];
         CharNode *x = cn->find(c);
@@ -601,12 +612,20 @@ public:
           cn = x;
           // Consider scores only for substrings with size >= 2
           if (i > 0) {
+          	// If we've reached here, size of substring is i+2
+          	// Get identifiers of files that include substring
+          	// query[start..(start+i+1)] ??
             std::set<int> ids = cn->getIds();
             for (const int &y : ids) {
               PathSegment *p = nullptr;
+							// Searching in file segments
+							// (or no file/dir separation)
               if (&chartr == &cm) {
                 p = seglist[y];
               } else {
+							// Searching in dir segments
                 p = seglist_dir[y];
               }
               assert(p != nullptr);

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: StrIdx
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - Sami Sieranoja
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-06-05 00:00:00.000000000 Z
+date: 2025-07-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler