StrIdx 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f7655b6bd71bca58c86ad607fd197933fc19b97b3ae1c76e322ec0432025dad7
4
- data.tar.gz: 2421892aa6fe750213d08e2254019d87ce7abb10496cdf1635a61815b8b8b0d7
3
+ metadata.gz: e79f1bebc1e56e0a1966ae5ff69b50e4048953dd338807a6c9dba1c53b1c8f34
4
+ data.tar.gz: 28eea5841ce96f9460975720a43a2655f4902a9c44458a8aae01ce0b3077e67d
5
5
  SHA512:
6
- metadata.gz: 0a0ed3f51b95b72a553cf97e1a852f63e8b6d1cbbba56fdab55ed5037ef4d658b8649f2cf92d35b5eea4e6657a18a3a1460a3a57069cda0889a1987f5d1611ee
7
- data.tar.gz: f0d4753ee43cb205fa86468dad92a66644e12cf889d8018283cb421e4a5d670386b8666af64b89d4d475a59f10701de2223c31d03ac5d366f2ee255c77190cf8
6
+ metadata.gz: e82430abd644e876dd758ceaffb3f759e19fde2a44cc85872f01721943c42d85d2032cbde91d235d44b7921833ed044241692247c88b56f54c6ea7248f5e584e
7
+ data.tar.gz: 642e16d0e6291474b007e0341bf6571576299f353bf1f5ff13a5c9966d9b1b2450de052f3ebad6f44c9ab9b341686dd41b4adf35922406e3df358c8cbabc9c31
data/Makefile CHANGED
@@ -1,6 +1,7 @@
1
1
  all: demo
2
2
 
3
3
  demo: *.hpp *.cpp Makefile
4
+ #g++ -Wall -O0 -g -lstdc++ demo.cpp -o demo
4
5
  g++ -Wall -O3 -lstdc++ demo.cpp -o demo
5
6
 
6
7
  clean:
data/README.md CHANGED
@@ -5,8 +5,14 @@ The fuzziness means that candidate filepaths do not need to include exact match
5
5
 
6
6
  The library can be applied for UTF-8 data also, although there is a small bias in scoring for multibyte characters.
7
7
 
8
+ 1. [String similarity calculation](#stringsim)
9
+ 2. [Interfaces](#interfaces)
10
+ 1. [Command line](#comm)
11
+ 2. [Ruby](#ruby)
12
+ 3. [C++](#cpp)
8
13
 
9
- ## String similarity calculation
14
+
15
+ ## String similarity calculation <a name="stringsim"/>
10
16
 
11
17
  Once the index has been created, the contents can be searched to find the best matching strings.
12
18
 
@@ -39,7 +45,7 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
39
45
 
40
46
  # Interfaces
41
47
 
42
- ## Commandline
48
+ ## Commandline <a name="comm"/>
43
49
  Install instructions (for Ubuntu Linux):
44
50
  ```
45
51
  apt update
@@ -55,7 +61,14 @@ stridx.rb start -- ~/Documents/ ~/Pictures/
55
61
  Add bash keybindings (Ctrl-t):
56
62
  ```
57
63
  eval "$(stridx.rb bash)"
64
+
65
+ ```
66
+
67
+ To autostart server, add following line to .bashrc:
58
68
  ```
69
+ stridx.rb start -- ~/Documents/ ~/Pictures/
70
+ ```
71
+
59
72
 
60
73
  Search by pressing <kbd>ctrl</kbd>+<kbd>t</kbd>. Keys: <kbd>up</kbd>, <kbd>down</kbd>, select with <kbd>enter</kbd>
61
74
 
@@ -151,7 +164,7 @@ Search time: 0.0488 seconds
151
164
  ```
152
165
 
153
166
 
154
- ## C++
167
+ ## C++ <a name="cpp"/>
155
168
  See demo.cpp
156
169
  ```cpp
157
170
  #include "stridx.hpp"
data/demo.cpp CHANGED
@@ -1,4 +1,9 @@
1
1
 
2
+ #include <sys/resource.h>
3
+ #include <malloc.h>
4
+
5
+ #include "mem_info.h"
6
+
2
7
  #include <condition_variable>
3
8
  #include <functional>
4
9
  #include <iostream>
@@ -19,7 +24,7 @@ using std::cout;
19
24
  using std::pair;
20
25
  using std::vector;
21
26
 
22
- std::vector<std::string> readLinesFromFile(const std::string &filename) {
27
+ std::vector<std::string> readLinesFromFile(const std::string &filename, int limit = 0) {
23
28
  std::vector<std::string> lines;
24
29
  std::ifstream file(filename);
25
30
  if (!file.is_open()) {
@@ -28,8 +33,10 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
28
33
  }
29
34
 
30
35
  std::string line;
31
- while (std::getline(file, line)) {
36
+ int i=0;
37
+ while (std::getline(file, line) && ( limit == 0 || i < limit) ) {
32
38
  lines.push_back(line);
39
+ i++;
33
40
  }
34
41
 
35
42
  file.close();
@@ -37,59 +44,134 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
37
44
  }
38
45
 
39
46
  int main() {
40
- StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
41
- // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
42
- // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
43
- // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
44
-
45
- // Add the file paths of 89828 files in linux-6.9-rc6 to the index
46
- std::string fn_filePaths = "flist.txt";
47
- std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
48
-
49
- // Launch indexing to be run on background
50
- cout << "File paths: " << v_filePaths.size() << std::endl;
51
- cout << "Start indexing in the background" << std::endl;
52
- auto start = std::chrono::high_resolution_clock::now();
53
- int id = 0;
54
- for (const auto &filePath : v_filePaths) {
55
- idx.addStrToIndexThreaded(filePath, id);
56
- id++;
57
- }
47
+ {
48
+ StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
49
+ // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
50
+ // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
51
+ // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
52
+
53
+ // Add the file paths of 89828 files in linux-6.9-rc6 to the index
54
+ std::string fn_filePaths = "flist2.txt";
55
+ std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
56
+ // std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths,10000);
57
+
58
+ // int* a = new int[10];
59
+ // delete(a);
60
+ // delete(a);
61
+
62
+ // Launch indexing to be run on background
63
+ cout << "File paths: " << v_filePaths.size() << std::endl;
64
+ cout << "Start indexing in the background" << std::endl;
65
+ auto start = std::chrono::high_resolution_clock::now();
66
+ int id = 0;
67
+ for (const auto &filePath : v_filePaths) {
68
+ // idx.addStrToIndexThreaded(filePath, id);
69
+ idx.addStrToIndex(filePath, id);
70
+ id++;
71
+ }
72
+
73
+ std::cout << "========\n";
74
+ for (int i = 0; i < id; i++) {
75
+ // std::cout << idx.getString(2) << "{}";
76
+ idx.getString(i);
77
+ }
78
+ std::cout << "========\n";
79
+
80
+ auto idx_time_launch = std::chrono::high_resolution_clock::now();
81
+ std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
82
+ cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
83
+
84
+ // Wait until indexing has finished
85
+ idx.waitUntilDone();
86
+
87
+ auto idx_time = std::chrono::high_resolution_clock::now();
88
+ std::chrono::duration<double, std::milli> duration = idx_time - start;
89
+ cout << "Indexing finished time for " << v_filePaths.size()
90
+ << " file paths (seconds): " << duration.count() / 1000 << "\n";
91
+
92
+ cout << "DEBUG" << std::endl;
93
+ // idx.cm.debug();
94
+ cout << "END DEBUG" << std::endl;
95
+
96
+ // Find matching filepaths from the index for the query string "rngnomadriv"
97
+ start = std::chrono::high_resolution_clock::now();
98
+ std::string query = "rngnomadriv";
99
+ // std::string query = "rngnomaindriv";
100
+ // std::string query = "time.rs";
101
+ // std::string query = "irqbypass.c";
102
+ for (int i = 0; i < 99; i++) {
103
+ // const vector<pair<float, int>> &results = idx.findSimilar(query);
104
+ }
58
105
 
59
- auto idx_time_launch = std::chrono::high_resolution_clock::now();
60
- std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
61
- cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
106
+ // idx.findSim(query);
107
+
108
+ // auto res = idx.findDirectories(query);
109
+
110
+ // const vector<pair<float, int>> &results = idx.findSimilar(query);
111
+ // const vector<pair<float, int>> &results = idx.findDirectories(query);
112
+ // const vector<pair<float, std::string>> &results = idx.findFilesAndDirectories(query, true,
113
+ // false);
114
+ vector<pair<float, std::string>> results = idx.findFilesAndDirectories(query, true, false);
115
+
116
+ auto search_time = std::chrono::high_resolution_clock::now();
117
+ duration = search_time - start;
118
+ cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
119
+
120
+ int i = 0;
121
+ std::cout << "query string: " << query << "\n";
122
+ std::cout << "Top 20 matches[1]:\n";
123
+ bool isDir = true;
124
+ for (const auto &res : results) {
125
+ // std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
126
+ std::cout << res.first << " " << res.second << "\n";
127
+ i++;
128
+ if (i > 40) {
129
+ break;
130
+ }
131
+ }
132
+
133
+ {
134
+
135
+ auto results = idx.findFiles(query);
136
+ int i = 0;
137
+ std::cout << "query string: " << query << "\n";
138
+ std::cout << "Top 20 matchesfff:\n";
139
+ bool isDir = true;
140
+ for (const auto &res : results) {
141
+ std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
142
+ // std::cout << res.first << " " << res.second << "\n";
143
+ i++;
144
+ if (i > 40) {
145
+ break;
146
+ }
147
+ }
148
+ }
62
149
 
63
- // Wait until indexing has finished
64
- idx.waitUntilDone();
150
+ std::cout << "========\n";
151
+ for (int i = 0; i < id; i++) {
152
+ // std::cout << idx.getString(2) << "{}";
153
+ // idx.getString(i);
154
+ }
155
+ std::cout << "========\n";
65
156
 
66
- auto idx_time = std::chrono::high_resolution_clock::now();
67
- std::chrono::duration<double, std::milli> duration = idx_time - start;
68
- cout << "Indexing finished time for " << v_filePaths.size()
69
- << " file paths (seconds): " << duration.count() / 1000 << "\n";
157
+ std::cout << "Size of CharNode: " << sizeof(StrIdx::CharNode) << " bytes" << std::endl;
158
+ std::cout << "Size of int: " << sizeof(int) << " bytes" << std::endl;
70
159
 
71
- // Find matching filepaths from the index for the query string "rngnomadriv"
72
- start = std::chrono::high_resolution_clock::now();
73
- std::string query = "rngnomadriv";
74
- for (int i = 0; i < 99; i++) {
75
- const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
160
+ StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
161
+ // std::this_thread::sleep_for(std::chrono::milliseconds(7000));
162
+
76
163
  }
77
164
 
78
- const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
79
- auto search_time = std::chrono::high_resolution_clock::now();
80
- duration = search_time - start;
81
- cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
165
+ // Force memory dealloc to properly benchmark
166
+ // https://www.reddit.com/r/C_Programming/comments/13dn8d7/is_malloc_trim_safe_to_use/
167
+ malloc_trim(0);
168
+
169
+ StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
82
170
 
83
- int i = 0;
84
- std::cout << "query string: " << query << "\n";
85
- std::cout << "Top 20 matches:\n";
86
- for (const auto &res : results) {
87
- std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
88
- i++;
89
- if (i > 20) {
90
- break;
91
- }
92
- }
171
+ // std::this_thread::sleep_for(std::chrono::milliseconds(7000));
172
+ struct rusage usage;
173
+ getrusage(RUSAGE_SELF, &usage);
174
+ std::cout << "Maximum resident set size: " << usage.ru_maxrss << " kilobytes" << std::endl;
93
175
 
94
176
  return 0;
95
177
  }
data/exe/stridx.rb CHANGED
@@ -1,16 +1,135 @@
1
1
  #!/usr/bin/env ruby
2
+ require "fileutils"
2
3
 
3
4
  $:.unshift File.dirname(__FILE__) + "/.."
5
+ require "server.rb"
4
6
 
5
- if ARGV[0] == "tty"
7
+ CUR_FILE = File.basename(__FILE__)
8
+ PID_FILE = File.expand_path("~/.config/stridx/index.pid")
9
+ LOCK_FILE = File.expand_path("~/.config/stridx/index.lock")
10
+
11
+
12
+
13
+ # To prevent against race condition when two process started at the same time
14
+ def obtain_lock_or_exit
15
+ @lockfile = File.open(LOCK_FILE, File::RDWR | File::CREAT, 0644)
16
+
17
+ unless @lockfile.flock(File::LOCK_NB | File::LOCK_EX)
18
+ puts "Another instance is already running."
19
+ exit 1
20
+ end
21
+
22
+ # Optionally truncate and write PID for info/logging
23
+ @lockfile.truncate(0)
24
+ @lockfile.write("#{Process.pid}\n")
25
+ @lockfile.flush
26
+ end
27
+
28
+ def running?
29
+ return false unless File.exist?(PID_FILE)
30
+
31
+ pid = File.read(PID_FILE).to_i
32
+
33
+ begin
34
+ # Check if process exists
35
+ Process.kill(0, pid)
36
+
37
+ # Handle race condition: if the daemon was previously killed with "kill -9",
38
+ # the PID file may remain. A new, unrelated process could later reuse the same PID,
39
+ # causing a false positive when checking for an existing instance and preventing the daemon from starting.
40
+
41
+ # ./daemon.rb # Starts daemon
42
+ # kill -9 $(cat /tmp/daemon_example.pid) # Force kill
43
+ # echo $$ > /tmp/daemon_example.pid # Simulate reused PID (use another terminal)
44
+ # ./daemon.rb # Old version would fail here; fixed version should detect mismatch
45
+
46
+ # Check if command line matches this script
47
+ cmdline = File.read("/proc/#{pid}/cmdline").split("\0")
48
+
49
+ correct_process = cmdline.any? { |arg| arg.include?(CUR_FILE) }
50
+ puts correct_process
51
+ if correct_process == false
52
+ puts "Old pidfile points to wrong process"
53
+ return false
54
+ end
55
+
56
+ return true
57
+ rescue Errno::ESRCH, Errno::ENOENT
58
+ return false
59
+ rescue Errno::EACCES
60
+ # Process exists, but inaccessible — might still be ours
61
+ return true
62
+ end
63
+ end
64
+
65
+ # Old version without /proc check
66
+ def running_old?
67
+ return false unless File.exist?(PID_FILE)
68
+ pid = File.read(PID_FILE).to_i
69
+ Process.kill(0, pid)
70
+ true
71
+ rescue Errno::ESRCH, Errno::EPERM
72
+ false
73
+ end
74
+
75
+ def start(daemonize: false)
76
+ if running?
77
+ puts "Daemon is already running."
78
+ exit 1
79
+ end
80
+
81
+ if daemonize
82
+ # Daemonize the process
83
+ Process.daemon(true, true) # Don't change directory, close stdio
84
+
85
+ # Save PID
86
+ File.write(PID_FILE, Process.pid)
87
+ puts "Daemon started with PID #{Process.pid}"
88
+
89
+ trap("TERM") do
90
+ puts "Daemon stopping..."
91
+ File.delete(PID_FILE) if File.exist?(PID_FILE)
92
+ exit
93
+ end
94
+
95
+ pid_dir_path = File.expand_path("~/.config/stridx/")
96
+ FileUtils.mkdir_p(pid_dir_path)
97
+ end
98
+
99
+ StrIdx::Server.start ARGV
100
+ end
101
+
102
+ def stop
103
+ unless File.exist?(PID_FILE)
104
+ puts "No PID file found. Daemon not running?"
105
+ exit 1
106
+ end
107
+
108
+ pid = File.read(PID_FILE).to_i
109
+ puts "Stopping daemon with PID #{pid}..."
110
+ Process.kill("TERM", pid)
111
+ File.delete(PID_FILE) rescue nil
112
+ rescue Errno::ESRCH
113
+ puts "Process not found. Cleaning up PID file."
114
+ File.delete(PID_FILE) rescue nil
115
+ end
116
+
117
+ # Entry point
118
+ case ARGV.first
119
+ when "stop"
120
+ stop
121
+ when "tty"
6
122
  require "stridx-tty.rb"
7
123
  StrIdxTTY.run
8
- elsif ARGV[0] == "bash"
124
+ when "bash"
9
125
  puts %q/
10
126
  bind -m emacs-standard '"\er": redraw-current-line';
11
127
  bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
12
128
  /
13
- else
14
- require "daemons"
15
- Daemons.run(File.dirname(__FILE__) + "/../runserver.rb")
129
+ when "run"
130
+ obtain_lock_or_exit
131
+ start(daemonize: false)
132
+ when "start"
133
+ obtain_lock_or_exit
134
+ start(daemonize: true)
16
135
  end