StrIdx 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +1 -0
- data/README.md +16 -3
- data/demo.cpp +130 -48
- data/exe/stridx.rb +124 -5
- data/flist.txt +0 -5550
- data/rubyext/extconf.rb +1 -1
- data/rubyext/ruby_interf.cpp +95 -6
- data/runserver.rb +22 -0
- data/server.rb +8 -2
- data/stridx.gemspec +1 -5
- data/stridx.hpp +431 -227
- data/thread_pool.hpp +20 -5
- data/unittest.cpp +58 -16
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e79f1bebc1e56e0a1966ae5ff69b50e4048953dd338807a6c9dba1c53b1c8f34
|
4
|
+
data.tar.gz: 28eea5841ce96f9460975720a43a2655f4902a9c44458a8aae01ce0b3077e67d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e82430abd644e876dd758ceaffb3f759e19fde2a44cc85872f01721943c42d85d2032cbde91d235d44b7921833ed044241692247c88b56f54c6ea7248f5e584e
|
7
|
+
data.tar.gz: 642e16d0e6291474b007e0341bf6571576299f353bf1f5ff13a5c9966d9b1b2450de052f3ebad6f44c9ab9b341686dd41b4adf35922406e3df358c8cbabc9c31
|
data/Makefile
CHANGED
data/README.md
CHANGED
@@ -5,8 +5,14 @@ The fuzziness means that candidate filepaths do not need to include exact match
|
|
5
5
|
|
6
6
|
The library can be applied for UTF-8 data also, although there is a small bias in scoring for multibyte characters.
|
7
7
|
|
8
|
+
1. [String similarity calculation](#stringsim)
|
9
|
+
2. [Interfaces](#interfaces)
|
10
|
+
1. [Command line](#comm)
|
11
|
+
2. [Ruby](#ruby)
|
12
|
+
3. [C++](#cpp)
|
8
13
|
|
9
|
-
|
14
|
+
|
15
|
+
## String similarity calculation <a name="stringsim"/>
|
10
16
|
|
11
17
|
Once the index has been created, the contents can be searched to find the best matching strings.
|
12
18
|
|
@@ -39,7 +45,7 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
|
|
39
45
|
|
40
46
|
# Interfaces
|
41
47
|
|
42
|
-
## Commandline
|
48
|
+
## Commandline <a name="comm"/>
|
43
49
|
Install instructions (for Ubuntu Linux):
|
44
50
|
```
|
45
51
|
apt update
|
@@ -55,7 +61,14 @@ stridx.rb start -- ~/Documents/ ~/Pictures/
|
|
55
61
|
Add bash keybindings (Ctrl-t):
|
56
62
|
```
|
57
63
|
eval "$(stridx.rb bash)"
|
64
|
+
|
65
|
+
```
|
66
|
+
|
67
|
+
To autostart server, add following line to .bashrc:
|
58
68
|
```
|
69
|
+
stridx.rb start -- ~/Documents/ ~/Pictures/
|
70
|
+
```
|
71
|
+
|
59
72
|
|
60
73
|
Search by pressing <kbd>ctrl</kbd>+<kbd>t</kbd>. Keys: <kbd>up</kbd>, <kbd>down</kbd>, select with <kbd>enter</kbd>
|
61
74
|
|
@@ -151,7 +164,7 @@ Search time: 0.0488 seconds
|
|
151
164
|
```
|
152
165
|
|
153
166
|
|
154
|
-
## C++
|
167
|
+
## C++ <a name="cpp"/>
|
155
168
|
See demo.cpp
|
156
169
|
```cpp
|
157
170
|
#include "stridx.hpp"
|
data/demo.cpp
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
|
2
|
+
#include <sys/resource.h>
|
3
|
+
#include <malloc.h>
|
4
|
+
|
5
|
+
#include "mem_info.h"
|
6
|
+
|
2
7
|
#include <condition_variable>
|
3
8
|
#include <functional>
|
4
9
|
#include <iostream>
|
@@ -19,7 +24,7 @@ using std::cout;
|
|
19
24
|
using std::pair;
|
20
25
|
using std::vector;
|
21
26
|
|
22
|
-
std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
27
|
+
std::vector<std::string> readLinesFromFile(const std::string &filename, int limit = 0) {
|
23
28
|
std::vector<std::string> lines;
|
24
29
|
std::ifstream file(filename);
|
25
30
|
if (!file.is_open()) {
|
@@ -28,8 +33,10 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
28
33
|
}
|
29
34
|
|
30
35
|
std::string line;
|
31
|
-
|
36
|
+
int i=0;
|
37
|
+
while (std::getline(file, line) && ( limit == 0 || i < limit) ) {
|
32
38
|
lines.push_back(line);
|
39
|
+
i++;
|
33
40
|
}
|
34
41
|
|
35
42
|
file.close();
|
@@ -37,59 +44,134 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
37
44
|
}
|
38
45
|
|
39
46
|
int main() {
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
47
|
+
{
|
48
|
+
StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
|
49
|
+
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
50
|
+
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
51
|
+
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
52
|
+
|
53
|
+
// Add the file paths of 89828 files in linux-6.9-rc6 to the index
|
54
|
+
std::string fn_filePaths = "flist2.txt";
|
55
|
+
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
56
|
+
// std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths,10000);
|
57
|
+
|
58
|
+
// int* a = new int[10];
|
59
|
+
// delete(a);
|
60
|
+
// delete(a);
|
61
|
+
|
62
|
+
// Launch indexing to be run on background
|
63
|
+
cout << "File paths: " << v_filePaths.size() << std::endl;
|
64
|
+
cout << "Start indexing in the background" << std::endl;
|
65
|
+
auto start = std::chrono::high_resolution_clock::now();
|
66
|
+
int id = 0;
|
67
|
+
for (const auto &filePath : v_filePaths) {
|
68
|
+
// idx.addStrToIndexThreaded(filePath, id);
|
69
|
+
idx.addStrToIndex(filePath, id);
|
70
|
+
id++;
|
71
|
+
}
|
72
|
+
|
73
|
+
std::cout << "========\n";
|
74
|
+
for (int i = 0; i < id; i++) {
|
75
|
+
// std::cout << idx.getString(2) << "{}";
|
76
|
+
idx.getString(i);
|
77
|
+
}
|
78
|
+
std::cout << "========\n";
|
79
|
+
|
80
|
+
auto idx_time_launch = std::chrono::high_resolution_clock::now();
|
81
|
+
std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
|
82
|
+
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
83
|
+
|
84
|
+
// Wait until indexing has finished
|
85
|
+
idx.waitUntilDone();
|
86
|
+
|
87
|
+
auto idx_time = std::chrono::high_resolution_clock::now();
|
88
|
+
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
89
|
+
cout << "Indexing finished time for " << v_filePaths.size()
|
90
|
+
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
91
|
+
|
92
|
+
cout << "DEBUG" << std::endl;
|
93
|
+
// idx.cm.debug();
|
94
|
+
cout << "END DEBUG" << std::endl;
|
95
|
+
|
96
|
+
// Find matching filepaths from the index for the query string "rngnomadriv"
|
97
|
+
start = std::chrono::high_resolution_clock::now();
|
98
|
+
std::string query = "rngnomadriv";
|
99
|
+
// std::string query = "rngnomaindriv";
|
100
|
+
// std::string query = "time.rs";
|
101
|
+
// std::string query = "irqbypass.c";
|
102
|
+
for (int i = 0; i < 99; i++) {
|
103
|
+
// const vector<pair<float, int>> &results = idx.findSimilar(query);
|
104
|
+
}
|
58
105
|
|
59
|
-
|
60
|
-
|
61
|
-
|
106
|
+
// idx.findSim(query);
|
107
|
+
|
108
|
+
// auto res = idx.findDirectories(query);
|
109
|
+
|
110
|
+
// const vector<pair<float, int>> &results = idx.findSimilar(query);
|
111
|
+
// const vector<pair<float, int>> &results = idx.findDirectories(query);
|
112
|
+
// const vector<pair<float, std::string>> &results = idx.findFilesAndDirectories(query, true,
|
113
|
+
// false);
|
114
|
+
vector<pair<float, std::string>> results = idx.findFilesAndDirectories(query, true, false);
|
115
|
+
|
116
|
+
auto search_time = std::chrono::high_resolution_clock::now();
|
117
|
+
duration = search_time - start;
|
118
|
+
cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
|
119
|
+
|
120
|
+
int i = 0;
|
121
|
+
std::cout << "query string: " << query << "\n";
|
122
|
+
std::cout << "Top 20 matches[1]:\n";
|
123
|
+
bool isDir = true;
|
124
|
+
for (const auto &res : results) {
|
125
|
+
// std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
126
|
+
std::cout << res.first << " " << res.second << "\n";
|
127
|
+
i++;
|
128
|
+
if (i > 40) {
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
{
|
134
|
+
|
135
|
+
auto results = idx.findFiles(query);
|
136
|
+
int i = 0;
|
137
|
+
std::cout << "query string: " << query << "\n";
|
138
|
+
std::cout << "Top 20 matchesfff:\n";
|
139
|
+
bool isDir = true;
|
140
|
+
for (const auto &res : results) {
|
141
|
+
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
142
|
+
// std::cout << res.first << " " << res.second << "\n";
|
143
|
+
i++;
|
144
|
+
if (i > 40) {
|
145
|
+
break;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
62
149
|
|
63
|
-
|
64
|
-
|
150
|
+
std::cout << "========\n";
|
151
|
+
for (int i = 0; i < id; i++) {
|
152
|
+
// std::cout << idx.getString(2) << "{}";
|
153
|
+
// idx.getString(i);
|
154
|
+
}
|
155
|
+
std::cout << "========\n";
|
65
156
|
|
66
|
-
|
67
|
-
|
68
|
-
cout << "Indexing finished time for " << v_filePaths.size()
|
69
|
-
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
157
|
+
std::cout << "Size of CharNode: " << sizeof(StrIdx::CharNode) << " bytes" << std::endl;
|
158
|
+
std::cout << "Size of int: " << sizeof(int) << " bytes" << std::endl;
|
70
159
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
for (int i = 0; i < 99; i++) {
|
75
|
-
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
160
|
+
StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
|
161
|
+
// std::this_thread::sleep_for(std::chrono::milliseconds(7000));
|
162
|
+
|
76
163
|
}
|
77
164
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
165
|
+
// Force memory dealloc to properly benchmark
|
166
|
+
// https://www.reddit.com/r/C_Programming/comments/13dn8d7/is_malloc_trim_safe_to_use/
|
167
|
+
malloc_trim(0);
|
168
|
+
|
169
|
+
StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
|
82
170
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
88
|
-
i++;
|
89
|
-
if (i > 20) {
|
90
|
-
break;
|
91
|
-
}
|
92
|
-
}
|
171
|
+
// std::this_thread::sleep_for(std::chrono::milliseconds(7000));
|
172
|
+
struct rusage usage;
|
173
|
+
getrusage(RUSAGE_SELF, &usage);
|
174
|
+
std::cout << "Maximum resident set size: " << usage.ru_maxrss << " kilobytes" << std::endl;
|
93
175
|
|
94
176
|
return 0;
|
95
177
|
}
|
data/exe/stridx.rb
CHANGED
@@ -1,16 +1,135 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require "fileutils"
|
2
3
|
|
3
4
|
$:.unshift File.dirname(__FILE__) + "/.."
|
5
|
+
require "server.rb"
|
4
6
|
|
5
|
-
|
7
|
+
CUR_FILE = File.basename(__FILE__)
|
8
|
+
PID_FILE = File.expand_path("~/.config/stridx/index.pid")
|
9
|
+
LOCK_FILE = File.expand_path("~/.config/stridx/index.lock")
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
# To prevent against race condition when two process started at the same time
|
14
|
+
def obtain_lock_or_exit
|
15
|
+
@lockfile = File.open(LOCK_FILE, File::RDWR | File::CREAT, 0644)
|
16
|
+
|
17
|
+
unless @lockfile.flock(File::LOCK_NB | File::LOCK_EX)
|
18
|
+
puts "Another instance is already running."
|
19
|
+
exit 1
|
20
|
+
end
|
21
|
+
|
22
|
+
# Optionally truncate and write PID for info/logging
|
23
|
+
@lockfile.truncate(0)
|
24
|
+
@lockfile.write("#{Process.pid}\n")
|
25
|
+
@lockfile.flush
|
26
|
+
end
|
27
|
+
|
28
|
+
def running?
|
29
|
+
return false unless File.exist?(PID_FILE)
|
30
|
+
|
31
|
+
pid = File.read(PID_FILE).to_i
|
32
|
+
|
33
|
+
begin
|
34
|
+
# Check if process exists
|
35
|
+
Process.kill(0, pid)
|
36
|
+
|
37
|
+
# Handle race condition: if the daemon was previously killed with "kill -9",
|
38
|
+
# the PID file may remain. A new, unrelated process could later reuse the same PID,
|
39
|
+
# causing a false positive when checking for an existing instance and preventing the daemon from starting.
|
40
|
+
|
41
|
+
# ./daemon.rb # Starts daemon
|
42
|
+
# kill -9 $(cat /tmp/daemon_example.pid) # Force kill
|
43
|
+
# echo $$ > /tmp/daemon_example.pid # Simulate reused PID (use another terminal)
|
44
|
+
# ./daemon.rb # Old version would fail here; fixed version should detect mismatch
|
45
|
+
|
46
|
+
# Check if command line matches this script
|
47
|
+
cmdline = File.read("/proc/#{pid}/cmdline").split("\0")
|
48
|
+
|
49
|
+
correct_process = cmdline.any? { |arg| arg.include?(CUR_FILE) }
|
50
|
+
puts correct_process
|
51
|
+
if correct_process == false
|
52
|
+
puts "Old pidfile points to wrong process"
|
53
|
+
return false
|
54
|
+
end
|
55
|
+
|
56
|
+
return true
|
57
|
+
rescue Errno::ESRCH, Errno::ENOENT
|
58
|
+
return false
|
59
|
+
rescue Errno::EACCES
|
60
|
+
# Process exists, but inaccessible — might still be ours
|
61
|
+
return true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Old version without /proc check
|
66
|
+
def running_old?
|
67
|
+
return false unless File.exist?(PID_FILE)
|
68
|
+
pid = File.read(PID_FILE).to_i
|
69
|
+
Process.kill(0, pid)
|
70
|
+
true
|
71
|
+
rescue Errno::ESRCH, Errno::EPERM
|
72
|
+
false
|
73
|
+
end
|
74
|
+
|
75
|
+
def start(daemonize: false)
|
76
|
+
if running?
|
77
|
+
puts "Daemon is already running."
|
78
|
+
exit 1
|
79
|
+
end
|
80
|
+
|
81
|
+
if daemonize
|
82
|
+
# Daemonize the process
|
83
|
+
Process.daemon(true, true) # Don't change directory, close stdio
|
84
|
+
|
85
|
+
# Save PID
|
86
|
+
File.write(PID_FILE, Process.pid)
|
87
|
+
puts "Daemon started with PID #{Process.pid}"
|
88
|
+
|
89
|
+
trap("TERM") do
|
90
|
+
puts "Daemon stopping..."
|
91
|
+
File.delete(PID_FILE) if File.exist?(PID_FILE)
|
92
|
+
exit
|
93
|
+
end
|
94
|
+
|
95
|
+
pid_dir_path = File.expand_path("~/.config/stridx/")
|
96
|
+
FileUtils.mkdir_p(pid_dir_path)
|
97
|
+
end
|
98
|
+
|
99
|
+
StrIdx::Server.start ARGV
|
100
|
+
end
|
101
|
+
|
102
|
+
def stop
|
103
|
+
unless File.exist?(PID_FILE)
|
104
|
+
puts "No PID file found. Daemon not running?"
|
105
|
+
exit 1
|
106
|
+
end
|
107
|
+
|
108
|
+
pid = File.read(PID_FILE).to_i
|
109
|
+
puts "Stopping daemon with PID #{pid}..."
|
110
|
+
Process.kill("TERM", pid)
|
111
|
+
File.delete(PID_FILE) rescue nil
|
112
|
+
rescue Errno::ESRCH
|
113
|
+
puts "Process not found. Cleaning up PID file."
|
114
|
+
File.delete(PID_FILE) rescue nil
|
115
|
+
end
|
116
|
+
|
117
|
+
# Entry point
|
118
|
+
case ARGV.first
|
119
|
+
when "stop"
|
120
|
+
stop
|
121
|
+
when "tty"
|
6
122
|
require "stridx-tty.rb"
|
7
123
|
StrIdxTTY.run
|
8
|
-
|
124
|
+
when "bash"
|
9
125
|
puts %q/
|
10
126
|
bind -m emacs-standard '"\er": redraw-current-line';
|
11
127
|
bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
|
12
128
|
/
|
13
|
-
|
14
|
-
|
15
|
-
|
129
|
+
when "run"
|
130
|
+
obtain_lock_or_exit
|
131
|
+
start(daemonize: false)
|
132
|
+
when "start"
|
133
|
+
obtain_lock_or_exit
|
134
|
+
start(daemonize: true)
|
16
135
|
end
|