StrIdx 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -2
- data/demo.cpp +124 -73
- data/exe/stridx.rb +124 -10
- data/rubyext/extconf.rb +1 -1
- data/rubyext/ruby_interf.cpp +42 -9
- data/runserver.rb +2 -0
- data/server.rb +1 -0
- data/stridx.gemspec +1 -1
- data/stridx.hpp +20 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e79f1bebc1e56e0a1966ae5ff69b50e4048953dd338807a6c9dba1c53b1c8f34
|
4
|
+
data.tar.gz: 28eea5841ce96f9460975720a43a2655f4902a9c44458a8aae01ce0b3077e67d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e82430abd644e876dd758ceaffb3f759e19fde2a44cc85872f01721943c42d85d2032cbde91d235d44b7921833ed044241692247c88b56f54c6ea7248f5e584e
|
7
|
+
data.tar.gz: 642e16d0e6291474b007e0341bf6571576299f353bf1f5ff13a5c9966d9b1b2450de052f3ebad6f44c9ab9b341686dd41b4adf35922406e3df358c8cbabc9c31
|
data/README.md
CHANGED
@@ -66,7 +66,7 @@ eval "$(stridx.rb bash)"
|
|
66
66
|
|
67
67
|
To autostart server, add following line to .bashrc:
|
68
68
|
```
|
69
|
-
|
69
|
+
stridx.rb start -- ~/Documents/ ~/Pictures/
|
70
70
|
```
|
71
71
|
|
72
72
|
|
@@ -79,7 +79,6 @@ Stop server:
|
|
79
79
|
```
|
80
80
|
stridx.rb stop
|
81
81
|
```
|
82
|
-
In case stop doesn't work, try: `kill $(pgrep -f runserver.rb | tail -n 1)`
|
83
82
|
|
84
83
|
Start indexing server (on foreground, to debug):
|
85
84
|
```
|
data/demo.cpp
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
|
2
2
|
#include <sys/resource.h>
|
3
|
+
#include <malloc.h>
|
4
|
+
|
5
|
+
#include "mem_info.h"
|
3
6
|
|
4
7
|
#include <condition_variable>
|
5
8
|
#include <functional>
|
@@ -21,7 +24,7 @@ using std::cout;
|
|
21
24
|
using std::pair;
|
22
25
|
using std::vector;
|
23
26
|
|
24
|
-
std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
27
|
+
std::vector<std::string> readLinesFromFile(const std::string &filename, int limit = 0) {
|
25
28
|
std::vector<std::string> lines;
|
26
29
|
std::ifstream file(filename);
|
27
30
|
if (!file.is_open()) {
|
@@ -30,8 +33,10 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
30
33
|
}
|
31
34
|
|
32
35
|
std::string line;
|
33
|
-
|
36
|
+
int i=0;
|
37
|
+
while (std::getline(file, line) && ( limit == 0 || i < limit) ) {
|
34
38
|
lines.push_back(line);
|
39
|
+
i++;
|
35
40
|
}
|
36
41
|
|
37
42
|
file.close();
|
@@ -39,88 +44,134 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
39
44
|
}
|
40
45
|
|
41
46
|
int main() {
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
68
|
-
|
69
|
-
// Wait until indexing has finished
|
70
|
-
idx.waitUntilDone();
|
71
|
-
|
72
|
-
auto idx_time = std::chrono::high_resolution_clock::now();
|
73
|
-
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
74
|
-
cout << "Indexing finished time for " << v_filePaths.size()
|
75
|
-
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
76
|
-
|
77
|
-
|
78
|
-
cout << "DEBUG" << std::endl;
|
79
|
-
// idx.cm.debug();
|
80
|
-
cout << "END DEBUG" << std::endl;
|
81
|
-
|
82
|
-
// Find matching filepaths from the index for the query string "rngnomadriv"
|
83
|
-
start = std::chrono::high_resolution_clock::now();
|
84
|
-
// std::string query = "rngnomadriv";
|
85
|
-
std::string query = "irqbypass.c";
|
86
|
-
for (int i = 0; i < 99; i++) {
|
87
|
-
// const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
88
|
-
const vector<pair<float, int>> &results = idx.findSim(query);
|
89
|
-
}
|
90
|
-
|
47
|
+
{
|
48
|
+
StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
|
49
|
+
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
50
|
+
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
51
|
+
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
52
|
+
|
53
|
+
// Add the file paths of 89828 files in linux-6.9-rc6 to the index
|
54
|
+
std::string fn_filePaths = "flist2.txt";
|
55
|
+
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
56
|
+
// std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths,10000);
|
57
|
+
|
58
|
+
// int* a = new int[10];
|
59
|
+
// delete(a);
|
60
|
+
// delete(a);
|
61
|
+
|
62
|
+
// Launch indexing to be run on background
|
63
|
+
cout << "File paths: " << v_filePaths.size() << std::endl;
|
64
|
+
cout << "Start indexing in the background" << std::endl;
|
65
|
+
auto start = std::chrono::high_resolution_clock::now();
|
66
|
+
int id = 0;
|
67
|
+
for (const auto &filePath : v_filePaths) {
|
68
|
+
// idx.addStrToIndexThreaded(filePath, id);
|
69
|
+
idx.addStrToIndex(filePath, id);
|
70
|
+
id++;
|
71
|
+
}
|
91
72
|
|
92
|
-
|
73
|
+
std::cout << "========\n";
|
74
|
+
for (int i = 0; i < id; i++) {
|
75
|
+
// std::cout << idx.getString(2) << "{}";
|
76
|
+
idx.getString(i);
|
77
|
+
}
|
78
|
+
std::cout << "========\n";
|
79
|
+
|
80
|
+
auto idx_time_launch = std::chrono::high_resolution_clock::now();
|
81
|
+
std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
|
82
|
+
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
83
|
+
|
84
|
+
// Wait until indexing has finished
|
85
|
+
idx.waitUntilDone();
|
86
|
+
|
87
|
+
auto idx_time = std::chrono::high_resolution_clock::now();
|
88
|
+
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
89
|
+
cout << "Indexing finished time for " << v_filePaths.size()
|
90
|
+
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
91
|
+
|
92
|
+
cout << "DEBUG" << std::endl;
|
93
|
+
// idx.cm.debug();
|
94
|
+
cout << "END DEBUG" << std::endl;
|
95
|
+
|
96
|
+
// Find matching filepaths from the index for the query string "rngnomadriv"
|
97
|
+
start = std::chrono::high_resolution_clock::now();
|
98
|
+
std::string query = "rngnomadriv";
|
99
|
+
// std::string query = "rngnomaindriv";
|
100
|
+
// std::string query = "time.rs";
|
101
|
+
// std::string query = "irqbypass.c";
|
102
|
+
for (int i = 0; i < 99; i++) {
|
103
|
+
// const vector<pair<float, int>> &results = idx.findSimilar(query);
|
104
|
+
}
|
93
105
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
106
|
+
// idx.findSim(query);
|
107
|
+
|
108
|
+
// auto res = idx.findDirectories(query);
|
109
|
+
|
110
|
+
// const vector<pair<float, int>> &results = idx.findSimilar(query);
|
111
|
+
// const vector<pair<float, int>> &results = idx.findDirectories(query);
|
112
|
+
// const vector<pair<float, std::string>> &results = idx.findFilesAndDirectories(query, true,
|
113
|
+
// false);
|
114
|
+
vector<pair<float, std::string>> results = idx.findFilesAndDirectories(query, true, false);
|
115
|
+
|
116
|
+
auto search_time = std::chrono::high_resolution_clock::now();
|
117
|
+
duration = search_time - start;
|
118
|
+
cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
|
119
|
+
|
120
|
+
int i = 0;
|
121
|
+
std::cout << "query string: " << query << "\n";
|
122
|
+
std::cout << "Top 20 matches[1]:\n";
|
123
|
+
bool isDir = true;
|
124
|
+
for (const auto &res : results) {
|
125
|
+
// std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
126
|
+
std::cout << res.first << " " << res.second << "\n";
|
127
|
+
i++;
|
128
|
+
if (i > 40) {
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
{
|
134
|
+
|
135
|
+
auto results = idx.findFiles(query);
|
136
|
+
int i = 0;
|
137
|
+
std::cout << "query string: " << query << "\n";
|
138
|
+
std::cout << "Top 20 matchesfff:\n";
|
139
|
+
bool isDir = true;
|
140
|
+
for (const auto &res : results) {
|
141
|
+
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
142
|
+
// std::cout << res.first << " " << res.second << "\n";
|
143
|
+
i++;
|
144
|
+
if (i > 40) {
|
145
|
+
break;
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
99
149
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
105
|
-
i++;
|
106
|
-
if (i > 20) {
|
107
|
-
break;
|
150
|
+
std::cout << "========\n";
|
151
|
+
for (int i = 0; i < id; i++) {
|
152
|
+
// std::cout << idx.getString(2) << "{}";
|
153
|
+
// idx.getString(i);
|
108
154
|
}
|
109
|
-
|
155
|
+
std::cout << "========\n";
|
110
156
|
|
111
|
-
// std::cout << "Size of MyClass: " << sizeof(StrIdx::CharMap) << " bytes" << std::endl;
|
112
|
-
// std::cout << "Size of CharMap3: " << sizeof(StrIdx::CharMap3) << " bytes" << std::endl;
|
113
157
|
std::cout << "Size of CharNode: " << sizeof(StrIdx::CharNode) << " bytes" << std::endl;
|
114
158
|
std::cout << "Size of int: " << sizeof(int) << " bytes" << std::endl;
|
115
159
|
|
160
|
+
StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
|
161
|
+
// std::this_thread::sleep_for(std::chrono::milliseconds(7000));
|
162
|
+
|
163
|
+
}
|
164
|
+
|
165
|
+
// Force memory dealloc to properly benchmark
|
166
|
+
// https://www.reddit.com/r/C_Programming/comments/13dn8d7/is_malloc_trim_safe_to_use/
|
167
|
+
malloc_trim(0);
|
168
|
+
|
169
|
+
StrIdx::out.printl("MEMSTAT current:", getCurrentRSS(), " peak:", getPeakRSS());
|
170
|
+
|
171
|
+
// std::this_thread::sleep_for(std::chrono::milliseconds(7000));
|
116
172
|
struct rusage usage;
|
117
173
|
getrusage(RUSAGE_SELF, &usage);
|
118
174
|
std::cout << "Maximum resident set size: " << usage.ru_maxrss << " kilobytes" << std::endl;
|
119
|
-
std::cout << "Integral shared memory size: " << usage.ru_ixrss << " kilobytes" << std::endl;
|
120
|
-
std::cout << "Integral unshared data size: " << usage.ru_idrss << " kilobytes" << std::endl;
|
121
|
-
std::cout << "Integral unshared stack size: " << usage.ru_isrss << " kilobytes" << std::endl;
|
122
|
-
|
123
|
-
|
124
175
|
|
125
176
|
return 0;
|
126
177
|
}
|
data/exe/stridx.rb
CHANGED
@@ -1,21 +1,135 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require
|
2
|
+
require "fileutils"
|
3
3
|
|
4
4
|
$:.unshift File.dirname(__FILE__) + "/.."
|
5
|
+
require "server.rb"
|
5
6
|
|
6
|
-
|
7
|
+
CUR_FILE = File.basename(__FILE__)
|
8
|
+
PID_FILE = File.expand_path("~/.config/stridx/index.pid")
|
9
|
+
LOCK_FILE = File.expand_path("~/.config/stridx/index.lock")
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
# To prevent against race condition when two process started at the same time
|
14
|
+
def obtain_lock_or_exit
|
15
|
+
@lockfile = File.open(LOCK_FILE, File::RDWR | File::CREAT, 0644)
|
16
|
+
|
17
|
+
unless @lockfile.flock(File::LOCK_NB | File::LOCK_EX)
|
18
|
+
puts "Another instance is already running."
|
19
|
+
exit 1
|
20
|
+
end
|
21
|
+
|
22
|
+
# Optionally truncate and write PID for info/logging
|
23
|
+
@lockfile.truncate(0)
|
24
|
+
@lockfile.write("#{Process.pid}\n")
|
25
|
+
@lockfile.flush
|
26
|
+
end
|
27
|
+
|
28
|
+
def running?
|
29
|
+
return false unless File.exist?(PID_FILE)
|
30
|
+
|
31
|
+
pid = File.read(PID_FILE).to_i
|
32
|
+
|
33
|
+
begin
|
34
|
+
# Check if process exists
|
35
|
+
Process.kill(0, pid)
|
36
|
+
|
37
|
+
# Handle race condition: if the daemon was previously killed with "kill -9",
|
38
|
+
# the PID file may remain. A new, unrelated process could later reuse the same PID,
|
39
|
+
# causing a false positive when checking for an existing instance and preventing the daemon from starting.
|
40
|
+
|
41
|
+
# ./daemon.rb # Starts daemon
|
42
|
+
# kill -9 $(cat /tmp/daemon_example.pid) # Force kill
|
43
|
+
# echo $$ > /tmp/daemon_example.pid # Simulate reused PID (use another terminal)
|
44
|
+
# ./daemon.rb # Old version would fail here; fixed version should detect mismatch
|
45
|
+
|
46
|
+
# Check if command line matches this script
|
47
|
+
cmdline = File.read("/proc/#{pid}/cmdline").split("\0")
|
48
|
+
|
49
|
+
correct_process = cmdline.any? { |arg| arg.include?(CUR_FILE) }
|
50
|
+
puts correct_process
|
51
|
+
if correct_process == false
|
52
|
+
puts "Old pidfile points to wrong process"
|
53
|
+
return false
|
54
|
+
end
|
55
|
+
|
56
|
+
return true
|
57
|
+
rescue Errno::ESRCH, Errno::ENOENT
|
58
|
+
return false
|
59
|
+
rescue Errno::EACCES
|
60
|
+
# Process exists, but inaccessible — might still be ours
|
61
|
+
return true
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Old version without /proc check
|
66
|
+
def running_old?
|
67
|
+
return false unless File.exist?(PID_FILE)
|
68
|
+
pid = File.read(PID_FILE).to_i
|
69
|
+
Process.kill(0, pid)
|
70
|
+
true
|
71
|
+
rescue Errno::ESRCH, Errno::EPERM
|
72
|
+
false
|
73
|
+
end
|
74
|
+
|
75
|
+
def start(daemonize: false)
|
76
|
+
if running?
|
77
|
+
puts "Daemon is already running."
|
78
|
+
exit 1
|
79
|
+
end
|
80
|
+
|
81
|
+
if daemonize
|
82
|
+
# Daemonize the process
|
83
|
+
Process.daemon(true, true) # Don't change directory, close stdio
|
84
|
+
|
85
|
+
# Save PID
|
86
|
+
File.write(PID_FILE, Process.pid)
|
87
|
+
puts "Daemon started with PID #{Process.pid}"
|
88
|
+
|
89
|
+
trap("TERM") do
|
90
|
+
puts "Daemon stopping..."
|
91
|
+
File.delete(PID_FILE) if File.exist?(PID_FILE)
|
92
|
+
exit
|
93
|
+
end
|
94
|
+
|
95
|
+
pid_dir_path = File.expand_path("~/.config/stridx/")
|
96
|
+
FileUtils.mkdir_p(pid_dir_path)
|
97
|
+
end
|
98
|
+
|
99
|
+
StrIdx::Server.start ARGV
|
100
|
+
end
|
101
|
+
|
102
|
+
def stop
|
103
|
+
unless File.exist?(PID_FILE)
|
104
|
+
puts "No PID file found. Daemon not running?"
|
105
|
+
exit 1
|
106
|
+
end
|
107
|
+
|
108
|
+
pid = File.read(PID_FILE).to_i
|
109
|
+
puts "Stopping daemon with PID #{pid}..."
|
110
|
+
Process.kill("TERM", pid)
|
111
|
+
File.delete(PID_FILE) rescue nil
|
112
|
+
rescue Errno::ESRCH
|
113
|
+
puts "Process not found. Cleaning up PID file."
|
114
|
+
File.delete(PID_FILE) rescue nil
|
115
|
+
end
|
116
|
+
|
117
|
+
# Entry point
|
118
|
+
case ARGV.first
|
119
|
+
when "stop"
|
120
|
+
stop
|
121
|
+
when "tty"
|
7
122
|
require "stridx-tty.rb"
|
8
123
|
StrIdxTTY.run
|
9
|
-
|
124
|
+
when "bash"
|
10
125
|
puts %q/
|
11
126
|
bind -m emacs-standard '"\er": redraw-current-line';
|
12
127
|
bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
|
13
128
|
/
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
{:dir_mode => :normal, :dir => pid_dir_path })
|
129
|
+
when "run"
|
130
|
+
obtain_lock_or_exit
|
131
|
+
start(daemonize: false)
|
132
|
+
when "start"
|
133
|
+
obtain_lock_or_exit
|
134
|
+
start(daemonize: true)
|
21
135
|
end
|
data/rubyext/extconf.rb
CHANGED
data/rubyext/ruby_interf.cpp
CHANGED
@@ -48,7 +48,32 @@ VALUE StringIndexWaitUntilDone(VALUE self) {
|
|
48
48
|
((StrIdx::StringIndex *)data)->waitUntilDone();
|
49
49
|
return self;
|
50
50
|
}
|
51
|
-
|
51
|
+
|
52
|
+
VALUE StringIndexFindNum(VALUE self, VALUE str, VALUE _limit) {
|
53
|
+
VALUE ret;
|
54
|
+
std::string s1 = StringValueCStr(str);
|
55
|
+
|
56
|
+
void *data;
|
57
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
58
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
59
|
+
|
60
|
+
int limit = NUM2INT(_limit);
|
61
|
+
|
62
|
+
ret = rb_ary_new();
|
63
|
+
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1);
|
64
|
+
int i = 0;
|
65
|
+
for (const auto &res : results) {
|
66
|
+
VALUE arr = rb_ary_new();
|
67
|
+
rb_ary_push(arr, INT2NUM(res.second));
|
68
|
+
rb_ary_push(arr, DBL2NUM(res.first));
|
69
|
+
rb_ary_push(ret, arr);
|
70
|
+
i++;
|
71
|
+
if (i >= limit) {
|
72
|
+
break;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
return ret;
|
76
|
+
}
|
52
77
|
|
53
78
|
VALUE StringIndexFind(VALUE self, VALUE str) {
|
54
79
|
VALUE ret;
|
@@ -109,7 +134,8 @@ VALUE StringIndexFindDirs(VALUE self, VALUE str) {
|
|
109
134
|
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
110
135
|
|
111
136
|
ret = rb_ary_new();
|
112
|
-
const std::vector<std::pair<float, std::string>> &results =
|
137
|
+
const std::vector<std::pair<float, std::string>> &results =
|
138
|
+
idx->findFilesAndDirectories(s1, false, true);
|
113
139
|
int limit = 40;
|
114
140
|
int i = 0;
|
115
141
|
for (const auto &res : results) {
|
@@ -125,10 +151,6 @@ VALUE StringIndexFindDirs(VALUE self, VALUE str) {
|
|
125
151
|
return ret;
|
126
152
|
}
|
127
153
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
154
|
VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
133
155
|
char c = '/';
|
134
156
|
if (TYPE(str) == T_STRING) {
|
@@ -149,6 +171,17 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
|
149
171
|
return self;
|
150
172
|
}
|
151
173
|
|
174
|
+
VALUE StringIndexSetDirWeight(VALUE self, VALUE d) {
|
175
|
+
if (TYPE(d) == T_FLOAT) {
|
176
|
+
double c_float = NUM2DBL(rb_funcall(d, rb_intern("to_f"), 0));
|
177
|
+
void *data;
|
178
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
179
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
180
|
+
idx->setDirWeight(c_float);
|
181
|
+
}
|
182
|
+
return self;
|
183
|
+
}
|
184
|
+
|
152
185
|
void Init_stridx(void) {
|
153
186
|
|
154
187
|
VALUE mStrIdx = rb_define_module("StrIdx");
|
@@ -158,12 +191,12 @@ void Init_stridx(void) {
|
|
158
191
|
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
159
192
|
rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
|
160
193
|
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
194
|
+
rb_define_method(classStringIndex, "findNum", StringIndexFindNum, 2);
|
195
|
+
rb_define_method(classStringIndex, "setDirWeight", StringIndexSetDirWeight, 1);
|
161
196
|
rb_define_method(classStringIndex, "findFilesAndDirs", StringIndexFindFilesAndDirs, 1);
|
162
197
|
rb_define_method(classStringIndex, "findDirs", StringIndexFindDirs, 1);
|
163
|
-
|
198
|
+
|
164
199
|
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
165
|
-
|
166
|
-
|
167
200
|
}
|
168
201
|
|
169
202
|
} // End extern "C"
|
data/runserver.rb
CHANGED
data/server.rb
CHANGED
data/stridx.gemspec
CHANGED
data/stridx.hpp
CHANGED
@@ -324,7 +324,11 @@ struct PathSegment {
|
|
324
324
|
|
325
325
|
// Candidate for result in string (filename) search
|
326
326
|
struct Candidate {
|
327
|
+
|
328
|
+
//This holds the subscores for each character in the query string
|
327
329
|
std::vector<float> v_charscore;
|
330
|
+
|
331
|
+
|
328
332
|
PathSegment *seg;
|
329
333
|
int fileId;
|
330
334
|
// The string that this candidate represents
|
@@ -343,6 +347,7 @@ struct Candidate {
|
|
343
347
|
candLen = seg->size();
|
344
348
|
}
|
345
349
|
|
350
|
+
// Sum subscores in v_charscore and normalize to get final score
|
346
351
|
[[nodiscard]] float getScore() const {
|
347
352
|
int i = 0;
|
348
353
|
float score = 0.0;
|
@@ -375,13 +380,14 @@ private:
|
|
375
380
|
|
376
381
|
std::vector<PathSegment *> segsToClean;
|
377
382
|
|
383
|
+
// Maps id's stored in charTree to corresponding PathSegment's
|
378
384
|
std::unordered_map<int, PathSegment *> seglist;
|
379
385
|
std::unordered_map<int, PathSegment *> seglist_dir;
|
380
386
|
std::mutex seglist_mu;
|
381
387
|
|
382
388
|
PathSegment *root;
|
383
389
|
int dirId = 0;
|
384
|
-
float dirWeight = 0
|
390
|
+
float dirWeight = 1.0; // =0.7: Give only 70% of score if match is for a directory
|
385
391
|
|
386
392
|
std::unique_ptr<ThreadPool> pool;
|
387
393
|
Output out{1}; // verbose level = 1
|
@@ -588,12 +594,17 @@ public:
|
|
588
594
|
void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
|
589
595
|
|
590
596
|
int last_start = query.size() - 2;
|
597
|
+
// Loop all possible start positions in query string. Indexes [0..(n-3)]
|
591
598
|
for (int start = 0; start <= last_start; start++) {
|
592
599
|
CharNode *cn = chartr.root;
|
600
|
+
|
601
|
+
// select a suffix (substring) starting from start, but cap length to 8 chars
|
593
602
|
int end = std::min(start + 7, ((int)query.size()) - 1);
|
594
603
|
int nchars = end - start + 1;
|
595
604
|
std::string s = query.substr(start, nchars);
|
596
605
|
|
606
|
+
// Loop all chars of the query substring
|
607
|
+
// Traverse from the
|
597
608
|
for (int i = 0; i < s.size(); i++) {
|
598
609
|
char c = s[i];
|
599
610
|
CharNode *x = cn->find(c);
|
@@ -601,12 +612,20 @@ public:
|
|
601
612
|
cn = x;
|
602
613
|
// Consider scores only for substrings with size >= 2
|
603
614
|
if (i > 0) {
|
615
|
+
// If we've reached here, size of substring is i+2
|
616
|
+
|
617
|
+
// Get identifiers of files that include substring
|
618
|
+
// query[start..(start+i+1)] ??
|
604
619
|
std::set<int> ids = cn->getIds();
|
605
620
|
for (const int &y : ids) {
|
606
621
|
PathSegment *p = nullptr;
|
622
|
+
|
623
|
+
// Searching in file segments
|
624
|
+
// (or no file/dir separation)
|
607
625
|
if (&chartr == &cm) {
|
608
626
|
p = seglist[y];
|
609
627
|
} else {
|
628
|
+
// Searching in dir segments
|
610
629
|
p = seglist_dir[y];
|
611
630
|
}
|
612
631
|
assert(p != nullptr);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-07-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|