StrIdx 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CMakeLists.txt +27 -0
- data/Gemfile +5 -0
- data/Makefile +2 -2
- data/README.md +49 -3
- data/demo.cpp +30 -8
- data/exe/stridx.rb +16 -0
- data/gem_install +4 -0
- data/py_example.py +18 -0
- data/py_interf.cpp +182 -0
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +18 -5
- data/runserver.rb +7 -0
- data/server.rb +103 -0
- data/setup.py +32 -0
- data/stridx-screencast.mp4 +0 -0
- data/stridx-tty.rb +122 -0
- data/stridx.gemspec +37 -0
- data/stridx.hpp +172 -71
- data/test.rb +12 -1
- data/thread_pool.hpp +98 -0
- data/unit_tests.sh +4 -0
- data/unittest.cpp +147 -0
- metadata +103 -3
data/stridx-tty.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "tty-prompt"
|
4
|
+
require "tty-cursor"
|
5
|
+
require "tty-reader"
|
6
|
+
require "pastel"
|
7
|
+
|
8
|
+
require "socket"
|
9
|
+
|
10
|
+
class StrIdxTTY
|
11
|
+
def self.run
|
12
|
+
stty = StrIdxTTY.new
|
13
|
+
selected = stty.search
|
14
|
+
STDOUT.write selected
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize()
|
18
|
+
@lines = []
|
19
|
+
@selected = ""
|
20
|
+
@idx = 0
|
21
|
+
|
22
|
+
@reader = TTY::Reader.new(output: STDERR)
|
23
|
+
@pastel = Pastel.new()
|
24
|
+
@cursor = TTY::Cursor
|
25
|
+
|
26
|
+
sock_dir = File.expand_path("~/.stridx")
|
27
|
+
sockfn = "#{sock_dir}/sock"
|
28
|
+
|
29
|
+
error = true
|
30
|
+
while error
|
31
|
+
begin
|
32
|
+
# Create a new UNIXSocket
|
33
|
+
client = UNIXSocket.new(sockfn)
|
34
|
+
rescue Errno::ECONNREFUSED => e
|
35
|
+
out "Waiting for server to start\n"
|
36
|
+
sleep 2
|
37
|
+
error = true
|
38
|
+
else
|
39
|
+
error = false
|
40
|
+
client.close
|
41
|
+
#... executes when no error
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def out(x)
|
47
|
+
STDERR.write x
|
48
|
+
end
|
49
|
+
|
50
|
+
def search
|
51
|
+
out "\n" * 20
|
52
|
+
out @cursor.clear_screen
|
53
|
+
out "\n" * 20
|
54
|
+
@cursor.move_to(0, 0)
|
55
|
+
@reader.on(:keypress) { |event|
|
56
|
+
handle_event(event)
|
57
|
+
}
|
58
|
+
@reader.read_line(">> ")
|
59
|
+
|
60
|
+
out @cursor.clear_screen
|
61
|
+
return @selected.strip
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_res_from_server(query)
|
65
|
+
# Define the socket file path
|
66
|
+
sock_dir = File.expand_path("~/.stridx")
|
67
|
+
sockfn = "#{sock_dir}/sock"
|
68
|
+
|
69
|
+
# Create a new UNIXSocket
|
70
|
+
client = UNIXSocket.new(sockfn)
|
71
|
+
|
72
|
+
# Send data to the server
|
73
|
+
client.puts "find:#{query}"
|
74
|
+
|
75
|
+
# Read response from the server
|
76
|
+
response = client.recv(200 * 200)
|
77
|
+
|
78
|
+
# Close the client connection
|
79
|
+
client.close
|
80
|
+
return response.lines
|
81
|
+
end
|
82
|
+
|
83
|
+
def draw_list()
|
84
|
+
@selected = @list[@idx]
|
85
|
+
i = 0
|
86
|
+
for x in @list
|
87
|
+
out @cursor.up(1)
|
88
|
+
out @cursor.clear_line
|
89
|
+
if i == @idx
|
90
|
+
out @pastel.lookup(:bold)
|
91
|
+
end
|
92
|
+
out x.strip
|
93
|
+
out @pastel.lookup(:reset)
|
94
|
+
i += 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def update_search(event)
|
99
|
+
query = event.line[3..-1]
|
100
|
+
if query.size > 2
|
101
|
+
@list = get_res_from_server(query)
|
102
|
+
draw_list
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def handle_event(event)
|
107
|
+
out @cursor.save
|
108
|
+
if event.key.name == :alpha
|
109
|
+
update_search(event)
|
110
|
+
elsif event.key.name == :up
|
111
|
+
@idx += 1 if @idx < @list.size - 1
|
112
|
+
draw_list
|
113
|
+
elsif event.key.name == :down
|
114
|
+
@idx -= 1 if @idx > 0
|
115
|
+
draw_list
|
116
|
+
elsif event.key.name == :backspace
|
117
|
+
update_search(event)
|
118
|
+
end
|
119
|
+
|
120
|
+
out @cursor.restore
|
121
|
+
end
|
122
|
+
end
|
data/stridx.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "StrIdx"
|
3
|
+
spec.version = "0.1.4"
|
4
|
+
spec.authors = ["Sami Sieranoja"]
|
5
|
+
spec.email = ["sami.sieranoja@gmail.com"]
|
6
|
+
|
7
|
+
spec.summary = %q{StrIdx}
|
8
|
+
spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
|
9
|
+
spec.homepage = "https://github.com/SamiSieranoja/stridx"
|
10
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
11
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
12
|
+
|
13
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
14
|
+
f.match(%r{^(refcode|spec|features)/})
|
15
|
+
end
|
16
|
+
# spec.files << "thread_pool.hpp"
|
17
|
+
# spec.files << "exe/stridx.rb"
|
18
|
+
# spec.files << "server.rb"
|
19
|
+
# spec.files << "stridx-tty.rb"
|
20
|
+
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib", "ext"]
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 2.4.21"
|
26
|
+
spec.add_development_dependency "rake", "~> 13.1.0"
|
27
|
+
|
28
|
+
spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
|
29
|
+
spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
|
30
|
+
spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
|
31
|
+
spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
|
32
|
+
spec.add_runtime_dependency "pastel", "~> 0.8.0"
|
33
|
+
spec.add_runtime_dependency "daemons", "~> 1.4.1"
|
34
|
+
|
35
|
+
spec.extensions = ["rubyext/extconf.rb"]
|
36
|
+
spec.licenses = ["LGPL-2.0+"]
|
37
|
+
end
|
data/stridx.hpp
CHANGED
@@ -1,21 +1,66 @@
|
|
1
1
|
|
2
|
+
#ifndef SSSTRIDX_HPP
|
3
|
+
#define SSSTRIDX_HPP
|
4
|
+
|
2
5
|
#include <stdio.h>
|
3
6
|
#include <stdlib.h>
|
4
7
|
#include <cassert>
|
5
8
|
|
6
9
|
#include <vector>
|
10
|
+
#include <array>
|
7
11
|
#include <iostream>
|
8
12
|
#include <unordered_map>
|
9
13
|
#include <set>
|
10
14
|
#include <algorithm>
|
11
15
|
#include <sstream>
|
12
16
|
|
13
|
-
#
|
14
|
-
#include <
|
15
|
-
#
|
17
|
+
#include <vector>
|
18
|
+
#include <mutex>
|
19
|
+
#include <thread>
|
16
20
|
|
21
|
+
#include "thread_pool.hpp"
|
17
22
|
#include "unordered_dense.h"
|
18
23
|
|
24
|
+
namespace StrIdx {
|
25
|
+
|
26
|
+
/* Alternative to using std::cout
|
27
|
+
Allows to control verbose level */
|
28
|
+
class Output {
|
29
|
+
private:
|
30
|
+
int verboseLevel;
|
31
|
+
|
32
|
+
public:
|
33
|
+
Output(int verb) : verboseLevel(verb) {}
|
34
|
+
Output() : Output(3) {}
|
35
|
+
~Output() = default;
|
36
|
+
static void print() {}
|
37
|
+
|
38
|
+
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
+
template <typename T, typename... Types> static void print(T var1, Types... var2) {
|
40
|
+
std::cout << var1;
|
41
|
+
print(var2...);
|
42
|
+
}
|
43
|
+
|
44
|
+
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
+
template <typename... Types> static void printl(Types... var2) {
|
46
|
+
print(var2...);
|
47
|
+
print("\n");
|
48
|
+
}
|
49
|
+
|
50
|
+
/* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
51
|
+
* if verboseLevel >= 2 (first arg)
|
52
|
+
*/
|
53
|
+
template <typename... Types> void printv(int vlevel, Types... var2) {
|
54
|
+
if (verboseLevel < vlevel) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
if (verboseLevel >= 3) {
|
58
|
+
print("[v=", vlevel, "] ");
|
59
|
+
}
|
60
|
+
printl(var2...);
|
61
|
+
}
|
62
|
+
};
|
63
|
+
|
19
64
|
// Transforms input string as follows:
|
20
65
|
// '/foo/bar/file1.txt'
|
21
66
|
// => vector{"foo", "bar", "file1.txt"}
|
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
|
|
34
79
|
}
|
35
80
|
|
36
81
|
// Convert int64_t to binary string
|
37
|
-
std::string int64ToBinaryString(int64_t num) {
|
82
|
+
[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
|
38
83
|
std::string result;
|
39
84
|
for (int i = 63; i >= 0; --i) {
|
40
85
|
result += ((num >> i) & 1) ? '1' : '0';
|
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
|
|
42
87
|
return result;
|
43
88
|
}
|
44
89
|
|
45
|
-
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
-
std::string int64ToStr(int64_t key) {
|
90
|
+
// Debug. Convert a (8 char) string represented as int64_t to std::string
|
91
|
+
[[nodiscard]] std::string int64ToStr(const int64_t &key) {
|
47
92
|
int nchars = 8;
|
48
93
|
std::string str;
|
49
94
|
int multip = nchars * 8;
|
@@ -55,43 +100,45 @@ std::string int64ToStr(int64_t key) {
|
|
55
100
|
return str;
|
56
101
|
}
|
57
102
|
|
103
|
+
// Debug
|
58
104
|
void printVector(const std::vector<int> &vec) {
|
59
105
|
for (const auto &value : vec) {
|
60
106
|
std::cout << value << " ";
|
61
107
|
}
|
62
108
|
}
|
63
109
|
|
64
|
-
|
110
|
+
// Debug
|
111
|
+
[[nodiscard]] std::string charToBinaryString(const char &chr) {
|
65
112
|
std::string result;
|
66
113
|
for (int i = 7; i >= 0; --i) {
|
67
|
-
result += ((
|
114
|
+
result += ((chr >> i) & 1) ? '1' : '0';
|
68
115
|
}
|
69
116
|
return result;
|
70
117
|
}
|
71
118
|
|
72
119
|
class Candidate;
|
73
|
-
enum segmentType { Dir, File };
|
120
|
+
enum class segmentType { Dir, File };
|
74
121
|
|
75
122
|
// A segment of a file path
|
76
123
|
// e.g. if path is /foo/bar/baz.txt
|
77
124
|
// segments are [{root}, foo, bar, baz.txt]
|
78
|
-
|
79
|
-
public:
|
125
|
+
struct PathSegment {
|
80
126
|
std::string str;
|
81
127
|
int fileId; // (if FILE)
|
82
128
|
Candidate *cand;
|
83
129
|
PathSegment *parent;
|
130
|
+
std::mutex mu;
|
84
131
|
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
-
segmentType type = Dir;
|
86
|
-
PathSegment() : parent(
|
87
|
-
PathSegment(std::string _str) : str(_str), parent(
|
132
|
+
segmentType type = segmentType::Dir;
|
133
|
+
PathSegment() : parent(nullptr) {}
|
134
|
+
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
88
135
|
PathSegment(std::string _str, int _fileId)
|
89
|
-
: str(_str), fileId(_fileId), cand(
|
90
|
-
int size() {
|
136
|
+
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
137
|
+
[[nodiscard]] int size() const {
|
91
138
|
int sz = str.size();
|
92
139
|
PathSegment *cur = parent;
|
93
140
|
// Sum up length of parent segments (+1 for divisors)
|
94
|
-
while (cur->parent !=
|
141
|
+
while (cur->parent != nullptr) {
|
95
142
|
sz += cur->str.size() + 1;
|
96
143
|
cur = cur->parent;
|
97
144
|
}
|
@@ -100,8 +147,7 @@ public:
|
|
100
147
|
};
|
101
148
|
|
102
149
|
// Candidate for result in string (filename) search
|
103
|
-
|
104
|
-
public:
|
150
|
+
struct Candidate {
|
105
151
|
std::vector<float> v_charscore;
|
106
152
|
PathSegment *seg;
|
107
153
|
int fileId;
|
@@ -114,25 +160,17 @@ public:
|
|
114
160
|
int candLen; // Length of candidate
|
115
161
|
|
116
162
|
Candidate(){};
|
117
|
-
Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
|
118
|
-
// Initialize v_charscores with zeros
|
119
|
-
v_charscore.resize(len, 0);
|
120
|
-
candLen = str.size();
|
121
|
-
seg = NULL;
|
122
|
-
}
|
123
|
-
|
124
163
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
125
164
|
// Initialize v_charscores with zeros
|
126
165
|
v_charscore.resize(len, 0);
|
127
166
|
candLen = seg->size();
|
128
167
|
}
|
129
168
|
|
130
|
-
float getScore() {
|
169
|
+
[[nodiscard]] float getScore() const {
|
131
170
|
int i = 0;
|
132
171
|
float score = 0.0;
|
133
|
-
candLen = seg->size();
|
134
172
|
|
135
|
-
for (float &charscore : v_charscore) {
|
173
|
+
for (const float &charscore : v_charscore) {
|
136
174
|
score += charscore;
|
137
175
|
i++;
|
138
176
|
}
|
@@ -145,35 +183,41 @@ public:
|
|
145
183
|
return score;
|
146
184
|
}
|
147
185
|
|
148
|
-
float operator[](int idx) { return v_charscore[idx]; }
|
186
|
+
[[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
|
149
187
|
};
|
150
188
|
|
151
189
|
// This seems to give 10x speed improvement over std::unordered_map
|
152
190
|
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
191
|
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
192
|
|
155
|
-
typedef
|
193
|
+
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
194
|
+
// typedef std::unordered_map<int, Candidate*> CandMap;
|
156
195
|
|
157
196
|
class StringIndex {
|
158
197
|
private:
|
159
198
|
int tmp;
|
160
199
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
200
|
+
int numStrings = 0;
|
161
201
|
|
162
202
|
std::vector<SegMap *> dirmaps;
|
203
|
+
std::array<std::mutex, 9> mts_d; // for dirmaps
|
163
204
|
std::vector<SegMap *> filemaps;
|
205
|
+
std::array<std::mutex, 9> mts_f; // for filemaps
|
164
206
|
|
165
207
|
std::vector<PathSegment *> segsToClean;
|
166
208
|
|
167
|
-
std::unordered_map<int, std::string> strlist;
|
168
209
|
std::unordered_map<int, PathSegment *> seglist;
|
169
210
|
PathSegment *root;
|
170
211
|
int dirId = 0;
|
171
212
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
213
|
|
214
|
+
std::unique_ptr<ThreadPool> pool;
|
215
|
+
Output out{1}; // verbose level = 1
|
216
|
+
|
173
217
|
public:
|
174
|
-
StringIndex() {
|
218
|
+
StringIndex(char sep) : dirSeparator(sep) {
|
175
219
|
root = new PathSegment();
|
176
|
-
root->parent =
|
220
|
+
root->parent = nullptr;
|
177
221
|
root->str = "[ROOT]";
|
178
222
|
|
179
223
|
for (int i = 0; i <= 8; i++) {
|
@@ -181,11 +225,18 @@ public:
|
|
181
225
|
filemaps.push_back(new SegMap);
|
182
226
|
}
|
183
227
|
|
184
|
-
|
185
|
-
|
186
|
-
|
228
|
+
// Threads between 4 and 6
|
229
|
+
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
230
|
+
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
231
|
+
num_threads = std::min(num_threads, 6);
|
232
|
+
out.printv(2, "Number of threads: ", num_threads);
|
233
|
+
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
187
234
|
}
|
188
235
|
|
236
|
+
/* Don't separate path to segments separator=\0.
|
237
|
+
This is slower, but can be used for other data than files also. */
|
238
|
+
StringIndex() : StringIndex('\0') {}
|
239
|
+
|
189
240
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
241
|
void setDirWeight(float val) { dirWeight = val; }
|
191
242
|
|
@@ -213,6 +264,15 @@ public:
|
|
213
264
|
addStrToIndex(filePath, fileId, dirSeparator);
|
214
265
|
}
|
215
266
|
|
267
|
+
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
268
|
+
pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
|
269
|
+
}
|
270
|
+
void waitUntilReady() const { pool->waitUntilDone(); }
|
271
|
+
|
272
|
+
void waitUntilDone() const { pool->waitUntilDone(); }
|
273
|
+
|
274
|
+
int size() const { return seglist.size(); }
|
275
|
+
|
216
276
|
/**
|
217
277
|
* Add a string to the index to be searched for afterwards
|
218
278
|
*
|
@@ -221,9 +281,17 @@ public:
|
|
221
281
|
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
|
222
282
|
* one of {'\\', '/', '\0' (no separation)}.
|
223
283
|
*/
|
284
|
+
|
224
285
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
286
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
|
287
|
+
|
288
|
+
// If a string with this index has beeen added already
|
289
|
+
if (seglist.find(fileId) != seglist.end()) {
|
290
|
+
return;
|
291
|
+
}
|
225
292
|
|
226
293
|
std::vector<std::string> segs;
|
294
|
+
numStrings += 1;
|
227
295
|
|
228
296
|
if (separator == '\0') {
|
229
297
|
// No separation to directories & files
|
@@ -233,7 +301,7 @@ public:
|
|
233
301
|
segs = splitString(filePath, separator);
|
234
302
|
}
|
235
303
|
|
236
|
-
PathSegment *prev =
|
304
|
+
PathSegment *prev = nullptr;
|
237
305
|
prev = root;
|
238
306
|
// Add segments to a tree type data structure
|
239
307
|
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
@@ -245,25 +313,27 @@ public:
|
|
245
313
|
auto x = *_x;
|
246
314
|
PathSegment *p;
|
247
315
|
|
248
|
-
|
316
|
+
prev->mu.lock();
|
317
|
+
|
249
318
|
// this part of the path already exists in the tree
|
250
|
-
if (it != prev->children.end()) {
|
319
|
+
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
251
320
|
p = it->second;
|
321
|
+
prev->mu.unlock();
|
252
322
|
} else {
|
253
323
|
p = new PathSegment(x, fileId);
|
254
324
|
p->parent = prev;
|
255
|
-
// If this is last item in segs
|
325
|
+
// If this is last item in segs, then it is a file.
|
256
326
|
if (_x == std::prev(segs.end())) {
|
257
|
-
|
258
|
-
p->type = File;
|
327
|
+
p->type = segmentType::File;
|
259
328
|
seglist[fileId] = p;
|
260
|
-
} else {
|
261
|
-
p->type = Dir;
|
329
|
+
} else { // otherwise, it is a directory
|
330
|
+
p->type = segmentType::Dir;
|
262
331
|
p->fileId = dirId;
|
263
332
|
// Files use user input Id. Directories need to have it generated
|
264
333
|
dirId++;
|
265
334
|
}
|
266
335
|
prev->children[x] = p;
|
336
|
+
prev->mu.unlock();
|
267
337
|
addPathSegmentKeys(p);
|
268
338
|
}
|
269
339
|
|
@@ -271,6 +341,17 @@ public:
|
|
271
341
|
}
|
272
342
|
}
|
273
343
|
|
344
|
+
std::string getString(int id) {
|
345
|
+
std::string s = "";
|
346
|
+
PathSegment *seg = seglist[id];
|
347
|
+
s += seg->str;
|
348
|
+
while (seg->parent->parent != nullptr) {
|
349
|
+
seg = seg->parent;
|
350
|
+
s = seg->str + dirSeparator + s;
|
351
|
+
}
|
352
|
+
return s;
|
353
|
+
}
|
354
|
+
|
274
355
|
/**
|
275
356
|
The search will find filepaths similar to the input string
|
276
357
|
|
@@ -303,14 +384,16 @@ public:
|
|
303
384
|
@param query String to search for inside the index
|
304
385
|
*/
|
305
386
|
|
306
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
387
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
307
388
|
return findSimilar(query, 2);
|
308
389
|
}
|
309
390
|
|
310
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
391
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
311
392
|
CandMap fileCandMap;
|
312
393
|
CandMap dirCandMap;
|
313
394
|
|
395
|
+
waitUntilDone();
|
396
|
+
|
314
397
|
// Find both files and directories that match the input query
|
315
398
|
addToCandMap(fileCandMap, query, filemaps);
|
316
399
|
addToCandMap(dirCandMap, query, dirmaps);
|
@@ -319,9 +402,9 @@ public:
|
|
319
402
|
scores of the file */
|
320
403
|
mergeCandidateMaps(fileCandMap, dirCandMap);
|
321
404
|
|
322
|
-
// Set all candidate pointers to
|
405
|
+
// Set all candidate pointers to nullptr so they won't mess up future searches
|
323
406
|
for (auto seg : segsToClean) {
|
324
|
-
seg->cand =
|
407
|
+
seg->cand = nullptr;
|
325
408
|
}
|
326
409
|
segsToClean.clear();
|
327
410
|
|
@@ -329,11 +412,17 @@ public:
|
|
329
412
|
std::vector<std::pair<float, int>> results;
|
330
413
|
for (auto &[fid, cand] : fileCandMap) {
|
331
414
|
std::pair<float, int> v;
|
332
|
-
float sc = cand
|
415
|
+
float sc = cand->getScore();
|
333
416
|
v.first = sc;
|
334
417
|
v.second = fid;
|
335
418
|
results.push_back(v);
|
419
|
+
delete cand;
|
336
420
|
}
|
421
|
+
|
422
|
+
for (auto &[fid, cand] : dirCandMap) {
|
423
|
+
delete cand;
|
424
|
+
}
|
425
|
+
|
337
426
|
// Sort highest score first
|
338
427
|
std::sort(results.begin(), results.end(),
|
339
428
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
@@ -341,10 +430,10 @@ public:
|
|
341
430
|
}
|
342
431
|
|
343
432
|
// Return int64_t representation of the first nchars in str, starting from index i
|
344
|
-
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
433
|
+
[[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
|
345
434
|
int64_t key = 0;
|
346
435
|
for (int i_char = 0; i_char < nchars; i_char++) {
|
347
|
-
key = key | static_cast<
|
436
|
+
key = key | static_cast<int64_t>(str[i + i_char]);
|
348
437
|
if (i_char < nchars - 1) {
|
349
438
|
// Shift 8 bits to the left except on the last iteration
|
350
439
|
key = key << 8;
|
@@ -399,22 +488,29 @@ private:
|
|
399
488
|
maxChars = p->str.size();
|
400
489
|
}
|
401
490
|
|
402
|
-
#ifdef _OPENMP
|
403
|
-
#pragma omp parallel for
|
404
|
-
#endif
|
405
491
|
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
406
492
|
|
493
|
+
std::mutex *mu;
|
407
494
|
SegMap *map;
|
408
|
-
if (p->type == File) {
|
495
|
+
if (p->type == segmentType::File) {
|
409
496
|
map = filemaps[sublen];
|
497
|
+
mu = &mts_f[sublen];
|
410
498
|
} else {
|
411
499
|
map = dirmaps[sublen];
|
500
|
+
mu = &mts_d[sublen];
|
412
501
|
}
|
413
502
|
|
414
503
|
int count = str.size() - sublen + 1;
|
415
504
|
|
505
|
+
int64_t keys[count + 1];
|
416
506
|
for (int i = 0; i <= count; i++) {
|
417
|
-
|
507
|
+
keys[i] = getKeyAtIdx(str, i, sublen);
|
508
|
+
}
|
509
|
+
|
510
|
+
mu->lock();
|
511
|
+
for (int i = 0; i <= count; i++) {
|
512
|
+
// int64_t key = getKeyAtIdx(str, i, sublen);
|
513
|
+
auto key = keys[i];
|
418
514
|
|
419
515
|
// Create a new std::set for key if doesn't exist already
|
420
516
|
auto it = map->find(key);
|
@@ -423,12 +519,14 @@ private:
|
|
423
519
|
}
|
424
520
|
(*map)[key]->insert(p);
|
425
521
|
}
|
522
|
+
mu->unlock();
|
426
523
|
}
|
427
524
|
}
|
428
525
|
|
429
526
|
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
430
527
|
// is of length <nchars>.
|
431
|
-
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
528
|
+
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
529
|
+
SegMap &map) const {
|
432
530
|
|
433
531
|
assert(i + nchars <= static_cast<int>(str.size()));
|
434
532
|
std::vector<PathSegment *> res;
|
@@ -437,8 +535,7 @@ private:
|
|
437
535
|
// transform that to 64 bit integer
|
438
536
|
int64_t key = getKeyAtIdx(str, i, nchars);
|
439
537
|
// Find all path segments in map that have the same substring
|
440
|
-
auto it = map.find(key);
|
441
|
-
if (it != map.end()) { // key found
|
538
|
+
if (auto it = map.find(key); it != map.end()) { // key found
|
442
539
|
auto set = it->second;
|
443
540
|
for (auto value : *set) {
|
444
541
|
res.push_back(value);
|
@@ -475,12 +572,12 @@ private:
|
|
475
572
|
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
476
573
|
|
477
574
|
for (auto &[fid, cand] : fileCandMap) {
|
478
|
-
PathSegment *p = cand
|
479
|
-
while (p->parent !=
|
480
|
-
if (p->cand !=
|
481
|
-
auto &scoreA = cand
|
575
|
+
PathSegment *p = cand->seg->parent;
|
576
|
+
while (p->parent != nullptr) {
|
577
|
+
if (p->cand != nullptr) {
|
578
|
+
auto &scoreA = cand->v_charscore;
|
482
579
|
auto &scoreB = p->cand->v_charscore;
|
483
|
-
for (int i = 0; i < cand
|
580
|
+
for (int i = 0; i < cand->len; i++) {
|
484
581
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
485
582
|
scoreA[i] = scoreB[i] * dirWeight;
|
486
583
|
}
|
@@ -493,18 +590,22 @@ private:
|
|
493
590
|
|
494
591
|
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
495
592
|
|
496
|
-
auto it2 = candmap.find(seg->fileId);
|
497
|
-
|
498
|
-
Candidate cand(seg, str.size());
|
499
|
-
seg->cand = &(candmap[seg->fileId]);
|
593
|
+
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
594
|
+
Candidate *cand = new Candidate(seg, str.size());
|
500
595
|
segsToClean.push_back(seg);
|
501
596
|
candmap[seg->fileId] = cand;
|
597
|
+
seg->cand = cand;
|
502
598
|
}
|
503
599
|
|
504
600
|
for (int j = i; j < i + nchars; j++) {
|
505
|
-
|
506
|
-
|
601
|
+
Candidate &cand = *(candmap[seg->fileId]);
|
602
|
+
if (cand[j] < nchars) {
|
603
|
+
cand.v_charscore[j] = nchars;
|
507
604
|
}
|
508
605
|
}
|
509
606
|
}
|
510
607
|
};
|
608
|
+
|
609
|
+
} // namespace StrIdx
|
610
|
+
|
611
|
+
#endif
|
data/test.rb
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
$:.unshift File.dirname(__FILE__)
|
4
|
+
|
3
5
|
require "stridx"
|
4
6
|
idx = StrIdx::StringIndex.new
|
5
7
|
|
8
|
+
# "/" for unix-style file paths
|
9
|
+
idx.setDirSeparator("/") #(comment out if not file paths)
|
10
|
+
|
6
11
|
t = Time.new
|
7
12
|
fn = File.expand_path("flist.txt")
|
8
13
|
lines = IO.read(fn).lines.collect { |x| x.strip }
|
@@ -13,7 +18,13 @@ for x in lines
|
|
13
18
|
end
|
14
19
|
|
15
20
|
idx_time = Time.new
|
16
|
-
|
21
|
+
# Time to start the threadpool to process indexing
|
22
|
+
puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
23
|
+
|
24
|
+
idx.waitUntilDone() # Not necessary, will be called by idx.find
|
25
|
+
idx_time = Time.new
|
26
|
+
# Time when all threads have completed
|
27
|
+
puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
17
28
|
|
18
29
|
query = "rngnomadriv"
|
19
30
|
res = idx.find(query)
|