StrIdx 0.1.2 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CMakeLists.txt +27 -0
- data/Gemfile +5 -0
- data/Makefile +2 -2
- data/README.md +49 -3
- data/demo.cpp +30 -8
- data/exe/stridx.rb +16 -0
- data/gem_install +4 -0
- data/py_example.py +18 -0
- data/py_interf.cpp +182 -0
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +18 -5
- data/runserver.rb +7 -0
- data/server.rb +103 -0
- data/setup.py +32 -0
- data/stridx-screencast.mp4 +0 -0
- data/stridx-tty.rb +122 -0
- data/stridx.gemspec +37 -0
- data/stridx.hpp +172 -71
- data/test.rb +12 -1
- data/thread_pool.hpp +98 -0
- data/unit_tests.sh +4 -0
- data/unittest.cpp +147 -0
- metadata +103 -3
data/stridx-tty.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "tty-prompt"
|
4
|
+
require "tty-cursor"
|
5
|
+
require "tty-reader"
|
6
|
+
require "pastel"
|
7
|
+
|
8
|
+
require "socket"
|
9
|
+
|
10
|
+
class StrIdxTTY
|
11
|
+
def self.run
|
12
|
+
stty = StrIdxTTY.new
|
13
|
+
selected = stty.search
|
14
|
+
STDOUT.write selected
|
15
|
+
end
|
16
|
+
|
17
|
+
def initialize()
|
18
|
+
@lines = []
|
19
|
+
@selected = ""
|
20
|
+
@idx = 0
|
21
|
+
|
22
|
+
@reader = TTY::Reader.new(output: STDERR)
|
23
|
+
@pastel = Pastel.new()
|
24
|
+
@cursor = TTY::Cursor
|
25
|
+
|
26
|
+
sock_dir = File.expand_path("~/.stridx")
|
27
|
+
sockfn = "#{sock_dir}/sock"
|
28
|
+
|
29
|
+
error = true
|
30
|
+
while error
|
31
|
+
begin
|
32
|
+
# Create a new UNIXSocket
|
33
|
+
client = UNIXSocket.new(sockfn)
|
34
|
+
rescue Errno::ECONNREFUSED => e
|
35
|
+
out "Waiting for server to start\n"
|
36
|
+
sleep 2
|
37
|
+
error = true
|
38
|
+
else
|
39
|
+
error = false
|
40
|
+
client.close
|
41
|
+
#... executes when no error
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def out(x)
|
47
|
+
STDERR.write x
|
48
|
+
end
|
49
|
+
|
50
|
+
def search
|
51
|
+
out "\n" * 20
|
52
|
+
out @cursor.clear_screen
|
53
|
+
out "\n" * 20
|
54
|
+
@cursor.move_to(0, 0)
|
55
|
+
@reader.on(:keypress) { |event|
|
56
|
+
handle_event(event)
|
57
|
+
}
|
58
|
+
@reader.read_line(">> ")
|
59
|
+
|
60
|
+
out @cursor.clear_screen
|
61
|
+
return @selected.strip
|
62
|
+
end
|
63
|
+
|
64
|
+
def get_res_from_server(query)
|
65
|
+
# Define the socket file path
|
66
|
+
sock_dir = File.expand_path("~/.stridx")
|
67
|
+
sockfn = "#{sock_dir}/sock"
|
68
|
+
|
69
|
+
# Create a new UNIXSocket
|
70
|
+
client = UNIXSocket.new(sockfn)
|
71
|
+
|
72
|
+
# Send data to the server
|
73
|
+
client.puts "find:#{query}"
|
74
|
+
|
75
|
+
# Read response from the server
|
76
|
+
response = client.recv(200 * 200)
|
77
|
+
|
78
|
+
# Close the client connection
|
79
|
+
client.close
|
80
|
+
return response.lines
|
81
|
+
end
|
82
|
+
|
83
|
+
def draw_list()
|
84
|
+
@selected = @list[@idx]
|
85
|
+
i = 0
|
86
|
+
for x in @list
|
87
|
+
out @cursor.up(1)
|
88
|
+
out @cursor.clear_line
|
89
|
+
if i == @idx
|
90
|
+
out @pastel.lookup(:bold)
|
91
|
+
end
|
92
|
+
out x.strip
|
93
|
+
out @pastel.lookup(:reset)
|
94
|
+
i += 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def update_search(event)
|
99
|
+
query = event.line[3..-1]
|
100
|
+
if query.size > 2
|
101
|
+
@list = get_res_from_server(query)
|
102
|
+
draw_list
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def handle_event(event)
|
107
|
+
out @cursor.save
|
108
|
+
if event.key.name == :alpha
|
109
|
+
update_search(event)
|
110
|
+
elsif event.key.name == :up
|
111
|
+
@idx += 1 if @idx < @list.size - 1
|
112
|
+
draw_list
|
113
|
+
elsif event.key.name == :down
|
114
|
+
@idx -= 1 if @idx > 0
|
115
|
+
draw_list
|
116
|
+
elsif event.key.name == :backspace
|
117
|
+
update_search(event)
|
118
|
+
end
|
119
|
+
|
120
|
+
out @cursor.restore
|
121
|
+
end
|
122
|
+
end
|
data/stridx.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "StrIdx"
|
3
|
+
spec.version = "0.1.4"
|
4
|
+
spec.authors = ["Sami Sieranoja"]
|
5
|
+
spec.email = ["sami.sieranoja@gmail.com"]
|
6
|
+
|
7
|
+
spec.summary = %q{StrIdx}
|
8
|
+
spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
|
9
|
+
spec.homepage = "https://github.com/SamiSieranoja/stridx"
|
10
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
11
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
12
|
+
|
13
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
14
|
+
f.match(%r{^(refcode|spec|features)/})
|
15
|
+
end
|
16
|
+
# spec.files << "thread_pool.hpp"
|
17
|
+
# spec.files << "exe/stridx.rb"
|
18
|
+
# spec.files << "server.rb"
|
19
|
+
# spec.files << "stridx-tty.rb"
|
20
|
+
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib", "ext"]
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 2.4.21"
|
26
|
+
spec.add_development_dependency "rake", "~> 13.1.0"
|
27
|
+
|
28
|
+
spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
|
29
|
+
spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
|
30
|
+
spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
|
31
|
+
spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
|
32
|
+
spec.add_runtime_dependency "pastel", "~> 0.8.0"
|
33
|
+
spec.add_runtime_dependency "daemons", "~> 1.4.1"
|
34
|
+
|
35
|
+
spec.extensions = ["rubyext/extconf.rb"]
|
36
|
+
spec.licenses = ["LGPL-2.0+"]
|
37
|
+
end
|
data/stridx.hpp
CHANGED
@@ -1,21 +1,66 @@
|
|
1
1
|
|
2
|
+
#ifndef SSSTRIDX_HPP
|
3
|
+
#define SSSTRIDX_HPP
|
4
|
+
|
2
5
|
#include <stdio.h>
|
3
6
|
#include <stdlib.h>
|
4
7
|
#include <cassert>
|
5
8
|
|
6
9
|
#include <vector>
|
10
|
+
#include <array>
|
7
11
|
#include <iostream>
|
8
12
|
#include <unordered_map>
|
9
13
|
#include <set>
|
10
14
|
#include <algorithm>
|
11
15
|
#include <sstream>
|
12
16
|
|
13
|
-
#
|
14
|
-
#include <
|
15
|
-
#
|
17
|
+
#include <vector>
|
18
|
+
#include <mutex>
|
19
|
+
#include <thread>
|
16
20
|
|
21
|
+
#include "thread_pool.hpp"
|
17
22
|
#include "unordered_dense.h"
|
18
23
|
|
24
|
+
namespace StrIdx {
|
25
|
+
|
26
|
+
/* Alternative to using std::cout
|
27
|
+
Allows to control verbose level */
|
28
|
+
class Output {
|
29
|
+
private:
|
30
|
+
int verboseLevel;
|
31
|
+
|
32
|
+
public:
|
33
|
+
Output(int verb) : verboseLevel(verb) {}
|
34
|
+
Output() : Output(3) {}
|
35
|
+
~Output() = default;
|
36
|
+
static void print() {}
|
37
|
+
|
38
|
+
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
+
template <typename T, typename... Types> static void print(T var1, Types... var2) {
|
40
|
+
std::cout << var1;
|
41
|
+
print(var2...);
|
42
|
+
}
|
43
|
+
|
44
|
+
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
+
template <typename... Types> static void printl(Types... var2) {
|
46
|
+
print(var2...);
|
47
|
+
print("\n");
|
48
|
+
}
|
49
|
+
|
50
|
+
/* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
51
|
+
* if verboseLevel >= 2 (first arg)
|
52
|
+
*/
|
53
|
+
template <typename... Types> void printv(int vlevel, Types... var2) {
|
54
|
+
if (verboseLevel < vlevel) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
if (verboseLevel >= 3) {
|
58
|
+
print("[v=", vlevel, "] ");
|
59
|
+
}
|
60
|
+
printl(var2...);
|
61
|
+
}
|
62
|
+
};
|
63
|
+
|
19
64
|
// Transforms input string as follows:
|
20
65
|
// '/foo/bar/file1.txt'
|
21
66
|
// => vector{"foo", "bar", "file1.txt"}
|
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
|
|
34
79
|
}
|
35
80
|
|
36
81
|
// Convert int64_t to binary string
|
37
|
-
std::string int64ToBinaryString(int64_t num) {
|
82
|
+
[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
|
38
83
|
std::string result;
|
39
84
|
for (int i = 63; i >= 0; --i) {
|
40
85
|
result += ((num >> i) & 1) ? '1' : '0';
|
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
|
|
42
87
|
return result;
|
43
88
|
}
|
44
89
|
|
45
|
-
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
-
std::string int64ToStr(int64_t key) {
|
90
|
+
// Debug. Convert a (8 char) string represented as int64_t to std::string
|
91
|
+
[[nodiscard]] std::string int64ToStr(const int64_t &key) {
|
47
92
|
int nchars = 8;
|
48
93
|
std::string str;
|
49
94
|
int multip = nchars * 8;
|
@@ -55,43 +100,45 @@ std::string int64ToStr(int64_t key) {
|
|
55
100
|
return str;
|
56
101
|
}
|
57
102
|
|
103
|
+
// Debug
|
58
104
|
void printVector(const std::vector<int> &vec) {
|
59
105
|
for (const auto &value : vec) {
|
60
106
|
std::cout << value << " ";
|
61
107
|
}
|
62
108
|
}
|
63
109
|
|
64
|
-
|
110
|
+
// Debug
|
111
|
+
[[nodiscard]] std::string charToBinaryString(const char &chr) {
|
65
112
|
std::string result;
|
66
113
|
for (int i = 7; i >= 0; --i) {
|
67
|
-
result += ((
|
114
|
+
result += ((chr >> i) & 1) ? '1' : '0';
|
68
115
|
}
|
69
116
|
return result;
|
70
117
|
}
|
71
118
|
|
72
119
|
class Candidate;
|
73
|
-
enum segmentType { Dir, File };
|
120
|
+
enum class segmentType { Dir, File };
|
74
121
|
|
75
122
|
// A segment of a file path
|
76
123
|
// e.g. if path is /foo/bar/baz.txt
|
77
124
|
// segments are [{root}, foo, bar, baz.txt]
|
78
|
-
|
79
|
-
public:
|
125
|
+
struct PathSegment {
|
80
126
|
std::string str;
|
81
127
|
int fileId; // (if FILE)
|
82
128
|
Candidate *cand;
|
83
129
|
PathSegment *parent;
|
130
|
+
std::mutex mu;
|
84
131
|
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
-
segmentType type = Dir;
|
86
|
-
PathSegment() : parent(
|
87
|
-
PathSegment(std::string _str) : str(_str), parent(
|
132
|
+
segmentType type = segmentType::Dir;
|
133
|
+
PathSegment() : parent(nullptr) {}
|
134
|
+
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
88
135
|
PathSegment(std::string _str, int _fileId)
|
89
|
-
: str(_str), fileId(_fileId), cand(
|
90
|
-
int size() {
|
136
|
+
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
137
|
+
[[nodiscard]] int size() const {
|
91
138
|
int sz = str.size();
|
92
139
|
PathSegment *cur = parent;
|
93
140
|
// Sum up length of parent segments (+1 for divisors)
|
94
|
-
while (cur->parent !=
|
141
|
+
while (cur->parent != nullptr) {
|
95
142
|
sz += cur->str.size() + 1;
|
96
143
|
cur = cur->parent;
|
97
144
|
}
|
@@ -100,8 +147,7 @@ public:
|
|
100
147
|
};
|
101
148
|
|
102
149
|
// Candidate for result in string (filename) search
|
103
|
-
|
104
|
-
public:
|
150
|
+
struct Candidate {
|
105
151
|
std::vector<float> v_charscore;
|
106
152
|
PathSegment *seg;
|
107
153
|
int fileId;
|
@@ -114,25 +160,17 @@ public:
|
|
114
160
|
int candLen; // Length of candidate
|
115
161
|
|
116
162
|
Candidate(){};
|
117
|
-
Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
|
118
|
-
// Initialize v_charscores with zeros
|
119
|
-
v_charscore.resize(len, 0);
|
120
|
-
candLen = str.size();
|
121
|
-
seg = NULL;
|
122
|
-
}
|
123
|
-
|
124
163
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
125
164
|
// Initialize v_charscores with zeros
|
126
165
|
v_charscore.resize(len, 0);
|
127
166
|
candLen = seg->size();
|
128
167
|
}
|
129
168
|
|
130
|
-
float getScore() {
|
169
|
+
[[nodiscard]] float getScore() const {
|
131
170
|
int i = 0;
|
132
171
|
float score = 0.0;
|
133
|
-
candLen = seg->size();
|
134
172
|
|
135
|
-
for (float &charscore : v_charscore) {
|
173
|
+
for (const float &charscore : v_charscore) {
|
136
174
|
score += charscore;
|
137
175
|
i++;
|
138
176
|
}
|
@@ -145,35 +183,41 @@ public:
|
|
145
183
|
return score;
|
146
184
|
}
|
147
185
|
|
148
|
-
float operator[](int idx) { return v_charscore[idx]; }
|
186
|
+
[[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
|
149
187
|
};
|
150
188
|
|
151
189
|
// This seems to give 10x speed improvement over std::unordered_map
|
152
190
|
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
191
|
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
192
|
|
155
|
-
typedef
|
193
|
+
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
194
|
+
// typedef std::unordered_map<int, Candidate*> CandMap;
|
156
195
|
|
157
196
|
class StringIndex {
|
158
197
|
private:
|
159
198
|
int tmp;
|
160
199
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
200
|
+
int numStrings = 0;
|
161
201
|
|
162
202
|
std::vector<SegMap *> dirmaps;
|
203
|
+
std::array<std::mutex, 9> mts_d; // for dirmaps
|
163
204
|
std::vector<SegMap *> filemaps;
|
205
|
+
std::array<std::mutex, 9> mts_f; // for filemaps
|
164
206
|
|
165
207
|
std::vector<PathSegment *> segsToClean;
|
166
208
|
|
167
|
-
std::unordered_map<int, std::string> strlist;
|
168
209
|
std::unordered_map<int, PathSegment *> seglist;
|
169
210
|
PathSegment *root;
|
170
211
|
int dirId = 0;
|
171
212
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
213
|
|
214
|
+
std::unique_ptr<ThreadPool> pool;
|
215
|
+
Output out{1}; // verbose level = 1
|
216
|
+
|
173
217
|
public:
|
174
|
-
StringIndex() {
|
218
|
+
StringIndex(char sep) : dirSeparator(sep) {
|
175
219
|
root = new PathSegment();
|
176
|
-
root->parent =
|
220
|
+
root->parent = nullptr;
|
177
221
|
root->str = "[ROOT]";
|
178
222
|
|
179
223
|
for (int i = 0; i <= 8; i++) {
|
@@ -181,11 +225,18 @@ public:
|
|
181
225
|
filemaps.push_back(new SegMap);
|
182
226
|
}
|
183
227
|
|
184
|
-
|
185
|
-
|
186
|
-
|
228
|
+
// Threads between 4 and 6
|
229
|
+
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
230
|
+
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
231
|
+
num_threads = std::min(num_threads, 6);
|
232
|
+
out.printv(2, "Number of threads: ", num_threads);
|
233
|
+
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
187
234
|
}
|
188
235
|
|
236
|
+
/* Don't separate path to segments separator=\0.
|
237
|
+
This is slower, but can be used for other data than files also. */
|
238
|
+
StringIndex() : StringIndex('\0') {}
|
239
|
+
|
189
240
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
241
|
void setDirWeight(float val) { dirWeight = val; }
|
191
242
|
|
@@ -213,6 +264,15 @@ public:
|
|
213
264
|
addStrToIndex(filePath, fileId, dirSeparator);
|
214
265
|
}
|
215
266
|
|
267
|
+
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
268
|
+
pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
|
269
|
+
}
|
270
|
+
void waitUntilReady() const { pool->waitUntilDone(); }
|
271
|
+
|
272
|
+
void waitUntilDone() const { pool->waitUntilDone(); }
|
273
|
+
|
274
|
+
int size() const { return seglist.size(); }
|
275
|
+
|
216
276
|
/**
|
217
277
|
* Add a string to the index to be searched for afterwards
|
218
278
|
*
|
@@ -221,9 +281,17 @@ public:
|
|
221
281
|
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
|
222
282
|
* one of {'\\', '/', '\0' (no separation)}.
|
223
283
|
*/
|
284
|
+
|
224
285
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
286
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
|
287
|
+
|
288
|
+
// If a string with this index has beeen added already
|
289
|
+
if (seglist.find(fileId) != seglist.end()) {
|
290
|
+
return;
|
291
|
+
}
|
225
292
|
|
226
293
|
std::vector<std::string> segs;
|
294
|
+
numStrings += 1;
|
227
295
|
|
228
296
|
if (separator == '\0') {
|
229
297
|
// No separation to directories & files
|
@@ -233,7 +301,7 @@ public:
|
|
233
301
|
segs = splitString(filePath, separator);
|
234
302
|
}
|
235
303
|
|
236
|
-
PathSegment *prev =
|
304
|
+
PathSegment *prev = nullptr;
|
237
305
|
prev = root;
|
238
306
|
// Add segments to a tree type data structure
|
239
307
|
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
@@ -245,25 +313,27 @@ public:
|
|
245
313
|
auto x = *_x;
|
246
314
|
PathSegment *p;
|
247
315
|
|
248
|
-
|
316
|
+
prev->mu.lock();
|
317
|
+
|
249
318
|
// this part of the path already exists in the tree
|
250
|
-
if (it != prev->children.end()) {
|
319
|
+
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
251
320
|
p = it->second;
|
321
|
+
prev->mu.unlock();
|
252
322
|
} else {
|
253
323
|
p = new PathSegment(x, fileId);
|
254
324
|
p->parent = prev;
|
255
|
-
// If this is last item in segs
|
325
|
+
// If this is last item in segs, then it is a file.
|
256
326
|
if (_x == std::prev(segs.end())) {
|
257
|
-
|
258
|
-
p->type = File;
|
327
|
+
p->type = segmentType::File;
|
259
328
|
seglist[fileId] = p;
|
260
|
-
} else {
|
261
|
-
p->type = Dir;
|
329
|
+
} else { // otherwise, it is a directory
|
330
|
+
p->type = segmentType::Dir;
|
262
331
|
p->fileId = dirId;
|
263
332
|
// Files use user input Id. Directories need to have it generated
|
264
333
|
dirId++;
|
265
334
|
}
|
266
335
|
prev->children[x] = p;
|
336
|
+
prev->mu.unlock();
|
267
337
|
addPathSegmentKeys(p);
|
268
338
|
}
|
269
339
|
|
@@ -271,6 +341,17 @@ public:
|
|
271
341
|
}
|
272
342
|
}
|
273
343
|
|
344
|
+
std::string getString(int id) {
|
345
|
+
std::string s = "";
|
346
|
+
PathSegment *seg = seglist[id];
|
347
|
+
s += seg->str;
|
348
|
+
while (seg->parent->parent != nullptr) {
|
349
|
+
seg = seg->parent;
|
350
|
+
s = seg->str + dirSeparator + s;
|
351
|
+
}
|
352
|
+
return s;
|
353
|
+
}
|
354
|
+
|
274
355
|
/**
|
275
356
|
The search will find filepaths similar to the input string
|
276
357
|
|
@@ -303,14 +384,16 @@ public:
|
|
303
384
|
@param query String to search for inside the index
|
304
385
|
*/
|
305
386
|
|
306
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
387
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
307
388
|
return findSimilar(query, 2);
|
308
389
|
}
|
309
390
|
|
310
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
391
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
311
392
|
CandMap fileCandMap;
|
312
393
|
CandMap dirCandMap;
|
313
394
|
|
395
|
+
waitUntilDone();
|
396
|
+
|
314
397
|
// Find both files and directories that match the input query
|
315
398
|
addToCandMap(fileCandMap, query, filemaps);
|
316
399
|
addToCandMap(dirCandMap, query, dirmaps);
|
@@ -319,9 +402,9 @@ public:
|
|
319
402
|
scores of the file */
|
320
403
|
mergeCandidateMaps(fileCandMap, dirCandMap);
|
321
404
|
|
322
|
-
// Set all candidate pointers to
|
405
|
+
// Set all candidate pointers to nullptr so they won't mess up future searches
|
323
406
|
for (auto seg : segsToClean) {
|
324
|
-
seg->cand =
|
407
|
+
seg->cand = nullptr;
|
325
408
|
}
|
326
409
|
segsToClean.clear();
|
327
410
|
|
@@ -329,11 +412,17 @@ public:
|
|
329
412
|
std::vector<std::pair<float, int>> results;
|
330
413
|
for (auto &[fid, cand] : fileCandMap) {
|
331
414
|
std::pair<float, int> v;
|
332
|
-
float sc = cand
|
415
|
+
float sc = cand->getScore();
|
333
416
|
v.first = sc;
|
334
417
|
v.second = fid;
|
335
418
|
results.push_back(v);
|
419
|
+
delete cand;
|
336
420
|
}
|
421
|
+
|
422
|
+
for (auto &[fid, cand] : dirCandMap) {
|
423
|
+
delete cand;
|
424
|
+
}
|
425
|
+
|
337
426
|
// Sort highest score first
|
338
427
|
std::sort(results.begin(), results.end(),
|
339
428
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
@@ -341,10 +430,10 @@ public:
|
|
341
430
|
}
|
342
431
|
|
343
432
|
// Return int64_t representation of the first nchars in str, starting from index i
|
344
|
-
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
433
|
+
[[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
|
345
434
|
int64_t key = 0;
|
346
435
|
for (int i_char = 0; i_char < nchars; i_char++) {
|
347
|
-
key = key | static_cast<
|
436
|
+
key = key | static_cast<int64_t>(str[i + i_char]);
|
348
437
|
if (i_char < nchars - 1) {
|
349
438
|
// Shift 8 bits to the left except on the last iteration
|
350
439
|
key = key << 8;
|
@@ -399,22 +488,29 @@ private:
|
|
399
488
|
maxChars = p->str.size();
|
400
489
|
}
|
401
490
|
|
402
|
-
#ifdef _OPENMP
|
403
|
-
#pragma omp parallel for
|
404
|
-
#endif
|
405
491
|
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
406
492
|
|
493
|
+
std::mutex *mu;
|
407
494
|
SegMap *map;
|
408
|
-
if (p->type == File) {
|
495
|
+
if (p->type == segmentType::File) {
|
409
496
|
map = filemaps[sublen];
|
497
|
+
mu = &mts_f[sublen];
|
410
498
|
} else {
|
411
499
|
map = dirmaps[sublen];
|
500
|
+
mu = &mts_d[sublen];
|
412
501
|
}
|
413
502
|
|
414
503
|
int count = str.size() - sublen + 1;
|
415
504
|
|
505
|
+
int64_t keys[count + 1];
|
416
506
|
for (int i = 0; i <= count; i++) {
|
417
|
-
|
507
|
+
keys[i] = getKeyAtIdx(str, i, sublen);
|
508
|
+
}
|
509
|
+
|
510
|
+
mu->lock();
|
511
|
+
for (int i = 0; i <= count; i++) {
|
512
|
+
// int64_t key = getKeyAtIdx(str, i, sublen);
|
513
|
+
auto key = keys[i];
|
418
514
|
|
419
515
|
// Create a new std::set for key if doesn't exist already
|
420
516
|
auto it = map->find(key);
|
@@ -423,12 +519,14 @@ private:
|
|
423
519
|
}
|
424
520
|
(*map)[key]->insert(p);
|
425
521
|
}
|
522
|
+
mu->unlock();
|
426
523
|
}
|
427
524
|
}
|
428
525
|
|
429
526
|
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
430
527
|
// is of length <nchars>.
|
431
|
-
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
528
|
+
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
529
|
+
SegMap &map) const {
|
432
530
|
|
433
531
|
assert(i + nchars <= static_cast<int>(str.size()));
|
434
532
|
std::vector<PathSegment *> res;
|
@@ -437,8 +535,7 @@ private:
|
|
437
535
|
// transform that to 64 bit integer
|
438
536
|
int64_t key = getKeyAtIdx(str, i, nchars);
|
439
537
|
// Find all path segments in map that have the same substring
|
440
|
-
auto it = map.find(key);
|
441
|
-
if (it != map.end()) { // key found
|
538
|
+
if (auto it = map.find(key); it != map.end()) { // key found
|
442
539
|
auto set = it->second;
|
443
540
|
for (auto value : *set) {
|
444
541
|
res.push_back(value);
|
@@ -475,12 +572,12 @@ private:
|
|
475
572
|
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
476
573
|
|
477
574
|
for (auto &[fid, cand] : fileCandMap) {
|
478
|
-
PathSegment *p = cand
|
479
|
-
while (p->parent !=
|
480
|
-
if (p->cand !=
|
481
|
-
auto &scoreA = cand
|
575
|
+
PathSegment *p = cand->seg->parent;
|
576
|
+
while (p->parent != nullptr) {
|
577
|
+
if (p->cand != nullptr) {
|
578
|
+
auto &scoreA = cand->v_charscore;
|
482
579
|
auto &scoreB = p->cand->v_charscore;
|
483
|
-
for (int i = 0; i < cand
|
580
|
+
for (int i = 0; i < cand->len; i++) {
|
484
581
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
485
582
|
scoreA[i] = scoreB[i] * dirWeight;
|
486
583
|
}
|
@@ -493,18 +590,22 @@ private:
|
|
493
590
|
|
494
591
|
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
495
592
|
|
496
|
-
auto it2 = candmap.find(seg->fileId);
|
497
|
-
|
498
|
-
Candidate cand(seg, str.size());
|
499
|
-
seg->cand = &(candmap[seg->fileId]);
|
593
|
+
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
594
|
+
Candidate *cand = new Candidate(seg, str.size());
|
500
595
|
segsToClean.push_back(seg);
|
501
596
|
candmap[seg->fileId] = cand;
|
597
|
+
seg->cand = cand;
|
502
598
|
}
|
503
599
|
|
504
600
|
for (int j = i; j < i + nchars; j++) {
|
505
|
-
|
506
|
-
|
601
|
+
Candidate &cand = *(candmap[seg->fileId]);
|
602
|
+
if (cand[j] < nchars) {
|
603
|
+
cand.v_charscore[j] = nchars;
|
507
604
|
}
|
508
605
|
}
|
509
606
|
}
|
510
607
|
};
|
608
|
+
|
609
|
+
} // namespace StrIdx
|
610
|
+
|
611
|
+
#endif
|
data/test.rb
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
$:.unshift File.dirname(__FILE__)
|
4
|
+
|
3
5
|
require "stridx"
|
4
6
|
idx = StrIdx::StringIndex.new
|
5
7
|
|
8
|
+
# "/" for unix-style file paths
|
9
|
+
idx.setDirSeparator("/") #(comment out if not file paths)
|
10
|
+
|
6
11
|
t = Time.new
|
7
12
|
fn = File.expand_path("flist.txt")
|
8
13
|
lines = IO.read(fn).lines.collect { |x| x.strip }
|
@@ -13,7 +18,13 @@ for x in lines
|
|
13
18
|
end
|
14
19
|
|
15
20
|
idx_time = Time.new
|
16
|
-
|
21
|
+
# Time to start the threadpool to process indexing
|
22
|
+
puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
23
|
+
|
24
|
+
idx.waitUntilDone() # Not necessary, will be called by idx.find
|
25
|
+
idx_time = Time.new
|
26
|
+
# Time when all threads have completed
|
27
|
+
puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
17
28
|
|
18
29
|
query = "rngnomadriv"
|
19
30
|
res = idx.find(query)
|