StrIdx 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +6 -265
- data/README.md +10 -1
- data/demo.cpp +30 -8
- data/flist.txt +89828 -0
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +38 -4
- data/stridx.hpp +160 -62
- data/test.rb +7 -1
- data/thread_pool.hpp +98 -0
- metadata +7 -3
data/rubyext/extconf.rb
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
2
|
|
4
3
|
require 'mkmf'
|
5
4
|
|
6
5
|
module_name = "stridx"
|
7
6
|
extension_name = 'stridx'
|
8
7
|
|
9
|
-
$CXXFLAGS << " -Wall -Wno-unused-variable -O3
|
8
|
+
$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
|
10
9
|
|
11
10
|
have_library( 'stdc++');
|
12
|
-
have_library( 'gomp' );
|
13
11
|
|
14
12
|
dir_config(extension_name) # The destination
|
15
13
|
create_makefile(extension_name) # Create Makefile
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
extern "C" {
|
9
9
|
|
10
|
-
void str_idx_free(void *data) { delete (StringIndex *)data; }
|
10
|
+
void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
|
11
11
|
|
12
12
|
// Wrap StringIndex class inside a ruby variable
|
13
13
|
static const rb_data_type_t str_idx_type = {
|
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
|
|
26
26
|
};
|
27
27
|
|
28
28
|
VALUE str_idx_alloc(VALUE self) {
|
29
|
-
void *data = new StringIndex();
|
29
|
+
void *data = new StrIdx::StringIndex();
|
30
30
|
return TypedData_Wrap_Struct(self, &str_idx_type, data);
|
31
31
|
}
|
32
32
|
|
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
36
36
|
|
37
37
|
void *data;
|
38
38
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
-
((StringIndex *)data)->addStrToIndex(s1, fid
|
39
|
+
// ((StringIndex *)data)->addStrToIndex(s1, fid);
|
40
|
+
((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
|
40
41
|
|
41
42
|
return self;
|
42
43
|
}
|
43
44
|
|
45
|
+
VALUE StringIndexWaitUntilDone(VALUE self) {
|
46
|
+
void *data;
|
47
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
48
|
+
((StrIdx::StringIndex *)data)->waitUntilDone();
|
49
|
+
return self;
|
50
|
+
}
|
51
|
+
|
52
|
+
|
44
53
|
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
54
|
VALUE ret;
|
46
55
|
std::string s1 = StringValueCStr(str);
|
47
56
|
|
48
57
|
void *data;
|
49
58
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
50
|
-
StringIndex *idx = (StringIndex *)data;
|
59
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
51
60
|
|
52
61
|
ret = rb_ary_new();
|
53
62
|
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
@@ -66,6 +75,26 @@ VALUE StringIndexFind(VALUE self, VALUE str) {
|
|
66
75
|
return ret;
|
67
76
|
}
|
68
77
|
|
78
|
+
VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
79
|
+
char c = '/';
|
80
|
+
if (TYPE(str) == T_STRING) {
|
81
|
+
std::string s = StringValueCStr(str);
|
82
|
+
if (s.size() >= 1) {
|
83
|
+
c = s[0];
|
84
|
+
}
|
85
|
+
} else {
|
86
|
+
c = '\0'; // No separator
|
87
|
+
// if (TYPE(obj) == T_NIL)
|
88
|
+
}
|
89
|
+
|
90
|
+
void *data;
|
91
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
92
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
93
|
+
idx->setDirSeparator(c);
|
94
|
+
|
95
|
+
return self;
|
96
|
+
}
|
97
|
+
|
69
98
|
void Init_stridx(void) {
|
70
99
|
|
71
100
|
VALUE mStrIdx = rb_define_module("StrIdx");
|
@@ -73,7 +102,12 @@ void Init_stridx(void) {
|
|
73
102
|
|
74
103
|
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
75
104
|
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
105
|
+
rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
|
76
106
|
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
107
|
+
|
108
|
+
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
109
|
+
|
110
|
+
|
77
111
|
}
|
78
112
|
|
79
113
|
} // End extern "C"
|
data/stridx.hpp
CHANGED
@@ -1,21 +1,66 @@
|
|
1
1
|
|
2
|
+
#ifndef SSSTRIDX_HPP
|
3
|
+
#define SSSTRIDX_HPP
|
4
|
+
|
2
5
|
#include <stdio.h>
|
3
6
|
#include <stdlib.h>
|
4
7
|
#include <cassert>
|
5
8
|
|
6
9
|
#include <vector>
|
10
|
+
#include <array>
|
7
11
|
#include <iostream>
|
8
12
|
#include <unordered_map>
|
9
13
|
#include <set>
|
10
14
|
#include <algorithm>
|
11
15
|
#include <sstream>
|
12
16
|
|
13
|
-
#
|
14
|
-
#include <
|
15
|
-
#
|
17
|
+
#include <vector>
|
18
|
+
#include <mutex>
|
19
|
+
#include <thread>
|
16
20
|
|
21
|
+
#include "thread_pool.hpp"
|
17
22
|
#include "unordered_dense.h"
|
18
23
|
|
24
|
+
namespace StrIdx {
|
25
|
+
|
26
|
+
/* Alternative to using std::cout
|
27
|
+
Allows to control verbose level */
|
28
|
+
class Output {
|
29
|
+
private:
|
30
|
+
int verboseLevel;
|
31
|
+
|
32
|
+
public:
|
33
|
+
Output(int verb) : verboseLevel(verb) {}
|
34
|
+
Output() : Output(3) {}
|
35
|
+
~Output() = default;
|
36
|
+
void print() {}
|
37
|
+
|
38
|
+
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
+
template <typename T, typename... Types> void print(T var1, Types... var2) {
|
40
|
+
std::cout << var1;
|
41
|
+
print(var2...);
|
42
|
+
}
|
43
|
+
|
44
|
+
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
+
template <typename... Types> void printl(Types... var2) {
|
46
|
+
print(var2...);
|
47
|
+
print("\n");
|
48
|
+
}
|
49
|
+
|
50
|
+
/* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
51
|
+
* if verboseLevel >= 2 (first arg)
|
52
|
+
*/
|
53
|
+
template <typename... Types> void printv(int vlevel, Types... var2) {
|
54
|
+
if (verboseLevel < vlevel) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
if (verboseLevel >= 3) {
|
58
|
+
print("[v=", vlevel, "] ");
|
59
|
+
}
|
60
|
+
printl(var2...);
|
61
|
+
}
|
62
|
+
};
|
63
|
+
|
19
64
|
// Transforms input string as follows:
|
20
65
|
// '/foo/bar/file1.txt'
|
21
66
|
// => vector{"foo", "bar", "file1.txt"}
|
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
|
|
34
79
|
}
|
35
80
|
|
36
81
|
// Convert int64_t to binary string
|
37
|
-
std::string int64ToBinaryString(int64_t num) {
|
82
|
+
[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
|
38
83
|
std::string result;
|
39
84
|
for (int i = 63; i >= 0; --i) {
|
40
85
|
result += ((num >> i) & 1) ? '1' : '0';
|
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
|
|
42
87
|
return result;
|
43
88
|
}
|
44
89
|
|
45
|
-
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
-
std::string int64ToStr(int64_t key) {
|
90
|
+
// Debug. Convert a (8 char) string represented as int64_t to std::string
|
91
|
+
[[nodiscard]] std::string int64ToStr(int64_t key) {
|
47
92
|
int nchars = 8;
|
48
93
|
std::string str;
|
49
94
|
int multip = nchars * 8;
|
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
|
|
55
100
|
return str;
|
56
101
|
}
|
57
102
|
|
103
|
+
// Debug
|
58
104
|
void printVector(const std::vector<int> &vec) {
|
59
105
|
for (const auto &value : vec) {
|
60
106
|
std::cout << value << " ";
|
61
107
|
}
|
62
108
|
}
|
63
109
|
|
64
|
-
|
110
|
+
// Debug
|
111
|
+
[[nodiscard]] std::string charToBinaryString(char chr) {
|
65
112
|
std::string result;
|
66
113
|
for (int i = 7; i >= 0; --i) {
|
67
|
-
result += ((
|
114
|
+
result += ((chr >> i) & 1) ? '1' : '0';
|
68
115
|
}
|
69
116
|
return result;
|
70
117
|
}
|
71
118
|
|
72
119
|
class Candidate;
|
73
|
-
enum segmentType { Dir, File };
|
120
|
+
enum class segmentType { Dir, File };
|
74
121
|
|
75
122
|
// A segment of a file path
|
76
123
|
// e.g. if path is /foo/bar/baz.txt
|
@@ -81,17 +128,18 @@ public:
|
|
81
128
|
int fileId; // (if FILE)
|
82
129
|
Candidate *cand;
|
83
130
|
PathSegment *parent;
|
131
|
+
std::mutex mu;
|
84
132
|
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
-
segmentType type = Dir;
|
86
|
-
PathSegment() : parent(
|
87
|
-
PathSegment(std::string _str) : str(_str), parent(
|
133
|
+
segmentType type = segmentType::Dir;
|
134
|
+
PathSegment() : parent(nullptr) {}
|
135
|
+
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
88
136
|
PathSegment(std::string _str, int _fileId)
|
89
|
-
: str(_str), fileId(_fileId), cand(
|
90
|
-
int size() {
|
137
|
+
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
138
|
+
[[nodiscard]] int size() {
|
91
139
|
int sz = str.size();
|
92
140
|
PathSegment *cur = parent;
|
93
141
|
// Sum up length of parent segments (+1 for divisors)
|
94
|
-
while (cur->parent !=
|
142
|
+
while (cur->parent != nullptr) {
|
95
143
|
sz += cur->str.size() + 1;
|
96
144
|
cur = cur->parent;
|
97
145
|
}
|
@@ -118,7 +166,7 @@ public:
|
|
118
166
|
// Initialize v_charscores with zeros
|
119
167
|
v_charscore.resize(len, 0);
|
120
168
|
candLen = str.size();
|
121
|
-
seg =
|
169
|
+
seg = nullptr;
|
122
170
|
}
|
123
171
|
|
124
172
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
@@ -127,7 +175,7 @@ public:
|
|
127
175
|
candLen = seg->size();
|
128
176
|
}
|
129
177
|
|
130
|
-
float getScore() {
|
178
|
+
[[nodiscard]] float getScore() {
|
131
179
|
int i = 0;
|
132
180
|
float score = 0.0;
|
133
181
|
candLen = seg->size();
|
@@ -145,19 +193,21 @@ public:
|
|
145
193
|
return score;
|
146
194
|
}
|
147
195
|
|
148
|
-
float operator[](int idx) { return v_charscore[idx]; }
|
196
|
+
[[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
|
149
197
|
};
|
150
198
|
|
151
199
|
// This seems to give 10x speed improvement over std::unordered_map
|
152
200
|
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
201
|
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
202
|
|
155
|
-
typedef
|
203
|
+
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
204
|
+
// typedef std::unordered_map<int, Candidate*> CandMap;
|
156
205
|
|
157
206
|
class StringIndex {
|
158
207
|
private:
|
159
208
|
int tmp;
|
160
209
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
210
|
+
int numStrings = 0;
|
161
211
|
|
162
212
|
std::vector<SegMap *> dirmaps;
|
163
213
|
std::vector<SegMap *> filemaps;
|
@@ -170,10 +220,16 @@ private:
|
|
170
220
|
int dirId = 0;
|
171
221
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
222
|
|
223
|
+
std::array<std::mutex, 9> mts_f;
|
224
|
+
std::array<std::mutex, 9> mts_d;
|
225
|
+
|
226
|
+
std::unique_ptr<ThreadPool> pool;
|
227
|
+
Output out{1}; // verbose level = 1
|
228
|
+
|
173
229
|
public:
|
174
|
-
StringIndex() {
|
230
|
+
StringIndex(char sep) : dirSeparator(sep) {
|
175
231
|
root = new PathSegment();
|
176
|
-
root->parent =
|
232
|
+
root->parent = nullptr;
|
177
233
|
root->str = "[ROOT]";
|
178
234
|
|
179
235
|
for (int i = 0; i <= 8; i++) {
|
@@ -181,11 +237,18 @@ public:
|
|
181
237
|
filemaps.push_back(new SegMap);
|
182
238
|
}
|
183
239
|
|
184
|
-
|
185
|
-
|
186
|
-
|
240
|
+
// Threads between 4 and 6
|
241
|
+
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
242
|
+
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
243
|
+
num_threads = std::min(num_threads, 6);
|
244
|
+
out.printv(2, "Number of threads: ", num_threads);
|
245
|
+
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
187
246
|
}
|
188
247
|
|
248
|
+
/* Don't separate path to segments separator=\0.
|
249
|
+
This is slower, but can be used for other data than files also. */
|
250
|
+
StringIndex() : StringIndex('\0') {}
|
251
|
+
|
189
252
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
253
|
void setDirWeight(float val) { dirWeight = val; }
|
191
254
|
|
@@ -213,8 +276,15 @@ public:
|
|
213
276
|
addStrToIndex(filePath, fileId, dirSeparator);
|
214
277
|
}
|
215
278
|
|
279
|
+
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
280
|
+
pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
|
281
|
+
}
|
282
|
+
void waitUntilReady() { pool->waitUntilDone(); }
|
283
|
+
|
284
|
+
void waitUntilDone() { pool->waitUntilDone(); }
|
285
|
+
|
216
286
|
/**
|
217
|
-
* Add a string to the index to be
|
287
|
+
* Add a string to the index to be searched for afterwards
|
218
288
|
*
|
219
289
|
* @param filePath String to index (e.g. /home/user/Project/main.cpp).
|
220
290
|
* @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
|
@@ -222,8 +292,10 @@ public:
|
|
222
292
|
* one of {'\\', '/', '\0' (no separation)}.
|
223
293
|
*/
|
224
294
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
295
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
|
225
296
|
|
226
297
|
std::vector<std::string> segs;
|
298
|
+
numStrings += 1;
|
227
299
|
|
228
300
|
if (separator == '\0') {
|
229
301
|
// No separation to directories & files
|
@@ -233,7 +305,7 @@ public:
|
|
233
305
|
segs = splitString(filePath, separator);
|
234
306
|
}
|
235
307
|
|
236
|
-
PathSegment *prev =
|
308
|
+
PathSegment *prev = nullptr;
|
237
309
|
prev = root;
|
238
310
|
// Add segments to a tree type data structure
|
239
311
|
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
@@ -245,25 +317,27 @@ public:
|
|
245
317
|
auto x = *_x;
|
246
318
|
PathSegment *p;
|
247
319
|
|
248
|
-
|
320
|
+
prev->mu.lock();
|
321
|
+
|
249
322
|
// this part of the path already exists in the tree
|
250
|
-
if (it != prev->children.end()) {
|
323
|
+
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
251
324
|
p = it->second;
|
325
|
+
prev->mu.unlock();
|
252
326
|
} else {
|
253
327
|
p = new PathSegment(x, fileId);
|
254
328
|
p->parent = prev;
|
255
|
-
// If this is last item in segs
|
329
|
+
// If this is last item in segs, then it is a file.
|
256
330
|
if (_x == std::prev(segs.end())) {
|
257
|
-
|
258
|
-
p->type = File;
|
331
|
+
p->type = segmentType::File;
|
259
332
|
seglist[fileId] = p;
|
260
|
-
} else {
|
261
|
-
p->type = Dir;
|
333
|
+
} else { // otherwise, it is a directory
|
334
|
+
p->type = segmentType::Dir;
|
262
335
|
p->fileId = dirId;
|
263
336
|
// Files use user input Id. Directories need to have it generated
|
264
337
|
dirId++;
|
265
338
|
}
|
266
339
|
prev->children[x] = p;
|
340
|
+
prev->mu.unlock();
|
267
341
|
addPathSegmentKeys(p);
|
268
342
|
}
|
269
343
|
|
@@ -272,7 +346,7 @@ public:
|
|
272
346
|
}
|
273
347
|
|
274
348
|
/**
|
275
|
-
|
349
|
+
The search will find filepaths similar to the input string
|
276
350
|
|
277
351
|
To be considered a candidate path, the file component of the path (e.g. file.txt)
|
278
352
|
is required to have at least a substring of two characters in common with the
|
@@ -286,8 +360,8 @@ public:
|
|
286
360
|
is also included in the PathSegment
|
287
361
|
- take the lenght of that substring as score
|
288
362
|
sum up the scores for each character c and divide by (string length)^2
|
289
|
-
|
290
|
-
For example, if query = "rngnomadriv"
|
363
|
+
|
364
|
+
For example, if query = "rngnomadriv"
|
291
365
|
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
|
292
366
|
as follows:
|
293
367
|
rngnomadriv
|
@@ -296,17 +370,23 @@ public:
|
|
296
370
|
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
297
371
|
|
298
372
|
In final score, give a small penalty for larger candidate filenames:
|
299
|
-
Divide main part of score with (query string length)^2
|
373
|
+
Divide main part of score with (query string length)^2
|
300
374
|
and minor part by (query string length)*(candidate string length)
|
301
375
|
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
302
376
|
|
303
377
|
@param query String to search for inside the index
|
304
378
|
*/
|
305
379
|
|
306
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query
|
380
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
381
|
+
return findSimilar(query, 2);
|
382
|
+
}
|
383
|
+
|
384
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
307
385
|
CandMap fileCandMap;
|
308
386
|
CandMap dirCandMap;
|
309
387
|
|
388
|
+
waitUntilDone();
|
389
|
+
|
310
390
|
// Find both files and directories that match the input query
|
311
391
|
addToCandMap(fileCandMap, query, filemaps);
|
312
392
|
addToCandMap(dirCandMap, query, dirmaps);
|
@@ -315,9 +395,9 @@ public:
|
|
315
395
|
scores of the file */
|
316
396
|
mergeCandidateMaps(fileCandMap, dirCandMap);
|
317
397
|
|
318
|
-
// Set all candidate pointers to
|
398
|
+
// Set all candidate pointers to nullptr so they won't mess up future searches
|
319
399
|
for (auto seg : segsToClean) {
|
320
|
-
seg->cand =
|
400
|
+
seg->cand = nullptr;
|
321
401
|
}
|
322
402
|
segsToClean.clear();
|
323
403
|
|
@@ -325,11 +405,17 @@ public:
|
|
325
405
|
std::vector<std::pair<float, int>> results;
|
326
406
|
for (auto &[fid, cand] : fileCandMap) {
|
327
407
|
std::pair<float, int> v;
|
328
|
-
float sc = cand
|
408
|
+
float sc = cand->getScore();
|
329
409
|
v.first = sc;
|
330
410
|
v.second = fid;
|
331
411
|
results.push_back(v);
|
412
|
+
delete cand;
|
332
413
|
}
|
414
|
+
|
415
|
+
for (auto &[fid, cand] : dirCandMap) {
|
416
|
+
delete cand;
|
417
|
+
}
|
418
|
+
|
333
419
|
// Sort highest score first
|
334
420
|
std::sort(results.begin(), results.end(),
|
335
421
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
@@ -337,10 +423,10 @@ public:
|
|
337
423
|
}
|
338
424
|
|
339
425
|
// Return int64_t representation of the first nchars in str, starting from index i
|
340
|
-
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
426
|
+
[[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
341
427
|
int64_t key = 0;
|
342
428
|
for (int i_char = 0; i_char < nchars; i_char++) {
|
343
|
-
key = key | static_cast<
|
429
|
+
key = key | static_cast<int64_t>(str[i + i_char]);
|
344
430
|
if (i_char < nchars - 1) {
|
345
431
|
// Shift 8 bits to the left except on the last iteration
|
346
432
|
key = key << 8;
|
@@ -395,22 +481,29 @@ private:
|
|
395
481
|
maxChars = p->str.size();
|
396
482
|
}
|
397
483
|
|
398
|
-
#ifdef _OPENMP
|
399
|
-
#pragma omp parallel for
|
400
|
-
#endif
|
401
484
|
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
402
485
|
|
486
|
+
std::mutex *mu;
|
403
487
|
SegMap *map;
|
404
|
-
if (p->type == File) {
|
488
|
+
if (p->type == segmentType::File) {
|
405
489
|
map = filemaps[sublen];
|
490
|
+
mu = &mts_f[sublen];
|
406
491
|
} else {
|
407
492
|
map = dirmaps[sublen];
|
493
|
+
mu = &mts_d[sublen];
|
408
494
|
}
|
409
495
|
|
410
496
|
int count = str.size() - sublen + 1;
|
411
497
|
|
498
|
+
int64_t keys[count + 1];
|
499
|
+
for (int i = 0; i <= count; i++) {
|
500
|
+
keys[i] = getKeyAtIdx(str, i, sublen);
|
501
|
+
}
|
502
|
+
|
503
|
+
mu->lock();
|
412
504
|
for (int i = 0; i <= count; i++) {
|
413
|
-
int64_t key = getKeyAtIdx(str, i, sublen);
|
505
|
+
// int64_t key = getKeyAtIdx(str, i, sublen);
|
506
|
+
auto key = keys[i];
|
414
507
|
|
415
508
|
// Create a new std::set for key if doesn't exist already
|
416
509
|
auto it = map->find(key);
|
@@ -419,12 +512,14 @@ private:
|
|
419
512
|
}
|
420
513
|
(*map)[key]->insert(p);
|
421
514
|
}
|
515
|
+
mu->unlock();
|
422
516
|
}
|
423
517
|
}
|
424
518
|
|
425
519
|
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
426
520
|
// is of length <nchars>.
|
427
|
-
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
521
|
+
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
522
|
+
SegMap &map) {
|
428
523
|
|
429
524
|
assert(i + nchars <= static_cast<int>(str.size()));
|
430
525
|
std::vector<PathSegment *> res;
|
@@ -433,8 +528,7 @@ private:
|
|
433
528
|
// transform that to 64 bit integer
|
434
529
|
int64_t key = getKeyAtIdx(str, i, nchars);
|
435
530
|
// Find all path segments in map that have the same substring
|
436
|
-
auto it = map.find(key);
|
437
|
-
if (it != map.end()) { // key found
|
531
|
+
if (auto it = map.find(key); it != map.end()) { // key found
|
438
532
|
auto set = it->second;
|
439
533
|
for (auto value : *set) {
|
440
534
|
res.push_back(value);
|
@@ -471,12 +565,12 @@ private:
|
|
471
565
|
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
472
566
|
|
473
567
|
for (auto &[fid, cand] : fileCandMap) {
|
474
|
-
PathSegment *p = cand
|
475
|
-
while (p->parent !=
|
476
|
-
if (p->cand !=
|
477
|
-
auto &scoreA = cand
|
568
|
+
PathSegment *p = cand->seg->parent;
|
569
|
+
while (p->parent != nullptr) {
|
570
|
+
if (p->cand != nullptr) {
|
571
|
+
auto &scoreA = cand->v_charscore;
|
478
572
|
auto &scoreB = p->cand->v_charscore;
|
479
|
-
for (int i = 0; i < cand
|
573
|
+
for (int i = 0; i < cand->len; i++) {
|
480
574
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
481
575
|
scoreA[i] = scoreB[i] * dirWeight;
|
482
576
|
}
|
@@ -489,18 +583,22 @@ private:
|
|
489
583
|
|
490
584
|
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
491
585
|
|
492
|
-
auto it2 = candmap.find(seg->fileId);
|
493
|
-
|
494
|
-
Candidate cand(seg, str.size());
|
495
|
-
seg->cand = &(candmap[seg->fileId]);
|
586
|
+
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
587
|
+
Candidate *cand = new Candidate(seg, str.size());
|
496
588
|
segsToClean.push_back(seg);
|
497
589
|
candmap[seg->fileId] = cand;
|
590
|
+
seg->cand = cand;
|
498
591
|
}
|
499
592
|
|
500
593
|
for (int j = i; j < i + nchars; j++) {
|
501
|
-
|
502
|
-
|
594
|
+
Candidate &cand = *(candmap[seg->fileId]);
|
595
|
+
if (cand[j] < nchars) {
|
596
|
+
cand.v_charscore[j] = nchars;
|
503
597
|
}
|
504
598
|
}
|
505
599
|
}
|
506
600
|
};
|
601
|
+
|
602
|
+
} // namespace StrIdx
|
603
|
+
|
604
|
+
#endif
|
data/test.rb
CHANGED
@@ -13,7 +13,13 @@ for x in lines
|
|
13
13
|
end
|
14
14
|
|
15
15
|
idx_time = Time.new
|
16
|
-
|
16
|
+
# Time to start the threadpool to process indexing
|
17
|
+
puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
18
|
+
|
19
|
+
idx.waitUntilDone() # Not necessary, will be called by idx.find
|
20
|
+
idx_time = Time.new
|
21
|
+
# Time when all threads have completed
|
22
|
+
puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
23
|
|
18
24
|
query = "rngnomadriv"
|
19
25
|
res = idx.find(query)
|