StrIdx 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +1 -0
- data/README.md +16 -3
- data/demo.cpp +130 -48
- data/exe/stridx.rb +124 -5
- data/flist.txt +0 -5550
- data/rubyext/extconf.rb +1 -1
- data/rubyext/ruby_interf.cpp +95 -6
- data/runserver.rb +22 -0
- data/server.rb +8 -2
- data/stridx.gemspec +1 -5
- data/stridx.hpp +431 -227
- data/thread_pool.hpp +20 -5
- data/unittest.cpp +58 -16
- metadata +3 -3
data/stridx.hpp
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#include <array>
|
11
11
|
#include <iostream>
|
12
12
|
#include <unordered_map>
|
13
|
+
#include <map>
|
13
14
|
#include <set>
|
14
15
|
#include <algorithm>
|
15
16
|
#include <sstream>
|
@@ -28,10 +29,11 @@ namespace StrIdx {
|
|
28
29
|
class Output {
|
29
30
|
private:
|
30
31
|
int verboseLevel;
|
32
|
+
// TODO: add mutex?
|
31
33
|
|
32
34
|
public:
|
33
35
|
Output(int verb) : verboseLevel(verb) {}
|
34
|
-
Output() : Output(
|
36
|
+
Output() : Output(1) {}
|
35
37
|
~Output() = default;
|
36
38
|
static void print() {}
|
37
39
|
|
@@ -61,61 +63,234 @@ public:
|
|
61
63
|
}
|
62
64
|
};
|
63
65
|
|
66
|
+
Output out{1};
|
67
|
+
|
68
|
+
struct CharNode {
|
69
|
+
int *ids;
|
70
|
+
int ids_sz;
|
71
|
+
char c;
|
72
|
+
std::uint8_t size;
|
73
|
+
CharNode *children;
|
74
|
+
CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
|
75
|
+
|
76
|
+
void init() {
|
77
|
+
ids = nullptr;
|
78
|
+
ids_sz = 0;
|
79
|
+
c = 0;
|
80
|
+
size = 0;
|
81
|
+
children = nullptr;
|
82
|
+
}
|
83
|
+
|
84
|
+
void dealloc() {
|
85
|
+
if (children != nullptr) {
|
86
|
+
for (CharNode *it = children; it != children + size; it++) {
|
87
|
+
it->dealloc();
|
88
|
+
}
|
89
|
+
free(children);
|
90
|
+
}
|
91
|
+
delete[] ids;
|
92
|
+
}
|
93
|
+
|
94
|
+
~CharNode() {}
|
95
|
+
|
96
|
+
// Gets Id's stored in this node and all child nodes combined
|
97
|
+
std::set<int> getIds() {
|
98
|
+
std::set<int> set;
|
99
|
+
getIds(set);
|
100
|
+
return set;
|
101
|
+
}
|
102
|
+
|
103
|
+
void getIds(std::set<int> &set) {
|
104
|
+
for (int j = 0; j < ids_sz; j++) {
|
105
|
+
set.insert(ids[j]);
|
106
|
+
}
|
107
|
+
for (CharNode *it = children; it != children + size; it++) {
|
108
|
+
it->getIds(set);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
// Find if character 'c' is included in children of the node
|
113
|
+
CharNode *find(char c) {
|
114
|
+
CharNode *ret = nullptr;
|
115
|
+
if (size > 0) {
|
116
|
+
for (auto it = children; it != children + size; it++) {
|
117
|
+
if (it->c == c) {
|
118
|
+
ret = it;
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
return ret;
|
124
|
+
}
|
125
|
+
};
|
126
|
+
|
127
|
+
/* Tree type data structure consisting of strings of file path segments
|
128
|
+
* (somewhat like a trie)
|
129
|
+
* For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
|
130
|
+
* substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
|
131
|
+
* (CharTree::addStr callled for each separately)
|
132
|
+
*
|
133
|
+
* Which forms a tree like structure:
|
134
|
+
* [root]-a-b-r-a-c-a-d-a
|
135
|
+
* | | ╰-4
|
136
|
+
* | ╰─c-a-d-a-b-r-4
|
137
|
+
* ╰───b-r-a-c-a-d-a-b
|
138
|
+
* | ╰─4
|
139
|
+
* ╰───r-a-c-a-d-a-b-r
|
140
|
+
* ╰───d-a-b-r-4
|
141
|
+
*
|
142
|
+
* Id's pointing to path segments are stored in nodes that match the end of the inserted substring
|
143
|
+
*
|
144
|
+
* This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
|
145
|
+
* For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
|
146
|
+
* Therefore, having std::vector's or similar structures with memory overhead is not really an
|
147
|
+
* option.
|
148
|
+
*/
|
149
|
+
class CharTree {
|
150
|
+
Output out;
|
151
|
+
std::mutex mu;
|
152
|
+
|
153
|
+
public:
|
154
|
+
CharNode *root;
|
155
|
+
|
156
|
+
CharTree() { root = new CharNode; }
|
157
|
+
~CharTree() {
|
158
|
+
root->dealloc();
|
159
|
+
delete root;
|
160
|
+
}
|
161
|
+
|
162
|
+
void addStr(std::string s, int id) {
|
163
|
+
if (s.size() < 2) {
|
164
|
+
return;
|
165
|
+
}
|
166
|
+
|
167
|
+
// out.printl("add str:",s);
|
168
|
+
CharNode *cn = root;
|
169
|
+
|
170
|
+
std::lock_guard<std::mutex> mu_lock(mu);
|
171
|
+
|
172
|
+
for (int i = 0; i < s.size() && i < 8; i++) {
|
173
|
+
int c = ((char)s[i]);
|
174
|
+
bool found = false;
|
175
|
+
|
176
|
+
if (cn->size > 0) {
|
177
|
+
// out.printl("(1) cn->size > 0");
|
178
|
+
for (auto it = cn->children; it != cn->children + cn->size; it++) {
|
179
|
+
if (it->c == c) {
|
180
|
+
// out.printl("{", c, "}");
|
181
|
+
found = true;
|
182
|
+
cn = it;
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
}
|
187
|
+
if (!found) {
|
188
|
+
// auto x = new CharNode[cn->size + 1];
|
189
|
+
CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
|
190
|
+
if (cn->size > 0) {
|
191
|
+
memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
|
192
|
+
free(cn->children);
|
193
|
+
}
|
194
|
+
cn->children = x;
|
195
|
+
CharNode *nn = &(cn->children[cn->size]);
|
196
|
+
nn->init();
|
197
|
+
nn->c = c;
|
198
|
+
cn->size++;
|
199
|
+
cn = nn;
|
200
|
+
}
|
201
|
+
|
202
|
+
if (i == s.size() - 1 && true) {
|
203
|
+
out.printv(4, "i=", i, "s:", s.size(), "|");
|
204
|
+
bool found = false;
|
205
|
+
if (cn->ids_sz > 0) {
|
206
|
+
for (int i = 0; i < cn->ids_sz; i++) {
|
207
|
+
if (cn->ids[i] == id) {
|
208
|
+
found = true;
|
209
|
+
out.printv(3, "found:", id, "\n");
|
210
|
+
}
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if (!found) {
|
214
|
+
// out.print(".a.");
|
215
|
+
auto x = new int[cn->ids_sz + 1];
|
216
|
+
if (cn->ids_sz > 0) {
|
217
|
+
memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
|
218
|
+
delete[] cn->ids;
|
219
|
+
}
|
220
|
+
cn->ids = x;
|
221
|
+
cn->ids[cn->ids_sz] = id;
|
222
|
+
cn->ids_sz++;
|
223
|
+
out.printv(3, "sz:", cn->ids_sz, ",");
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
} // END for
|
228
|
+
}
|
229
|
+
|
230
|
+
void debug() { debug("", root); }
|
231
|
+
void debug(std::string trail, CharNode *cn) {
|
232
|
+
|
233
|
+
// if (trail.size() > 6) {
|
234
|
+
// out.print("\n");
|
235
|
+
// return;
|
236
|
+
// }
|
237
|
+
|
238
|
+
if (cn == nullptr) {
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
for (int i = 0; i < cn->size; i++) {
|
242
|
+
CharNode *child = &cn->children[i];
|
243
|
+
out.print("[", child->ids_sz, "]");
|
244
|
+
if (child->size > 0) {
|
245
|
+
debug(trail + child->c, child);
|
246
|
+
} else {
|
247
|
+
out.printl(trail, child->c);
|
248
|
+
// out.printl();
|
249
|
+
}
|
250
|
+
}
|
251
|
+
}
|
252
|
+
};
|
253
|
+
|
64
254
|
// Transforms input string as follows:
|
65
255
|
// '/foo/bar/file1.txt'
|
66
|
-
// => vector{"foo", "bar", "file1.txt"}
|
67
|
-
|
256
|
+
// => vector{"/foo", "/bar", "/file1.txt"}
|
257
|
+
|
258
|
+
std::vector<std::string> splitString(const std::string &str, char delimiter) {
|
68
259
|
std::vector<std::string> result;
|
69
|
-
std::
|
70
|
-
std::string item;
|
260
|
+
std::string part;
|
71
261
|
|
72
|
-
|
73
|
-
if (
|
74
|
-
|
262
|
+
for (char ch : str) {
|
263
|
+
if (ch == delimiter) {
|
264
|
+
if (part.size() > 0) {
|
265
|
+
result.push_back(part);
|
266
|
+
}
|
267
|
+
part.clear(); // Start a new part
|
268
|
+
part += ch;
|
269
|
+
} else {
|
270
|
+
part += ch;
|
75
271
|
}
|
76
272
|
}
|
77
273
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
// Convert int64_t to binary string
|
82
|
-
[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
|
83
|
-
std::string result;
|
84
|
-
for (int i = 63; i >= 0; --i) {
|
85
|
-
result += ((num >> i) & 1) ? '1' : '0';
|
274
|
+
// If there's any remaining part after the loop, add it to the result
|
275
|
+
if (!part.empty()) {
|
276
|
+
result.push_back(part);
|
86
277
|
}
|
87
|
-
return result;
|
88
|
-
}
|
89
278
|
|
90
|
-
//
|
91
|
-
|
92
|
-
|
93
|
-
std::
|
94
|
-
|
95
|
-
|
96
|
-
char c = (key >> multip) & 255;
|
97
|
-
str.push_back(c);
|
98
|
-
multip -= 8;
|
99
|
-
}
|
100
|
-
return str;
|
279
|
+
// for (const auto &value : result) {
|
280
|
+
// std::cout << value << "|";
|
281
|
+
// }
|
282
|
+
// std::cout << std::endl;
|
283
|
+
|
284
|
+
return result;
|
101
285
|
}
|
102
286
|
|
103
287
|
// Debug
|
104
|
-
void printVector(const std::vector<
|
288
|
+
void printVector(const std::vector<float> &vec) {
|
105
289
|
for (const auto &value : vec) {
|
106
290
|
std::cout << value << " ";
|
107
291
|
}
|
108
292
|
}
|
109
293
|
|
110
|
-
// Debug
|
111
|
-
[[nodiscard]] std::string charToBinaryString(const char &chr) {
|
112
|
-
std::string result;
|
113
|
-
for (int i = 7; i >= 0; --i) {
|
114
|
-
result += ((chr >> i) & 1) ? '1' : '0';
|
115
|
-
}
|
116
|
-
return result;
|
117
|
-
}
|
118
|
-
|
119
294
|
class Candidate;
|
120
295
|
enum class segmentType { Dir, File };
|
121
296
|
|
@@ -128,7 +303,8 @@ struct PathSegment {
|
|
128
303
|
Candidate *cand;
|
129
304
|
PathSegment *parent;
|
130
305
|
std::mutex mu;
|
131
|
-
|
306
|
+
std::map<std::string, PathSegment *> children;
|
307
|
+
|
132
308
|
segmentType type = segmentType::Dir;
|
133
309
|
PathSegment() : parent(nullptr) {}
|
134
310
|
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
@@ -137,9 +313,9 @@ struct PathSegment {
|
|
137
313
|
[[nodiscard]] int size() const {
|
138
314
|
int sz = str.size();
|
139
315
|
PathSegment *cur = parent;
|
140
|
-
// Sum up length of parent segments
|
316
|
+
// Sum up length of parent segments
|
141
317
|
while (cur->parent != nullptr) {
|
142
|
-
sz += cur->str.size()
|
318
|
+
sz += cur->str.size();
|
143
319
|
cur = cur->parent;
|
144
320
|
}
|
145
321
|
return sz;
|
@@ -148,7 +324,11 @@ struct PathSegment {
|
|
148
324
|
|
149
325
|
// Candidate for result in string (filename) search
|
150
326
|
struct Candidate {
|
327
|
+
|
328
|
+
//This holds the subscores for each character in the query string
|
151
329
|
std::vector<float> v_charscore;
|
330
|
+
|
331
|
+
|
152
332
|
PathSegment *seg;
|
153
333
|
int fileId;
|
154
334
|
// The string that this candidate represents
|
@@ -159,6 +339,7 @@ struct Candidate {
|
|
159
339
|
float maxscore;
|
160
340
|
int candLen; // Length of candidate
|
161
341
|
|
342
|
+
~Candidate(){};
|
162
343
|
Candidate(){};
|
163
344
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
164
345
|
// Initialize v_charscores with zeros
|
@@ -166,6 +347,7 @@ struct Candidate {
|
|
166
347
|
candLen = seg->size();
|
167
348
|
}
|
168
349
|
|
350
|
+
// Sum subscores in v_charscore and normalize to get final score
|
169
351
|
[[nodiscard]] float getScore() const {
|
170
352
|
int i = 0;
|
171
353
|
float score = 0.0;
|
@@ -178,6 +360,7 @@ struct Candidate {
|
|
178
360
|
float div2 = len * candLen;
|
179
361
|
float score1 = score / div;
|
180
362
|
float score2 = score / div2;
|
363
|
+
// out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
|
181
364
|
|
182
365
|
score = score1 * 0.97 + score2 * 0.03;
|
183
366
|
return score;
|
@@ -186,12 +369,8 @@ struct Candidate {
|
|
186
369
|
[[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
|
187
370
|
};
|
188
371
|
|
189
|
-
// This seems to give 10x speed improvement over std::unordered_map
|
190
|
-
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
191
|
-
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
192
|
-
|
193
372
|
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
194
|
-
// typedef std::unordered_map<int, Candidate*> CandMap;
|
373
|
+
// typedef std::unordered_map<int, Candidate *> CandMap;
|
195
374
|
|
196
375
|
class StringIndex {
|
197
376
|
private:
|
@@ -199,32 +378,29 @@ private:
|
|
199
378
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
200
379
|
int numStrings = 0;
|
201
380
|
|
202
|
-
std::vector<SegMap *> dirmaps;
|
203
|
-
std::array<std::mutex, 9> mts_d; // for dirmaps
|
204
|
-
std::vector<SegMap *> filemaps;
|
205
|
-
std::array<std::mutex, 9> mts_f; // for filemaps
|
206
|
-
|
207
381
|
std::vector<PathSegment *> segsToClean;
|
208
382
|
|
383
|
+
// Maps id's stored in charTree to corresponding PathSegment's
|
209
384
|
std::unordered_map<int, PathSegment *> seglist;
|
385
|
+
std::unordered_map<int, PathSegment *> seglist_dir;
|
386
|
+
std::mutex seglist_mu;
|
387
|
+
|
210
388
|
PathSegment *root;
|
211
389
|
int dirId = 0;
|
212
|
-
float dirWeight = 0
|
390
|
+
float dirWeight = 1.0; // =0.7: Give only 70% of score if match is for a directory
|
213
391
|
|
214
392
|
std::unique_ptr<ThreadPool> pool;
|
215
393
|
Output out{1}; // verbose level = 1
|
394
|
+
std::mutex cm_mu;
|
216
395
|
|
217
396
|
public:
|
397
|
+
CharTree cm; // for files
|
398
|
+
CharTree cm_dir; // for directories
|
218
399
|
StringIndex(char sep) : dirSeparator(sep) {
|
219
400
|
root = new PathSegment();
|
220
401
|
root->parent = nullptr;
|
221
402
|
root->str = "[ROOT]";
|
222
403
|
|
223
|
-
for (int i = 0; i <= 8; i++) {
|
224
|
-
dirmaps.push_back(new SegMap);
|
225
|
-
filemaps.push_back(new SegMap);
|
226
|
-
}
|
227
|
-
|
228
404
|
// Threads between 4 and 6
|
229
405
|
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
230
406
|
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
@@ -233,45 +409,31 @@ public:
|
|
233
409
|
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
234
410
|
}
|
235
411
|
|
236
|
-
/* Don't separate path to segments separator=\0.
|
412
|
+
/* Don't separate path to segments when separator=\0.
|
237
413
|
This is slower, but can be used for other data than files also. */
|
238
414
|
StringIndex() : StringIndex('\0') {}
|
239
415
|
|
240
416
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
241
417
|
void setDirWeight(float val) { dirWeight = val; }
|
242
418
|
|
243
|
-
~StringIndex() {
|
244
|
-
for (auto x : dirmaps) {
|
245
|
-
for (auto y : *x) {
|
246
|
-
y.second->clear();
|
247
|
-
delete (y.second);
|
248
|
-
}
|
249
|
-
x->clear();
|
250
|
-
delete x;
|
251
|
-
}
|
252
|
-
for (auto x : filemaps) {
|
253
|
-
for (auto y : *x) {
|
254
|
-
y.second->clear();
|
255
|
-
delete (y.second);
|
256
|
-
}
|
257
|
-
x->clear();
|
258
|
-
delete x;
|
259
|
-
}
|
260
|
-
clearPathSegmentChildren(root);
|
261
|
-
}
|
419
|
+
~StringIndex() { clearPathSegmentChildren(root); }
|
262
420
|
|
263
421
|
void addStrToIndex(std::string filePath, int fileId) {
|
264
422
|
addStrToIndex(filePath, fileId, dirSeparator);
|
265
423
|
}
|
266
424
|
|
267
425
|
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
268
|
-
pool->enqueue([
|
426
|
+
pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
|
427
|
+
// addStrToIndex(filePath, fileId, dirSeparator);
|
269
428
|
}
|
270
429
|
void waitUntilReady() const { pool->waitUntilDone(); }
|
271
430
|
|
272
431
|
void waitUntilDone() const { pool->waitUntilDone(); }
|
273
432
|
|
274
|
-
int size()
|
433
|
+
int size() {
|
434
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
435
|
+
return seglist.size();
|
436
|
+
}
|
275
437
|
|
276
438
|
/**
|
277
439
|
* Add a string to the index to be searched for afterwards
|
@@ -283,11 +445,18 @@ public:
|
|
283
445
|
*/
|
284
446
|
|
285
447
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
286
|
-
|
448
|
+
|
449
|
+
std::lock_guard<std::mutex> guard(cm_mu);
|
450
|
+
|
451
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
|
287
452
|
|
288
453
|
// If a string with this index has beeen added already
|
289
|
-
|
290
|
-
|
454
|
+
{
|
455
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
456
|
+
if (seglist.find(fileId) != seglist.end()) {
|
457
|
+
out.printl("seglist.find(fileId) != seglist.end()");
|
458
|
+
return;
|
459
|
+
}
|
291
460
|
}
|
292
461
|
|
293
462
|
std::vector<std::string> segs;
|
@@ -319,36 +488,74 @@ public:
|
|
319
488
|
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
320
489
|
p = it->second;
|
321
490
|
prev->mu.unlock();
|
322
|
-
} else {
|
491
|
+
} else { // File or dir not included in tree yet
|
323
492
|
p = new PathSegment(x, fileId);
|
324
493
|
p->parent = prev;
|
325
494
|
// If this is last item in segs, then it is a file.
|
326
495
|
if (_x == std::prev(segs.end())) {
|
327
496
|
p->type = segmentType::File;
|
328
|
-
|
497
|
+
{
|
498
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
499
|
+
seglist[fileId] = p;
|
500
|
+
|
501
|
+
for (int i = 0; i < x.size() + 1; i++) {
|
502
|
+
auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
|
503
|
+
cm.addStr(s, fileId);
|
504
|
+
}
|
505
|
+
}
|
329
506
|
} else { // otherwise, it is a directory
|
330
507
|
p->type = segmentType::Dir;
|
331
508
|
p->fileId = dirId;
|
332
|
-
|
509
|
+
/* Add "/" to the end of the string so that
|
510
|
+
* /path/to/file will be indexed as:
|
511
|
+
* {"/path/", "/to/", "/file"}
|
512
|
+
*/
|
513
|
+
auto dir_str = x + "/";
|
514
|
+
|
515
|
+
{
|
516
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
517
|
+
seglist_dir[dirId] = p;
|
518
|
+
// Files use user input Id. Directories need to have it generated
|
519
|
+
}
|
520
|
+
|
521
|
+
// TODO: Create a function
|
522
|
+
for (int i = 0; i < dir_str.size() + 1; i++) {
|
523
|
+
auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
|
524
|
+
cm_dir.addStr(s, dirId);
|
525
|
+
}
|
526
|
+
|
333
527
|
dirId++;
|
334
528
|
}
|
335
529
|
prev->children[x] = p;
|
336
530
|
prev->mu.unlock();
|
337
|
-
|
338
|
-
}
|
531
|
+
} // END of first if
|
339
532
|
|
340
533
|
prev = p;
|
341
534
|
}
|
342
535
|
}
|
343
536
|
|
344
|
-
std::string getString(int id) {
|
537
|
+
std::string getString(int id) { return getString(id, false); }
|
538
|
+
|
539
|
+
// Reconstruct original filepath from segments
|
540
|
+
std::string getString(int id, bool isDir) {
|
345
541
|
std::string s = "";
|
346
|
-
|
542
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
543
|
+
|
544
|
+
PathSegment *seg = nullptr;
|
545
|
+
|
546
|
+
if (isDir) {
|
547
|
+
seg = seglist_dir[id];
|
548
|
+
} else {
|
549
|
+
seg = seglist[id];
|
550
|
+
}
|
347
551
|
s += seg->str;
|
348
552
|
while (seg->parent->parent != nullptr) {
|
349
553
|
seg = seg->parent;
|
350
|
-
s = seg->str +
|
554
|
+
s = seg->str + s;
|
555
|
+
// out.print(seg, "(", seg->str, ")", ",");
|
351
556
|
}
|
557
|
+
// out.printl(s);
|
558
|
+
|
352
559
|
return s;
|
353
560
|
}
|
354
561
|
|
@@ -384,33 +591,59 @@ public:
|
|
384
591
|
@param query String to search for inside the index
|
385
592
|
*/
|
386
593
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
594
|
+
void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
|
595
|
+
|
596
|
+
int last_start = query.size() - 2;
|
597
|
+
// Loop all possible start positions in query string. Indexes [0..(n-3)]
|
598
|
+
for (int start = 0; start <= last_start; start++) {
|
599
|
+
CharNode *cn = chartr.root;
|
600
|
+
|
601
|
+
// select a suffix (substring) starting from start, but cap length to 8 chars
|
602
|
+
int end = std::min(start + 7, ((int)query.size()) - 1);
|
603
|
+
int nchars = end - start + 1;
|
604
|
+
std::string s = query.substr(start, nchars);
|
605
|
+
|
606
|
+
// Loop all chars of the query substring
|
607
|
+
// Traverse from the
|
608
|
+
for (int i = 0; i < s.size(); i++) {
|
609
|
+
char c = s[i];
|
610
|
+
CharNode *x = cn->find(c);
|
611
|
+
if (x != nullptr) {
|
612
|
+
cn = x;
|
613
|
+
// Consider scores only for substrings with size >= 2
|
614
|
+
if (i > 0) {
|
615
|
+
// If we've reached here, size of substring is i+2
|
616
|
+
|
617
|
+
// Get identifiers of files that include substring
|
618
|
+
// query[start..(start+i+1)] ??
|
619
|
+
std::set<int> ids = cn->getIds();
|
620
|
+
for (const int &y : ids) {
|
621
|
+
PathSegment *p = nullptr;
|
622
|
+
|
623
|
+
// Searching in file segments
|
624
|
+
// (or no file/dir separation)
|
625
|
+
if (&chartr == &cm) {
|
626
|
+
p = seglist[y];
|
627
|
+
} else {
|
628
|
+
// Searching in dir segments
|
629
|
+
p = seglist_dir[y];
|
630
|
+
}
|
631
|
+
assert(p != nullptr);
|
632
|
+
addToResults(p, query, start, i + 1, candmap);
|
633
|
+
}
|
634
|
+
}
|
635
|
+
} else {
|
636
|
+
// assert(cn->ids_sz < 1); // TODO: should not come here?
|
637
|
+
break;
|
638
|
+
}
|
639
|
+
}
|
408
640
|
}
|
409
|
-
|
641
|
+
}
|
410
642
|
|
643
|
+
std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
|
411
644
|
// Form return result, 2d array with file id's and scores
|
412
645
|
std::vector<std::pair<float, int>> results;
|
413
|
-
for (auto &[fid, cand] :
|
646
|
+
for (auto &[fid, cand] : candmap) {
|
414
647
|
std::pair<float, int> v;
|
415
648
|
float sc = cand->getScore();
|
416
649
|
v.first = sc;
|
@@ -419,164 +652,134 @@ public:
|
|
419
652
|
delete cand;
|
420
653
|
}
|
421
654
|
|
422
|
-
for (auto &[fid, cand] : dirCandMap) {
|
423
|
-
delete cand;
|
424
|
-
}
|
425
|
-
|
426
655
|
// Sort highest score first
|
427
656
|
std::sort(results.begin(), results.end(),
|
428
657
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
429
658
|
return results;
|
430
659
|
}
|
431
660
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
key = key | static_cast<int64_t>(str[i + i_char]);
|
437
|
-
if (i_char < nchars - 1) {
|
438
|
-
// Shift 8 bits to the left except on the last iteration
|
439
|
-
key = key << 8;
|
440
|
-
}
|
441
|
-
}
|
442
|
-
return key;
|
443
|
-
}
|
661
|
+
std::vector<std::pair<float, int>> findDirectories(std::string query) {
|
662
|
+
CandMap dirCandMap;
|
663
|
+
auto &candmap = dirCandMap;
|
664
|
+
waitUntilDone();
|
444
665
|
|
445
|
-
|
666
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
667
|
+
addParentScores(dirCandMap);
|
668
|
+
auto results = candidatesToVec(dirCandMap);
|
446
669
|
|
447
|
-
|
448
|
-
for (const auto &[key, value] : (*filemaps[nchars])) {
|
449
|
-
int64_t x;
|
450
|
-
x = key;
|
451
|
-
int multip = nchars * 8;
|
452
|
-
for (int i = 0; i <= nchars; i++) {
|
453
|
-
char c = (x >> multip) & 255;
|
454
|
-
std::cout << c;
|
455
|
-
multip -= 8;
|
456
|
-
}
|
457
|
-
std::cout << "\n";
|
458
|
-
// for (auto y : *value) {
|
459
|
-
// std::cout << y << " ";
|
460
|
-
// }
|
461
|
-
// std::cout << "\n";
|
462
|
-
}
|
670
|
+
return results;
|
463
671
|
}
|
464
672
|
|
465
|
-
|
466
|
-
|
467
|
-
if (p->children.size() > 0) {
|
468
|
-
for (auto x : p->children) {
|
469
|
-
clearPathSegmentChildren(x.second);
|
470
|
-
}
|
471
|
-
}
|
472
|
-
delete p;
|
673
|
+
std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
|
674
|
+
return findFilesAndDirectories(query, true, true);
|
473
675
|
}
|
474
676
|
|
475
|
-
|
476
|
-
|
477
|
-
// This function generates int64 representations (keys) of all substrings of size 2..8 in that
|
478
|
-
// path segment and stores pointer to p in hash tables using these int values as keys.
|
677
|
+
std::vector<std::pair<float, std::string>>
|
678
|
+
findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
|
479
679
|
|
480
|
-
|
481
|
-
|
680
|
+
CandMap fileCandMap;
|
681
|
+
CandMap dirCandMap;
|
682
|
+
waitUntilDone();
|
683
|
+
std::vector<std::pair<float, std::string>> results;
|
482
684
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
}
|
487
|
-
if (static_cast<int>(p->str.size()) < maxChars) {
|
488
|
-
maxChars = p->str.size();
|
685
|
+
if (includeFiles) {
|
686
|
+
searchCharTree(query, fileCandMap, cm);
|
687
|
+
// out.printl("size:",fileCandMap.size());
|
489
688
|
}
|
490
689
|
|
491
|
-
|
690
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
492
691
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
map = filemaps[sublen];
|
497
|
-
mu = &mts_f[sublen];
|
498
|
-
} else {
|
499
|
-
map = dirmaps[sublen];
|
500
|
-
mu = &mts_d[sublen];
|
501
|
-
}
|
692
|
+
if (includeFiles) {
|
693
|
+
addParentScores(fileCandMap);
|
694
|
+
}
|
502
695
|
|
503
|
-
|
696
|
+
if (includeDirs) {
|
697
|
+
addParentScores(dirCandMap);
|
698
|
+
}
|
699
|
+
|
700
|
+
for (auto seg : segsToClean) {
|
701
|
+
seg->cand = nullptr;
|
702
|
+
}
|
703
|
+
segsToClean.clear();
|
504
704
|
|
505
|
-
|
506
|
-
|
507
|
-
|
705
|
+
// TODO: Need to call this just to delete candidates
|
706
|
+
auto res_dir = candidatesToVec(dirCandMap);
|
707
|
+
if (includeDirs) {
|
708
|
+
for (const auto &[score, id] : res_dir) {
|
709
|
+
results.push_back(std::pair<float, std::string>{score, getString(id, true)});
|
508
710
|
}
|
711
|
+
}
|
509
712
|
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
713
|
+
if (includeFiles) {
|
714
|
+
auto res_file = candidatesToVec(fileCandMap);
|
715
|
+
// out.printl("size2:",fileCandMap.size());
|
716
|
+
for (const auto &[score, id] : res_file) {
|
514
717
|
|
515
|
-
//
|
516
|
-
|
517
|
-
if (it == map->end()) {
|
518
|
-
(*map)[key] = new std::set<PathSegment *>;
|
519
|
-
}
|
520
|
-
(*map)[key]->insert(p);
|
718
|
+
// out.print("|",getString(id),"|");
|
719
|
+
results.push_back(std::pair<float, std::string>{score, getString(id)});
|
521
720
|
}
|
522
|
-
mu->unlock();
|
523
721
|
}
|
722
|
+
|
723
|
+
// Sort highest score first
|
724
|
+
std::sort(results.begin(), results.end(),
|
725
|
+
[](std::pair<float, std::string> a, std::pair<float, std::string> b) {
|
726
|
+
return a.first > b.first;
|
727
|
+
});
|
728
|
+
return results;
|
524
729
|
}
|
525
730
|
|
526
|
-
//
|
527
|
-
|
528
|
-
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
529
|
-
SegMap &map) const {
|
731
|
+
// TODO: delete?
|
732
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
|
530
733
|
|
531
|
-
|
532
|
-
std::vector<PathSegment *> res;
|
734
|
+
std::vector<std::pair<float, int>> findFiles(std::string query) {
|
533
735
|
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
}
|
543
|
-
}
|
544
|
-
return res;
|
545
|
-
}
|
736
|
+
CandMap fileCandMap;
|
737
|
+
CandMap dirCandMap;
|
738
|
+
auto &candmap = fileCandMap;
|
739
|
+
waitUntilDone();
|
740
|
+
|
741
|
+
searchCharTree(query, fileCandMap, cm);
|
742
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
743
|
+
addParentScores(fileCandMap);
|
546
744
|
|
547
|
-
|
548
|
-
|
549
|
-
) {
|
550
|
-
int maxChars = 8;
|
551
|
-
int minChars = 2;
|
552
|
-
if (static_cast<int>(query.size()) < maxChars) {
|
553
|
-
maxChars = query.size();
|
745
|
+
for (auto seg : segsToClean) {
|
746
|
+
seg->cand = nullptr;
|
554
747
|
}
|
748
|
+
segsToClean.clear();
|
555
749
|
|
556
|
-
|
557
|
-
|
558
|
-
int count = query.size() - sublen + 1;
|
750
|
+
auto results = candidatesToVec(fileCandMap);
|
751
|
+
auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
|
559
752
|
|
560
|
-
|
561
|
-
|
562
|
-
std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
|
753
|
+
return results;
|
754
|
+
}
|
563
755
|
|
564
|
-
|
565
|
-
|
566
|
-
|
756
|
+
private:
|
757
|
+
void clearPathSegmentChildren(PathSegment *p) {
|
758
|
+
if (p->children.size() > 0) {
|
759
|
+
for (auto x : p->children) {
|
760
|
+
clearPathSegmentChildren(x.second);
|
567
761
|
}
|
568
762
|
}
|
763
|
+
delete p;
|
569
764
|
}
|
570
765
|
|
571
766
|
// Add parent directories scores to files
|
572
|
-
void
|
767
|
+
void addParentScores(CandMap &fileCandMap) {
|
573
768
|
|
574
769
|
for (auto &[fid, cand] : fileCandMap) {
|
575
770
|
PathSegment *p = cand->seg->parent;
|
576
771
|
while (p->parent != nullptr) {
|
577
772
|
if (p->cand != nullptr) {
|
773
|
+
|
578
774
|
auto &scoreA = cand->v_charscore;
|
579
775
|
auto &scoreB = p->cand->v_charscore;
|
776
|
+
|
777
|
+
// out.print("[");
|
778
|
+
// printVector(scoreA);
|
779
|
+
// out.print(",");
|
780
|
+
// printVector(scoreB);
|
781
|
+
// out.print(",");
|
782
|
+
// out.print("]");
|
580
783
|
for (int i = 0; i < cand->len; i++) {
|
581
784
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
582
785
|
scoreA[i] = scoreB[i] * dirWeight;
|
@@ -592,6 +795,7 @@ private:
|
|
592
795
|
|
593
796
|
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
594
797
|
Candidate *cand = new Candidate(seg, str.size());
|
798
|
+
// out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
|
595
799
|
segsToClean.push_back(seg);
|
596
800
|
candmap[seg->fileId] = cand;
|
597
801
|
seg->cand = cand;
|