StrIdx 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +1 -0
- data/README.md +17 -3
- data/demo.cpp +36 -5
- data/exe/stridx.rb +6 -1
- data/flist.txt +0 -5550
- data/rubyext/ruby_interf.cpp +58 -2
- data/runserver.rb +20 -0
- data/server.rb +7 -2
- data/stridx.gemspec +1 -5
- data/stridx.hpp +411 -226
- data/thread_pool.hpp +20 -5
- data/unittest.cpp +58 -16
- metadata +3 -3
data/stridx.hpp
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#include <array>
|
11
11
|
#include <iostream>
|
12
12
|
#include <unordered_map>
|
13
|
+
#include <map>
|
13
14
|
#include <set>
|
14
15
|
#include <algorithm>
|
15
16
|
#include <sstream>
|
@@ -28,10 +29,11 @@ namespace StrIdx {
|
|
28
29
|
class Output {
|
29
30
|
private:
|
30
31
|
int verboseLevel;
|
32
|
+
// TODO: add mutex?
|
31
33
|
|
32
34
|
public:
|
33
35
|
Output(int verb) : verboseLevel(verb) {}
|
34
|
-
Output() : Output(
|
36
|
+
Output() : Output(1) {}
|
35
37
|
~Output() = default;
|
36
38
|
static void print() {}
|
37
39
|
|
@@ -61,61 +63,234 @@ public:
|
|
61
63
|
}
|
62
64
|
};
|
63
65
|
|
66
|
+
Output out{1};
|
67
|
+
|
68
|
+
struct CharNode {
|
69
|
+
int *ids;
|
70
|
+
int ids_sz;
|
71
|
+
char c;
|
72
|
+
std::uint8_t size;
|
73
|
+
CharNode *children;
|
74
|
+
CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
|
75
|
+
|
76
|
+
void init() {
|
77
|
+
ids = nullptr;
|
78
|
+
ids_sz = 0;
|
79
|
+
c = 0;
|
80
|
+
size = 0;
|
81
|
+
children = nullptr;
|
82
|
+
}
|
83
|
+
|
84
|
+
void dealloc() {
|
85
|
+
if (children != nullptr) {
|
86
|
+
for (CharNode *it = children; it != children + size; it++) {
|
87
|
+
it->dealloc();
|
88
|
+
}
|
89
|
+
free(children);
|
90
|
+
}
|
91
|
+
delete[] ids;
|
92
|
+
}
|
93
|
+
|
94
|
+
~CharNode() {}
|
95
|
+
|
96
|
+
// Gets Id's stored in this node and all child nodes combined
|
97
|
+
std::set<int> getIds() {
|
98
|
+
std::set<int> set;
|
99
|
+
getIds(set);
|
100
|
+
return set;
|
101
|
+
}
|
102
|
+
|
103
|
+
void getIds(std::set<int> &set) {
|
104
|
+
for (int j = 0; j < ids_sz; j++) {
|
105
|
+
set.insert(ids[j]);
|
106
|
+
}
|
107
|
+
for (CharNode *it = children; it != children + size; it++) {
|
108
|
+
it->getIds(set);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
// Find if character 'c' is included in children of the node
|
113
|
+
CharNode *find(char c) {
|
114
|
+
CharNode *ret = nullptr;
|
115
|
+
if (size > 0) {
|
116
|
+
for (auto it = children; it != children + size; it++) {
|
117
|
+
if (it->c == c) {
|
118
|
+
ret = it;
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
return ret;
|
124
|
+
}
|
125
|
+
};
|
126
|
+
|
127
|
+
/* Tree type data structure consisting of strings of file path segments
|
128
|
+
* (somewhat like a trie)
|
129
|
+
* For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
|
130
|
+
* substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
|
131
|
+
* (CharTree::addStr callled for each separately)
|
132
|
+
*
|
133
|
+
* Which forms a tree like structure:
|
134
|
+
* [root]-a-b-r-a-c-a-d-a
|
135
|
+
* | | ╰-4
|
136
|
+
* | ╰─c-a-d-a-b-r-4
|
137
|
+
* ╰───b-r-a-c-a-d-a-b
|
138
|
+
* | ╰─4
|
139
|
+
* ╰───r-a-c-a-d-a-b-r
|
140
|
+
* ╰───d-a-b-r-4
|
141
|
+
*
|
142
|
+
* Id's pointing to path segments are stored in nodes that match the end of the inserted substring
|
143
|
+
*
|
144
|
+
* This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
|
145
|
+
* For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
|
146
|
+
* Therefore, having std::vector's or similar structures with memory overhead is not really an
|
147
|
+
* option.
|
148
|
+
*/
|
149
|
+
class CharTree {
|
150
|
+
Output out;
|
151
|
+
std::mutex mu;
|
152
|
+
|
153
|
+
public:
|
154
|
+
CharNode *root;
|
155
|
+
|
156
|
+
CharTree() { root = new CharNode; }
|
157
|
+
~CharTree() {
|
158
|
+
root->dealloc();
|
159
|
+
delete root;
|
160
|
+
}
|
161
|
+
|
162
|
+
void addStr(std::string s, int id) {
|
163
|
+
if (s.size() < 2) {
|
164
|
+
return;
|
165
|
+
}
|
166
|
+
|
167
|
+
// out.printl("add str:",s);
|
168
|
+
CharNode *cn = root;
|
169
|
+
|
170
|
+
std::lock_guard<std::mutex> mu_lock(mu);
|
171
|
+
|
172
|
+
for (int i = 0; i < s.size() && i < 8; i++) {
|
173
|
+
int c = ((char)s[i]);
|
174
|
+
bool found = false;
|
175
|
+
|
176
|
+
if (cn->size > 0) {
|
177
|
+
// out.printl("(1) cn->size > 0");
|
178
|
+
for (auto it = cn->children; it != cn->children + cn->size; it++) {
|
179
|
+
if (it->c == c) {
|
180
|
+
// out.printl("{", c, "}");
|
181
|
+
found = true;
|
182
|
+
cn = it;
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
}
|
187
|
+
if (!found) {
|
188
|
+
// auto x = new CharNode[cn->size + 1];
|
189
|
+
CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
|
190
|
+
if (cn->size > 0) {
|
191
|
+
memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
|
192
|
+
free(cn->children);
|
193
|
+
}
|
194
|
+
cn->children = x;
|
195
|
+
CharNode *nn = &(cn->children[cn->size]);
|
196
|
+
nn->init();
|
197
|
+
nn->c = c;
|
198
|
+
cn->size++;
|
199
|
+
cn = nn;
|
200
|
+
}
|
201
|
+
|
202
|
+
if (i == s.size() - 1 && true) {
|
203
|
+
out.printv(4, "i=", i, "s:", s.size(), "|");
|
204
|
+
bool found = false;
|
205
|
+
if (cn->ids_sz > 0) {
|
206
|
+
for (int i = 0; i < cn->ids_sz; i++) {
|
207
|
+
if (cn->ids[i] == id) {
|
208
|
+
found = true;
|
209
|
+
out.printv(3, "found:", id, "\n");
|
210
|
+
}
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if (!found) {
|
214
|
+
// out.print(".a.");
|
215
|
+
auto x = new int[cn->ids_sz + 1];
|
216
|
+
if (cn->ids_sz > 0) {
|
217
|
+
memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
|
218
|
+
delete[] cn->ids;
|
219
|
+
}
|
220
|
+
cn->ids = x;
|
221
|
+
cn->ids[cn->ids_sz] = id;
|
222
|
+
cn->ids_sz++;
|
223
|
+
out.printv(3, "sz:", cn->ids_sz, ",");
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
} // END for
|
228
|
+
}
|
229
|
+
|
230
|
+
void debug() { debug("", root); }
|
231
|
+
void debug(std::string trail, CharNode *cn) {
|
232
|
+
|
233
|
+
// if (trail.size() > 6) {
|
234
|
+
// out.print("\n");
|
235
|
+
// return;
|
236
|
+
// }
|
237
|
+
|
238
|
+
if (cn == nullptr) {
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
for (int i = 0; i < cn->size; i++) {
|
242
|
+
CharNode *child = &cn->children[i];
|
243
|
+
out.print("[", child->ids_sz, "]");
|
244
|
+
if (child->size > 0) {
|
245
|
+
debug(trail + child->c, child);
|
246
|
+
} else {
|
247
|
+
out.printl(trail, child->c);
|
248
|
+
// out.printl();
|
249
|
+
}
|
250
|
+
}
|
251
|
+
}
|
252
|
+
};
|
253
|
+
|
64
254
|
// Transforms input string as follows:
|
65
255
|
// '/foo/bar/file1.txt'
|
66
|
-
// => vector{"foo", "bar", "file1.txt"}
|
67
|
-
|
256
|
+
// => vector{"/foo", "/bar", "/file1.txt"}
|
257
|
+
|
258
|
+
std::vector<std::string> splitString(const std::string &str, char delimiter) {
|
68
259
|
std::vector<std::string> result;
|
69
|
-
std::
|
70
|
-
std::string item;
|
260
|
+
std::string part;
|
71
261
|
|
72
|
-
|
73
|
-
if (
|
74
|
-
|
262
|
+
for (char ch : str) {
|
263
|
+
if (ch == delimiter) {
|
264
|
+
if (part.size() > 0) {
|
265
|
+
result.push_back(part);
|
266
|
+
}
|
267
|
+
part.clear(); // Start a new part
|
268
|
+
part += ch;
|
269
|
+
} else {
|
270
|
+
part += ch;
|
75
271
|
}
|
76
272
|
}
|
77
273
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
// Convert int64_t to binary string
|
82
|
-
[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
|
83
|
-
std::string result;
|
84
|
-
for (int i = 63; i >= 0; --i) {
|
85
|
-
result += ((num >> i) & 1) ? '1' : '0';
|
274
|
+
// If there's any remaining part after the loop, add it to the result
|
275
|
+
if (!part.empty()) {
|
276
|
+
result.push_back(part);
|
86
277
|
}
|
87
|
-
return result;
|
88
|
-
}
|
89
278
|
|
90
|
-
//
|
91
|
-
|
92
|
-
|
93
|
-
std::
|
94
|
-
|
95
|
-
|
96
|
-
char c = (key >> multip) & 255;
|
97
|
-
str.push_back(c);
|
98
|
-
multip -= 8;
|
99
|
-
}
|
100
|
-
return str;
|
279
|
+
// for (const auto &value : result) {
|
280
|
+
// std::cout << value << "|";
|
281
|
+
// }
|
282
|
+
// std::cout << std::endl;
|
283
|
+
|
284
|
+
return result;
|
101
285
|
}
|
102
286
|
|
103
287
|
// Debug
|
104
|
-
void printVector(const std::vector<
|
288
|
+
void printVector(const std::vector<float> &vec) {
|
105
289
|
for (const auto &value : vec) {
|
106
290
|
std::cout << value << " ";
|
107
291
|
}
|
108
292
|
}
|
109
293
|
|
110
|
-
// Debug
|
111
|
-
[[nodiscard]] std::string charToBinaryString(const char &chr) {
|
112
|
-
std::string result;
|
113
|
-
for (int i = 7; i >= 0; --i) {
|
114
|
-
result += ((chr >> i) & 1) ? '1' : '0';
|
115
|
-
}
|
116
|
-
return result;
|
117
|
-
}
|
118
|
-
|
119
294
|
class Candidate;
|
120
295
|
enum class segmentType { Dir, File };
|
121
296
|
|
@@ -128,7 +303,8 @@ struct PathSegment {
|
|
128
303
|
Candidate *cand;
|
129
304
|
PathSegment *parent;
|
130
305
|
std::mutex mu;
|
131
|
-
|
306
|
+
std::map<std::string, PathSegment *> children;
|
307
|
+
|
132
308
|
segmentType type = segmentType::Dir;
|
133
309
|
PathSegment() : parent(nullptr) {}
|
134
310
|
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
@@ -137,9 +313,9 @@ struct PathSegment {
|
|
137
313
|
[[nodiscard]] int size() const {
|
138
314
|
int sz = str.size();
|
139
315
|
PathSegment *cur = parent;
|
140
|
-
// Sum up length of parent segments
|
316
|
+
// Sum up length of parent segments
|
141
317
|
while (cur->parent != nullptr) {
|
142
|
-
sz += cur->str.size()
|
318
|
+
sz += cur->str.size();
|
143
319
|
cur = cur->parent;
|
144
320
|
}
|
145
321
|
return sz;
|
@@ -159,6 +335,7 @@ struct Candidate {
|
|
159
335
|
float maxscore;
|
160
336
|
int candLen; // Length of candidate
|
161
337
|
|
338
|
+
~Candidate(){};
|
162
339
|
Candidate(){};
|
163
340
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
164
341
|
// Initialize v_charscores with zeros
|
@@ -178,6 +355,7 @@ struct Candidate {
|
|
178
355
|
float div2 = len * candLen;
|
179
356
|
float score1 = score / div;
|
180
357
|
float score2 = score / div2;
|
358
|
+
// out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
|
181
359
|
|
182
360
|
score = score1 * 0.97 + score2 * 0.03;
|
183
361
|
return score;
|
@@ -186,12 +364,8 @@ struct Candidate {
|
|
186
364
|
[[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
|
187
365
|
};
|
188
366
|
|
189
|
-
// This seems to give 10x speed improvement over std::unordered_map
|
190
|
-
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
191
|
-
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
192
|
-
|
193
367
|
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
194
|
-
// typedef std::unordered_map<int, Candidate*> CandMap;
|
368
|
+
// typedef std::unordered_map<int, Candidate *> CandMap;
|
195
369
|
|
196
370
|
class StringIndex {
|
197
371
|
private:
|
@@ -199,32 +373,28 @@ private:
|
|
199
373
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
200
374
|
int numStrings = 0;
|
201
375
|
|
202
|
-
std::vector<SegMap *> dirmaps;
|
203
|
-
std::array<std::mutex, 9> mts_d; // for dirmaps
|
204
|
-
std::vector<SegMap *> filemaps;
|
205
|
-
std::array<std::mutex, 9> mts_f; // for filemaps
|
206
|
-
|
207
376
|
std::vector<PathSegment *> segsToClean;
|
208
377
|
|
209
378
|
std::unordered_map<int, PathSegment *> seglist;
|
379
|
+
std::unordered_map<int, PathSegment *> seglist_dir;
|
380
|
+
std::mutex seglist_mu;
|
381
|
+
|
210
382
|
PathSegment *root;
|
211
383
|
int dirId = 0;
|
212
384
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
213
385
|
|
214
386
|
std::unique_ptr<ThreadPool> pool;
|
215
387
|
Output out{1}; // verbose level = 1
|
388
|
+
std::mutex cm_mu;
|
216
389
|
|
217
390
|
public:
|
391
|
+
CharTree cm; // for files
|
392
|
+
CharTree cm_dir; // for directories
|
218
393
|
StringIndex(char sep) : dirSeparator(sep) {
|
219
394
|
root = new PathSegment();
|
220
395
|
root->parent = nullptr;
|
221
396
|
root->str = "[ROOT]";
|
222
397
|
|
223
|
-
for (int i = 0; i <= 8; i++) {
|
224
|
-
dirmaps.push_back(new SegMap);
|
225
|
-
filemaps.push_back(new SegMap);
|
226
|
-
}
|
227
|
-
|
228
398
|
// Threads between 4 and 6
|
229
399
|
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
230
400
|
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
@@ -233,45 +403,31 @@ public:
|
|
233
403
|
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
234
404
|
}
|
235
405
|
|
236
|
-
/* Don't separate path to segments separator=\0.
|
406
|
+
/* Don't separate path to segments when separator=\0.
|
237
407
|
This is slower, but can be used for other data than files also. */
|
238
408
|
StringIndex() : StringIndex('\0') {}
|
239
409
|
|
240
410
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
241
411
|
void setDirWeight(float val) { dirWeight = val; }
|
242
412
|
|
243
|
-
~StringIndex() {
|
244
|
-
for (auto x : dirmaps) {
|
245
|
-
for (auto y : *x) {
|
246
|
-
y.second->clear();
|
247
|
-
delete (y.second);
|
248
|
-
}
|
249
|
-
x->clear();
|
250
|
-
delete x;
|
251
|
-
}
|
252
|
-
for (auto x : filemaps) {
|
253
|
-
for (auto y : *x) {
|
254
|
-
y.second->clear();
|
255
|
-
delete (y.second);
|
256
|
-
}
|
257
|
-
x->clear();
|
258
|
-
delete x;
|
259
|
-
}
|
260
|
-
clearPathSegmentChildren(root);
|
261
|
-
}
|
413
|
+
~StringIndex() { clearPathSegmentChildren(root); }
|
262
414
|
|
263
415
|
void addStrToIndex(std::string filePath, int fileId) {
|
264
416
|
addStrToIndex(filePath, fileId, dirSeparator);
|
265
417
|
}
|
266
418
|
|
267
419
|
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
268
|
-
pool->enqueue([
|
420
|
+
pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
|
421
|
+
// addStrToIndex(filePath, fileId, dirSeparator);
|
269
422
|
}
|
270
423
|
void waitUntilReady() const { pool->waitUntilDone(); }
|
271
424
|
|
272
425
|
void waitUntilDone() const { pool->waitUntilDone(); }
|
273
426
|
|
274
|
-
int size()
|
427
|
+
int size() {
|
428
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
429
|
+
return seglist.size();
|
430
|
+
}
|
275
431
|
|
276
432
|
/**
|
277
433
|
* Add a string to the index to be searched for afterwards
|
@@ -283,11 +439,18 @@ public:
|
|
283
439
|
*/
|
284
440
|
|
285
441
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
286
|
-
|
442
|
+
|
443
|
+
std::lock_guard<std::mutex> guard(cm_mu);
|
444
|
+
|
445
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
|
287
446
|
|
288
447
|
// If a string with this index has beeen added already
|
289
|
-
|
290
|
-
|
448
|
+
{
|
449
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
450
|
+
if (seglist.find(fileId) != seglist.end()) {
|
451
|
+
out.printl("seglist.find(fileId) != seglist.end()");
|
452
|
+
return;
|
453
|
+
}
|
291
454
|
}
|
292
455
|
|
293
456
|
std::vector<std::string> segs;
|
@@ -319,36 +482,74 @@ public:
|
|
319
482
|
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
320
483
|
p = it->second;
|
321
484
|
prev->mu.unlock();
|
322
|
-
} else {
|
485
|
+
} else { // File or dir not included in tree yet
|
323
486
|
p = new PathSegment(x, fileId);
|
324
487
|
p->parent = prev;
|
325
488
|
// If this is last item in segs, then it is a file.
|
326
489
|
if (_x == std::prev(segs.end())) {
|
327
490
|
p->type = segmentType::File;
|
328
|
-
|
491
|
+
{
|
492
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
493
|
+
seglist[fileId] = p;
|
494
|
+
|
495
|
+
for (int i = 0; i < x.size() + 1; i++) {
|
496
|
+
auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
|
497
|
+
cm.addStr(s, fileId);
|
498
|
+
}
|
499
|
+
}
|
329
500
|
} else { // otherwise, it is a directory
|
330
501
|
p->type = segmentType::Dir;
|
331
502
|
p->fileId = dirId;
|
332
|
-
|
503
|
+
/* Add "/" to the end of the string so that
|
504
|
+
* /path/to/file will be indexed as:
|
505
|
+
* {"/path/", "/to/", "/file"}
|
506
|
+
*/
|
507
|
+
auto dir_str = x + "/";
|
508
|
+
|
509
|
+
{
|
510
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
511
|
+
seglist_dir[dirId] = p;
|
512
|
+
// Files use user input Id. Directories need to have it generated
|
513
|
+
}
|
514
|
+
|
515
|
+
// TODO: Create a function
|
516
|
+
for (int i = 0; i < dir_str.size() + 1; i++) {
|
517
|
+
auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
|
518
|
+
cm_dir.addStr(s, dirId);
|
519
|
+
}
|
520
|
+
|
333
521
|
dirId++;
|
334
522
|
}
|
335
523
|
prev->children[x] = p;
|
336
524
|
prev->mu.unlock();
|
337
|
-
|
338
|
-
}
|
525
|
+
} // END of first if
|
339
526
|
|
340
527
|
prev = p;
|
341
528
|
}
|
342
529
|
}
|
343
530
|
|
344
|
-
std::string getString(int id) {
|
531
|
+
std::string getString(int id) { return getString(id, false); }
|
532
|
+
|
533
|
+
// Reconstruct original filepath from segments
|
534
|
+
std::string getString(int id, bool isDir) {
|
345
535
|
std::string s = "";
|
346
|
-
|
536
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
537
|
+
|
538
|
+
PathSegment *seg = nullptr;
|
539
|
+
|
540
|
+
if (isDir) {
|
541
|
+
seg = seglist_dir[id];
|
542
|
+
} else {
|
543
|
+
seg = seglist[id];
|
544
|
+
}
|
347
545
|
s += seg->str;
|
348
546
|
while (seg->parent->parent != nullptr) {
|
349
547
|
seg = seg->parent;
|
350
|
-
s = seg->str +
|
548
|
+
s = seg->str + s;
|
549
|
+
// out.print(seg, "(", seg->str, ")", ",");
|
351
550
|
}
|
551
|
+
// out.printl(s);
|
552
|
+
|
352
553
|
return s;
|
353
554
|
}
|
354
555
|
|
@@ -384,33 +585,46 @@ public:
|
|
384
585
|
@param query String to search for inside the index
|
385
586
|
*/
|
386
587
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
588
|
+
void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
|
589
|
+
|
590
|
+
int last_start = query.size() - 2;
|
591
|
+
for (int start = 0; start <= last_start; start++) {
|
592
|
+
CharNode *cn = chartr.root;
|
593
|
+
int end = std::min(start + 7, ((int)query.size()) - 1);
|
594
|
+
int nchars = end - start + 1;
|
595
|
+
std::string s = query.substr(start, nchars);
|
596
|
+
|
597
|
+
for (int i = 0; i < s.size(); i++) {
|
598
|
+
char c = s[i];
|
599
|
+
CharNode *x = cn->find(c);
|
600
|
+
if (x != nullptr) {
|
601
|
+
cn = x;
|
602
|
+
// Consider scores only for substrings with size >= 2
|
603
|
+
if (i > 0) {
|
604
|
+
std::set<int> ids = cn->getIds();
|
605
|
+
for (const int &y : ids) {
|
606
|
+
PathSegment *p = nullptr;
|
607
|
+
if (&chartr == &cm) {
|
608
|
+
p = seglist[y];
|
609
|
+
} else {
|
610
|
+
p = seglist_dir[y];
|
611
|
+
}
|
612
|
+
assert(p != nullptr);
|
613
|
+
addToResults(p, query, start, i + 1, candmap);
|
614
|
+
}
|
615
|
+
}
|
616
|
+
} else {
|
617
|
+
// assert(cn->ids_sz < 1); // TODO: should not come here?
|
618
|
+
break;
|
619
|
+
}
|
620
|
+
}
|
408
621
|
}
|
409
|
-
|
622
|
+
}
|
410
623
|
|
624
|
+
std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
|
411
625
|
// Form return result, 2d array with file id's and scores
|
412
626
|
std::vector<std::pair<float, int>> results;
|
413
|
-
for (auto &[fid, cand] :
|
627
|
+
for (auto &[fid, cand] : candmap) {
|
414
628
|
std::pair<float, int> v;
|
415
629
|
float sc = cand->getScore();
|
416
630
|
v.first = sc;
|
@@ -419,164 +633,134 @@ public:
|
|
419
633
|
delete cand;
|
420
634
|
}
|
421
635
|
|
422
|
-
for (auto &[fid, cand] : dirCandMap) {
|
423
|
-
delete cand;
|
424
|
-
}
|
425
|
-
|
426
636
|
// Sort highest score first
|
427
637
|
std::sort(results.begin(), results.end(),
|
428
638
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
429
639
|
return results;
|
430
640
|
}
|
431
641
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
key = key | static_cast<int64_t>(str[i + i_char]);
|
437
|
-
if (i_char < nchars - 1) {
|
438
|
-
// Shift 8 bits to the left except on the last iteration
|
439
|
-
key = key << 8;
|
440
|
-
}
|
441
|
-
}
|
442
|
-
return key;
|
443
|
-
}
|
642
|
+
std::vector<std::pair<float, int>> findDirectories(std::string query) {
|
643
|
+
CandMap dirCandMap;
|
644
|
+
auto &candmap = dirCandMap;
|
645
|
+
waitUntilDone();
|
444
646
|
|
445
|
-
|
647
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
648
|
+
addParentScores(dirCandMap);
|
649
|
+
auto results = candidatesToVec(dirCandMap);
|
446
650
|
|
447
|
-
|
448
|
-
for (const auto &[key, value] : (*filemaps[nchars])) {
|
449
|
-
int64_t x;
|
450
|
-
x = key;
|
451
|
-
int multip = nchars * 8;
|
452
|
-
for (int i = 0; i <= nchars; i++) {
|
453
|
-
char c = (x >> multip) & 255;
|
454
|
-
std::cout << c;
|
455
|
-
multip -= 8;
|
456
|
-
}
|
457
|
-
std::cout << "\n";
|
458
|
-
// for (auto y : *value) {
|
459
|
-
// std::cout << y << " ";
|
460
|
-
// }
|
461
|
-
// std::cout << "\n";
|
462
|
-
}
|
651
|
+
return results;
|
463
652
|
}
|
464
653
|
|
465
|
-
|
466
|
-
|
467
|
-
if (p->children.size() > 0) {
|
468
|
-
for (auto x : p->children) {
|
469
|
-
clearPathSegmentChildren(x.second);
|
470
|
-
}
|
471
|
-
}
|
472
|
-
delete p;
|
654
|
+
std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
|
655
|
+
return findFilesAndDirectories(query, true, true);
|
473
656
|
}
|
474
657
|
|
475
|
-
|
476
|
-
|
477
|
-
// This function generates int64 representations (keys) of all substrings of size 2..8 in that
|
478
|
-
// path segment and stores pointer to p in hash tables using these int values as keys.
|
658
|
+
std::vector<std::pair<float, std::string>>
|
659
|
+
findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
|
479
660
|
|
480
|
-
|
481
|
-
|
661
|
+
CandMap fileCandMap;
|
662
|
+
CandMap dirCandMap;
|
663
|
+
waitUntilDone();
|
664
|
+
std::vector<std::pair<float, std::string>> results;
|
482
665
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
}
|
487
|
-
if (static_cast<int>(p->str.size()) < maxChars) {
|
488
|
-
maxChars = p->str.size();
|
666
|
+
if (includeFiles) {
|
667
|
+
searchCharTree(query, fileCandMap, cm);
|
668
|
+
// out.printl("size:",fileCandMap.size());
|
489
669
|
}
|
490
670
|
|
491
|
-
|
671
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
492
672
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
map = filemaps[sublen];
|
497
|
-
mu = &mts_f[sublen];
|
498
|
-
} else {
|
499
|
-
map = dirmaps[sublen];
|
500
|
-
mu = &mts_d[sublen];
|
501
|
-
}
|
673
|
+
if (includeFiles) {
|
674
|
+
addParentScores(fileCandMap);
|
675
|
+
}
|
502
676
|
|
503
|
-
|
677
|
+
if (includeDirs) {
|
678
|
+
addParentScores(dirCandMap);
|
679
|
+
}
|
680
|
+
|
681
|
+
for (auto seg : segsToClean) {
|
682
|
+
seg->cand = nullptr;
|
683
|
+
}
|
684
|
+
segsToClean.clear();
|
504
685
|
|
505
|
-
|
506
|
-
|
507
|
-
|
686
|
+
// TODO: Need to call this just to delete candidates
|
687
|
+
auto res_dir = candidatesToVec(dirCandMap);
|
688
|
+
if (includeDirs) {
|
689
|
+
for (const auto &[score, id] : res_dir) {
|
690
|
+
results.push_back(std::pair<float, std::string>{score, getString(id, true)});
|
508
691
|
}
|
692
|
+
}
|
509
693
|
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
694
|
+
if (includeFiles) {
|
695
|
+
auto res_file = candidatesToVec(fileCandMap);
|
696
|
+
// out.printl("size2:",fileCandMap.size());
|
697
|
+
for (const auto &[score, id] : res_file) {
|
514
698
|
|
515
|
-
//
|
516
|
-
|
517
|
-
if (it == map->end()) {
|
518
|
-
(*map)[key] = new std::set<PathSegment *>;
|
519
|
-
}
|
520
|
-
(*map)[key]->insert(p);
|
699
|
+
// out.print("|",getString(id),"|");
|
700
|
+
results.push_back(std::pair<float, std::string>{score, getString(id)});
|
521
701
|
}
|
522
|
-
mu->unlock();
|
523
702
|
}
|
703
|
+
|
704
|
+
// Sort highest score first
|
705
|
+
std::sort(results.begin(), results.end(),
|
706
|
+
[](std::pair<float, std::string> a, std::pair<float, std::string> b) {
|
707
|
+
return a.first > b.first;
|
708
|
+
});
|
709
|
+
return results;
|
524
710
|
}
|
525
711
|
|
526
|
-
//
|
527
|
-
|
528
|
-
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
529
|
-
SegMap &map) const {
|
712
|
+
// TODO: delete?
|
713
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
|
530
714
|
|
531
|
-
|
532
|
-
std::vector<PathSegment *> res;
|
715
|
+
std::vector<std::pair<float, int>> findFiles(std::string query) {
|
533
716
|
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
}
|
543
|
-
}
|
544
|
-
return res;
|
545
|
-
}
|
717
|
+
CandMap fileCandMap;
|
718
|
+
CandMap dirCandMap;
|
719
|
+
auto &candmap = fileCandMap;
|
720
|
+
waitUntilDone();
|
721
|
+
|
722
|
+
searchCharTree(query, fileCandMap, cm);
|
723
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
724
|
+
addParentScores(fileCandMap);
|
546
725
|
|
547
|
-
|
548
|
-
|
549
|
-
) {
|
550
|
-
int maxChars = 8;
|
551
|
-
int minChars = 2;
|
552
|
-
if (static_cast<int>(query.size()) < maxChars) {
|
553
|
-
maxChars = query.size();
|
726
|
+
for (auto seg : segsToClean) {
|
727
|
+
seg->cand = nullptr;
|
554
728
|
}
|
729
|
+
segsToClean.clear();
|
555
730
|
|
556
|
-
|
557
|
-
|
558
|
-
int count = query.size() - sublen + 1;
|
731
|
+
auto results = candidatesToVec(fileCandMap);
|
732
|
+
auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
|
559
733
|
|
560
|
-
|
561
|
-
|
562
|
-
std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
|
734
|
+
return results;
|
735
|
+
}
|
563
736
|
|
564
|
-
|
565
|
-
|
566
|
-
|
737
|
+
private:
|
738
|
+
void clearPathSegmentChildren(PathSegment *p) {
|
739
|
+
if (p->children.size() > 0) {
|
740
|
+
for (auto x : p->children) {
|
741
|
+
clearPathSegmentChildren(x.second);
|
567
742
|
}
|
568
743
|
}
|
744
|
+
delete p;
|
569
745
|
}
|
570
746
|
|
571
747
|
// Add parent directories scores to files
|
572
|
-
void
|
748
|
+
void addParentScores(CandMap &fileCandMap) {
|
573
749
|
|
574
750
|
for (auto &[fid, cand] : fileCandMap) {
|
575
751
|
PathSegment *p = cand->seg->parent;
|
576
752
|
while (p->parent != nullptr) {
|
577
753
|
if (p->cand != nullptr) {
|
754
|
+
|
578
755
|
auto &scoreA = cand->v_charscore;
|
579
756
|
auto &scoreB = p->cand->v_charscore;
|
757
|
+
|
758
|
+
// out.print("[");
|
759
|
+
// printVector(scoreA);
|
760
|
+
// out.print(",");
|
761
|
+
// printVector(scoreB);
|
762
|
+
// out.print(",");
|
763
|
+
// out.print("]");
|
580
764
|
for (int i = 0; i < cand->len; i++) {
|
581
765
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
582
766
|
scoreA[i] = scoreB[i] * dirWeight;
|
@@ -592,6 +776,7 @@ private:
|
|
592
776
|
|
593
777
|
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
594
778
|
Candidate *cand = new Candidate(seg, str.size());
|
779
|
+
// out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
|
595
780
|
segsToClean.push_back(seg);
|
596
781
|
candmap[seg->fileId] = cand;
|
597
782
|
seg->cand = cand;
|