StrIdx 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CMakeLists.txt +27 -0
- data/Gemfile +5 -0
- data/Makefile +1 -0
- data/README.md +54 -3
- data/demo.cpp +36 -5
- data/exe/stridx.rb +21 -0
- data/flist.txt +0 -5550
- data/gem_install +4 -0
- data/py_example.py +18 -0
- data/py_interf.cpp +182 -0
- data/rubyext/ruby_interf.cpp +58 -2
- data/runserver.rb +27 -0
- data/server.rb +108 -0
- data/setup.py +32 -0
- data/stridx-screencast.mp4 +0 -0
- data/stridx-tty.rb +122 -0
- data/stridx.gemspec +33 -0
- data/stridx.hpp +435 -243
- data/test.rb +7 -2
- data/thread_pool.hpp +20 -5
- data/unit_tests.sh +4 -0
- data/unittest.cpp +189 -0
- metadata +106 -7
data/stridx.hpp
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#include <array>
|
11
11
|
#include <iostream>
|
12
12
|
#include <unordered_map>
|
13
|
+
#include <map>
|
13
14
|
#include <set>
|
14
15
|
#include <algorithm>
|
15
16
|
#include <sstream>
|
@@ -28,21 +29,22 @@ namespace StrIdx {
|
|
28
29
|
class Output {
|
29
30
|
private:
|
30
31
|
int verboseLevel;
|
32
|
+
// TODO: add mutex?
|
31
33
|
|
32
34
|
public:
|
33
35
|
Output(int verb) : verboseLevel(verb) {}
|
34
|
-
Output() : Output(
|
36
|
+
Output() : Output(1) {}
|
35
37
|
~Output() = default;
|
36
|
-
void print() {}
|
38
|
+
static void print() {}
|
37
39
|
|
38
40
|
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
-
template <typename T, typename... Types> void print(T var1, Types... var2) {
|
41
|
+
template <typename T, typename... Types> static void print(T var1, Types... var2) {
|
40
42
|
std::cout << var1;
|
41
43
|
print(var2...);
|
42
44
|
}
|
43
45
|
|
44
46
|
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
-
template <typename... Types> void printl(Types... var2) {
|
47
|
+
template <typename... Types> static void printl(Types... var2) {
|
46
48
|
print(var2...);
|
47
49
|
print("\n");
|
48
50
|
}
|
@@ -61,86 +63,259 @@ public:
|
|
61
63
|
}
|
62
64
|
};
|
63
65
|
|
66
|
+
Output out{1};
|
67
|
+
|
68
|
+
struct CharNode {
|
69
|
+
int *ids;
|
70
|
+
int ids_sz;
|
71
|
+
char c;
|
72
|
+
std::uint8_t size;
|
73
|
+
CharNode *children;
|
74
|
+
CharNode() : ids(nullptr), ids_sz(0), c(0), size(0), children(nullptr) {}
|
75
|
+
|
76
|
+
void init() {
|
77
|
+
ids = nullptr;
|
78
|
+
ids_sz = 0;
|
79
|
+
c = 0;
|
80
|
+
size = 0;
|
81
|
+
children = nullptr;
|
82
|
+
}
|
83
|
+
|
84
|
+
void dealloc() {
|
85
|
+
if (children != nullptr) {
|
86
|
+
for (CharNode *it = children; it != children + size; it++) {
|
87
|
+
it->dealloc();
|
88
|
+
}
|
89
|
+
free(children);
|
90
|
+
}
|
91
|
+
delete[] ids;
|
92
|
+
}
|
93
|
+
|
94
|
+
~CharNode() {}
|
95
|
+
|
96
|
+
// Gets Id's stored in this node and all child nodes combined
|
97
|
+
std::set<int> getIds() {
|
98
|
+
std::set<int> set;
|
99
|
+
getIds(set);
|
100
|
+
return set;
|
101
|
+
}
|
102
|
+
|
103
|
+
void getIds(std::set<int> &set) {
|
104
|
+
for (int j = 0; j < ids_sz; j++) {
|
105
|
+
set.insert(ids[j]);
|
106
|
+
}
|
107
|
+
for (CharNode *it = children; it != children + size; it++) {
|
108
|
+
it->getIds(set);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
// Find if character 'c' is included in children of the node
|
113
|
+
CharNode *find(char c) {
|
114
|
+
CharNode *ret = nullptr;
|
115
|
+
if (size > 0) {
|
116
|
+
for (auto it = children; it != children + size; it++) {
|
117
|
+
if (it->c == c) {
|
118
|
+
ret = it;
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
return ret;
|
124
|
+
}
|
125
|
+
};
|
126
|
+
|
127
|
+
/* Tree type data structure consisting of strings of file path segments
|
128
|
+
* (somewhat like a trie)
|
129
|
+
* For example, Adding one input string "abracadabr4" will add the following (size 2..8 char)
|
130
|
+
* substrings: abracada bracadab racadabr acadabr4 dabr4 abr4 br4 ra
|
131
|
+
* (CharTree::addStr callled for each separately)
|
132
|
+
*
|
133
|
+
* Which forms a tree like structure:
|
134
|
+
* [root]-a-b-r-a-c-a-d-a
|
135
|
+
* | | ╰-4
|
136
|
+
* | ╰─c-a-d-a-b-r-4
|
137
|
+
* ╰───b-r-a-c-a-d-a-b
|
138
|
+
* | ╰─4
|
139
|
+
* ╰───r-a-c-a-d-a-b-r
|
140
|
+
* ╰───d-a-b-r-4
|
141
|
+
*
|
142
|
+
* Id's pointing to path segments are stored in nodes that match the end of the inserted substring
|
143
|
+
*
|
144
|
+
* This data structure (CharTree/CharNode) is the main bottleneck in terms of memory consumption.
|
145
|
+
* For a dataset of 84k files with 3.5 million characters there will be about 2.3 million CharNodes.
|
146
|
+
* Therefore, having std::vector's or similar structures with memory overhead is not really an
|
147
|
+
* option.
|
148
|
+
*/
|
149
|
+
class CharTree {
|
150
|
+
Output out;
|
151
|
+
std::mutex mu;
|
152
|
+
|
153
|
+
public:
|
154
|
+
CharNode *root;
|
155
|
+
|
156
|
+
CharTree() { root = new CharNode; }
|
157
|
+
~CharTree() {
|
158
|
+
root->dealloc();
|
159
|
+
delete root;
|
160
|
+
}
|
161
|
+
|
162
|
+
void addStr(std::string s, int id) {
|
163
|
+
if (s.size() < 2) {
|
164
|
+
return;
|
165
|
+
}
|
166
|
+
|
167
|
+
// out.printl("add str:",s);
|
168
|
+
CharNode *cn = root;
|
169
|
+
|
170
|
+
std::lock_guard<std::mutex> mu_lock(mu);
|
171
|
+
|
172
|
+
for (int i = 0; i < s.size() && i < 8; i++) {
|
173
|
+
int c = ((char)s[i]);
|
174
|
+
bool found = false;
|
175
|
+
|
176
|
+
if (cn->size > 0) {
|
177
|
+
// out.printl("(1) cn->size > 0");
|
178
|
+
for (auto it = cn->children; it != cn->children + cn->size; it++) {
|
179
|
+
if (it->c == c) {
|
180
|
+
// out.printl("{", c, "}");
|
181
|
+
found = true;
|
182
|
+
cn = it;
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
}
|
187
|
+
if (!found) {
|
188
|
+
// auto x = new CharNode[cn->size + 1];
|
189
|
+
CharNode *x = (CharNode *)malloc(sizeof(CharNode) * (cn->size + 1));
|
190
|
+
if (cn->size > 0) {
|
191
|
+
memcpy(x, cn->children, sizeof(CharNode) * (cn->size));
|
192
|
+
free(cn->children);
|
193
|
+
}
|
194
|
+
cn->children = x;
|
195
|
+
CharNode *nn = &(cn->children[cn->size]);
|
196
|
+
nn->init();
|
197
|
+
nn->c = c;
|
198
|
+
cn->size++;
|
199
|
+
cn = nn;
|
200
|
+
}
|
201
|
+
|
202
|
+
if (i == s.size() - 1 && true) {
|
203
|
+
out.printv(4, "i=", i, "s:", s.size(), "|");
|
204
|
+
bool found = false;
|
205
|
+
if (cn->ids_sz > 0) {
|
206
|
+
for (int i = 0; i < cn->ids_sz; i++) {
|
207
|
+
if (cn->ids[i] == id) {
|
208
|
+
found = true;
|
209
|
+
out.printv(3, "found:", id, "\n");
|
210
|
+
}
|
211
|
+
}
|
212
|
+
}
|
213
|
+
if (!found) {
|
214
|
+
// out.print(".a.");
|
215
|
+
auto x = new int[cn->ids_sz + 1];
|
216
|
+
if (cn->ids_sz > 0) {
|
217
|
+
memcpy(x, cn->ids, sizeof(int) * cn->ids_sz);
|
218
|
+
delete[] cn->ids;
|
219
|
+
}
|
220
|
+
cn->ids = x;
|
221
|
+
cn->ids[cn->ids_sz] = id;
|
222
|
+
cn->ids_sz++;
|
223
|
+
out.printv(3, "sz:", cn->ids_sz, ",");
|
224
|
+
}
|
225
|
+
}
|
226
|
+
|
227
|
+
} // END for
|
228
|
+
}
|
229
|
+
|
230
|
+
void debug() { debug("", root); }
|
231
|
+
void debug(std::string trail, CharNode *cn) {
|
232
|
+
|
233
|
+
// if (trail.size() > 6) {
|
234
|
+
// out.print("\n");
|
235
|
+
// return;
|
236
|
+
// }
|
237
|
+
|
238
|
+
if (cn == nullptr) {
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
for (int i = 0; i < cn->size; i++) {
|
242
|
+
CharNode *child = &cn->children[i];
|
243
|
+
out.print("[", child->ids_sz, "]");
|
244
|
+
if (child->size > 0) {
|
245
|
+
debug(trail + child->c, child);
|
246
|
+
} else {
|
247
|
+
out.printl(trail, child->c);
|
248
|
+
// out.printl();
|
249
|
+
}
|
250
|
+
}
|
251
|
+
}
|
252
|
+
};
|
253
|
+
|
64
254
|
// Transforms input string as follows:
|
65
255
|
// '/foo/bar/file1.txt'
|
66
|
-
// => vector{"foo", "bar", "file1.txt"}
|
67
|
-
|
256
|
+
// => vector{"/foo", "/bar", "/file1.txt"}
|
257
|
+
|
258
|
+
std::vector<std::string> splitString(const std::string &str, char delimiter) {
|
68
259
|
std::vector<std::string> result;
|
69
|
-
std::
|
70
|
-
std::string item;
|
260
|
+
std::string part;
|
71
261
|
|
72
|
-
|
73
|
-
if (
|
74
|
-
|
262
|
+
for (char ch : str) {
|
263
|
+
if (ch == delimiter) {
|
264
|
+
if (part.size() > 0) {
|
265
|
+
result.push_back(part);
|
266
|
+
}
|
267
|
+
part.clear(); // Start a new part
|
268
|
+
part += ch;
|
269
|
+
} else {
|
270
|
+
part += ch;
|
75
271
|
}
|
76
272
|
}
|
77
273
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
// Convert int64_t to binary string
|
82
|
-
[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
|
83
|
-
std::string result;
|
84
|
-
for (int i = 63; i >= 0; --i) {
|
85
|
-
result += ((num >> i) & 1) ? '1' : '0';
|
274
|
+
// If there's any remaining part after the loop, add it to the result
|
275
|
+
if (!part.empty()) {
|
276
|
+
result.push_back(part);
|
86
277
|
}
|
87
|
-
return result;
|
88
|
-
}
|
89
278
|
|
90
|
-
//
|
91
|
-
|
92
|
-
|
93
|
-
std::
|
94
|
-
|
95
|
-
|
96
|
-
char c = (key >> multip) & 255;
|
97
|
-
str.push_back(c);
|
98
|
-
multip -= 8;
|
99
|
-
}
|
100
|
-
return str;
|
279
|
+
// for (const auto &value : result) {
|
280
|
+
// std::cout << value << "|";
|
281
|
+
// }
|
282
|
+
// std::cout << std::endl;
|
283
|
+
|
284
|
+
return result;
|
101
285
|
}
|
102
286
|
|
103
287
|
// Debug
|
104
|
-
void printVector(const std::vector<
|
288
|
+
void printVector(const std::vector<float> &vec) {
|
105
289
|
for (const auto &value : vec) {
|
106
290
|
std::cout << value << " ";
|
107
291
|
}
|
108
292
|
}
|
109
293
|
|
110
|
-
// Debug
|
111
|
-
[[nodiscard]] std::string charToBinaryString(char chr) {
|
112
|
-
std::string result;
|
113
|
-
for (int i = 7; i >= 0; --i) {
|
114
|
-
result += ((chr >> i) & 1) ? '1' : '0';
|
115
|
-
}
|
116
|
-
return result;
|
117
|
-
}
|
118
|
-
|
119
294
|
class Candidate;
|
120
295
|
enum class segmentType { Dir, File };
|
121
296
|
|
122
297
|
// A segment of a file path
|
123
298
|
// e.g. if path is /foo/bar/baz.txt
|
124
299
|
// segments are [{root}, foo, bar, baz.txt]
|
125
|
-
|
126
|
-
public:
|
300
|
+
struct PathSegment {
|
127
301
|
std::string str;
|
128
302
|
int fileId; // (if FILE)
|
129
303
|
Candidate *cand;
|
130
304
|
PathSegment *parent;
|
131
305
|
std::mutex mu;
|
132
|
-
|
306
|
+
std::map<std::string, PathSegment *> children;
|
307
|
+
|
133
308
|
segmentType type = segmentType::Dir;
|
134
309
|
PathSegment() : parent(nullptr) {}
|
135
310
|
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
136
311
|
PathSegment(std::string _str, int _fileId)
|
137
312
|
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
138
|
-
[[nodiscard]] int size() {
|
313
|
+
[[nodiscard]] int size() const {
|
139
314
|
int sz = str.size();
|
140
315
|
PathSegment *cur = parent;
|
141
|
-
// Sum up length of parent segments
|
316
|
+
// Sum up length of parent segments
|
142
317
|
while (cur->parent != nullptr) {
|
143
|
-
sz += cur->str.size()
|
318
|
+
sz += cur->str.size();
|
144
319
|
cur = cur->parent;
|
145
320
|
}
|
146
321
|
return sz;
|
@@ -148,8 +323,7 @@ public:
|
|
148
323
|
};
|
149
324
|
|
150
325
|
// Candidate for result in string (filename) search
|
151
|
-
|
152
|
-
public:
|
326
|
+
struct Candidate {
|
153
327
|
std::vector<float> v_charscore;
|
154
328
|
PathSegment *seg;
|
155
329
|
int fileId;
|
@@ -161,26 +335,19 @@ public:
|
|
161
335
|
float maxscore;
|
162
336
|
int candLen; // Length of candidate
|
163
337
|
|
338
|
+
~Candidate(){};
|
164
339
|
Candidate(){};
|
165
|
-
Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
|
166
|
-
// Initialize v_charscores with zeros
|
167
|
-
v_charscore.resize(len, 0);
|
168
|
-
candLen = str.size();
|
169
|
-
seg = nullptr;
|
170
|
-
}
|
171
|
-
|
172
340
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
173
341
|
// Initialize v_charscores with zeros
|
174
342
|
v_charscore.resize(len, 0);
|
175
343
|
candLen = seg->size();
|
176
344
|
}
|
177
345
|
|
178
|
-
[[nodiscard]] float getScore() {
|
346
|
+
[[nodiscard]] float getScore() const {
|
179
347
|
int i = 0;
|
180
348
|
float score = 0.0;
|
181
|
-
candLen = seg->size();
|
182
349
|
|
183
|
-
for (float &charscore : v_charscore) {
|
350
|
+
for (const float &charscore : v_charscore) {
|
184
351
|
score += charscore;
|
185
352
|
i++;
|
186
353
|
}
|
@@ -188,20 +355,17 @@ public:
|
|
188
355
|
float div2 = len * candLen;
|
189
356
|
float score1 = score / div;
|
190
357
|
float score2 = score / div2;
|
358
|
+
// out.printl("str:",seg->str," len:",len," candLen:", candLen, " score:", score);
|
191
359
|
|
192
360
|
score = score1 * 0.97 + score2 * 0.03;
|
193
361
|
return score;
|
194
362
|
}
|
195
363
|
|
196
|
-
[[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
|
364
|
+
[[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
|
197
365
|
};
|
198
366
|
|
199
|
-
// This seems to give 10x speed improvement over std::unordered_map
|
200
|
-
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
201
|
-
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
202
|
-
|
203
367
|
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
204
|
-
// typedef std::unordered_map<int, Candidate*> CandMap;
|
368
|
+
// typedef std::unordered_map<int, Candidate *> CandMap;
|
205
369
|
|
206
370
|
class StringIndex {
|
207
371
|
private:
|
@@ -209,34 +373,28 @@ private:
|
|
209
373
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
210
374
|
int numStrings = 0;
|
211
375
|
|
212
|
-
std::vector<SegMap *> dirmaps;
|
213
|
-
std::vector<SegMap *> filemaps;
|
214
|
-
|
215
376
|
std::vector<PathSegment *> segsToClean;
|
216
377
|
|
217
|
-
std::unordered_map<int, std::string> strlist;
|
218
378
|
std::unordered_map<int, PathSegment *> seglist;
|
379
|
+
std::unordered_map<int, PathSegment *> seglist_dir;
|
380
|
+
std::mutex seglist_mu;
|
381
|
+
|
219
382
|
PathSegment *root;
|
220
383
|
int dirId = 0;
|
221
384
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
222
385
|
|
223
|
-
std::array<std::mutex, 9> mts_f;
|
224
|
-
std::array<std::mutex, 9> mts_d;
|
225
|
-
|
226
386
|
std::unique_ptr<ThreadPool> pool;
|
227
387
|
Output out{1}; // verbose level = 1
|
388
|
+
std::mutex cm_mu;
|
228
389
|
|
229
390
|
public:
|
391
|
+
CharTree cm; // for files
|
392
|
+
CharTree cm_dir; // for directories
|
230
393
|
StringIndex(char sep) : dirSeparator(sep) {
|
231
394
|
root = new PathSegment();
|
232
395
|
root->parent = nullptr;
|
233
396
|
root->str = "[ROOT]";
|
234
397
|
|
235
|
-
for (int i = 0; i <= 8; i++) {
|
236
|
-
dirmaps.push_back(new SegMap);
|
237
|
-
filemaps.push_back(new SegMap);
|
238
|
-
}
|
239
|
-
|
240
398
|
// Threads between 4 and 6
|
241
399
|
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
242
400
|
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
@@ -245,43 +403,31 @@ public:
|
|
245
403
|
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
246
404
|
}
|
247
405
|
|
248
|
-
/* Don't separate path to segments separator=\0.
|
406
|
+
/* Don't separate path to segments when separator=\0.
|
249
407
|
This is slower, but can be used for other data than files also. */
|
250
408
|
StringIndex() : StringIndex('\0') {}
|
251
409
|
|
252
410
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
253
411
|
void setDirWeight(float val) { dirWeight = val; }
|
254
412
|
|
255
|
-
~StringIndex() {
|
256
|
-
for (auto x : dirmaps) {
|
257
|
-
for (auto y : *x) {
|
258
|
-
y.second->clear();
|
259
|
-
delete (y.second);
|
260
|
-
}
|
261
|
-
x->clear();
|
262
|
-
delete x;
|
263
|
-
}
|
264
|
-
for (auto x : filemaps) {
|
265
|
-
for (auto y : *x) {
|
266
|
-
y.second->clear();
|
267
|
-
delete (y.second);
|
268
|
-
}
|
269
|
-
x->clear();
|
270
|
-
delete x;
|
271
|
-
}
|
272
|
-
clearPathSegmentChildren(root);
|
273
|
-
}
|
413
|
+
~StringIndex() { clearPathSegmentChildren(root); }
|
274
414
|
|
275
415
|
void addStrToIndex(std::string filePath, int fileId) {
|
276
416
|
addStrToIndex(filePath, fileId, dirSeparator);
|
277
417
|
}
|
278
418
|
|
279
419
|
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
280
|
-
pool->enqueue([
|
420
|
+
pool->enqueue([filePath, fileId, this] { addStrToIndex(filePath, fileId, dirSeparator); });
|
421
|
+
// addStrToIndex(filePath, fileId, dirSeparator);
|
281
422
|
}
|
282
|
-
void waitUntilReady() { pool->waitUntilDone(); }
|
423
|
+
void waitUntilReady() const { pool->waitUntilDone(); }
|
283
424
|
|
284
|
-
void waitUntilDone() { pool->waitUntilDone(); }
|
425
|
+
void waitUntilDone() const { pool->waitUntilDone(); }
|
426
|
+
|
427
|
+
int size() {
|
428
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
429
|
+
return seglist.size();
|
430
|
+
}
|
285
431
|
|
286
432
|
/**
|
287
433
|
* Add a string to the index to be searched for afterwards
|
@@ -291,8 +437,21 @@ public:
|
|
291
437
|
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
|
292
438
|
* one of {'\\', '/', '\0' (no separation)}.
|
293
439
|
*/
|
440
|
+
|
294
441
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
295
|
-
|
442
|
+
|
443
|
+
std::lock_guard<std::mutex> guard(cm_mu);
|
444
|
+
|
445
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",", dirSeparator);
|
446
|
+
|
447
|
+
// If a string with this index has beeen added already
|
448
|
+
{
|
449
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
450
|
+
if (seglist.find(fileId) != seglist.end()) {
|
451
|
+
out.printl("seglist.find(fileId) != seglist.end()");
|
452
|
+
return;
|
453
|
+
}
|
454
|
+
}
|
296
455
|
|
297
456
|
std::vector<std::string> segs;
|
298
457
|
numStrings += 1;
|
@@ -323,28 +482,77 @@ public:
|
|
323
482
|
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
324
483
|
p = it->second;
|
325
484
|
prev->mu.unlock();
|
326
|
-
} else {
|
485
|
+
} else { // File or dir not included in tree yet
|
327
486
|
p = new PathSegment(x, fileId);
|
328
487
|
p->parent = prev;
|
329
488
|
// If this is last item in segs, then it is a file.
|
330
489
|
if (_x == std::prev(segs.end())) {
|
331
490
|
p->type = segmentType::File;
|
332
|
-
|
491
|
+
{
|
492
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
493
|
+
seglist[fileId] = p;
|
494
|
+
|
495
|
+
for (int i = 0; i < x.size() + 1; i++) {
|
496
|
+
auto s = x.substr(i, std::min(static_cast<size_t>(8), x.size() - i));
|
497
|
+
cm.addStr(s, fileId);
|
498
|
+
}
|
499
|
+
}
|
333
500
|
} else { // otherwise, it is a directory
|
334
501
|
p->type = segmentType::Dir;
|
335
502
|
p->fileId = dirId;
|
336
|
-
|
503
|
+
/* Add "/" to the end of the string so that
|
504
|
+
* /path/to/file will be indexed as:
|
505
|
+
* {"/path/", "/to/", "/file"}
|
506
|
+
*/
|
507
|
+
auto dir_str = x + "/";
|
508
|
+
|
509
|
+
{
|
510
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
511
|
+
seglist_dir[dirId] = p;
|
512
|
+
// Files use user input Id. Directories need to have it generated
|
513
|
+
}
|
514
|
+
|
515
|
+
// TODO: Create a function
|
516
|
+
for (int i = 0; i < dir_str.size() + 1; i++) {
|
517
|
+
auto s = dir_str.substr(i, std::min(static_cast<size_t>(8), dir_str.size() - i));
|
518
|
+
cm_dir.addStr(s, dirId);
|
519
|
+
}
|
520
|
+
|
337
521
|
dirId++;
|
338
522
|
}
|
339
523
|
prev->children[x] = p;
|
340
524
|
prev->mu.unlock();
|
341
|
-
|
342
|
-
}
|
525
|
+
} // END of first if
|
343
526
|
|
344
527
|
prev = p;
|
345
528
|
}
|
346
529
|
}
|
347
530
|
|
531
|
+
std::string getString(int id) { return getString(id, false); }
|
532
|
+
|
533
|
+
// Reconstruct original filepath from segments
|
534
|
+
std::string getString(int id, bool isDir) {
|
535
|
+
std::string s = "";
|
536
|
+
std::lock_guard<std::mutex> guard(seglist_mu);
|
537
|
+
|
538
|
+
PathSegment *seg = nullptr;
|
539
|
+
|
540
|
+
if (isDir) {
|
541
|
+
seg = seglist_dir[id];
|
542
|
+
} else {
|
543
|
+
seg = seglist[id];
|
544
|
+
}
|
545
|
+
s += seg->str;
|
546
|
+
while (seg->parent->parent != nullptr) {
|
547
|
+
seg = seg->parent;
|
548
|
+
s = seg->str + s;
|
549
|
+
// out.print(seg, "(", seg->str, ")", ",");
|
550
|
+
}
|
551
|
+
// out.printl(s);
|
552
|
+
|
553
|
+
return s;
|
554
|
+
}
|
555
|
+
|
348
556
|
/**
|
349
557
|
The search will find filepaths similar to the input string
|
350
558
|
|
@@ -377,33 +585,46 @@ public:
|
|
377
585
|
@param query String to search for inside the index
|
378
586
|
*/
|
379
587
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
588
|
+
void searchCharTree(const std::string &query, CandMap &candmap, CharTree &chartr) {
|
589
|
+
|
590
|
+
int last_start = query.size() - 2;
|
591
|
+
for (int start = 0; start <= last_start; start++) {
|
592
|
+
CharNode *cn = chartr.root;
|
593
|
+
int end = std::min(start + 7, ((int)query.size()) - 1);
|
594
|
+
int nchars = end - start + 1;
|
595
|
+
std::string s = query.substr(start, nchars);
|
596
|
+
|
597
|
+
for (int i = 0; i < s.size(); i++) {
|
598
|
+
char c = s[i];
|
599
|
+
CharNode *x = cn->find(c);
|
600
|
+
if (x != nullptr) {
|
601
|
+
cn = x;
|
602
|
+
// Consider scores only for substrings with size >= 2
|
603
|
+
if (i > 0) {
|
604
|
+
std::set<int> ids = cn->getIds();
|
605
|
+
for (const int &y : ids) {
|
606
|
+
PathSegment *p = nullptr;
|
607
|
+
if (&chartr == &cm) {
|
608
|
+
p = seglist[y];
|
609
|
+
} else {
|
610
|
+
p = seglist_dir[y];
|
611
|
+
}
|
612
|
+
assert(p != nullptr);
|
613
|
+
addToResults(p, query, start, i + 1, candmap);
|
614
|
+
}
|
615
|
+
}
|
616
|
+
} else {
|
617
|
+
// assert(cn->ids_sz < 1); // TODO: should not come here?
|
618
|
+
break;
|
619
|
+
}
|
620
|
+
}
|
401
621
|
}
|
402
|
-
|
622
|
+
}
|
403
623
|
|
624
|
+
std::vector<std::pair<float, int>> candidatesToVec(CandMap &candmap) {
|
404
625
|
// Form return result, 2d array with file id's and scores
|
405
626
|
std::vector<std::pair<float, int>> results;
|
406
|
-
for (auto &[fid, cand] :
|
627
|
+
for (auto &[fid, cand] : candmap) {
|
407
628
|
std::pair<float, int> v;
|
408
629
|
float sc = cand->getScore();
|
409
630
|
v.first = sc;
|
@@ -412,164 +633,134 @@ public:
|
|
412
633
|
delete cand;
|
413
634
|
}
|
414
635
|
|
415
|
-
for (auto &[fid, cand] : dirCandMap) {
|
416
|
-
delete cand;
|
417
|
-
}
|
418
|
-
|
419
636
|
// Sort highest score first
|
420
637
|
std::sort(results.begin(), results.end(),
|
421
638
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
422
639
|
return results;
|
423
640
|
}
|
424
641
|
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
key = key | static_cast<int64_t>(str[i + i_char]);
|
430
|
-
if (i_char < nchars - 1) {
|
431
|
-
// Shift 8 bits to the left except on the last iteration
|
432
|
-
key = key << 8;
|
433
|
-
}
|
434
|
-
}
|
435
|
-
return key;
|
436
|
-
}
|
642
|
+
std::vector<std::pair<float, int>> findDirectories(std::string query) {
|
643
|
+
CandMap dirCandMap;
|
644
|
+
auto &candmap = dirCandMap;
|
645
|
+
waitUntilDone();
|
437
646
|
|
438
|
-
|
647
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
648
|
+
addParentScores(dirCandMap);
|
649
|
+
auto results = candidatesToVec(dirCandMap);
|
439
650
|
|
440
|
-
|
441
|
-
for (const auto &[key, value] : (*filemaps[nchars])) {
|
442
|
-
int64_t x;
|
443
|
-
x = key;
|
444
|
-
int multip = nchars * 8;
|
445
|
-
for (int i = 0; i <= nchars; i++) {
|
446
|
-
char c = (x >> multip) & 255;
|
447
|
-
std::cout << c;
|
448
|
-
multip -= 8;
|
449
|
-
}
|
450
|
-
std::cout << "\n";
|
451
|
-
// for (auto y : *value) {
|
452
|
-
// std::cout << y << " ";
|
453
|
-
// }
|
454
|
-
// std::cout << "\n";
|
455
|
-
}
|
651
|
+
return results;
|
456
652
|
}
|
457
653
|
|
458
|
-
|
459
|
-
|
460
|
-
if (p->children.size() > 0) {
|
461
|
-
for (auto x : p->children) {
|
462
|
-
clearPathSegmentChildren(x.second);
|
463
|
-
}
|
464
|
-
}
|
465
|
-
delete p;
|
654
|
+
std::vector<std::pair<float, std::string>> findFilesAndDirectories(std::string query) {
|
655
|
+
return findFilesAndDirectories(query, true, true);
|
466
656
|
}
|
467
657
|
|
468
|
-
|
469
|
-
|
470
|
-
// This function generates int64 representations (keys) of all substrings of size 2..8 in that
|
471
|
-
// path segment and stores pointer to p in hash tables using these int values as keys.
|
658
|
+
std::vector<std::pair<float, std::string>>
|
659
|
+
findFilesAndDirectories(std::string query, bool includeFiles, bool includeDirs) {
|
472
660
|
|
473
|
-
|
474
|
-
|
661
|
+
CandMap fileCandMap;
|
662
|
+
CandMap dirCandMap;
|
663
|
+
waitUntilDone();
|
664
|
+
std::vector<std::pair<float, std::string>> results;
|
475
665
|
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
}
|
480
|
-
if (static_cast<int>(p->str.size()) < maxChars) {
|
481
|
-
maxChars = p->str.size();
|
666
|
+
if (includeFiles) {
|
667
|
+
searchCharTree(query, fileCandMap, cm);
|
668
|
+
// out.printl("size:",fileCandMap.size());
|
482
669
|
}
|
483
670
|
|
484
|
-
|
671
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
485
672
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
map = filemaps[sublen];
|
490
|
-
mu = &mts_f[sublen];
|
491
|
-
} else {
|
492
|
-
map = dirmaps[sublen];
|
493
|
-
mu = &mts_d[sublen];
|
494
|
-
}
|
673
|
+
if (includeFiles) {
|
674
|
+
addParentScores(fileCandMap);
|
675
|
+
}
|
495
676
|
|
496
|
-
|
677
|
+
if (includeDirs) {
|
678
|
+
addParentScores(dirCandMap);
|
679
|
+
}
|
680
|
+
|
681
|
+
for (auto seg : segsToClean) {
|
682
|
+
seg->cand = nullptr;
|
683
|
+
}
|
684
|
+
segsToClean.clear();
|
497
685
|
|
498
|
-
|
499
|
-
|
500
|
-
|
686
|
+
// TODO: Need to call this just to delete candidates
|
687
|
+
auto res_dir = candidatesToVec(dirCandMap);
|
688
|
+
if (includeDirs) {
|
689
|
+
for (const auto &[score, id] : res_dir) {
|
690
|
+
results.push_back(std::pair<float, std::string>{score, getString(id, true)});
|
501
691
|
}
|
692
|
+
}
|
502
693
|
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
694
|
+
if (includeFiles) {
|
695
|
+
auto res_file = candidatesToVec(fileCandMap);
|
696
|
+
// out.printl("size2:",fileCandMap.size());
|
697
|
+
for (const auto &[score, id] : res_file) {
|
507
698
|
|
508
|
-
//
|
509
|
-
|
510
|
-
if (it == map->end()) {
|
511
|
-
(*map)[key] = new std::set<PathSegment *>;
|
512
|
-
}
|
513
|
-
(*map)[key]->insert(p);
|
699
|
+
// out.print("|",getString(id),"|");
|
700
|
+
results.push_back(std::pair<float, std::string>{score, getString(id)});
|
514
701
|
}
|
515
|
-
mu->unlock();
|
516
702
|
}
|
703
|
+
|
704
|
+
// Sort highest score first
|
705
|
+
std::sort(results.begin(), results.end(),
|
706
|
+
[](std::pair<float, std::string> a, std::pair<float, std::string> b) {
|
707
|
+
return a.first > b.first;
|
708
|
+
});
|
709
|
+
return results;
|
517
710
|
}
|
518
711
|
|
519
|
-
//
|
520
|
-
|
521
|
-
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
522
|
-
SegMap &map) {
|
712
|
+
// TODO: delete?
|
713
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query) { return findFiles(query); }
|
523
714
|
|
524
|
-
|
525
|
-
std::vector<PathSegment *> res;
|
715
|
+
std::vector<std::pair<float, int>> findFiles(std::string query) {
|
526
716
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
if (auto it = map.find(key); it != map.end()) { // key found
|
532
|
-
auto set = it->second;
|
533
|
-
for (auto value : *set) {
|
534
|
-
res.push_back(value);
|
535
|
-
}
|
536
|
-
}
|
537
|
-
return res;
|
538
|
-
}
|
717
|
+
CandMap fileCandMap;
|
718
|
+
CandMap dirCandMap;
|
719
|
+
auto &candmap = fileCandMap;
|
720
|
+
waitUntilDone();
|
539
721
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
maxChars = query.size();
|
722
|
+
searchCharTree(query, fileCandMap, cm);
|
723
|
+
searchCharTree(query, dirCandMap, cm_dir);
|
724
|
+
addParentScores(fileCandMap);
|
725
|
+
|
726
|
+
for (auto seg : segsToClean) {
|
727
|
+
seg->cand = nullptr;
|
547
728
|
}
|
729
|
+
segsToClean.clear();
|
548
730
|
|
549
|
-
|
550
|
-
|
551
|
-
int count = query.size() - sublen + 1;
|
731
|
+
auto results = candidatesToVec(fileCandMap);
|
732
|
+
auto tmp = candidatesToVec(dirCandMap); // TODO: call just to release memory
|
552
733
|
|
553
|
-
|
554
|
-
|
555
|
-
std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
|
734
|
+
return results;
|
735
|
+
}
|
556
736
|
|
557
|
-
|
558
|
-
|
559
|
-
|
737
|
+
private:
|
738
|
+
void clearPathSegmentChildren(PathSegment *p) {
|
739
|
+
if (p->children.size() > 0) {
|
740
|
+
for (auto x : p->children) {
|
741
|
+
clearPathSegmentChildren(x.second);
|
560
742
|
}
|
561
743
|
}
|
744
|
+
delete p;
|
562
745
|
}
|
563
746
|
|
564
747
|
// Add parent directories scores to files
|
565
|
-
void
|
748
|
+
void addParentScores(CandMap &fileCandMap) {
|
566
749
|
|
567
750
|
for (auto &[fid, cand] : fileCandMap) {
|
568
751
|
PathSegment *p = cand->seg->parent;
|
569
752
|
while (p->parent != nullptr) {
|
570
753
|
if (p->cand != nullptr) {
|
754
|
+
|
571
755
|
auto &scoreA = cand->v_charscore;
|
572
756
|
auto &scoreB = p->cand->v_charscore;
|
757
|
+
|
758
|
+
// out.print("[");
|
759
|
+
// printVector(scoreA);
|
760
|
+
// out.print(",");
|
761
|
+
// printVector(scoreB);
|
762
|
+
// out.print(",");
|
763
|
+
// out.print("]");
|
573
764
|
for (int i = 0; i < cand->len; i++) {
|
574
765
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
575
766
|
scoreA[i] = scoreB[i] * dirWeight;
|
@@ -585,6 +776,7 @@ private:
|
|
585
776
|
|
586
777
|
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
587
778
|
Candidate *cand = new Candidate(seg, str.size());
|
779
|
+
// out.printl("new cand:", seg->str, ",", seg, ",", seg->parent, ",", seg->parent->parent);
|
588
780
|
segsToClean.push_back(seg);
|
589
781
|
candmap[seg->fileId] = cand;
|
590
782
|
seg->cand = cand;
|