StrIdx 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +504 -0
- data/Makefile +268 -0
- data/README.md +92 -0
- data/demo.cpp +76 -0
- data/rubyext/extconf.rb +16 -0
- data/rubyext/ruby_interf.cpp +79 -0
- data/rubyext/test.rb +34 -0
- data/stridx.hpp +506 -0
- data/unordered_dense.h +2032 -0
- metadata +82 -0
data/stridx.hpp
ADDED
@@ -0,0 +1,506 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <cassert>
|
5
|
+
|
6
|
+
#include <vector>
|
7
|
+
#include <iostream>
|
8
|
+
#include <unordered_map>
|
9
|
+
#include <set>
|
10
|
+
#include <algorithm>
|
11
|
+
#include <sstream>
|
12
|
+
|
13
|
+
#ifdef _OPENMP
|
14
|
+
#include <omp.h>
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#include "unordered_dense.h"
|
18
|
+
|
19
|
+
// Transforms input string as follows:
|
20
|
+
// '/foo/bar/file1.txt'
|
21
|
+
// => vector{"foo", "bar", "file1.txt"}
|
22
|
+
std::vector<std::string> splitString(const std::string &input, const char &separator) {
|
23
|
+
std::vector<std::string> result;
|
24
|
+
std::stringstream ss(input);
|
25
|
+
std::string item;
|
26
|
+
|
27
|
+
while (std::getline(ss, item, separator)) {
|
28
|
+
if (item.size() > 0) {
|
29
|
+
result.push_back(item);
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
return result;
|
34
|
+
}
|
35
|
+
|
36
|
+
// Convert int64_t to binary string
|
37
|
+
std::string int64ToBinaryString(int64_t num) {
|
38
|
+
std::string result;
|
39
|
+
for (int i = 63; i >= 0; --i) {
|
40
|
+
result += ((num >> i) & 1) ? '1' : '0';
|
41
|
+
}
|
42
|
+
return result;
|
43
|
+
}
|
44
|
+
|
45
|
+
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
+
std::string int64ToStr(int64_t key) {
|
47
|
+
int nchars = 8;
|
48
|
+
std::string str;
|
49
|
+
int multip = nchars * 8;
|
50
|
+
for (int i = 0; i <= nchars; i++) {
|
51
|
+
char c = (key >> multip) & 255;
|
52
|
+
str.push_back(c);
|
53
|
+
multip -= 8;
|
54
|
+
}
|
55
|
+
return str;
|
56
|
+
}
|
57
|
+
|
58
|
+
void printVector(const std::vector<int> &vec) {
|
59
|
+
for (const auto &value : vec) {
|
60
|
+
std::cout << value << " ";
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
std::string charToBinaryString(char num) {
|
65
|
+
std::string result;
|
66
|
+
for (int i = 7; i >= 0; --i) {
|
67
|
+
result += ((num >> i) & 1) ? '1' : '0';
|
68
|
+
}
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
class Candidate;
|
73
|
+
enum segmentType { Dir, File };
|
74
|
+
|
75
|
+
// A segment of a file path
|
76
|
+
// e.g. if path is /foo/bar/baz.txt
|
77
|
+
// segments are [{root}, foo, bar, baz.txt]
|
78
|
+
class PathSegment {
|
79
|
+
public:
|
80
|
+
std::string str;
|
81
|
+
int fileId; // (if FILE)
|
82
|
+
Candidate *cand;
|
83
|
+
PathSegment *parent;
|
84
|
+
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
+
segmentType type = Dir;
|
86
|
+
PathSegment() : parent(NULL) {}
|
87
|
+
PathSegment(std::string _str) : str(_str), parent(NULL) {}
|
88
|
+
PathSegment(std::string _str, int _fileId)
|
89
|
+
: str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
|
90
|
+
int size() {
|
91
|
+
int sz = str.size();
|
92
|
+
PathSegment *cur = parent;
|
93
|
+
// Sum up length of parent segments (+1 for divisors)
|
94
|
+
while (cur->parent != NULL) {
|
95
|
+
sz += cur->str.size() + 1;
|
96
|
+
cur = cur->parent;
|
97
|
+
}
|
98
|
+
return sz;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
// Candidate for result in string (filename) search
|
103
|
+
class Candidate {
|
104
|
+
public:
|
105
|
+
std::vector<float> v_charscore;
|
106
|
+
PathSegment *seg;
|
107
|
+
int fileId;
|
108
|
+
// The string that this candidate represents
|
109
|
+
std::string str;
|
110
|
+
int len; // Query string length
|
111
|
+
|
112
|
+
float minscore;
|
113
|
+
float maxscore;
|
114
|
+
int candLen; // Length of candidate
|
115
|
+
|
116
|
+
Candidate(){};
|
117
|
+
Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
|
118
|
+
// Initialize v_charscores with zeros
|
119
|
+
v_charscore.resize(len, 0);
|
120
|
+
candLen = str.size();
|
121
|
+
seg = NULL;
|
122
|
+
}
|
123
|
+
|
124
|
+
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
125
|
+
// Initialize v_charscores with zeros
|
126
|
+
v_charscore.resize(len, 0);
|
127
|
+
candLen = seg->size();
|
128
|
+
}
|
129
|
+
|
130
|
+
float getScore() {
|
131
|
+
int i = 0;
|
132
|
+
float score = 0.0;
|
133
|
+
candLen = seg->size();
|
134
|
+
|
135
|
+
for (float &charscore : v_charscore) {
|
136
|
+
score += charscore;
|
137
|
+
i++;
|
138
|
+
}
|
139
|
+
float div = len * len;
|
140
|
+
float div2 = len * candLen;
|
141
|
+
float score1 = score / div;
|
142
|
+
float score2 = score / div2;
|
143
|
+
|
144
|
+
score = score1 * 0.97 + score2 * 0.03;
|
145
|
+
return score;
|
146
|
+
}
|
147
|
+
|
148
|
+
float operator[](int idx) { return v_charscore[idx]; }
|
149
|
+
};
|
150
|
+
|
151
|
+
// This seems to give 10x speed improvement over std::unordered_map
|
152
|
+
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
|
+
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
|
+
|
155
|
+
typedef std::unordered_map<float, Candidate> CandMap;
|
156
|
+
|
157
|
+
class StringIndex {
|
158
|
+
private:
|
159
|
+
int tmp;
|
160
|
+
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
161
|
+
|
162
|
+
std::vector<SegMap *> dirmaps;
|
163
|
+
std::vector<SegMap *> filemaps;
|
164
|
+
|
165
|
+
std::vector<PathSegment *> segsToClean;
|
166
|
+
|
167
|
+
std::unordered_map<int, std::string> strlist;
|
168
|
+
std::unordered_map<int, PathSegment *> seglist;
|
169
|
+
PathSegment *root;
|
170
|
+
int dirId = 0;
|
171
|
+
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
|
+
|
173
|
+
public:
|
174
|
+
StringIndex() {
|
175
|
+
root = new PathSegment();
|
176
|
+
root->parent = NULL;
|
177
|
+
root->str = "[ROOT]";
|
178
|
+
|
179
|
+
for (int i = 0; i <= 8; i++) {
|
180
|
+
dirmaps.push_back(new SegMap);
|
181
|
+
filemaps.push_back(new SegMap);
|
182
|
+
}
|
183
|
+
|
184
|
+
#ifdef _OPENMP
|
185
|
+
std::cout << "OPENMP enabled\n";
|
186
|
+
#endif
|
187
|
+
}
|
188
|
+
|
189
|
+
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
|
+
void setDirWeight(float val) { dirWeight = val; }
|
191
|
+
|
192
|
+
~StringIndex() {
|
193
|
+
for (auto x : dirmaps) {
|
194
|
+
for (auto y : *x) {
|
195
|
+
y.second->clear();
|
196
|
+
delete (y.second);
|
197
|
+
}
|
198
|
+
x->clear();
|
199
|
+
delete x;
|
200
|
+
}
|
201
|
+
for (auto x : filemaps) {
|
202
|
+
for (auto y : *x) {
|
203
|
+
y.second->clear();
|
204
|
+
delete (y.second);
|
205
|
+
}
|
206
|
+
x->clear();
|
207
|
+
delete x;
|
208
|
+
}
|
209
|
+
clearPathSegmentChildren(root);
|
210
|
+
}
|
211
|
+
|
212
|
+
void addStrToIndex(std::string filePath, int fileId) {
|
213
|
+
addStrToIndex(filePath, fileId, dirSeparator);
|
214
|
+
}
|
215
|
+
|
216
|
+
/**
|
217
|
+
* Add a string to the index to be search for afterwards
|
218
|
+
*
|
219
|
+
* @param filePath String to index (e.g. /home/user/Project/main.cpp).
|
220
|
+
* @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
|
221
|
+
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
|
222
|
+
* one of {'\\', '/', '\0' (no separation)}.
|
223
|
+
*/
|
224
|
+
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
225
|
+
|
226
|
+
std::vector<std::string> segs;
|
227
|
+
|
228
|
+
if (separator == '\0') {
|
229
|
+
// No separation to directories & files
|
230
|
+
segs = {filePath};
|
231
|
+
} else {
|
232
|
+
// Split path to segments
|
233
|
+
segs = splitString(filePath, separator);
|
234
|
+
}
|
235
|
+
|
236
|
+
PathSegment *prev = NULL;
|
237
|
+
prev = root;
|
238
|
+
// Add segments to a tree type data structure
|
239
|
+
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
240
|
+
// addStrToIndex('/foo/faa/file2.txt' ..)
|
241
|
+
// forms structure:
|
242
|
+
// root -> foo |-> bar -> file1,txt
|
243
|
+
// |-> faa -> file2.txt
|
244
|
+
for (auto _x = segs.begin(); _x != segs.end(); ++_x) {
|
245
|
+
auto x = *_x;
|
246
|
+
PathSegment *p;
|
247
|
+
|
248
|
+
auto it = prev->children.find(x);
|
249
|
+
// this part of the path already exists in the tree
|
250
|
+
if (it != prev->children.end()) {
|
251
|
+
p = it->second;
|
252
|
+
} else {
|
253
|
+
p = new PathSegment(x, fileId);
|
254
|
+
p->parent = prev;
|
255
|
+
// If this is last item in segs
|
256
|
+
if (_x == std::prev(segs.end())) {
|
257
|
+
// therefore, it is a file.
|
258
|
+
p->type = File;
|
259
|
+
seglist[fileId] = p;
|
260
|
+
} else {
|
261
|
+
p->type = Dir;
|
262
|
+
p->fileId = dirId;
|
263
|
+
// Files use user input Id. Directories need to have it generated
|
264
|
+
dirId++;
|
265
|
+
}
|
266
|
+
prev->children[x] = p;
|
267
|
+
addPathSegmentKeys(p);
|
268
|
+
}
|
269
|
+
|
270
|
+
prev = p;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
|
274
|
+
/**
|
275
|
+
* The search will find filepaths similar to the input string
|
276
|
+
|
277
|
+
To be considered a candidate path, the file component of the path (e.g. file.txt)
|
278
|
+
is required to have at least a substring of two characters in common with the
|
279
|
+
query string. If that condition is true, then the directories will also add to the
|
280
|
+
score, although with a smaller weight.
|
281
|
+
|
282
|
+
The similarity measure between query and PathSegment in index
|
283
|
+
works as follows:
|
284
|
+
For each character c in the query string:
|
285
|
+
- find the largest substring in the query which includes the character c and
|
286
|
+
is also included in the PathSegment
|
287
|
+
- take the lenght of that substring as score
|
288
|
+
sum up the scores for each character c and divide by (string length)^2
|
289
|
+
|
290
|
+
For example, if query = "rngnomadriv"
|
291
|
+
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
|
292
|
+
as follows:
|
293
|
+
rngnomadriv
|
294
|
+
33355555444 (subscores)
|
295
|
+
FFFFFFFFDDD (F=file component, D=dir component)
|
296
|
+
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
297
|
+
|
298
|
+
In final score, give a small penalty for larger candidate filenames:
|
299
|
+
Divide main part of score with (query string length)^2
|
300
|
+
and minor part by (query string length)*(candidate string length)
|
301
|
+
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
302
|
+
|
303
|
+
@param query String to search for inside the index
|
304
|
+
*/
|
305
|
+
|
306
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
307
|
+
CandMap fileCandMap;
|
308
|
+
CandMap dirCandMap;
|
309
|
+
|
310
|
+
// Find both files and directories that match the input query
|
311
|
+
addToCandMap(fileCandMap, query, filemaps);
|
312
|
+
addToCandMap(dirCandMap, query, dirmaps);
|
313
|
+
|
314
|
+
/* If parent dir of a file matches the input string add the scores of the direcotry to the
|
315
|
+
scores of the file */
|
316
|
+
mergeCandidateMaps(fileCandMap, dirCandMap);
|
317
|
+
|
318
|
+
// Set all candidate pointers to NULL so they won't mess up future searches
|
319
|
+
for (auto seg : segsToClean) {
|
320
|
+
seg->cand = NULL;
|
321
|
+
}
|
322
|
+
segsToClean.clear();
|
323
|
+
|
324
|
+
// Form return result, 2d array with file id's and scores
|
325
|
+
std::vector<std::pair<float, int>> results;
|
326
|
+
for (auto &[fid, cand] : fileCandMap) {
|
327
|
+
std::pair<float, int> v;
|
328
|
+
float sc = cand.getScore();
|
329
|
+
v.first = sc;
|
330
|
+
v.second = fid;
|
331
|
+
results.push_back(v);
|
332
|
+
}
|
333
|
+
// Sort highest score first
|
334
|
+
std::sort(results.begin(), results.end(),
|
335
|
+
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
336
|
+
return results;
|
337
|
+
}
|
338
|
+
|
339
|
+
// Return int64_t representation of the first nchars in str, starting from index i
|
340
|
+
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
341
|
+
int64_t key = 0;
|
342
|
+
for (int i_char = 0; i_char < nchars; i_char++) {
|
343
|
+
key = key | static_cast<int>(str[i + i_char]);
|
344
|
+
if (i_char < nchars - 1) {
|
345
|
+
// Shift 8 bits to the left except on the last iteration
|
346
|
+
key = key << 8;
|
347
|
+
}
|
348
|
+
}
|
349
|
+
return key;
|
350
|
+
}
|
351
|
+
|
352
|
+
void debug() {
|
353
|
+
|
354
|
+
int nchars = 3;
|
355
|
+
for (const auto &[key, value] : (*filemaps[nchars])) {
|
356
|
+
int64_t x;
|
357
|
+
x = key;
|
358
|
+
int multip = nchars * 8;
|
359
|
+
for (int i = 0; i <= nchars; i++) {
|
360
|
+
char c = (x >> multip) & 255;
|
361
|
+
std::cout << c;
|
362
|
+
multip -= 8;
|
363
|
+
}
|
364
|
+
std::cout << "\n";
|
365
|
+
// for (auto y : *value) {
|
366
|
+
// std::cout << y << " ";
|
367
|
+
// }
|
368
|
+
// std::cout << "\n";
|
369
|
+
}
|
370
|
+
}
|
371
|
+
|
372
|
+
private:
|
373
|
+
void clearPathSegmentChildren(PathSegment *p) {
|
374
|
+
if (p->children.size() > 0) {
|
375
|
+
for (auto x : p->children) {
|
376
|
+
clearPathSegmentChildren(x.second);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
delete p;
|
380
|
+
}
|
381
|
+
|
382
|
+
void addPathSegmentKeys(PathSegment *p) {
|
383
|
+
// Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
|
384
|
+
// This function generates int64 representations (keys) of all substrings of size 2..8 in that
|
385
|
+
// path segment and stores pointer to p in hash tables using these int values as keys.
|
386
|
+
|
387
|
+
int maxChars = 8;
|
388
|
+
int minChars = 2;
|
389
|
+
|
390
|
+
std::string str = p->str;
|
391
|
+
if (p->str.size() < 2) {
|
392
|
+
return;
|
393
|
+
}
|
394
|
+
if (static_cast<int>(p->str.size()) < maxChars) {
|
395
|
+
maxChars = p->str.size();
|
396
|
+
}
|
397
|
+
|
398
|
+
#ifdef _OPENMP
|
399
|
+
#pragma omp parallel for
|
400
|
+
#endif
|
401
|
+
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
402
|
+
|
403
|
+
SegMap *map;
|
404
|
+
if (p->type == File) {
|
405
|
+
map = filemaps[sublen];
|
406
|
+
} else {
|
407
|
+
map = dirmaps[sublen];
|
408
|
+
}
|
409
|
+
|
410
|
+
int count = str.size() - sublen + 1;
|
411
|
+
|
412
|
+
for (int i = 0; i <= count; i++) {
|
413
|
+
int64_t key = getKeyAtIdx(str, i, sublen);
|
414
|
+
|
415
|
+
// Create a new std::set for key if doesn't exist already
|
416
|
+
auto it = map->find(key);
|
417
|
+
if (it == map->end()) {
|
418
|
+
(*map)[key] = new std::set<PathSegment *>;
|
419
|
+
}
|
420
|
+
(*map)[key]->insert(p);
|
421
|
+
}
|
422
|
+
}
|
423
|
+
}
|
424
|
+
|
425
|
+
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
426
|
+
// is of length <nchars>.
|
427
|
+
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
|
428
|
+
|
429
|
+
assert(i + nchars <= static_cast<int>(str.size()));
|
430
|
+
std::vector<PathSegment *> res;
|
431
|
+
|
432
|
+
// Take substring of str, starting at i, spanning nchars
|
433
|
+
// transform that to 64 bit integer
|
434
|
+
int64_t key = getKeyAtIdx(str, i, nchars);
|
435
|
+
// Find all path segments in map that have the same substring
|
436
|
+
auto it = map.find(key);
|
437
|
+
if (it != map.end()) { // key found
|
438
|
+
auto set = it->second;
|
439
|
+
for (auto value : *set) {
|
440
|
+
res.push_back(value);
|
441
|
+
}
|
442
|
+
}
|
443
|
+
return res;
|
444
|
+
}
|
445
|
+
|
446
|
+
void addToCandMap(CandMap &candmap, std::string query,
|
447
|
+
std::vector<SegMap *> &map // filemaps or dirmaps
|
448
|
+
) {
|
449
|
+
int maxChars = 8;
|
450
|
+
int minChars = 2;
|
451
|
+
if (static_cast<int>(query.size()) < maxChars) {
|
452
|
+
maxChars = query.size();
|
453
|
+
}
|
454
|
+
|
455
|
+
// Loop all substring lengths between minChars..maxChars
|
456
|
+
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
457
|
+
int count = query.size() - sublen + 1;
|
458
|
+
|
459
|
+
// Loop all possible start positions
|
460
|
+
for (int i = 0; i < count; i++) {
|
461
|
+
std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
|
462
|
+
|
463
|
+
for (PathSegment *p : res) {
|
464
|
+
addToResults(p, query, i, sublen, candmap);
|
465
|
+
}
|
466
|
+
}
|
467
|
+
}
|
468
|
+
}
|
469
|
+
|
470
|
+
// Add parent directories scores to files
|
471
|
+
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
472
|
+
|
473
|
+
for (auto &[fid, cand] : fileCandMap) {
|
474
|
+
PathSegment *p = cand.seg->parent;
|
475
|
+
while (p->parent != NULL) {
|
476
|
+
if (p->cand != NULL) {
|
477
|
+
auto &scoreA = cand.v_charscore;
|
478
|
+
auto &scoreB = p->cand->v_charscore;
|
479
|
+
for (int i = 0; i < cand.len; i++) {
|
480
|
+
if (scoreA[i] < scoreB[i] * dirWeight) {
|
481
|
+
scoreA[i] = scoreB[i] * dirWeight;
|
482
|
+
}
|
483
|
+
}
|
484
|
+
}
|
485
|
+
p = p->parent;
|
486
|
+
}
|
487
|
+
}
|
488
|
+
}
|
489
|
+
|
490
|
+
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
491
|
+
|
492
|
+
auto it2 = candmap.find(seg->fileId);
|
493
|
+
if (it2 == candmap.end()) {
|
494
|
+
Candidate cand(seg, str.size());
|
495
|
+
seg->cand = &(candmap[seg->fileId]);
|
496
|
+
segsToClean.push_back(seg);
|
497
|
+
candmap[seg->fileId] = cand;
|
498
|
+
}
|
499
|
+
|
500
|
+
for (int j = i; j < i + nchars; j++) {
|
501
|
+
if (candmap[seg->fileId][j] < nchars) {
|
502
|
+
candmap[seg->fileId].v_charscore[j] = nchars;
|
503
|
+
}
|
504
|
+
}
|
505
|
+
}
|
506
|
+
};
|