StrIdx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +504 -0
- data/Makefile +268 -0
- data/README.md +92 -0
- data/demo.cpp +76 -0
- data/rubyext/extconf.rb +16 -0
- data/rubyext/ruby_interf.cpp +79 -0
- data/rubyext/test.rb +34 -0
- data/stridx.hpp +506 -0
- data/unordered_dense.h +2032 -0
- metadata +82 -0
data/stridx.hpp
ADDED
@@ -0,0 +1,506 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <cassert>
|
5
|
+
|
6
|
+
#include <vector>
|
7
|
+
#include <iostream>
|
8
|
+
#include <unordered_map>
|
9
|
+
#include <set>
|
10
|
+
#include <algorithm>
|
11
|
+
#include <sstream>
|
12
|
+
|
13
|
+
#ifdef _OPENMP
|
14
|
+
#include <omp.h>
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#include "unordered_dense.h"
|
18
|
+
|
19
|
+
// Transforms input string as follows:
|
20
|
+
// '/foo/bar/file1.txt'
|
21
|
+
// => vector{"foo", "bar", "file1.txt"}
|
22
|
+
std::vector<std::string> splitString(const std::string &input, const char &separator) {
|
23
|
+
std::vector<std::string> result;
|
24
|
+
std::stringstream ss(input);
|
25
|
+
std::string item;
|
26
|
+
|
27
|
+
while (std::getline(ss, item, separator)) {
|
28
|
+
if (item.size() > 0) {
|
29
|
+
result.push_back(item);
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
return result;
|
34
|
+
}
|
35
|
+
|
36
|
+
// Convert int64_t to binary string
|
37
|
+
std::string int64ToBinaryString(int64_t num) {
|
38
|
+
std::string result;
|
39
|
+
for (int i = 63; i >= 0; --i) {
|
40
|
+
result += ((num >> i) & 1) ? '1' : '0';
|
41
|
+
}
|
42
|
+
return result;
|
43
|
+
}
|
44
|
+
|
45
|
+
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
+
std::string int64ToStr(int64_t key) {
|
47
|
+
int nchars = 8;
|
48
|
+
std::string str;
|
49
|
+
int multip = nchars * 8;
|
50
|
+
for (int i = 0; i <= nchars; i++) {
|
51
|
+
char c = (key >> multip) & 255;
|
52
|
+
str.push_back(c);
|
53
|
+
multip -= 8;
|
54
|
+
}
|
55
|
+
return str;
|
56
|
+
}
|
57
|
+
|
58
|
+
void printVector(const std::vector<int> &vec) {
|
59
|
+
for (const auto &value : vec) {
|
60
|
+
std::cout << value << " ";
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
std::string charToBinaryString(char num) {
|
65
|
+
std::string result;
|
66
|
+
for (int i = 7; i >= 0; --i) {
|
67
|
+
result += ((num >> i) & 1) ? '1' : '0';
|
68
|
+
}
|
69
|
+
return result;
|
70
|
+
}
|
71
|
+
|
72
|
+
class Candidate;
|
73
|
+
enum segmentType { Dir, File };
|
74
|
+
|
75
|
+
// A segment of a file path
|
76
|
+
// e.g. if path is /foo/bar/baz.txt
|
77
|
+
// segments are [{root}, foo, bar, baz.txt]
|
78
|
+
class PathSegment {
|
79
|
+
public:
|
80
|
+
std::string str;
|
81
|
+
int fileId; // (if FILE)
|
82
|
+
Candidate *cand;
|
83
|
+
PathSegment *parent;
|
84
|
+
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
+
segmentType type = Dir;
|
86
|
+
PathSegment() : parent(NULL) {}
|
87
|
+
PathSegment(std::string _str) : str(_str), parent(NULL) {}
|
88
|
+
PathSegment(std::string _str, int _fileId)
|
89
|
+
: str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
|
90
|
+
int size() {
|
91
|
+
int sz = str.size();
|
92
|
+
PathSegment *cur = parent;
|
93
|
+
// Sum up length of parent segments (+1 for divisors)
|
94
|
+
while (cur->parent != NULL) {
|
95
|
+
sz += cur->str.size() + 1;
|
96
|
+
cur = cur->parent;
|
97
|
+
}
|
98
|
+
return sz;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
// Candidate for result in string (filename) search
|
103
|
+
class Candidate {
|
104
|
+
public:
|
105
|
+
std::vector<float> v_charscore;
|
106
|
+
PathSegment *seg;
|
107
|
+
int fileId;
|
108
|
+
// The string that this candidate represents
|
109
|
+
std::string str;
|
110
|
+
int len; // Query string length
|
111
|
+
|
112
|
+
float minscore;
|
113
|
+
float maxscore;
|
114
|
+
int candLen; // Length of candidate
|
115
|
+
|
116
|
+
Candidate(){};
|
117
|
+
Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
|
118
|
+
// Initialize v_charscores with zeros
|
119
|
+
v_charscore.resize(len, 0);
|
120
|
+
candLen = str.size();
|
121
|
+
seg = NULL;
|
122
|
+
}
|
123
|
+
|
124
|
+
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
125
|
+
// Initialize v_charscores with zeros
|
126
|
+
v_charscore.resize(len, 0);
|
127
|
+
candLen = seg->size();
|
128
|
+
}
|
129
|
+
|
130
|
+
float getScore() {
|
131
|
+
int i = 0;
|
132
|
+
float score = 0.0;
|
133
|
+
candLen = seg->size();
|
134
|
+
|
135
|
+
for (float &charscore : v_charscore) {
|
136
|
+
score += charscore;
|
137
|
+
i++;
|
138
|
+
}
|
139
|
+
float div = len * len;
|
140
|
+
float div2 = len * candLen;
|
141
|
+
float score1 = score / div;
|
142
|
+
float score2 = score / div2;
|
143
|
+
|
144
|
+
score = score1 * 0.97 + score2 * 0.03;
|
145
|
+
return score;
|
146
|
+
}
|
147
|
+
|
148
|
+
float operator[](int idx) { return v_charscore[idx]; }
|
149
|
+
};
|
150
|
+
|
151
|
+
// This seems to give 10x speed improvement over std::unordered_map
|
152
|
+
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
|
+
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
|
+
|
155
|
+
typedef std::unordered_map<float, Candidate> CandMap;
|
156
|
+
|
157
|
+
class StringIndex {
|
158
|
+
private:
|
159
|
+
int tmp;
|
160
|
+
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
161
|
+
|
162
|
+
std::vector<SegMap *> dirmaps;
|
163
|
+
std::vector<SegMap *> filemaps;
|
164
|
+
|
165
|
+
std::vector<PathSegment *> segsToClean;
|
166
|
+
|
167
|
+
std::unordered_map<int, std::string> strlist;
|
168
|
+
std::unordered_map<int, PathSegment *> seglist;
|
169
|
+
PathSegment *root;
|
170
|
+
int dirId = 0;
|
171
|
+
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
|
+
|
173
|
+
public:
|
174
|
+
StringIndex() {
|
175
|
+
root = new PathSegment();
|
176
|
+
root->parent = NULL;
|
177
|
+
root->str = "[ROOT]";
|
178
|
+
|
179
|
+
for (int i = 0; i <= 8; i++) {
|
180
|
+
dirmaps.push_back(new SegMap);
|
181
|
+
filemaps.push_back(new SegMap);
|
182
|
+
}
|
183
|
+
|
184
|
+
#ifdef _OPENMP
|
185
|
+
std::cout << "OPENMP enabled\n";
|
186
|
+
#endif
|
187
|
+
}
|
188
|
+
|
189
|
+
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
|
+
void setDirWeight(float val) { dirWeight = val; }
|
191
|
+
|
192
|
+
~StringIndex() {
|
193
|
+
for (auto x : dirmaps) {
|
194
|
+
for (auto y : *x) {
|
195
|
+
y.second->clear();
|
196
|
+
delete (y.second);
|
197
|
+
}
|
198
|
+
x->clear();
|
199
|
+
delete x;
|
200
|
+
}
|
201
|
+
for (auto x : filemaps) {
|
202
|
+
for (auto y : *x) {
|
203
|
+
y.second->clear();
|
204
|
+
delete (y.second);
|
205
|
+
}
|
206
|
+
x->clear();
|
207
|
+
delete x;
|
208
|
+
}
|
209
|
+
clearPathSegmentChildren(root);
|
210
|
+
}
|
211
|
+
|
212
|
+
void addStrToIndex(std::string filePath, int fileId) {
|
213
|
+
addStrToIndex(filePath, fileId, dirSeparator);
|
214
|
+
}
|
215
|
+
|
216
|
+
/**
|
217
|
+
* Add a string to the index to be search for afterwards
|
218
|
+
*
|
219
|
+
* @param filePath String to index (e.g. /home/user/Project/main.cpp).
|
220
|
+
* @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
|
221
|
+
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
|
222
|
+
* one of {'\\', '/', '\0' (no separation)}.
|
223
|
+
*/
|
224
|
+
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
225
|
+
|
226
|
+
std::vector<std::string> segs;
|
227
|
+
|
228
|
+
if (separator == '\0') {
|
229
|
+
// No separation to directories & files
|
230
|
+
segs = {filePath};
|
231
|
+
} else {
|
232
|
+
// Split path to segments
|
233
|
+
segs = splitString(filePath, separator);
|
234
|
+
}
|
235
|
+
|
236
|
+
PathSegment *prev = NULL;
|
237
|
+
prev = root;
|
238
|
+
// Add segments to a tree type data structure
|
239
|
+
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
240
|
+
// addStrToIndex('/foo/faa/file2.txt' ..)
|
241
|
+
// forms structure:
|
242
|
+
// root -> foo |-> bar -> file1,txt
|
243
|
+
// |-> faa -> file2.txt
|
244
|
+
for (auto _x = segs.begin(); _x != segs.end(); ++_x) {
|
245
|
+
auto x = *_x;
|
246
|
+
PathSegment *p;
|
247
|
+
|
248
|
+
auto it = prev->children.find(x);
|
249
|
+
// this part of the path already exists in the tree
|
250
|
+
if (it != prev->children.end()) {
|
251
|
+
p = it->second;
|
252
|
+
} else {
|
253
|
+
p = new PathSegment(x, fileId);
|
254
|
+
p->parent = prev;
|
255
|
+
// If this is last item in segs
|
256
|
+
if (_x == std::prev(segs.end())) {
|
257
|
+
// therefore, it is a file.
|
258
|
+
p->type = File;
|
259
|
+
seglist[fileId] = p;
|
260
|
+
} else {
|
261
|
+
p->type = Dir;
|
262
|
+
p->fileId = dirId;
|
263
|
+
// Files use user input Id. Directories need to have it generated
|
264
|
+
dirId++;
|
265
|
+
}
|
266
|
+
prev->children[x] = p;
|
267
|
+
addPathSegmentKeys(p);
|
268
|
+
}
|
269
|
+
|
270
|
+
prev = p;
|
271
|
+
}
|
272
|
+
}
|
273
|
+
|
274
|
+
/**
|
275
|
+
* The search will find filepaths similar to the input string
|
276
|
+
|
277
|
+
To be considered a candidate path, the file component of the path (e.g. file.txt)
|
278
|
+
is required to have at least a substring of two characters in common with the
|
279
|
+
query string. If that condition is true, then the directories will also add to the
|
280
|
+
score, although with a smaller weight.
|
281
|
+
|
282
|
+
The similarity measure between query and PathSegment in index
|
283
|
+
works as follows:
|
284
|
+
For each character c in the query string:
|
285
|
+
- find the largest substring in the query which includes the character c and
|
286
|
+
is also included in the PathSegment
|
287
|
+
- take the lenght of that substring as score
|
288
|
+
sum up the scores for each character c and divide by (string length)^2
|
289
|
+
|
290
|
+
For example, if query = "rngnomadriv"
|
291
|
+
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
|
292
|
+
as follows:
|
293
|
+
rngnomadriv
|
294
|
+
33355555444 (subscores)
|
295
|
+
FFFFFFFFDDD (F=file component, D=dir component)
|
296
|
+
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
297
|
+
|
298
|
+
In final score, give a small penalty for larger candidate filenames:
|
299
|
+
Divide main part of score with (query string length)^2
|
300
|
+
and minor part by (query string length)*(candidate string length)
|
301
|
+
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
302
|
+
|
303
|
+
@param query String to search for inside the index
|
304
|
+
*/
|
305
|
+
|
306
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
307
|
+
CandMap fileCandMap;
|
308
|
+
CandMap dirCandMap;
|
309
|
+
|
310
|
+
// Find both files and directories that match the input query
|
311
|
+
addToCandMap(fileCandMap, query, filemaps);
|
312
|
+
addToCandMap(dirCandMap, query, dirmaps);
|
313
|
+
|
314
|
+
/* If parent dir of a file matches the input string add the scores of the direcotry to the
|
315
|
+
scores of the file */
|
316
|
+
mergeCandidateMaps(fileCandMap, dirCandMap);
|
317
|
+
|
318
|
+
// Set all candidate pointers to NULL so they won't mess up future searches
|
319
|
+
for (auto seg : segsToClean) {
|
320
|
+
seg->cand = NULL;
|
321
|
+
}
|
322
|
+
segsToClean.clear();
|
323
|
+
|
324
|
+
// Form return result, 2d array with file id's and scores
|
325
|
+
std::vector<std::pair<float, int>> results;
|
326
|
+
for (auto &[fid, cand] : fileCandMap) {
|
327
|
+
std::pair<float, int> v;
|
328
|
+
float sc = cand.getScore();
|
329
|
+
v.first = sc;
|
330
|
+
v.second = fid;
|
331
|
+
results.push_back(v);
|
332
|
+
}
|
333
|
+
// Sort highest score first
|
334
|
+
std::sort(results.begin(), results.end(),
|
335
|
+
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
336
|
+
return results;
|
337
|
+
}
|
338
|
+
|
339
|
+
// Return int64_t representation of the first nchars in str, starting from index i
|
340
|
+
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
341
|
+
int64_t key = 0;
|
342
|
+
for (int i_char = 0; i_char < nchars; i_char++) {
|
343
|
+
key = key | static_cast<int>(str[i + i_char]);
|
344
|
+
if (i_char < nchars - 1) {
|
345
|
+
// Shift 8 bits to the left except on the last iteration
|
346
|
+
key = key << 8;
|
347
|
+
}
|
348
|
+
}
|
349
|
+
return key;
|
350
|
+
}
|
351
|
+
|
352
|
+
void debug() {
|
353
|
+
|
354
|
+
int nchars = 3;
|
355
|
+
for (const auto &[key, value] : (*filemaps[nchars])) {
|
356
|
+
int64_t x;
|
357
|
+
x = key;
|
358
|
+
int multip = nchars * 8;
|
359
|
+
for (int i = 0; i <= nchars; i++) {
|
360
|
+
char c = (x >> multip) & 255;
|
361
|
+
std::cout << c;
|
362
|
+
multip -= 8;
|
363
|
+
}
|
364
|
+
std::cout << "\n";
|
365
|
+
// for (auto y : *value) {
|
366
|
+
// std::cout << y << " ";
|
367
|
+
// }
|
368
|
+
// std::cout << "\n";
|
369
|
+
}
|
370
|
+
}
|
371
|
+
|
372
|
+
private:
|
373
|
+
void clearPathSegmentChildren(PathSegment *p) {
|
374
|
+
if (p->children.size() > 0) {
|
375
|
+
for (auto x : p->children) {
|
376
|
+
clearPathSegmentChildren(x.second);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
delete p;
|
380
|
+
}
|
381
|
+
|
382
|
+
void addPathSegmentKeys(PathSegment *p) {
|
383
|
+
// Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
|
384
|
+
// This function generates int64 representations (keys) of all substrings of size 2..8 in that
|
385
|
+
// path segment and stores pointer to p in hash tables using these int values as keys.
|
386
|
+
|
387
|
+
int maxChars = 8;
|
388
|
+
int minChars = 2;
|
389
|
+
|
390
|
+
std::string str = p->str;
|
391
|
+
if (p->str.size() < 2) {
|
392
|
+
return;
|
393
|
+
}
|
394
|
+
if (static_cast<int>(p->str.size()) < maxChars) {
|
395
|
+
maxChars = p->str.size();
|
396
|
+
}
|
397
|
+
|
398
|
+
#ifdef _OPENMP
|
399
|
+
#pragma omp parallel for
|
400
|
+
#endif
|
401
|
+
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
402
|
+
|
403
|
+
SegMap *map;
|
404
|
+
if (p->type == File) {
|
405
|
+
map = filemaps[sublen];
|
406
|
+
} else {
|
407
|
+
map = dirmaps[sublen];
|
408
|
+
}
|
409
|
+
|
410
|
+
int count = str.size() - sublen + 1;
|
411
|
+
|
412
|
+
for (int i = 0; i <= count; i++) {
|
413
|
+
int64_t key = getKeyAtIdx(str, i, sublen);
|
414
|
+
|
415
|
+
// Create a new std::set for key if doesn't exist already
|
416
|
+
auto it = map->find(key);
|
417
|
+
if (it == map->end()) {
|
418
|
+
(*map)[key] = new std::set<PathSegment *>;
|
419
|
+
}
|
420
|
+
(*map)[key]->insert(p);
|
421
|
+
}
|
422
|
+
}
|
423
|
+
}
|
424
|
+
|
425
|
+
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
426
|
+
// is of length <nchars>.
|
427
|
+
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
|
428
|
+
|
429
|
+
assert(i + nchars <= static_cast<int>(str.size()));
|
430
|
+
std::vector<PathSegment *> res;
|
431
|
+
|
432
|
+
// Take substring of str, starting at i, spanning nchars
|
433
|
+
// transform that to 64 bit integer
|
434
|
+
int64_t key = getKeyAtIdx(str, i, nchars);
|
435
|
+
// Find all path segments in map that have the same substring
|
436
|
+
auto it = map.find(key);
|
437
|
+
if (it != map.end()) { // key found
|
438
|
+
auto set = it->second;
|
439
|
+
for (auto value : *set) {
|
440
|
+
res.push_back(value);
|
441
|
+
}
|
442
|
+
}
|
443
|
+
return res;
|
444
|
+
}
|
445
|
+
|
446
|
+
void addToCandMap(CandMap &candmap, std::string query,
|
447
|
+
std::vector<SegMap *> &map // filemaps or dirmaps
|
448
|
+
) {
|
449
|
+
int maxChars = 8;
|
450
|
+
int minChars = 2;
|
451
|
+
if (static_cast<int>(query.size()) < maxChars) {
|
452
|
+
maxChars = query.size();
|
453
|
+
}
|
454
|
+
|
455
|
+
// Loop all substring lengths between minChars..maxChars
|
456
|
+
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
457
|
+
int count = query.size() - sublen + 1;
|
458
|
+
|
459
|
+
// Loop all possible start positions
|
460
|
+
for (int i = 0; i < count; i++) {
|
461
|
+
std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
|
462
|
+
|
463
|
+
for (PathSegment *p : res) {
|
464
|
+
addToResults(p, query, i, sublen, candmap);
|
465
|
+
}
|
466
|
+
}
|
467
|
+
}
|
468
|
+
}
|
469
|
+
|
470
|
+
// Add parent directories scores to files
|
471
|
+
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
472
|
+
|
473
|
+
for (auto &[fid, cand] : fileCandMap) {
|
474
|
+
PathSegment *p = cand.seg->parent;
|
475
|
+
while (p->parent != NULL) {
|
476
|
+
if (p->cand != NULL) {
|
477
|
+
auto &scoreA = cand.v_charscore;
|
478
|
+
auto &scoreB = p->cand->v_charscore;
|
479
|
+
for (int i = 0; i < cand.len; i++) {
|
480
|
+
if (scoreA[i] < scoreB[i] * dirWeight) {
|
481
|
+
scoreA[i] = scoreB[i] * dirWeight;
|
482
|
+
}
|
483
|
+
}
|
484
|
+
}
|
485
|
+
p = p->parent;
|
486
|
+
}
|
487
|
+
}
|
488
|
+
}
|
489
|
+
|
490
|
+
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
491
|
+
|
492
|
+
auto it2 = candmap.find(seg->fileId);
|
493
|
+
if (it2 == candmap.end()) {
|
494
|
+
Candidate cand(seg, str.size());
|
495
|
+
seg->cand = &(candmap[seg->fileId]);
|
496
|
+
segsToClean.push_back(seg);
|
497
|
+
candmap[seg->fileId] = cand;
|
498
|
+
}
|
499
|
+
|
500
|
+
for (int j = i; j < i + nchars; j++) {
|
501
|
+
if (candmap[seg->fileId][j] < nchars) {
|
502
|
+
candmap[seg->fileId].v_charscore[j] = nchars;
|
503
|
+
}
|
504
|
+
}
|
505
|
+
}
|
506
|
+
};
|