StrIdx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/stridx.hpp ADDED
@@ -0,0 +1,506 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <cassert>
5
+
6
+ #include <vector>
7
+ #include <iostream>
8
+ #include <unordered_map>
9
+ #include <set>
10
+ #include <algorithm>
11
+ #include <sstream>
12
+
13
+ #ifdef _OPENMP
14
+ #include <omp.h>
15
+ #endif
16
+
17
+ #include "unordered_dense.h"
18
+
19
+ // Transforms input string as follows:
20
+ // '/foo/bar/file1.txt'
21
+ // => vector{"foo", "bar", "file1.txt"}
22
+ std::vector<std::string> splitString(const std::string &input, const char &separator) {
23
+ std::vector<std::string> result;
24
+ std::stringstream ss(input);
25
+ std::string item;
26
+
27
+ while (std::getline(ss, item, separator)) {
28
+ if (item.size() > 0) {
29
+ result.push_back(item);
30
+ }
31
+ }
32
+
33
+ return result;
34
+ }
35
+
36
+ // Convert int64_t to binary string
37
+ std::string int64ToBinaryString(int64_t num) {
38
+ std::string result;
39
+ for (int i = 63; i >= 0; --i) {
40
+ result += ((num >> i) & 1) ? '1' : '0';
41
+ }
42
+ return result;
43
+ }
44
+
45
+ // Convert a (8 char) string represented as int64_t to std::string
46
+ std::string int64ToStr(int64_t key) {
47
+ int nchars = 8;
48
+ std::string str;
49
+ int multip = nchars * 8;
50
+ for (int i = 0; i <= nchars; i++) {
51
+ char c = (key >> multip) & 255;
52
+ str.push_back(c);
53
+ multip -= 8;
54
+ }
55
+ return str;
56
+ }
57
+
58
+ void printVector(const std::vector<int> &vec) {
59
+ for (const auto &value : vec) {
60
+ std::cout << value << " ";
61
+ }
62
+ }
63
+
64
+ std::string charToBinaryString(char num) {
65
+ std::string result;
66
+ for (int i = 7; i >= 0; --i) {
67
+ result += ((num >> i) & 1) ? '1' : '0';
68
+ }
69
+ return result;
70
+ }
71
+
72
+ class Candidate;
73
+ enum segmentType { Dir, File };
74
+
75
+ // A segment of a file path
76
+ // e.g. if path is /foo/bar/baz.txt
77
+ // segments are [{root}, foo, bar, baz.txt]
78
+ class PathSegment {
79
+ public:
80
+ std::string str;
81
+ int fileId; // (if FILE)
82
+ Candidate *cand;
83
+ PathSegment *parent;
84
+ ankerl::unordered_dense::map<std::string, PathSegment *> children;
85
+ segmentType type = Dir;
86
+ PathSegment() : parent(NULL) {}
87
+ PathSegment(std::string _str) : str(_str), parent(NULL) {}
88
+ PathSegment(std::string _str, int _fileId)
89
+ : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
90
+ int size() {
91
+ int sz = str.size();
92
+ PathSegment *cur = parent;
93
+ // Sum up length of parent segments (+1 for divisors)
94
+ while (cur->parent != NULL) {
95
+ sz += cur->str.size() + 1;
96
+ cur = cur->parent;
97
+ }
98
+ return sz;
99
+ }
100
+ };
101
+
102
+ // Candidate for result in string (filename) search
103
+ class Candidate {
104
+ public:
105
+ std::vector<float> v_charscore;
106
+ PathSegment *seg;
107
+ int fileId;
108
+ // The string that this candidate represents
109
+ std::string str;
110
+ int len; // Query string length
111
+
112
+ float minscore;
113
+ float maxscore;
114
+ int candLen; // Length of candidate
115
+
116
+ Candidate(){};
117
+ Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
118
+ // Initialize v_charscores with zeros
119
+ v_charscore.resize(len, 0);
120
+ candLen = str.size();
121
+ seg = NULL;
122
+ }
123
+
124
+ Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
125
+ // Initialize v_charscores with zeros
126
+ v_charscore.resize(len, 0);
127
+ candLen = seg->size();
128
+ }
129
+
130
+ float getScore() {
131
+ int i = 0;
132
+ float score = 0.0;
133
+ candLen = seg->size();
134
+
135
+ for (float &charscore : v_charscore) {
136
+ score += charscore;
137
+ i++;
138
+ }
139
+ float div = len * len;
140
+ float div2 = len * candLen;
141
+ float score1 = score / div;
142
+ float score2 = score / div2;
143
+
144
+ score = score1 * 0.97 + score2 * 0.03;
145
+ return score;
146
+ }
147
+
148
+ float operator[](int idx) { return v_charscore[idx]; }
149
+ };
150
+
151
+ // This seems to give 10x speed improvement over std::unordered_map
152
+ typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
153
+ // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
154
+
155
+ typedef std::unordered_map<float, Candidate> CandMap;
156
+
157
+ class StringIndex {
158
+ private:
159
+ int tmp;
160
+ char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
161
+
162
+ std::vector<SegMap *> dirmaps;
163
+ std::vector<SegMap *> filemaps;
164
+
165
+ std::vector<PathSegment *> segsToClean;
166
+
167
+ std::unordered_map<int, std::string> strlist;
168
+ std::unordered_map<int, PathSegment *> seglist;
169
+ PathSegment *root;
170
+ int dirId = 0;
171
+ float dirWeight = 0.7; // Give only 70% of score if match is for a directory
172
+
173
+ public:
174
+ StringIndex() {
175
+ root = new PathSegment();
176
+ root->parent = NULL;
177
+ root->str = "[ROOT]";
178
+
179
+ for (int i = 0; i <= 8; i++) {
180
+ dirmaps.push_back(new SegMap);
181
+ filemaps.push_back(new SegMap);
182
+ }
183
+
184
+ #ifdef _OPENMP
185
+ std::cout << "OPENMP enabled\n";
186
+ #endif
187
+ }
188
+
189
+ void setDirSeparator(char sep) { dirSeparator = sep; }
190
+ void setDirWeight(float val) { dirWeight = val; }
191
+
192
+ ~StringIndex() {
193
+ for (auto x : dirmaps) {
194
+ for (auto y : *x) {
195
+ y.second->clear();
196
+ delete (y.second);
197
+ }
198
+ x->clear();
199
+ delete x;
200
+ }
201
+ for (auto x : filemaps) {
202
+ for (auto y : *x) {
203
+ y.second->clear();
204
+ delete (y.second);
205
+ }
206
+ x->clear();
207
+ delete x;
208
+ }
209
+ clearPathSegmentChildren(root);
210
+ }
211
+
212
+ void addStrToIndex(std::string filePath, int fileId) {
213
+ addStrToIndex(filePath, fileId, dirSeparator);
214
+ }
215
+
216
+ /**
217
+ * Add a string to the index to be search for afterwards
218
+ *
219
+ * @param filePath String to index (e.g. /home/user/Project/main.cpp).
220
+ * @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
221
+ * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
222
+ * one of {'\\', '/', '\0' (no separation)}.
223
+ */
224
+ void addStrToIndex(std::string filePath, int fileId, const char &separator) {
225
+
226
+ std::vector<std::string> segs;
227
+
228
+ if (separator == '\0') {
229
+ // No separation to directories & files
230
+ segs = {filePath};
231
+ } else {
232
+ // Split path to segments
233
+ segs = splitString(filePath, separator);
234
+ }
235
+
236
+ PathSegment *prev = NULL;
237
+ prev = root;
238
+ // Add segments to a tree type data structure
239
+ // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
240
+ // addStrToIndex('/foo/faa/file2.txt' ..)
241
+ // forms structure:
242
+ // root -> foo |-> bar -> file1,txt
243
+ // |-> faa -> file2.txt
244
+ for (auto _x = segs.begin(); _x != segs.end(); ++_x) {
245
+ auto x = *_x;
246
+ PathSegment *p;
247
+
248
+ auto it = prev->children.find(x);
249
+ // this part of the path already exists in the tree
250
+ if (it != prev->children.end()) {
251
+ p = it->second;
252
+ } else {
253
+ p = new PathSegment(x, fileId);
254
+ p->parent = prev;
255
+ // If this is last item in segs
256
+ if (_x == std::prev(segs.end())) {
257
+ // therefore, it is a file.
258
+ p->type = File;
259
+ seglist[fileId] = p;
260
+ } else {
261
+ p->type = Dir;
262
+ p->fileId = dirId;
263
+ // Files use user input Id. Directories need to have it generated
264
+ dirId++;
265
+ }
266
+ prev->children[x] = p;
267
+ addPathSegmentKeys(p);
268
+ }
269
+
270
+ prev = p;
271
+ }
272
+ }
273
+
274
+ /**
275
+ * The search will find filepaths similar to the input string
276
+
277
+ To be considered a candidate path, the file component of the path (e.g. file.txt)
278
+ is required to have at least a substring of two characters in common with the
279
+ query string. If that condition is true, then the directories will also add to the
280
+ score, although with a smaller weight.
281
+
282
+ The similarity measure between query and PathSegment in index
283
+ works as follows:
284
+ For each character c in the query string:
285
+ - find the largest substring in the query which includes the character c and
286
+ is also included in the PathSegment
287
+ - take the lenght of that substring as score
288
+ sum up the scores for each character c and divide by (string length)^2
289
+
290
+ For example, if query = "rngnomadriv"
291
+ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
292
+ as follows:
293
+ rngnomadriv
294
+ 33355555444 (subscores)
295
+ FFFFFFFFDDD (F=file component, D=dir component)
296
+ score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
297
+
298
+ In final score, give a small penalty for larger candidate filenames:
299
+ Divide main part of score with (query string length)^2
300
+ and minor part by (query string length)*(candidate string length)
301
+ score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
302
+
303
+ @param query String to search for inside the index
304
+ */
305
+
306
+ std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
307
+ CandMap fileCandMap;
308
+ CandMap dirCandMap;
309
+
310
+ // Find both files and directories that match the input query
311
+ addToCandMap(fileCandMap, query, filemaps);
312
+ addToCandMap(dirCandMap, query, dirmaps);
313
+
314
+ /* If parent dir of a file matches the input string add the scores of the direcotry to the
315
+ scores of the file */
316
+ mergeCandidateMaps(fileCandMap, dirCandMap);
317
+
318
+ // Set all candidate pointers to NULL so they won't mess up future searches
319
+ for (auto seg : segsToClean) {
320
+ seg->cand = NULL;
321
+ }
322
+ segsToClean.clear();
323
+
324
+ // Form return result, 2d array with file id's and scores
325
+ std::vector<std::pair<float, int>> results;
326
+ for (auto &[fid, cand] : fileCandMap) {
327
+ std::pair<float, int> v;
328
+ float sc = cand.getScore();
329
+ v.first = sc;
330
+ v.second = fid;
331
+ results.push_back(v);
332
+ }
333
+ // Sort highest score first
334
+ std::sort(results.begin(), results.end(),
335
+ [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
336
+ return results;
337
+ }
338
+
339
+ // Return int64_t representation of the first nchars in str, starting from index i
340
+ int64_t getKeyAtIdx(std::string str, int i, int nchars) {
341
+ int64_t key = 0;
342
+ for (int i_char = 0; i_char < nchars; i_char++) {
343
+ key = key | static_cast<int>(str[i + i_char]);
344
+ if (i_char < nchars - 1) {
345
+ // Shift 8 bits to the left except on the last iteration
346
+ key = key << 8;
347
+ }
348
+ }
349
+ return key;
350
+ }
351
+
352
+ void debug() {
353
+
354
+ int nchars = 3;
355
+ for (const auto &[key, value] : (*filemaps[nchars])) {
356
+ int64_t x;
357
+ x = key;
358
+ int multip = nchars * 8;
359
+ for (int i = 0; i <= nchars; i++) {
360
+ char c = (x >> multip) & 255;
361
+ std::cout << c;
362
+ multip -= 8;
363
+ }
364
+ std::cout << "\n";
365
+ // for (auto y : *value) {
366
+ // std::cout << y << " ";
367
+ // }
368
+ // std::cout << "\n";
369
+ }
370
+ }
371
+
372
+ private:
373
+ void clearPathSegmentChildren(PathSegment *p) {
374
+ if (p->children.size() > 0) {
375
+ for (auto x : p->children) {
376
+ clearPathSegmentChildren(x.second);
377
+ }
378
+ }
379
+ delete p;
380
+ }
381
+
382
+ void addPathSegmentKeys(PathSegment *p) {
383
+ // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
384
+ // This function generates int64 representations (keys) of all substrings of size 2..8 in that
385
+ // path segment and stores pointer to p in hash tables using these int values as keys.
386
+
387
+ int maxChars = 8;
388
+ int minChars = 2;
389
+
390
+ std::string str = p->str;
391
+ if (p->str.size() < 2) {
392
+ return;
393
+ }
394
+ if (static_cast<int>(p->str.size()) < maxChars) {
395
+ maxChars = p->str.size();
396
+ }
397
+
398
+ #ifdef _OPENMP
399
+ #pragma omp parallel for
400
+ #endif
401
+ for (int sublen = minChars; sublen <= maxChars; sublen++) {
402
+
403
+ SegMap *map;
404
+ if (p->type == File) {
405
+ map = filemaps[sublen];
406
+ } else {
407
+ map = dirmaps[sublen];
408
+ }
409
+
410
+ int count = str.size() - sublen + 1;
411
+
412
+ for (int i = 0; i <= count; i++) {
413
+ int64_t key = getKeyAtIdx(str, i, sublen);
414
+
415
+ // Create a new std::set for key if doesn't exist already
416
+ auto it = map->find(key);
417
+ if (it == map->end()) {
418
+ (*map)[key] = new std::set<PathSegment *>;
419
+ }
420
+ (*map)[key]->insert(p);
421
+ }
422
+ }
423
+ }
424
+
425
+ // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
426
+ // is of length <nchars>.
427
+ std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
428
+
429
+ assert(i + nchars <= static_cast<int>(str.size()));
430
+ std::vector<PathSegment *> res;
431
+
432
+ // Take substring of str, starting at i, spanning nchars
433
+ // transform that to 64 bit integer
434
+ int64_t key = getKeyAtIdx(str, i, nchars);
435
+ // Find all path segments in map that have the same substring
436
+ auto it = map.find(key);
437
+ if (it != map.end()) { // key found
438
+ auto set = it->second;
439
+ for (auto value : *set) {
440
+ res.push_back(value);
441
+ }
442
+ }
443
+ return res;
444
+ }
445
+
446
+ void addToCandMap(CandMap &candmap, std::string query,
447
+ std::vector<SegMap *> &map // filemaps or dirmaps
448
+ ) {
449
+ int maxChars = 8;
450
+ int minChars = 2;
451
+ if (static_cast<int>(query.size()) < maxChars) {
452
+ maxChars = query.size();
453
+ }
454
+
455
+ // Loop all substring lengths between minChars..maxChars
456
+ for (int sublen = minChars; sublen <= maxChars; sublen++) {
457
+ int count = query.size() - sublen + 1;
458
+
459
+ // Loop all possible start positions
460
+ for (int i = 0; i < count; i++) {
461
+ std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
462
+
463
+ for (PathSegment *p : res) {
464
+ addToResults(p, query, i, sublen, candmap);
465
+ }
466
+ }
467
+ }
468
+ }
469
+
470
+ // Add parent directories scores to files
471
+ void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
472
+
473
+ for (auto &[fid, cand] : fileCandMap) {
474
+ PathSegment *p = cand.seg->parent;
475
+ while (p->parent != NULL) {
476
+ if (p->cand != NULL) {
477
+ auto &scoreA = cand.v_charscore;
478
+ auto &scoreB = p->cand->v_charscore;
479
+ for (int i = 0; i < cand.len; i++) {
480
+ if (scoreA[i] < scoreB[i] * dirWeight) {
481
+ scoreA[i] = scoreB[i] * dirWeight;
482
+ }
483
+ }
484
+ }
485
+ p = p->parent;
486
+ }
487
+ }
488
+ }
489
+
490
+ void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
491
+
492
+ auto it2 = candmap.find(seg->fileId);
493
+ if (it2 == candmap.end()) {
494
+ Candidate cand(seg, str.size());
495
+ seg->cand = &(candmap[seg->fileId]);
496
+ segsToClean.push_back(seg);
497
+ candmap[seg->fileId] = cand;
498
+ }
499
+
500
+ for (int j = i; j < i + nchars; j++) {
501
+ if (candmap[seg->fileId][j] < nchars) {
502
+ candmap[seg->fileId].v_charscore[j] = nchars;
503
+ }
504
+ }
505
+ }
506
+ };