StrIdx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/stridx.hpp ADDED
@@ -0,0 +1,506 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <cassert>
5
+
6
+ #include <vector>
7
+ #include <iostream>
8
+ #include <unordered_map>
9
+ #include <set>
10
+ #include <algorithm>
11
+ #include <sstream>
12
+
13
+ #ifdef _OPENMP
14
+ #include <omp.h>
15
+ #endif
16
+
17
+ #include "unordered_dense.h"
18
+
19
+ // Transforms input string as follows:
20
+ // '/foo/bar/file1.txt'
21
+ // => vector{"foo", "bar", "file1.txt"}
22
+ std::vector<std::string> splitString(const std::string &input, const char &separator) {
23
+ std::vector<std::string> result;
24
+ std::stringstream ss(input);
25
+ std::string item;
26
+
27
+ while (std::getline(ss, item, separator)) {
28
+ if (item.size() > 0) {
29
+ result.push_back(item);
30
+ }
31
+ }
32
+
33
+ return result;
34
+ }
35
+
36
+ // Convert int64_t to binary string
37
+ std::string int64ToBinaryString(int64_t num) {
38
+ std::string result;
39
+ for (int i = 63; i >= 0; --i) {
40
+ result += ((num >> i) & 1) ? '1' : '0';
41
+ }
42
+ return result;
43
+ }
44
+
45
+ // Convert a (8 char) string represented as int64_t to std::string
46
+ std::string int64ToStr(int64_t key) {
47
+ int nchars = 8;
48
+ std::string str;
49
+ int multip = nchars * 8;
50
+ for (int i = 0; i <= nchars; i++) {
51
+ char c = (key >> multip) & 255;
52
+ str.push_back(c);
53
+ multip -= 8;
54
+ }
55
+ return str;
56
+ }
57
+
58
+ void printVector(const std::vector<int> &vec) {
59
+ for (const auto &value : vec) {
60
+ std::cout << value << " ";
61
+ }
62
+ }
63
+
64
+ std::string charToBinaryString(char num) {
65
+ std::string result;
66
+ for (int i = 7; i >= 0; --i) {
67
+ result += ((num >> i) & 1) ? '1' : '0';
68
+ }
69
+ return result;
70
+ }
71
+
72
+ class Candidate;
73
+ enum segmentType { Dir, File };
74
+
75
+ // A segment of a file path
76
+ // e.g. if path is /foo/bar/baz.txt
77
+ // segments are [{root}, foo, bar, baz.txt]
78
+ class PathSegment {
79
+ public:
80
+ std::string str;
81
+ int fileId; // (if FILE)
82
+ Candidate *cand;
83
+ PathSegment *parent;
84
+ ankerl::unordered_dense::map<std::string, PathSegment *> children;
85
+ segmentType type = Dir;
86
+ PathSegment() : parent(NULL) {}
87
+ PathSegment(std::string _str) : str(_str), parent(NULL) {}
88
+ PathSegment(std::string _str, int _fileId)
89
+ : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
90
+ int size() {
91
+ int sz = str.size();
92
+ PathSegment *cur = parent;
93
+ // Sum up length of parent segments (+1 for divisors)
94
+ while (cur->parent != NULL) {
95
+ sz += cur->str.size() + 1;
96
+ cur = cur->parent;
97
+ }
98
+ return sz;
99
+ }
100
+ };
101
+
102
+ // Candidate for result in string (filename) search
103
+ class Candidate {
104
+ public:
105
+ std::vector<float> v_charscore;
106
+ PathSegment *seg;
107
+ int fileId;
108
+ // The string that this candidate represents
109
+ std::string str;
110
+ int len; // Query string length
111
+
112
+ float minscore;
113
+ float maxscore;
114
+ int candLen; // Length of candidate
115
+
116
+ Candidate(){};
117
+ Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
118
+ // Initialize v_charscores with zeros
119
+ v_charscore.resize(len, 0);
120
+ candLen = str.size();
121
+ seg = NULL;
122
+ }
123
+
124
+ Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
125
+ // Initialize v_charscores with zeros
126
+ v_charscore.resize(len, 0);
127
+ candLen = seg->size();
128
+ }
129
+
130
+ float getScore() {
131
+ int i = 0;
132
+ float score = 0.0;
133
+ candLen = seg->size();
134
+
135
+ for (float &charscore : v_charscore) {
136
+ score += charscore;
137
+ i++;
138
+ }
139
+ float div = len * len;
140
+ float div2 = len * candLen;
141
+ float score1 = score / div;
142
+ float score2 = score / div2;
143
+
144
+ score = score1 * 0.97 + score2 * 0.03;
145
+ return score;
146
+ }
147
+
148
+ float operator[](int idx) { return v_charscore[idx]; }
149
+ };
150
+
151
+ // This seems to give 10x speed improvement over std::unordered_map
152
+ typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
153
+ // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
154
+
155
+ typedef std::unordered_map<float, Candidate> CandMap;
156
+
157
+ class StringIndex {
158
+ private:
159
+ int tmp;
160
+ char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
161
+
162
+ std::vector<SegMap *> dirmaps;
163
+ std::vector<SegMap *> filemaps;
164
+
165
+ std::vector<PathSegment *> segsToClean;
166
+
167
+ std::unordered_map<int, std::string> strlist;
168
+ std::unordered_map<int, PathSegment *> seglist;
169
+ PathSegment *root;
170
+ int dirId = 0;
171
+ float dirWeight = 0.7; // Give only 70% of score if match is for a directory
172
+
173
+ public:
174
+ StringIndex() {
175
+ root = new PathSegment();
176
+ root->parent = NULL;
177
+ root->str = "[ROOT]";
178
+
179
+ for (int i = 0; i <= 8; i++) {
180
+ dirmaps.push_back(new SegMap);
181
+ filemaps.push_back(new SegMap);
182
+ }
183
+
184
+ #ifdef _OPENMP
185
+ std::cout << "OPENMP enabled\n";
186
+ #endif
187
+ }
188
+
189
+ void setDirSeparator(char sep) { dirSeparator = sep; }
190
+ void setDirWeight(float val) { dirWeight = val; }
191
+
192
+ ~StringIndex() {
193
+ for (auto x : dirmaps) {
194
+ for (auto y : *x) {
195
+ y.second->clear();
196
+ delete (y.second);
197
+ }
198
+ x->clear();
199
+ delete x;
200
+ }
201
+ for (auto x : filemaps) {
202
+ for (auto y : *x) {
203
+ y.second->clear();
204
+ delete (y.second);
205
+ }
206
+ x->clear();
207
+ delete x;
208
+ }
209
+ clearPathSegmentChildren(root);
210
+ }
211
+
212
+ void addStrToIndex(std::string filePath, int fileId) {
213
+ addStrToIndex(filePath, fileId, dirSeparator);
214
+ }
215
+
216
+ /**
217
+ * Add a string to the index to be search for afterwards
218
+ *
219
+ * @param filePath String to index (e.g. /home/user/Project/main.cpp).
220
+ * @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
221
+ * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
222
+ * one of {'\\', '/', '\0' (no separation)}.
223
+ */
224
+ void addStrToIndex(std::string filePath, int fileId, const char &separator) {
225
+
226
+ std::vector<std::string> segs;
227
+
228
+ if (separator == '\0') {
229
+ // No separation to directories & files
230
+ segs = {filePath};
231
+ } else {
232
+ // Split path to segments
233
+ segs = splitString(filePath, separator);
234
+ }
235
+
236
+ PathSegment *prev = NULL;
237
+ prev = root;
238
+ // Add segments to a tree type data structure
239
+ // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
240
+ // addStrToIndex('/foo/faa/file2.txt' ..)
241
+ // forms structure:
242
+ // root -> foo |-> bar -> file1,txt
243
+ // |-> faa -> file2.txt
244
+ for (auto _x = segs.begin(); _x != segs.end(); ++_x) {
245
+ auto x = *_x;
246
+ PathSegment *p;
247
+
248
+ auto it = prev->children.find(x);
249
+ // this part of the path already exists in the tree
250
+ if (it != prev->children.end()) {
251
+ p = it->second;
252
+ } else {
253
+ p = new PathSegment(x, fileId);
254
+ p->parent = prev;
255
+ // If this is last item in segs
256
+ if (_x == std::prev(segs.end())) {
257
+ // therefore, it is a file.
258
+ p->type = File;
259
+ seglist[fileId] = p;
260
+ } else {
261
+ p->type = Dir;
262
+ p->fileId = dirId;
263
+ // Files use user input Id. Directories need to have it generated
264
+ dirId++;
265
+ }
266
+ prev->children[x] = p;
267
+ addPathSegmentKeys(p);
268
+ }
269
+
270
+ prev = p;
271
+ }
272
+ }
273
+
274
+ /**
275
+ * The search will find filepaths similar to the input string
276
+
277
+ To be considered a candidate path, the file component of the path (e.g. file.txt)
278
+ is required to have at least a substring of two characters in common with the
279
+ query string. If that condition is true, then the directories will also add to the
280
+ score, although with a smaller weight.
281
+
282
+ The similarity measure between query and PathSegment in index
283
+ works as follows:
284
+ For each character c in the query string:
285
+ - find the largest substring in the query which includes the character c and
286
+ is also included in the PathSegment
287
+ - take the lenght of that substring as score
288
+ sum up the scores for each character c and divide by (string length)^2
289
+
290
+ For example, if query = "rngnomadriv"
291
+ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
292
+ as follows:
293
+ rngnomadriv
294
+ 33355555444 (subscores)
295
+ FFFFFFFFDDD (F=file component, D=dir component)
296
+ score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
297
+
298
+ In final score, give a small penalty for larger candidate filenames:
299
+ Divide main part of score with (query string length)^2
300
+ and minor part by (query string length)*(candidate string length)
301
+ score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
302
+
303
+ @param query String to search for inside the index
304
+ */
305
+
306
+ std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
307
+ CandMap fileCandMap;
308
+ CandMap dirCandMap;
309
+
310
+ // Find both files and directories that match the input query
311
+ addToCandMap(fileCandMap, query, filemaps);
312
+ addToCandMap(dirCandMap, query, dirmaps);
313
+
314
+ /* If parent dir of a file matches the input string add the scores of the direcotry to the
315
+ scores of the file */
316
+ mergeCandidateMaps(fileCandMap, dirCandMap);
317
+
318
+ // Set all candidate pointers to NULL so they won't mess up future searches
319
+ for (auto seg : segsToClean) {
320
+ seg->cand = NULL;
321
+ }
322
+ segsToClean.clear();
323
+
324
+ // Form return result, 2d array with file id's and scores
325
+ std::vector<std::pair<float, int>> results;
326
+ for (auto &[fid, cand] : fileCandMap) {
327
+ std::pair<float, int> v;
328
+ float sc = cand.getScore();
329
+ v.first = sc;
330
+ v.second = fid;
331
+ results.push_back(v);
332
+ }
333
+ // Sort highest score first
334
+ std::sort(results.begin(), results.end(),
335
+ [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
336
+ return results;
337
+ }
338
+
339
+ // Return int64_t representation of the first nchars in str, starting from index i
340
+ int64_t getKeyAtIdx(std::string str, int i, int nchars) {
341
+ int64_t key = 0;
342
+ for (int i_char = 0; i_char < nchars; i_char++) {
343
+ key = key | static_cast<int>(str[i + i_char]);
344
+ if (i_char < nchars - 1) {
345
+ // Shift 8 bits to the left except on the last iteration
346
+ key = key << 8;
347
+ }
348
+ }
349
+ return key;
350
+ }
351
+
352
+ void debug() {
353
+
354
+ int nchars = 3;
355
+ for (const auto &[key, value] : (*filemaps[nchars])) {
356
+ int64_t x;
357
+ x = key;
358
+ int multip = nchars * 8;
359
+ for (int i = 0; i <= nchars; i++) {
360
+ char c = (x >> multip) & 255;
361
+ std::cout << c;
362
+ multip -= 8;
363
+ }
364
+ std::cout << "\n";
365
+ // for (auto y : *value) {
366
+ // std::cout << y << " ";
367
+ // }
368
+ // std::cout << "\n";
369
+ }
370
+ }
371
+
372
+ private:
373
+ void clearPathSegmentChildren(PathSegment *p) {
374
+ if (p->children.size() > 0) {
375
+ for (auto x : p->children) {
376
+ clearPathSegmentChildren(x.second);
377
+ }
378
+ }
379
+ delete p;
380
+ }
381
+
382
+ void addPathSegmentKeys(PathSegment *p) {
383
+ // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt
384
+ // This function generates int64 representations (keys) of all substrings of size 2..8 in that
385
+ // path segment and stores pointer to p in hash tables using these int values as keys.
386
+
387
+ int maxChars = 8;
388
+ int minChars = 2;
389
+
390
+ std::string str = p->str;
391
+ if (p->str.size() < 2) {
392
+ return;
393
+ }
394
+ if (static_cast<int>(p->str.size()) < maxChars) {
395
+ maxChars = p->str.size();
396
+ }
397
+
398
+ #ifdef _OPENMP
399
+ #pragma omp parallel for
400
+ #endif
401
+ for (int sublen = minChars; sublen <= maxChars; sublen++) {
402
+
403
+ SegMap *map;
404
+ if (p->type == File) {
405
+ map = filemaps[sublen];
406
+ } else {
407
+ map = dirmaps[sublen];
408
+ }
409
+
410
+ int count = str.size() - sublen + 1;
411
+
412
+ for (int i = 0; i <= count; i++) {
413
+ int64_t key = getKeyAtIdx(str, i, sublen);
414
+
415
+ // Create a new std::set for key if doesn't exist already
416
+ auto it = map->find(key);
417
+ if (it == map->end()) {
418
+ (*map)[key] = new std::set<PathSegment *>;
419
+ }
420
+ (*map)[key]->insert(p);
421
+ }
422
+ }
423
+ }
424
+
425
+ // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
426
+ // is of length <nchars>.
427
+ std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
428
+
429
+ assert(i + nchars <= static_cast<int>(str.size()));
430
+ std::vector<PathSegment *> res;
431
+
432
+ // Take substring of str, starting at i, spanning nchars
433
+ // transform that to 64 bit integer
434
+ int64_t key = getKeyAtIdx(str, i, nchars);
435
+ // Find all path segments in map that have the same substring
436
+ auto it = map.find(key);
437
+ if (it != map.end()) { // key found
438
+ auto set = it->second;
439
+ for (auto value : *set) {
440
+ res.push_back(value);
441
+ }
442
+ }
443
+ return res;
444
+ }
445
+
446
+ void addToCandMap(CandMap &candmap, std::string query,
447
+ std::vector<SegMap *> &map // filemaps or dirmaps
448
+ ) {
449
+ int maxChars = 8;
450
+ int minChars = 2;
451
+ if (static_cast<int>(query.size()) < maxChars) {
452
+ maxChars = query.size();
453
+ }
454
+
455
+ // Loop all substring lengths between minChars..maxChars
456
+ for (int sublen = minChars; sublen <= maxChars; sublen++) {
457
+ int count = query.size() - sublen + 1;
458
+
459
+ // Loop all possible start positions
460
+ for (int i = 0; i < count; i++) {
461
+ std::vector<PathSegment *> res = findSimilarForNgram(query, i, sublen, *(map[sublen]));
462
+
463
+ for (PathSegment *p : res) {
464
+ addToResults(p, query, i, sublen, candmap);
465
+ }
466
+ }
467
+ }
468
+ }
469
+
470
+ // Add parent directories scores to files
471
+ void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
472
+
473
+ for (auto &[fid, cand] : fileCandMap) {
474
+ PathSegment *p = cand.seg->parent;
475
+ while (p->parent != NULL) {
476
+ if (p->cand != NULL) {
477
+ auto &scoreA = cand.v_charscore;
478
+ auto &scoreB = p->cand->v_charscore;
479
+ for (int i = 0; i < cand.len; i++) {
480
+ if (scoreA[i] < scoreB[i] * dirWeight) {
481
+ scoreA[i] = scoreB[i] * dirWeight;
482
+ }
483
+ }
484
+ }
485
+ p = p->parent;
486
+ }
487
+ }
488
+ }
489
+
490
+ void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
491
+
492
+ auto it2 = candmap.find(seg->fileId);
493
+ if (it2 == candmap.end()) {
494
+ Candidate cand(seg, str.size());
495
+ seg->cand = &(candmap[seg->fileId]);
496
+ segsToClean.push_back(seg);
497
+ candmap[seg->fileId] = cand;
498
+ }
499
+
500
+ for (int j = i; j < i + nchars; j++) {
501
+ if (candmap[seg->fileId][j] < nchars) {
502
+ candmap[seg->fileId].v_charscore[j] = nchars;
503
+ }
504
+ }
505
+ }
506
+ };