StrIdx 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/stridx-tty.rb ADDED
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "tty-prompt"
4
+ require "tty-cursor"
5
+ require "tty-reader"
6
+ require "pastel"
7
+
8
+ require "socket"
9
+
10
+ class StrIdxTTY
11
+ def self.run
12
+ stty = StrIdxTTY.new
13
+ selected = stty.search
14
+ STDOUT.write selected
15
+ end
16
+
17
+ def initialize()
18
+ @lines = []
19
+ @selected = ""
20
+ @idx = 0
21
+
22
+ @reader = TTY::Reader.new(output: STDERR)
23
+ @pastel = Pastel.new()
24
+ @cursor = TTY::Cursor
25
+
26
+ sock_dir = File.expand_path("~/.stridx")
27
+ sockfn = "#{sock_dir}/sock"
28
+
29
+ error = true
30
+ while error
31
+ begin
32
+ # Create a new UNIXSocket
33
+ client = UNIXSocket.new(sockfn)
34
+ rescue Errno::ECONNREFUSED => e
35
+ out "Waiting for server to start\n"
36
+ sleep 2
37
+ error = true
38
+ else
39
+ error = false
40
+ client.close
41
+ #... executes when no error
42
+ end
43
+ end
44
+ end
45
+
46
+ def out(x)
47
+ STDERR.write x
48
+ end
49
+
50
+ def search
51
+ out "\n" * 20
52
+ out @cursor.clear_screen
53
+ out "\n" * 20
54
+ @cursor.move_to(0, 0)
55
+ @reader.on(:keypress) { |event|
56
+ handle_event(event)
57
+ }
58
+ @reader.read_line(">> ")
59
+
60
+ out @cursor.clear_screen
61
+ return @selected.strip
62
+ end
63
+
64
+ def get_res_from_server(query)
65
+ # Define the socket file path
66
+ sock_dir = File.expand_path("~/.stridx")
67
+ sockfn = "#{sock_dir}/sock"
68
+
69
+ # Create a new UNIXSocket
70
+ client = UNIXSocket.new(sockfn)
71
+
72
+ # Send data to the server
73
+ client.puts "find:#{query}"
74
+
75
+ # Read response from the server
76
+ response = client.recv(200 * 200)
77
+
78
+ # Close the client connection
79
+ client.close
80
+ return response.lines
81
+ end
82
+
83
+ def draw_list()
84
+ @selected = @list[@idx]
85
+ i = 0
86
+ for x in @list
87
+ out @cursor.up(1)
88
+ out @cursor.clear_line
89
+ if i == @idx
90
+ out @pastel.lookup(:bold)
91
+ end
92
+ out x.strip
93
+ out @pastel.lookup(:reset)
94
+ i += 1
95
+ end
96
+ end
97
+
98
+ def update_search(event)
99
+ query = event.line[3..-1]
100
+ if query.size > 2
101
+ @list = get_res_from_server(query)
102
+ draw_list
103
+ end
104
+ end
105
+
106
+ def handle_event(event)
107
+ out @cursor.save
108
+ if event.key.name == :alpha
109
+ update_search(event)
110
+ elsif event.key.name == :up
111
+ @idx += 1 if @idx < @list.size - 1
112
+ draw_list
113
+ elsif event.key.name == :down
114
+ @idx -= 1 if @idx > 0
115
+ draw_list
116
+ elsif event.key.name == :backspace
117
+ update_search(event)
118
+ end
119
+
120
+ out @cursor.restore
121
+ end
122
+ end
data/stridx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "StrIdx"
3
+ spec.version = "0.1.4"
4
+ spec.authors = ["Sami Sieranoja"]
5
+ spec.email = ["sami.sieranoja@gmail.com"]
6
+
7
+ spec.summary = %q{StrIdx}
8
+ spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
9
+ spec.homepage = "https://github.com/SamiSieranoja/stridx"
10
+ spec.metadata["source_code_uri"] = spec.homepage
11
+ spec.metadata["homepage_uri"] = spec.homepage
12
+
13
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
14
+ f.match(%r{^(refcode|spec|features)/})
15
+ end
16
+ # spec.files << "thread_pool.hpp"
17
+ # spec.files << "exe/stridx.rb"
18
+ # spec.files << "server.rb"
19
+ # spec.files << "stridx-tty.rb"
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib", "ext"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 2.4.21"
26
+ spec.add_development_dependency "rake", "~> 13.1.0"
27
+
28
+ spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
29
+ spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
30
+ spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
31
+ spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
32
+ spec.add_runtime_dependency "pastel", "~> 0.8.0"
33
+ spec.add_runtime_dependency "daemons", "~> 1.4.1"
34
+
35
+ spec.extensions = ["rubyext/extconf.rb"]
36
+ spec.licenses = ["LGPL-2.0+"]
37
+ end
data/stridx.hpp CHANGED
@@ -1,21 +1,66 @@
1
1
 
2
+ #ifndef SSSTRIDX_HPP
3
+ #define SSSTRIDX_HPP
4
+
2
5
  #include <stdio.h>
3
6
  #include <stdlib.h>
4
7
  #include <cassert>
5
8
 
6
9
  #include <vector>
10
+ #include <array>
7
11
  #include <iostream>
8
12
  #include <unordered_map>
9
13
  #include <set>
10
14
  #include <algorithm>
11
15
  #include <sstream>
12
16
 
13
- #ifdef _OPENMP
14
- #include <omp.h>
15
- #endif
17
+ #include <vector>
18
+ #include <mutex>
19
+ #include <thread>
16
20
 
21
+ #include "thread_pool.hpp"
17
22
  #include "unordered_dense.h"
18
23
 
24
+ namespace StrIdx {
25
+
26
+ /* Alternative to using std::cout
27
+ Allows to control verbose level */
28
+ class Output {
29
+ private:
30
+ int verboseLevel;
31
+
32
+ public:
33
+ Output(int verb) : verboseLevel(verb) {}
34
+ Output() : Output(3) {}
35
+ ~Output() = default;
36
+ static void print() {}
37
+
38
+ // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
40
+ std::cout << var1;
41
+ print(var2...);
42
+ }
43
+
44
+ // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
+ template <typename... Types> static void printl(Types... var2) {
46
+ print(var2...);
47
+ print("\n");
48
+ }
49
+
50
+ /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
51
+ * if verboseLevel >= 2 (first arg)
52
+ */
53
+ template <typename... Types> void printv(int vlevel, Types... var2) {
54
+ if (verboseLevel < vlevel) {
55
+ return;
56
+ }
57
+ if (verboseLevel >= 3) {
58
+ print("[v=", vlevel, "] ");
59
+ }
60
+ printl(var2...);
61
+ }
62
+ };
63
+
19
64
  // Transforms input string as follows:
20
65
  // '/foo/bar/file1.txt'
21
66
  // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
34
79
  }
35
80
 
36
81
  // Convert int64_t to binary string
37
- std::string int64ToBinaryString(int64_t num) {
82
+ [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
38
83
  std::string result;
39
84
  for (int i = 63; i >= 0; --i) {
40
85
  result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
42
87
  return result;
43
88
  }
44
89
 
45
- // Convert a (8 char) string represented as int64_t to std::string
46
- std::string int64ToStr(int64_t key) {
90
+ // Debug. Convert a (8 char) string represented as int64_t to std::string
91
+ [[nodiscard]] std::string int64ToStr(const int64_t &key) {
47
92
  int nchars = 8;
48
93
  std::string str;
49
94
  int multip = nchars * 8;
@@ -55,43 +100,45 @@ std::string int64ToStr(int64_t key) {
55
100
  return str;
56
101
  }
57
102
 
103
+ // Debug
58
104
  void printVector(const std::vector<int> &vec) {
59
105
  for (const auto &value : vec) {
60
106
  std::cout << value << " ";
61
107
  }
62
108
  }
63
109
 
64
- std::string charToBinaryString(char num) {
110
+ // Debug
111
+ [[nodiscard]] std::string charToBinaryString(const char &chr) {
65
112
  std::string result;
66
113
  for (int i = 7; i >= 0; --i) {
67
- result += ((num >> i) & 1) ? '1' : '0';
114
+ result += ((chr >> i) & 1) ? '1' : '0';
68
115
  }
69
116
  return result;
70
117
  }
71
118
 
72
119
  class Candidate;
73
- enum segmentType { Dir, File };
120
+ enum class segmentType { Dir, File };
74
121
 
75
122
  // A segment of a file path
76
123
  // e.g. if path is /foo/bar/baz.txt
77
124
  // segments are [{root}, foo, bar, baz.txt]
78
- class PathSegment {
79
- public:
125
+ struct PathSegment {
80
126
  std::string str;
81
127
  int fileId; // (if FILE)
82
128
  Candidate *cand;
83
129
  PathSegment *parent;
130
+ std::mutex mu;
84
131
  ankerl::unordered_dense::map<std::string, PathSegment *> children;
85
- segmentType type = Dir;
86
- PathSegment() : parent(NULL) {}
87
- PathSegment(std::string _str) : str(_str), parent(NULL) {}
132
+ segmentType type = segmentType::Dir;
133
+ PathSegment() : parent(nullptr) {}
134
+ PathSegment(std::string _str) : str(_str), parent(nullptr) {}
88
135
  PathSegment(std::string _str, int _fileId)
89
- : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
90
- int size() {
136
+ : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
137
+ [[nodiscard]] int size() const {
91
138
  int sz = str.size();
92
139
  PathSegment *cur = parent;
93
140
  // Sum up length of parent segments (+1 for divisors)
94
- while (cur->parent != NULL) {
141
+ while (cur->parent != nullptr) {
95
142
  sz += cur->str.size() + 1;
96
143
  cur = cur->parent;
97
144
  }
@@ -100,8 +147,7 @@ public:
100
147
  };
101
148
 
102
149
  // Candidate for result in string (filename) search
103
- class Candidate {
104
- public:
150
+ struct Candidate {
105
151
  std::vector<float> v_charscore;
106
152
  PathSegment *seg;
107
153
  int fileId;
@@ -114,25 +160,17 @@ public:
114
160
  int candLen; // Length of candidate
115
161
 
116
162
  Candidate(){};
117
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
118
- // Initialize v_charscores with zeros
119
- v_charscore.resize(len, 0);
120
- candLen = str.size();
121
- seg = NULL;
122
- }
123
-
124
163
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
125
164
  // Initialize v_charscores with zeros
126
165
  v_charscore.resize(len, 0);
127
166
  candLen = seg->size();
128
167
  }
129
168
 
130
- float getScore() {
169
+ [[nodiscard]] float getScore() const {
131
170
  int i = 0;
132
171
  float score = 0.0;
133
- candLen = seg->size();
134
172
 
135
- for (float &charscore : v_charscore) {
173
+ for (const float &charscore : v_charscore) {
136
174
  score += charscore;
137
175
  i++;
138
176
  }
@@ -145,35 +183,41 @@ public:
145
183
  return score;
146
184
  }
147
185
 
148
- float operator[](int idx) { return v_charscore[idx]; }
186
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
149
187
  };
150
188
 
151
189
  // This seems to give 10x speed improvement over std::unordered_map
152
190
  typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
153
191
  // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
154
192
 
155
- typedef std::unordered_map<float, Candidate> CandMap;
193
+ typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
194
+ // typedef std::unordered_map<int, Candidate*> CandMap;
156
195
 
157
196
  class StringIndex {
158
197
  private:
159
198
  int tmp;
160
199
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
200
+ int numStrings = 0;
161
201
 
162
202
  std::vector<SegMap *> dirmaps;
203
+ std::array<std::mutex, 9> mts_d; // for dirmaps
163
204
  std::vector<SegMap *> filemaps;
205
+ std::array<std::mutex, 9> mts_f; // for filemaps
164
206
 
165
207
  std::vector<PathSegment *> segsToClean;
166
208
 
167
- std::unordered_map<int, std::string> strlist;
168
209
  std::unordered_map<int, PathSegment *> seglist;
169
210
  PathSegment *root;
170
211
  int dirId = 0;
171
212
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
172
213
 
214
+ std::unique_ptr<ThreadPool> pool;
215
+ Output out{1}; // verbose level = 1
216
+
173
217
  public:
174
- StringIndex() {
218
+ StringIndex(char sep) : dirSeparator(sep) {
175
219
  root = new PathSegment();
176
- root->parent = NULL;
220
+ root->parent = nullptr;
177
221
  root->str = "[ROOT]";
178
222
 
179
223
  for (int i = 0; i <= 8; i++) {
@@ -181,11 +225,18 @@ public:
181
225
  filemaps.push_back(new SegMap);
182
226
  }
183
227
 
184
- #ifdef _OPENMP
185
- std::cout << "OPENMP enabled\n";
186
- #endif
228
+ // Threads between 4 and 6
229
+ // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
230
+ int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
231
+ num_threads = std::min(num_threads, 6);
232
+ out.printv(2, "Number of threads: ", num_threads);
233
+ pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
187
234
  }
188
235
 
236
+ /* Don't separate path to segments separator=\0.
237
+ This is slower, but can be used for other data than files also. */
238
+ StringIndex() : StringIndex('\0') {}
239
+
189
240
  void setDirSeparator(char sep) { dirSeparator = sep; }
190
241
  void setDirWeight(float val) { dirWeight = val; }
191
242
 
@@ -213,6 +264,15 @@ public:
213
264
  addStrToIndex(filePath, fileId, dirSeparator);
214
265
  }
215
266
 
267
+ void addStrToIndexThreaded(std::string filePath, int fileId) {
268
+ pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
269
+ }
270
+ void waitUntilReady() const { pool->waitUntilDone(); }
271
+
272
+ void waitUntilDone() const { pool->waitUntilDone(); }
273
+
274
+ int size() const { return seglist.size(); }
275
+
216
276
  /**
217
277
  * Add a string to the index to be searched for afterwards
218
278
  *
@@ -221,9 +281,17 @@ public:
221
281
  * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
222
282
  * one of {'\\', '/', '\0' (no separation)}.
223
283
  */
284
+
224
285
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
286
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
287
+
288
+ // If a string with this index has beeen added already
289
+ if (seglist.find(fileId) != seglist.end()) {
290
+ return;
291
+ }
225
292
 
226
293
  std::vector<std::string> segs;
294
+ numStrings += 1;
227
295
 
228
296
  if (separator == '\0') {
229
297
  // No separation to directories & files
@@ -233,7 +301,7 @@ public:
233
301
  segs = splitString(filePath, separator);
234
302
  }
235
303
 
236
- PathSegment *prev = NULL;
304
+ PathSegment *prev = nullptr;
237
305
  prev = root;
238
306
  // Add segments to a tree type data structure
239
307
  // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +313,27 @@ public:
245
313
  auto x = *_x;
246
314
  PathSegment *p;
247
315
 
248
- auto it = prev->children.find(x);
316
+ prev->mu.lock();
317
+
249
318
  // this part of the path already exists in the tree
250
- if (it != prev->children.end()) {
319
+ if (auto it = prev->children.find(x); it != prev->children.end()) {
251
320
  p = it->second;
321
+ prev->mu.unlock();
252
322
  } else {
253
323
  p = new PathSegment(x, fileId);
254
324
  p->parent = prev;
255
- // If this is last item in segs
325
+ // If this is last item in segs, then it is a file.
256
326
  if (_x == std::prev(segs.end())) {
257
- // therefore, it is a file.
258
- p->type = File;
327
+ p->type = segmentType::File;
259
328
  seglist[fileId] = p;
260
- } else {
261
- p->type = Dir;
329
+ } else { // otherwise, it is a directory
330
+ p->type = segmentType::Dir;
262
331
  p->fileId = dirId;
263
332
  // Files use user input Id. Directories need to have it generated
264
333
  dirId++;
265
334
  }
266
335
  prev->children[x] = p;
336
+ prev->mu.unlock();
267
337
  addPathSegmentKeys(p);
268
338
  }
269
339
 
@@ -271,6 +341,17 @@ public:
271
341
  }
272
342
  }
273
343
 
344
+ std::string getString(int id) {
345
+ std::string s = "";
346
+ PathSegment *seg = seglist[id];
347
+ s += seg->str;
348
+ while (seg->parent->parent != nullptr) {
349
+ seg = seg->parent;
350
+ s = seg->str + dirSeparator + s;
351
+ }
352
+ return s;
353
+ }
354
+
274
355
  /**
275
356
  The search will find filepaths similar to the input string
276
357
 
@@ -303,14 +384,16 @@ public:
303
384
  @param query String to search for inside the index
304
385
  */
305
386
 
306
- std::vector<std::pair<float, int>> findSimilar(std::string query) {
387
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
307
388
  return findSimilar(query, 2);
308
389
  }
309
390
 
310
- std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
391
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
311
392
  CandMap fileCandMap;
312
393
  CandMap dirCandMap;
313
394
 
395
+ waitUntilDone();
396
+
314
397
  // Find both files and directories that match the input query
315
398
  addToCandMap(fileCandMap, query, filemaps);
316
399
  addToCandMap(dirCandMap, query, dirmaps);
@@ -319,9 +402,9 @@ public:
319
402
  scores of the file */
320
403
  mergeCandidateMaps(fileCandMap, dirCandMap);
321
404
 
322
- // Set all candidate pointers to NULL so they won't mess up future searches
405
+ // Set all candidate pointers to nullptr so they won't mess up future searches
323
406
  for (auto seg : segsToClean) {
324
- seg->cand = NULL;
407
+ seg->cand = nullptr;
325
408
  }
326
409
  segsToClean.clear();
327
410
 
@@ -329,11 +412,17 @@ public:
329
412
  std::vector<std::pair<float, int>> results;
330
413
  for (auto &[fid, cand] : fileCandMap) {
331
414
  std::pair<float, int> v;
332
- float sc = cand.getScore();
415
+ float sc = cand->getScore();
333
416
  v.first = sc;
334
417
  v.second = fid;
335
418
  results.push_back(v);
419
+ delete cand;
336
420
  }
421
+
422
+ for (auto &[fid, cand] : dirCandMap) {
423
+ delete cand;
424
+ }
425
+
337
426
  // Sort highest score first
338
427
  std::sort(results.begin(), results.end(),
339
428
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -341,10 +430,10 @@ public:
341
430
  }
342
431
 
343
432
  // Return int64_t representation of the first nchars in str, starting from index i
344
- int64_t getKeyAtIdx(std::string str, int i, int nchars) {
433
+ [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
345
434
  int64_t key = 0;
346
435
  for (int i_char = 0; i_char < nchars; i_char++) {
347
- key = key | static_cast<int>(str[i + i_char]);
436
+ key = key | static_cast<int64_t>(str[i + i_char]);
348
437
  if (i_char < nchars - 1) {
349
438
  // Shift 8 bits to the left except on the last iteration
350
439
  key = key << 8;
@@ -399,22 +488,29 @@ private:
399
488
  maxChars = p->str.size();
400
489
  }
401
490
 
402
- #ifdef _OPENMP
403
- #pragma omp parallel for
404
- #endif
405
491
  for (int sublen = minChars; sublen <= maxChars; sublen++) {
406
492
 
493
+ std::mutex *mu;
407
494
  SegMap *map;
408
- if (p->type == File) {
495
+ if (p->type == segmentType::File) {
409
496
  map = filemaps[sublen];
497
+ mu = &mts_f[sublen];
410
498
  } else {
411
499
  map = dirmaps[sublen];
500
+ mu = &mts_d[sublen];
412
501
  }
413
502
 
414
503
  int count = str.size() - sublen + 1;
415
504
 
505
+ int64_t keys[count + 1];
416
506
  for (int i = 0; i <= count; i++) {
417
- int64_t key = getKeyAtIdx(str, i, sublen);
507
+ keys[i] = getKeyAtIdx(str, i, sublen);
508
+ }
509
+
510
+ mu->lock();
511
+ for (int i = 0; i <= count; i++) {
512
+ // int64_t key = getKeyAtIdx(str, i, sublen);
513
+ auto key = keys[i];
418
514
 
419
515
  // Create a new std::set for key if doesn't exist already
420
516
  auto it = map->find(key);
@@ -423,12 +519,14 @@ private:
423
519
  }
424
520
  (*map)[key]->insert(p);
425
521
  }
522
+ mu->unlock();
426
523
  }
427
524
  }
428
525
 
429
526
  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
430
527
  // is of length <nchars>.
431
- std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
528
+ [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
529
+ SegMap &map) const {
432
530
 
433
531
  assert(i + nchars <= static_cast<int>(str.size()));
434
532
  std::vector<PathSegment *> res;
@@ -437,8 +535,7 @@ private:
437
535
  // transform that to 64 bit integer
438
536
  int64_t key = getKeyAtIdx(str, i, nchars);
439
537
  // Find all path segments in map that have the same substring
440
- auto it = map.find(key);
441
- if (it != map.end()) { // key found
538
+ if (auto it = map.find(key); it != map.end()) { // key found
442
539
  auto set = it->second;
443
540
  for (auto value : *set) {
444
541
  res.push_back(value);
@@ -475,12 +572,12 @@ private:
475
572
  void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
476
573
 
477
574
  for (auto &[fid, cand] : fileCandMap) {
478
- PathSegment *p = cand.seg->parent;
479
- while (p->parent != NULL) {
480
- if (p->cand != NULL) {
481
- auto &scoreA = cand.v_charscore;
575
+ PathSegment *p = cand->seg->parent;
576
+ while (p->parent != nullptr) {
577
+ if (p->cand != nullptr) {
578
+ auto &scoreA = cand->v_charscore;
482
579
  auto &scoreB = p->cand->v_charscore;
483
- for (int i = 0; i < cand.len; i++) {
580
+ for (int i = 0; i < cand->len; i++) {
484
581
  if (scoreA[i] < scoreB[i] * dirWeight) {
485
582
  scoreA[i] = scoreB[i] * dirWeight;
486
583
  }
@@ -493,18 +590,22 @@ private:
493
590
 
494
591
  void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
495
592
 
496
- auto it2 = candmap.find(seg->fileId);
497
- if (it2 == candmap.end()) {
498
- Candidate cand(seg, str.size());
499
- seg->cand = &(candmap[seg->fileId]);
593
+ if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
594
+ Candidate *cand = new Candidate(seg, str.size());
500
595
  segsToClean.push_back(seg);
501
596
  candmap[seg->fileId] = cand;
597
+ seg->cand = cand;
502
598
  }
503
599
 
504
600
  for (int j = i; j < i + nchars; j++) {
505
- if (candmap[seg->fileId][j] < nchars) {
506
- candmap[seg->fileId].v_charscore[j] = nchars;
601
+ Candidate &cand = *(candmap[seg->fileId]);
602
+ if (cand[j] < nchars) {
603
+ cand.v_charscore[j] = nchars;
507
604
  }
508
605
  }
509
606
  }
510
607
  };
608
+
609
+ } // namespace StrIdx
610
+
611
+ #endif
data/test.rb CHANGED
@@ -1,8 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ $:.unshift File.dirname(__FILE__)
4
+
3
5
  require "stridx"
4
6
  idx = StrIdx::StringIndex.new
5
7
 
8
+ # "/" for unix-style file paths
9
+ idx.setDirSeparator("/") #(comment out if not file paths)
10
+
6
11
  t = Time.new
7
12
  fn = File.expand_path("flist.txt")
8
13
  lines = IO.read(fn).lines.collect { |x| x.strip }
@@ -13,7 +18,13 @@ for x in lines
13
18
  end
14
19
 
15
20
  idx_time = Time.new
16
- puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
21
+ # Time to start the threadpool to process indexing
22
+ puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
23
+
24
+ idx.waitUntilDone() # Not necessary, will be called by idx.find
25
+ idx_time = Time.new
26
+ # Time when all threads have completed
27
+ puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
17
28
 
18
29
  query = "rngnomadriv"
19
30
  res = idx.find(query)