StrIdx 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/stridx-tty.rb ADDED
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "tty-prompt"
4
+ require "tty-cursor"
5
+ require "tty-reader"
6
+ require "pastel"
7
+
8
+ require "socket"
9
+
10
+ class StrIdxTTY
11
+ def self.run
12
+ stty = StrIdxTTY.new
13
+ selected = stty.search
14
+ STDOUT.write selected
15
+ end
16
+
17
+ def initialize()
18
+ @lines = []
19
+ @selected = ""
20
+ @idx = 0
21
+
22
+ @reader = TTY::Reader.new(output: STDERR)
23
+ @pastel = Pastel.new()
24
+ @cursor = TTY::Cursor
25
+
26
+ sock_dir = File.expand_path("~/.stridx")
27
+ sockfn = "#{sock_dir}/sock"
28
+
29
+ error = true
30
+ while error
31
+ begin
32
+ # Create a new UNIXSocket
33
+ client = UNIXSocket.new(sockfn)
34
+ rescue Errno::ECONNREFUSED => e
35
+ out "Waiting for server to start\n"
36
+ sleep 2
37
+ error = true
38
+ else
39
+ error = false
40
+ client.close
41
+ #... executes when no error
42
+ end
43
+ end
44
+ end
45
+
46
+ def out(x)
47
+ STDERR.write x
48
+ end
49
+
50
+ def search
51
+ out "\n" * 20
52
+ out @cursor.clear_screen
53
+ out "\n" * 20
54
+ @cursor.move_to(0, 0)
55
+ @reader.on(:keypress) { |event|
56
+ handle_event(event)
57
+ }
58
+ @reader.read_line(">> ")
59
+
60
+ out @cursor.clear_screen
61
+ return @selected.strip
62
+ end
63
+
64
+ def get_res_from_server(query)
65
+ # Define the socket file path
66
+ sock_dir = File.expand_path("~/.stridx")
67
+ sockfn = "#{sock_dir}/sock"
68
+
69
+ # Create a new UNIXSocket
70
+ client = UNIXSocket.new(sockfn)
71
+
72
+ # Send data to the server
73
+ client.puts "find:#{query}"
74
+
75
+ # Read response from the server
76
+ response = client.recv(200 * 200)
77
+
78
+ # Close the client connection
79
+ client.close
80
+ return response.lines
81
+ end
82
+
83
+ def draw_list()
84
+ @selected = @list[@idx]
85
+ i = 0
86
+ for x in @list
87
+ out @cursor.up(1)
88
+ out @cursor.clear_line
89
+ if i == @idx
90
+ out @pastel.lookup(:bold)
91
+ end
92
+ out x.strip
93
+ out @pastel.lookup(:reset)
94
+ i += 1
95
+ end
96
+ end
97
+
98
+ def update_search(event)
99
+ query = event.line[3..-1]
100
+ if query.size > 2
101
+ @list = get_res_from_server(query)
102
+ draw_list
103
+ end
104
+ end
105
+
106
+ def handle_event(event)
107
+ out @cursor.save
108
+ if event.key.name == :alpha
109
+ update_search(event)
110
+ elsif event.key.name == :up
111
+ @idx += 1 if @idx < @list.size - 1
112
+ draw_list
113
+ elsif event.key.name == :down
114
+ @idx -= 1 if @idx > 0
115
+ draw_list
116
+ elsif event.key.name == :backspace
117
+ update_search(event)
118
+ end
119
+
120
+ out @cursor.restore
121
+ end
122
+ end
data/stridx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "StrIdx"
3
+ spec.version = "0.1.4"
4
+ spec.authors = ["Sami Sieranoja"]
5
+ spec.email = ["sami.sieranoja@gmail.com"]
6
+
7
+ spec.summary = %q{StrIdx}
8
+ spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
9
+ spec.homepage = "https://github.com/SamiSieranoja/stridx"
10
+ spec.metadata["source_code_uri"] = spec.homepage
11
+ spec.metadata["homepage_uri"] = spec.homepage
12
+
13
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
14
+ f.match(%r{^(refcode|spec|features)/})
15
+ end
16
+ # spec.files << "thread_pool.hpp"
17
+ # spec.files << "exe/stridx.rb"
18
+ # spec.files << "server.rb"
19
+ # spec.files << "stridx-tty.rb"
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib", "ext"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 2.4.21"
26
+ spec.add_development_dependency "rake", "~> 13.1.0"
27
+
28
+ spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
29
+ spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
30
+ spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
31
+ spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
32
+ spec.add_runtime_dependency "pastel", "~> 0.8.0"
33
+ spec.add_runtime_dependency "daemons", "~> 1.4.1"
34
+
35
+ spec.extensions = ["rubyext/extconf.rb"]
36
+ spec.licenses = ["LGPL-2.0+"]
37
+ end
data/stridx.hpp CHANGED
@@ -1,21 +1,66 @@
1
1
 
2
+ #ifndef SSSTRIDX_HPP
3
+ #define SSSTRIDX_HPP
4
+
2
5
  #include <stdio.h>
3
6
  #include <stdlib.h>
4
7
  #include <cassert>
5
8
 
6
9
  #include <vector>
10
+ #include <array>
7
11
  #include <iostream>
8
12
  #include <unordered_map>
9
13
  #include <set>
10
14
  #include <algorithm>
11
15
  #include <sstream>
12
16
 
13
- #ifdef _OPENMP
14
- #include <omp.h>
15
- #endif
17
+ #include <vector>
18
+ #include <mutex>
19
+ #include <thread>
16
20
 
21
+ #include "thread_pool.hpp"
17
22
  #include "unordered_dense.h"
18
23
 
24
+ namespace StrIdx {
25
+
26
+ /* Alternative to using std::cout
27
+ Allows to control verbose level */
28
+ class Output {
29
+ private:
30
+ int verboseLevel;
31
+
32
+ public:
33
+ Output(int verb) : verboseLevel(verb) {}
34
+ Output() : Output(3) {}
35
+ ~Output() = default;
36
+ static void print() {}
37
+
38
+ // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
40
+ std::cout << var1;
41
+ print(var2...);
42
+ }
43
+
44
+ // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
+ template <typename... Types> static void printl(Types... var2) {
46
+ print(var2...);
47
+ print("\n");
48
+ }
49
+
50
+ /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
51
+ * if verboseLevel >= 2 (first arg)
52
+ */
53
+ template <typename... Types> void printv(int vlevel, Types... var2) {
54
+ if (verboseLevel < vlevel) {
55
+ return;
56
+ }
57
+ if (verboseLevel >= 3) {
58
+ print("[v=", vlevel, "] ");
59
+ }
60
+ printl(var2...);
61
+ }
62
+ };
63
+
19
64
  // Transforms input string as follows:
20
65
  // '/foo/bar/file1.txt'
21
66
  // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
34
79
  }
35
80
 
36
81
  // Convert int64_t to binary string
37
- std::string int64ToBinaryString(int64_t num) {
82
+ [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
38
83
  std::string result;
39
84
  for (int i = 63; i >= 0; --i) {
40
85
  result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
42
87
  return result;
43
88
  }
44
89
 
45
- // Convert a (8 char) string represented as int64_t to std::string
46
- std::string int64ToStr(int64_t key) {
90
+ // Debug. Convert a (8 char) string represented as int64_t to std::string
91
+ [[nodiscard]] std::string int64ToStr(const int64_t &key) {
47
92
  int nchars = 8;
48
93
  std::string str;
49
94
  int multip = nchars * 8;
@@ -55,43 +100,45 @@ std::string int64ToStr(int64_t key) {
55
100
  return str;
56
101
  }
57
102
 
103
+ // Debug
58
104
  void printVector(const std::vector<int> &vec) {
59
105
  for (const auto &value : vec) {
60
106
  std::cout << value << " ";
61
107
  }
62
108
  }
63
109
 
64
- std::string charToBinaryString(char num) {
110
+ // Debug
111
+ [[nodiscard]] std::string charToBinaryString(const char &chr) {
65
112
  std::string result;
66
113
  for (int i = 7; i >= 0; --i) {
67
- result += ((num >> i) & 1) ? '1' : '0';
114
+ result += ((chr >> i) & 1) ? '1' : '0';
68
115
  }
69
116
  return result;
70
117
  }
71
118
 
72
119
  class Candidate;
73
- enum segmentType { Dir, File };
120
+ enum class segmentType { Dir, File };
74
121
 
75
122
  // A segment of a file path
76
123
  // e.g. if path is /foo/bar/baz.txt
77
124
  // segments are [{root}, foo, bar, baz.txt]
78
- class PathSegment {
79
- public:
125
+ struct PathSegment {
80
126
  std::string str;
81
127
  int fileId; // (if FILE)
82
128
  Candidate *cand;
83
129
  PathSegment *parent;
130
+ std::mutex mu;
84
131
  ankerl::unordered_dense::map<std::string, PathSegment *> children;
85
- segmentType type = Dir;
86
- PathSegment() : parent(NULL) {}
87
- PathSegment(std::string _str) : str(_str), parent(NULL) {}
132
+ segmentType type = segmentType::Dir;
133
+ PathSegment() : parent(nullptr) {}
134
+ PathSegment(std::string _str) : str(_str), parent(nullptr) {}
88
135
  PathSegment(std::string _str, int _fileId)
89
- : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
90
- int size() {
136
+ : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
137
+ [[nodiscard]] int size() const {
91
138
  int sz = str.size();
92
139
  PathSegment *cur = parent;
93
140
  // Sum up length of parent segments (+1 for divisors)
94
- while (cur->parent != NULL) {
141
+ while (cur->parent != nullptr) {
95
142
  sz += cur->str.size() + 1;
96
143
  cur = cur->parent;
97
144
  }
@@ -100,8 +147,7 @@ public:
100
147
  };
101
148
 
102
149
  // Candidate for result in string (filename) search
103
- class Candidate {
104
- public:
150
+ struct Candidate {
105
151
  std::vector<float> v_charscore;
106
152
  PathSegment *seg;
107
153
  int fileId;
@@ -114,25 +160,17 @@ public:
114
160
  int candLen; // Length of candidate
115
161
 
116
162
  Candidate(){};
117
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
118
- // Initialize v_charscores with zeros
119
- v_charscore.resize(len, 0);
120
- candLen = str.size();
121
- seg = NULL;
122
- }
123
-
124
163
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
125
164
  // Initialize v_charscores with zeros
126
165
  v_charscore.resize(len, 0);
127
166
  candLen = seg->size();
128
167
  }
129
168
 
130
- float getScore() {
169
+ [[nodiscard]] float getScore() const {
131
170
  int i = 0;
132
171
  float score = 0.0;
133
- candLen = seg->size();
134
172
 
135
- for (float &charscore : v_charscore) {
173
+ for (const float &charscore : v_charscore) {
136
174
  score += charscore;
137
175
  i++;
138
176
  }
@@ -145,35 +183,41 @@ public:
145
183
  return score;
146
184
  }
147
185
 
148
- float operator[](int idx) { return v_charscore[idx]; }
186
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
149
187
  };
150
188
 
151
189
  // This seems to give 10x speed improvement over std::unordered_map
152
190
  typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
153
191
  // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
154
192
 
155
- typedef std::unordered_map<float, Candidate> CandMap;
193
+ typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
194
+ // typedef std::unordered_map<int, Candidate*> CandMap;
156
195
 
157
196
  class StringIndex {
158
197
  private:
159
198
  int tmp;
160
199
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
200
+ int numStrings = 0;
161
201
 
162
202
  std::vector<SegMap *> dirmaps;
203
+ std::array<std::mutex, 9> mts_d; // for dirmaps
163
204
  std::vector<SegMap *> filemaps;
205
+ std::array<std::mutex, 9> mts_f; // for filemaps
164
206
 
165
207
  std::vector<PathSegment *> segsToClean;
166
208
 
167
- std::unordered_map<int, std::string> strlist;
168
209
  std::unordered_map<int, PathSegment *> seglist;
169
210
  PathSegment *root;
170
211
  int dirId = 0;
171
212
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
172
213
 
214
+ std::unique_ptr<ThreadPool> pool;
215
+ Output out{1}; // verbose level = 1
216
+
173
217
  public:
174
- StringIndex() {
218
+ StringIndex(char sep) : dirSeparator(sep) {
175
219
  root = new PathSegment();
176
- root->parent = NULL;
220
+ root->parent = nullptr;
177
221
  root->str = "[ROOT]";
178
222
 
179
223
  for (int i = 0; i <= 8; i++) {
@@ -181,11 +225,18 @@ public:
181
225
  filemaps.push_back(new SegMap);
182
226
  }
183
227
 
184
- #ifdef _OPENMP
185
- std::cout << "OPENMP enabled\n";
186
- #endif
228
+ // Threads between 4 and 6
229
+ // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
230
+ int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
231
+ num_threads = std::min(num_threads, 6);
232
+ out.printv(2, "Number of threads: ", num_threads);
233
+ pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
187
234
  }
188
235
 
236
+ /* Don't separate path to segments separator=\0.
237
+ This is slower, but can be used for other data than files also. */
238
+ StringIndex() : StringIndex('\0') {}
239
+
189
240
  void setDirSeparator(char sep) { dirSeparator = sep; }
190
241
  void setDirWeight(float val) { dirWeight = val; }
191
242
 
@@ -213,6 +264,15 @@ public:
213
264
  addStrToIndex(filePath, fileId, dirSeparator);
214
265
  }
215
266
 
267
+ void addStrToIndexThreaded(std::string filePath, int fileId) {
268
+ pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
269
+ }
270
+ void waitUntilReady() const { pool->waitUntilDone(); }
271
+
272
+ void waitUntilDone() const { pool->waitUntilDone(); }
273
+
274
+ int size() const { return seglist.size(); }
275
+
216
276
  /**
217
277
  * Add a string to the index to be searched for afterwards
218
278
  *
@@ -221,9 +281,17 @@ public:
221
281
  * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
222
282
  * one of {'\\', '/', '\0' (no separation)}.
223
283
  */
284
+
224
285
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
286
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
287
+
288
+ // If a string with this index has beeen added already
289
+ if (seglist.find(fileId) != seglist.end()) {
290
+ return;
291
+ }
225
292
 
226
293
  std::vector<std::string> segs;
294
+ numStrings += 1;
227
295
 
228
296
  if (separator == '\0') {
229
297
  // No separation to directories & files
@@ -233,7 +301,7 @@ public:
233
301
  segs = splitString(filePath, separator);
234
302
  }
235
303
 
236
- PathSegment *prev = NULL;
304
+ PathSegment *prev = nullptr;
237
305
  prev = root;
238
306
  // Add segments to a tree type data structure
239
307
  // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +313,27 @@ public:
245
313
  auto x = *_x;
246
314
  PathSegment *p;
247
315
 
248
- auto it = prev->children.find(x);
316
+ prev->mu.lock();
317
+
249
318
  // this part of the path already exists in the tree
250
- if (it != prev->children.end()) {
319
+ if (auto it = prev->children.find(x); it != prev->children.end()) {
251
320
  p = it->second;
321
+ prev->mu.unlock();
252
322
  } else {
253
323
  p = new PathSegment(x, fileId);
254
324
  p->parent = prev;
255
- // If this is last item in segs
325
+ // If this is last item in segs, then it is a file.
256
326
  if (_x == std::prev(segs.end())) {
257
- // therefore, it is a file.
258
- p->type = File;
327
+ p->type = segmentType::File;
259
328
  seglist[fileId] = p;
260
- } else {
261
- p->type = Dir;
329
+ } else { // otherwise, it is a directory
330
+ p->type = segmentType::Dir;
262
331
  p->fileId = dirId;
263
332
  // Files use user input Id. Directories need to have it generated
264
333
  dirId++;
265
334
  }
266
335
  prev->children[x] = p;
336
+ prev->mu.unlock();
267
337
  addPathSegmentKeys(p);
268
338
  }
269
339
 
@@ -271,6 +341,17 @@ public:
271
341
  }
272
342
  }
273
343
 
344
+ std::string getString(int id) {
345
+ std::string s = "";
346
+ PathSegment *seg = seglist[id];
347
+ s += seg->str;
348
+ while (seg->parent->parent != nullptr) {
349
+ seg = seg->parent;
350
+ s = seg->str + dirSeparator + s;
351
+ }
352
+ return s;
353
+ }
354
+
274
355
  /**
275
356
  The search will find filepaths similar to the input string
276
357
 
@@ -303,14 +384,16 @@ public:
303
384
  @param query String to search for inside the index
304
385
  */
305
386
 
306
- std::vector<std::pair<float, int>> findSimilar(std::string query) {
387
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
307
388
  return findSimilar(query, 2);
308
389
  }
309
390
 
310
- std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
391
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
311
392
  CandMap fileCandMap;
312
393
  CandMap dirCandMap;
313
394
 
395
+ waitUntilDone();
396
+
314
397
  // Find both files and directories that match the input query
315
398
  addToCandMap(fileCandMap, query, filemaps);
316
399
  addToCandMap(dirCandMap, query, dirmaps);
@@ -319,9 +402,9 @@ public:
319
402
  scores of the file */
320
403
  mergeCandidateMaps(fileCandMap, dirCandMap);
321
404
 
322
- // Set all candidate pointers to NULL so they won't mess up future searches
405
+ // Set all candidate pointers to nullptr so they won't mess up future searches
323
406
  for (auto seg : segsToClean) {
324
- seg->cand = NULL;
407
+ seg->cand = nullptr;
325
408
  }
326
409
  segsToClean.clear();
327
410
 
@@ -329,11 +412,17 @@ public:
329
412
  std::vector<std::pair<float, int>> results;
330
413
  for (auto &[fid, cand] : fileCandMap) {
331
414
  std::pair<float, int> v;
332
- float sc = cand.getScore();
415
+ float sc = cand->getScore();
333
416
  v.first = sc;
334
417
  v.second = fid;
335
418
  results.push_back(v);
419
+ delete cand;
336
420
  }
421
+
422
+ for (auto &[fid, cand] : dirCandMap) {
423
+ delete cand;
424
+ }
425
+
337
426
  // Sort highest score first
338
427
  std::sort(results.begin(), results.end(),
339
428
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -341,10 +430,10 @@ public:
341
430
  }
342
431
 
343
432
  // Return int64_t representation of the first nchars in str, starting from index i
344
- int64_t getKeyAtIdx(std::string str, int i, int nchars) {
433
+ [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
345
434
  int64_t key = 0;
346
435
  for (int i_char = 0; i_char < nchars; i_char++) {
347
- key = key | static_cast<int>(str[i + i_char]);
436
+ key = key | static_cast<int64_t>(str[i + i_char]);
348
437
  if (i_char < nchars - 1) {
349
438
  // Shift 8 bits to the left except on the last iteration
350
439
  key = key << 8;
@@ -399,22 +488,29 @@ private:
399
488
  maxChars = p->str.size();
400
489
  }
401
490
 
402
- #ifdef _OPENMP
403
- #pragma omp parallel for
404
- #endif
405
491
  for (int sublen = minChars; sublen <= maxChars; sublen++) {
406
492
 
493
+ std::mutex *mu;
407
494
  SegMap *map;
408
- if (p->type == File) {
495
+ if (p->type == segmentType::File) {
409
496
  map = filemaps[sublen];
497
+ mu = &mts_f[sublen];
410
498
  } else {
411
499
  map = dirmaps[sublen];
500
+ mu = &mts_d[sublen];
412
501
  }
413
502
 
414
503
  int count = str.size() - sublen + 1;
415
504
 
505
+ int64_t keys[count + 1];
416
506
  for (int i = 0; i <= count; i++) {
417
- int64_t key = getKeyAtIdx(str, i, sublen);
507
+ keys[i] = getKeyAtIdx(str, i, sublen);
508
+ }
509
+
510
+ mu->lock();
511
+ for (int i = 0; i <= count; i++) {
512
+ // int64_t key = getKeyAtIdx(str, i, sublen);
513
+ auto key = keys[i];
418
514
 
419
515
  // Create a new std::set for key if doesn't exist already
420
516
  auto it = map->find(key);
@@ -423,12 +519,14 @@ private:
423
519
  }
424
520
  (*map)[key]->insert(p);
425
521
  }
522
+ mu->unlock();
426
523
  }
427
524
  }
428
525
 
429
526
  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
430
527
  // is of length <nchars>.
431
- std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
528
+ [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
529
+ SegMap &map) const {
432
530
 
433
531
  assert(i + nchars <= static_cast<int>(str.size()));
434
532
  std::vector<PathSegment *> res;
@@ -437,8 +535,7 @@ private:
437
535
  // transform that to 64 bit integer
438
536
  int64_t key = getKeyAtIdx(str, i, nchars);
439
537
  // Find all path segments in map that have the same substring
440
- auto it = map.find(key);
441
- if (it != map.end()) { // key found
538
+ if (auto it = map.find(key); it != map.end()) { // key found
442
539
  auto set = it->second;
443
540
  for (auto value : *set) {
444
541
  res.push_back(value);
@@ -475,12 +572,12 @@ private:
475
572
  void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
476
573
 
477
574
  for (auto &[fid, cand] : fileCandMap) {
478
- PathSegment *p = cand.seg->parent;
479
- while (p->parent != NULL) {
480
- if (p->cand != NULL) {
481
- auto &scoreA = cand.v_charscore;
575
+ PathSegment *p = cand->seg->parent;
576
+ while (p->parent != nullptr) {
577
+ if (p->cand != nullptr) {
578
+ auto &scoreA = cand->v_charscore;
482
579
  auto &scoreB = p->cand->v_charscore;
483
- for (int i = 0; i < cand.len; i++) {
580
+ for (int i = 0; i < cand->len; i++) {
484
581
  if (scoreA[i] < scoreB[i] * dirWeight) {
485
582
  scoreA[i] = scoreB[i] * dirWeight;
486
583
  }
@@ -493,18 +590,22 @@ private:
493
590
 
494
591
  void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
495
592
 
496
- auto it2 = candmap.find(seg->fileId);
497
- if (it2 == candmap.end()) {
498
- Candidate cand(seg, str.size());
499
- seg->cand = &(candmap[seg->fileId]);
593
+ if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
594
+ Candidate *cand = new Candidate(seg, str.size());
500
595
  segsToClean.push_back(seg);
501
596
  candmap[seg->fileId] = cand;
597
+ seg->cand = cand;
502
598
  }
503
599
 
504
600
  for (int j = i; j < i + nchars; j++) {
505
- if (candmap[seg->fileId][j] < nchars) {
506
- candmap[seg->fileId].v_charscore[j] = nchars;
601
+ Candidate &cand = *(candmap[seg->fileId]);
602
+ if (cand[j] < nchars) {
603
+ cand.v_charscore[j] = nchars;
507
604
  }
508
605
  }
509
606
  }
510
607
  };
608
+
609
+ } // namespace StrIdx
610
+
611
+ #endif
data/test.rb CHANGED
@@ -1,8 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ $:.unshift File.dirname(__FILE__)
4
+
3
5
  require "stridx"
4
6
  idx = StrIdx::StringIndex.new
5
7
 
8
+ # "/" for unix-style file paths
9
+ idx.setDirSeparator("/") #(comment out if not file paths)
10
+
6
11
  t = Time.new
7
12
  fn = File.expand_path("flist.txt")
8
13
  lines = IO.read(fn).lines.collect { |x| x.strip }
@@ -13,7 +18,13 @@ for x in lines
13
18
  end
14
19
 
15
20
  idx_time = Time.new
16
- puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
21
+ # Time to start the threadpool to process indexing
22
+ puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
23
+
24
+ idx.waitUntilDone() # Not necessary, will be called by idx.find
25
+ idx_time = Time.new
26
+ # Time when all threads have completed
27
+ puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
17
28
 
18
29
  query = "rngnomadriv"
19
30
  res = idx.find(query)