StrIdx 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2155c54900f595ae954d2074840f425195a66d6bf464e35dc68a1bb10236255c
4
- data.tar.gz: fc2b50bcf6083b806643886a275e2a03ce3267cbd66dd81a0d63f5f68f8e4268
3
+ metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
4
+ data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
5
5
  SHA512:
6
- metadata.gz: ac27e251a448f1ca17d4672d12ac165412f2c7fbd71359e77d21f8a8c30521797befe7f2555d181be2af0cace345636558b4a8d93d8e141f707e8b28ffe5687b
7
- data.tar.gz: f8e65efbed5079c0a2e37504aa955e0855d0ef088ad07d1d2566dee6c5f51b79d9ba71ef102e5aaeef8d1b7d548eefa2fab3930e21b7de05b940da882c9b86f3
6
+ metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
7
+ data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217
data/Makefile CHANGED
@@ -1,7 +1,7 @@
1
1
  all: demo
2
2
 
3
- demo: *.hpp *.cpp
4
- g++ -Wall -O3 -fopenmp -lstdc++ demo.cpp -o demo
3
+ demo: *.hpp *.cpp Makefile
4
+ g++ -Wall -O3 -lstdc++ demo.cpp -o demo
5
5
 
6
6
  clean:
7
7
  rm demo
data/README.md CHANGED
@@ -43,6 +43,15 @@ Install:
43
43
  gem install StrIdx
44
44
  ```
45
45
 
46
+ Or, for development version:
47
+ ```
48
+ git clone https://github.com/SamiSieranoja/stridx.git
49
+ cd stridx
50
+ cd rubyext; ruby extconf.rb ; make ; cd ..
51
+ gem build stridx.gemspec
52
+ gem install $(ls -1tr StrIdx*gem | tail -n 1)
53
+ ```
54
+
46
55
  Usage example (see test.rb):
47
56
  ```ruby
48
57
  require "stridx"
@@ -58,7 +67,7 @@ for x in lines
58
67
  end
59
68
 
60
69
  idx_time = Time.new
61
- puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
70
+ puts "\nIndexing time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
62
71
 
63
72
  query = "rngnomadriv"
64
73
  res = idx.find(query)
data/demo.cpp CHANGED
@@ -1,3 +1,12 @@
1
+
2
+ #include <condition_variable>
3
+ #include <functional>
4
+ #include <iostream>
5
+ #include <mutex>
6
+ #include <queue>
7
+ #include <thread>
8
+ #include <algorithm>
9
+
1
10
  #include "stridx.hpp"
2
11
 
3
12
  #include <iostream>
@@ -28,7 +37,7 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
28
37
  }
29
38
 
30
39
  int main() {
31
- StringIndex idx;
40
+ StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
32
41
  // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
33
42
  // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
34
43
  // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
@@ -37,26 +46,39 @@ int main() {
37
46
  std::string fn_filePaths = "flist.txt";
38
47
  std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
39
48
 
49
+ // Launch indexing to be run on background
50
+ cout << "File paths: " << v_filePaths.size() << std::endl;
51
+ cout << "Start indexing in the background" << std::endl;
40
52
  auto start = std::chrono::high_resolution_clock::now();
41
53
  int id = 0;
42
54
  for (const auto &filePath : v_filePaths) {
43
- idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
44
- // idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
55
+ idx.addStrToIndexThreaded(filePath, id);
45
56
  id++;
46
57
  }
47
-
58
+
59
+ auto idx_time_launch = std::chrono::high_resolution_clock::now();
60
+ std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
61
+ cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
62
+
63
+ // Wait until indexing has finished
64
+ idx.waitUntilDone();
65
+
48
66
  auto idx_time = std::chrono::high_resolution_clock::now();
49
67
  std::chrono::duration<double, std::milli> duration = idx_time - start;
50
- cout << "Indexing creation time for " << v_filePaths.size() << " file paths (seconds): " << duration.count() / 1000 << "\n";
68
+ cout << "Indexing finished time for " << v_filePaths.size()
69
+ << " file paths (seconds): " << duration.count() / 1000 << "\n";
51
70
 
52
71
  // Find matching filepaths from the index for the query string "rngnomadriv"
53
72
  start = std::chrono::high_resolution_clock::now();
54
73
  std::string query = "rngnomadriv";
74
+ for (int i = 0; i < 99; i++) {
75
+ const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
76
+ }
77
+
55
78
  const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
56
79
  auto search_time = std::chrono::high_resolution_clock::now();
57
80
  duration = search_time - start;
58
- cout << "Search time (seconds): " << duration.count() / 1000
59
- << "\n";
81
+ cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
60
82
 
61
83
  int i = 0;
62
84
  std::cout << "query string: " << query << "\n";
@@ -73,4 +95,4 @@ int main() {
73
95
  }
74
96
 
75
97
  // Compile:
76
- // g++ -Wall -Wno-unused-variable -O3 -fopenmp -lstdc++ demo.cpp -o demo
98
+ // g++ -Wall -Wno-unused-variable -O3 -lstdc++ demo.cpp -o demo
data/rubyext/extconf.rb CHANGED
@@ -1,15 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
- #
3
2
 
4
3
  require 'mkmf'
5
4
 
6
5
  module_name = "stridx"
7
6
  extension_name = 'stridx'
8
7
 
9
- $CXXFLAGS << " -Wall -Wno-unused-variable -O3 -fopenmp"
8
+ $CXXFLAGS << " -Wall -Wno-unused-variable -O3"
10
9
 
11
10
  have_library( 'stdc++');
12
- have_library( 'gomp' );
13
11
 
14
12
  dir_config(extension_name) # The destination
15
13
  create_makefile(extension_name) # Create Makefile
@@ -7,7 +7,7 @@
7
7
 
8
8
  extern "C" {
9
9
 
10
- void str_idx_free(void *data) { delete (StringIndex *)data; }
10
+ void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
11
11
 
12
12
  // Wrap StringIndex class inside a ruby variable
13
13
  static const rb_data_type_t str_idx_type = {
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
26
26
  };
27
27
 
28
28
  VALUE str_idx_alloc(VALUE self) {
29
- void *data = new StringIndex();
29
+ void *data = new StrIdx::StringIndex();
30
30
  return TypedData_Wrap_Struct(self, &str_idx_type, data);
31
31
  }
32
32
 
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
36
36
 
37
37
  void *data;
38
38
  TypedData_Get_Struct(self, int, &str_idx_type, data);
39
- ((StringIndex *)data)->addStrToIndex(s1, fid);
39
+ // ((StringIndex *)data)->addStrToIndex(s1, fid);
40
+ ((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
40
41
 
41
42
  return self;
42
43
  }
43
44
 
45
+ VALUE StringIndexWaitUntilDone(VALUE self) {
46
+ void *data;
47
+ TypedData_Get_Struct(self, int, &str_idx_type, data);
48
+ ((StrIdx::StringIndex *)data)->waitUntilDone();
49
+ return self;
50
+ }
51
+
52
+
44
53
  VALUE StringIndexFind(VALUE self, VALUE str) {
45
54
  VALUE ret;
46
55
  std::string s1 = StringValueCStr(str);
47
56
 
48
57
  void *data;
49
58
  TypedData_Get_Struct(self, int, &str_idx_type, data);
50
- StringIndex *idx = (StringIndex *)data;
59
+ StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
51
60
 
52
61
  ret = rb_ary_new();
53
62
  const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
@@ -80,7 +89,7 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
80
89
 
81
90
  void *data;
82
91
  TypedData_Get_Struct(self, int, &str_idx_type, data);
83
- StringIndex *idx = (StringIndex *)data;
92
+ StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
84
93
  idx->setDirSeparator(c);
85
94
 
86
95
  return self;
@@ -93,8 +102,12 @@ void Init_stridx(void) {
93
102
 
94
103
  rb_define_alloc_func(classStringIndex, str_idx_alloc);
95
104
  rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
105
+ rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
96
106
  rb_define_method(classStringIndex, "find", StringIndexFind, 1);
107
+
97
108
  rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
109
+
110
+
98
111
  }
99
112
 
100
113
  } // End extern "C"
data/stridx.hpp CHANGED
@@ -1,21 +1,66 @@
1
1
 
2
+ #ifndef SSSTRIDX_HPP
3
+ #define SSSTRIDX_HPP
4
+
2
5
  #include <stdio.h>
3
6
  #include <stdlib.h>
4
7
  #include <cassert>
5
8
 
6
9
  #include <vector>
10
+ #include <array>
7
11
  #include <iostream>
8
12
  #include <unordered_map>
9
13
  #include <set>
10
14
  #include <algorithm>
11
15
  #include <sstream>
12
16
 
13
- #ifdef _OPENMP
14
- #include <omp.h>
15
- #endif
17
+ #include <vector>
18
+ #include <mutex>
19
+ #include <thread>
16
20
 
21
+ #include "thread_pool.hpp"
17
22
  #include "unordered_dense.h"
18
23
 
24
+ namespace StrIdx {
25
+
26
+ /* Alternative to using std::cout
27
+ Allows to control verbose level */
28
+ class Output {
29
+ private:
30
+ int verboseLevel;
31
+
32
+ public:
33
+ Output(int verb) : verboseLevel(verb) {}
34
+ Output() : Output(3) {}
35
+ ~Output() = default;
36
+ void print() {}
37
+
38
+ // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
+ template <typename T, typename... Types> void print(T var1, Types... var2) {
40
+ std::cout << var1;
41
+ print(var2...);
42
+ }
43
+
44
+ // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
+ template <typename... Types> void printl(Types... var2) {
46
+ print(var2...);
47
+ print("\n");
48
+ }
49
+
50
+ /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
51
+ * if verboseLevel >= 2 (first arg)
52
+ */
53
+ template <typename... Types> void printv(int vlevel, Types... var2) {
54
+ if (verboseLevel < vlevel) {
55
+ return;
56
+ }
57
+ if (verboseLevel >= 3) {
58
+ print("[v=", vlevel, "] ");
59
+ }
60
+ printl(var2...);
61
+ }
62
+ };
63
+
19
64
  // Transforms input string as follows:
20
65
  // '/foo/bar/file1.txt'
21
66
  // => vector{"foo", "bar", "file1.txt"}
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
34
79
  }
35
80
 
36
81
  // Convert int64_t to binary string
37
- std::string int64ToBinaryString(int64_t num) {
82
+ [[nodiscard]] std::string int64ToBinaryString(int64_t num) {
38
83
  std::string result;
39
84
  for (int i = 63; i >= 0; --i) {
40
85
  result += ((num >> i) & 1) ? '1' : '0';
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
42
87
  return result;
43
88
  }
44
89
 
45
- // Convert a (8 char) string represented as int64_t to std::string
46
- std::string int64ToStr(int64_t key) {
90
+ // Debug. Convert a (8 char) string represented as int64_t to std::string
91
+ [[nodiscard]] std::string int64ToStr(int64_t key) {
47
92
  int nchars = 8;
48
93
  std::string str;
49
94
  int multip = nchars * 8;
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
55
100
  return str;
56
101
  }
57
102
 
103
+ // Debug
58
104
  void printVector(const std::vector<int> &vec) {
59
105
  for (const auto &value : vec) {
60
106
  std::cout << value << " ";
61
107
  }
62
108
  }
63
109
 
64
- std::string charToBinaryString(char num) {
110
+ // Debug
111
+ [[nodiscard]] std::string charToBinaryString(char chr) {
65
112
  std::string result;
66
113
  for (int i = 7; i >= 0; --i) {
67
- result += ((num >> i) & 1) ? '1' : '0';
114
+ result += ((chr >> i) & 1) ? '1' : '0';
68
115
  }
69
116
  return result;
70
117
  }
71
118
 
72
119
  class Candidate;
73
- enum segmentType { Dir, File };
120
+ enum class segmentType { Dir, File };
74
121
 
75
122
  // A segment of a file path
76
123
  // e.g. if path is /foo/bar/baz.txt
@@ -81,17 +128,18 @@ public:
81
128
  int fileId; // (if FILE)
82
129
  Candidate *cand;
83
130
  PathSegment *parent;
131
+ std::mutex mu;
84
132
  ankerl::unordered_dense::map<std::string, PathSegment *> children;
85
- segmentType type = Dir;
86
- PathSegment() : parent(NULL) {}
87
- PathSegment(std::string _str) : str(_str), parent(NULL) {}
133
+ segmentType type = segmentType::Dir;
134
+ PathSegment() : parent(nullptr) {}
135
+ PathSegment(std::string _str) : str(_str), parent(nullptr) {}
88
136
  PathSegment(std::string _str, int _fileId)
89
- : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
90
- int size() {
137
+ : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
138
+ [[nodiscard]] int size() {
91
139
  int sz = str.size();
92
140
  PathSegment *cur = parent;
93
141
  // Sum up length of parent segments (+1 for divisors)
94
- while (cur->parent != NULL) {
142
+ while (cur->parent != nullptr) {
95
143
  sz += cur->str.size() + 1;
96
144
  cur = cur->parent;
97
145
  }
@@ -118,7 +166,7 @@ public:
118
166
  // Initialize v_charscores with zeros
119
167
  v_charscore.resize(len, 0);
120
168
  candLen = str.size();
121
- seg = NULL;
169
+ seg = nullptr;
122
170
  }
123
171
 
124
172
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
@@ -127,7 +175,7 @@ public:
127
175
  candLen = seg->size();
128
176
  }
129
177
 
130
- float getScore() {
178
+ [[nodiscard]] float getScore() {
131
179
  int i = 0;
132
180
  float score = 0.0;
133
181
  candLen = seg->size();
@@ -145,19 +193,21 @@ public:
145
193
  return score;
146
194
  }
147
195
 
148
- float operator[](int idx) { return v_charscore[idx]; }
196
+ [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
149
197
  };
150
198
 
151
199
  // This seems to give 10x speed improvement over std::unordered_map
152
200
  typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
153
201
  // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
154
202
 
155
- typedef std::unordered_map<float, Candidate> CandMap;
203
+ typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
204
+ // typedef std::unordered_map<int, Candidate*> CandMap;
156
205
 
157
206
  class StringIndex {
158
207
  private:
159
208
  int tmp;
160
209
  char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
210
+ int numStrings = 0;
161
211
 
162
212
  std::vector<SegMap *> dirmaps;
163
213
  std::vector<SegMap *> filemaps;
@@ -170,10 +220,16 @@ private:
170
220
  int dirId = 0;
171
221
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
172
222
 
223
+ std::array<std::mutex, 9> mts_f;
224
+ std::array<std::mutex, 9> mts_d;
225
+
226
+ std::unique_ptr<ThreadPool> pool;
227
+ Output out{1}; // verbose level = 1
228
+
173
229
  public:
174
- StringIndex() {
230
+ StringIndex(char sep) : dirSeparator(sep) {
175
231
  root = new PathSegment();
176
- root->parent = NULL;
232
+ root->parent = nullptr;
177
233
  root->str = "[ROOT]";
178
234
 
179
235
  for (int i = 0; i <= 8; i++) {
@@ -181,11 +237,18 @@ public:
181
237
  filemaps.push_back(new SegMap);
182
238
  }
183
239
 
184
- #ifdef _OPENMP
185
- std::cout << "OPENMP enabled\n";
186
- #endif
240
+ // Threads between 4 and 6
241
+ // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
242
+ int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
243
+ num_threads = std::min(num_threads, 6);
244
+ out.printv(2, "Number of threads: ", num_threads);
245
+ pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
187
246
  }
188
247
 
248
+ /* Don't separate path to segments separator=\0.
249
+ This is slower, but can be used for other data than files also. */
250
+ StringIndex() : StringIndex('\0') {}
251
+
189
252
  void setDirSeparator(char sep) { dirSeparator = sep; }
190
253
  void setDirWeight(float val) { dirWeight = val; }
191
254
 
@@ -213,6 +276,13 @@ public:
213
276
  addStrToIndex(filePath, fileId, dirSeparator);
214
277
  }
215
278
 
279
+ void addStrToIndexThreaded(std::string filePath, int fileId) {
280
+ pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
281
+ }
282
+ void waitUntilReady() { pool->waitUntilDone(); }
283
+
284
+ void waitUntilDone() { pool->waitUntilDone(); }
285
+
216
286
  /**
217
287
  * Add a string to the index to be searched for afterwards
218
288
  *
@@ -222,8 +292,10 @@ public:
222
292
  * one of {'\\', '/', '\0' (no separation)}.
223
293
  */
224
294
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
295
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
225
296
 
226
297
  std::vector<std::string> segs;
298
+ numStrings += 1;
227
299
 
228
300
  if (separator == '\0') {
229
301
  // No separation to directories & files
@@ -233,7 +305,7 @@ public:
233
305
  segs = splitString(filePath, separator);
234
306
  }
235
307
 
236
- PathSegment *prev = NULL;
308
+ PathSegment *prev = nullptr;
237
309
  prev = root;
238
310
  // Add segments to a tree type data structure
239
311
  // e.g. addStrToIndex('/foo/bar/file1.txt' ..)
@@ -245,25 +317,27 @@ public:
245
317
  auto x = *_x;
246
318
  PathSegment *p;
247
319
 
248
- auto it = prev->children.find(x);
320
+ prev->mu.lock();
321
+
249
322
  // this part of the path already exists in the tree
250
- if (it != prev->children.end()) {
323
+ if (auto it = prev->children.find(x); it != prev->children.end()) {
251
324
  p = it->second;
325
+ prev->mu.unlock();
252
326
  } else {
253
327
  p = new PathSegment(x, fileId);
254
328
  p->parent = prev;
255
- // If this is last item in segs
329
+ // If this is last item in segs, then it is a file.
256
330
  if (_x == std::prev(segs.end())) {
257
- // therefore, it is a file.
258
- p->type = File;
331
+ p->type = segmentType::File;
259
332
  seglist[fileId] = p;
260
- } else {
261
- p->type = Dir;
333
+ } else { // otherwise, it is a directory
334
+ p->type = segmentType::Dir;
262
335
  p->fileId = dirId;
263
336
  // Files use user input Id. Directories need to have it generated
264
337
  dirId++;
265
338
  }
266
339
  prev->children[x] = p;
340
+ prev->mu.unlock();
267
341
  addPathSegmentKeys(p);
268
342
  }
269
343
 
@@ -303,14 +377,16 @@ public:
303
377
  @param query String to search for inside the index
304
378
  */
305
379
 
306
- std::vector<std::pair<float, int>> findSimilar(std::string query) {
380
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
307
381
  return findSimilar(query, 2);
308
382
  }
309
383
 
310
- std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
384
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
311
385
  CandMap fileCandMap;
312
386
  CandMap dirCandMap;
313
387
 
388
+ waitUntilDone();
389
+
314
390
  // Find both files and directories that match the input query
315
391
  addToCandMap(fileCandMap, query, filemaps);
316
392
  addToCandMap(dirCandMap, query, dirmaps);
@@ -319,9 +395,9 @@ public:
319
395
  scores of the file */
320
396
  mergeCandidateMaps(fileCandMap, dirCandMap);
321
397
 
322
- // Set all candidate pointers to NULL so they won't mess up future searches
398
+ // Set all candidate pointers to nullptr so they won't mess up future searches
323
399
  for (auto seg : segsToClean) {
324
- seg->cand = NULL;
400
+ seg->cand = nullptr;
325
401
  }
326
402
  segsToClean.clear();
327
403
 
@@ -329,11 +405,17 @@ public:
329
405
  std::vector<std::pair<float, int>> results;
330
406
  for (auto &[fid, cand] : fileCandMap) {
331
407
  std::pair<float, int> v;
332
- float sc = cand.getScore();
408
+ float sc = cand->getScore();
333
409
  v.first = sc;
334
410
  v.second = fid;
335
411
  results.push_back(v);
412
+ delete cand;
336
413
  }
414
+
415
+ for (auto &[fid, cand] : dirCandMap) {
416
+ delete cand;
417
+ }
418
+
337
419
  // Sort highest score first
338
420
  std::sort(results.begin(), results.end(),
339
421
  [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
@@ -341,10 +423,10 @@ public:
341
423
  }
342
424
 
343
425
  // Return int64_t representation of the first nchars in str, starting from index i
344
- int64_t getKeyAtIdx(std::string str, int i, int nchars) {
426
+ [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
345
427
  int64_t key = 0;
346
428
  for (int i_char = 0; i_char < nchars; i_char++) {
347
- key = key | static_cast<int>(str[i + i_char]);
429
+ key = key | static_cast<int64_t>(str[i + i_char]);
348
430
  if (i_char < nchars - 1) {
349
431
  // Shift 8 bits to the left except on the last iteration
350
432
  key = key << 8;
@@ -399,22 +481,29 @@ private:
399
481
  maxChars = p->str.size();
400
482
  }
401
483
 
402
- #ifdef _OPENMP
403
- #pragma omp parallel for
404
- #endif
405
484
  for (int sublen = minChars; sublen <= maxChars; sublen++) {
406
485
 
486
+ std::mutex *mu;
407
487
  SegMap *map;
408
- if (p->type == File) {
488
+ if (p->type == segmentType::File) {
409
489
  map = filemaps[sublen];
490
+ mu = &mts_f[sublen];
410
491
  } else {
411
492
  map = dirmaps[sublen];
493
+ mu = &mts_d[sublen];
412
494
  }
413
495
 
414
496
  int count = str.size() - sublen + 1;
415
497
 
498
+ int64_t keys[count + 1];
416
499
  for (int i = 0; i <= count; i++) {
417
- int64_t key = getKeyAtIdx(str, i, sublen);
500
+ keys[i] = getKeyAtIdx(str, i, sublen);
501
+ }
502
+
503
+ mu->lock();
504
+ for (int i = 0; i <= count; i++) {
505
+ // int64_t key = getKeyAtIdx(str, i, sublen);
506
+ auto key = keys[i];
418
507
 
419
508
  // Create a new std::set for key if doesn't exist already
420
509
  auto it = map->find(key);
@@ -423,12 +512,14 @@ private:
423
512
  }
424
513
  (*map)[key]->insert(p);
425
514
  }
515
+ mu->unlock();
426
516
  }
427
517
  }
428
518
 
429
519
  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
430
520
  // is of length <nchars>.
431
- std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
521
+ [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
522
+ SegMap &map) {
432
523
 
433
524
  assert(i + nchars <= static_cast<int>(str.size()));
434
525
  std::vector<PathSegment *> res;
@@ -437,8 +528,7 @@ private:
437
528
  // transform that to 64 bit integer
438
529
  int64_t key = getKeyAtIdx(str, i, nchars);
439
530
  // Find all path segments in map that have the same substring
440
- auto it = map.find(key);
441
- if (it != map.end()) { // key found
531
+ if (auto it = map.find(key); it != map.end()) { // key found
442
532
  auto set = it->second;
443
533
  for (auto value : *set) {
444
534
  res.push_back(value);
@@ -475,12 +565,12 @@ private:
475
565
  void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
476
566
 
477
567
  for (auto &[fid, cand] : fileCandMap) {
478
- PathSegment *p = cand.seg->parent;
479
- while (p->parent != NULL) {
480
- if (p->cand != NULL) {
481
- auto &scoreA = cand.v_charscore;
568
+ PathSegment *p = cand->seg->parent;
569
+ while (p->parent != nullptr) {
570
+ if (p->cand != nullptr) {
571
+ auto &scoreA = cand->v_charscore;
482
572
  auto &scoreB = p->cand->v_charscore;
483
- for (int i = 0; i < cand.len; i++) {
573
+ for (int i = 0; i < cand->len; i++) {
484
574
  if (scoreA[i] < scoreB[i] * dirWeight) {
485
575
  scoreA[i] = scoreB[i] * dirWeight;
486
576
  }
@@ -493,18 +583,22 @@ private:
493
583
 
494
584
  void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
495
585
 
496
- auto it2 = candmap.find(seg->fileId);
497
- if (it2 == candmap.end()) {
498
- Candidate cand(seg, str.size());
499
- seg->cand = &(candmap[seg->fileId]);
586
+ if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
587
+ Candidate *cand = new Candidate(seg, str.size());
500
588
  segsToClean.push_back(seg);
501
589
  candmap[seg->fileId] = cand;
590
+ seg->cand = cand;
502
591
  }
503
592
 
504
593
  for (int j = i; j < i + nchars; j++) {
505
- if (candmap[seg->fileId][j] < nchars) {
506
- candmap[seg->fileId].v_charscore[j] = nchars;
594
+ Candidate &cand = *(candmap[seg->fileId]);
595
+ if (cand[j] < nchars) {
596
+ cand.v_charscore[j] = nchars;
507
597
  }
508
598
  }
509
599
  }
510
600
  };
601
+
602
+ } // namespace StrIdx
603
+
604
+ #endif
data/test.rb CHANGED
@@ -13,7 +13,13 @@ for x in lines
13
13
  end
14
14
 
15
15
  idx_time = Time.new
16
- puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
16
+ # Time to start the threadpool to process indexing
17
+ puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
18
+
19
+ idx.waitUntilDone() # Not necessary, will be called by idx.find
20
+ idx_time = Time.new
21
+ # Time when all threads have completed
22
+ puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
17
23
 
18
24
  query = "rngnomadriv"
19
25
  res = idx.find(query)
data/thread_pool.hpp ADDED
@@ -0,0 +1,98 @@
1
+
2
+ // Based on example in https://www.geeksforgeeks.org/thread-pool-in-cpp/
3
+
4
+ #include <condition_variable>
5
+ #include <functional>
6
+ #include <iostream>
7
+ #include <mutex>
8
+ #include <queue>
9
+ #include <thread>
10
+ #include <algorithm>
11
+ #include <iostream>
12
+ #include <fstream>
13
+ #include <vector>
14
+ #include <string>
15
+ #include <chrono>
16
+
17
+ class ThreadPool {
18
+ public:
19
+ // Create a thread pool with given number of threads
20
+ ThreadPool(size_t num_threads) {
21
+
22
+ // Creating worker threads
23
+ for (size_t i = 0; i < num_threads; ++i) {
24
+ workerThreads.emplace_back([this] {
25
+ while (true) {
26
+ std::function<void()> task;
27
+ {
28
+ std::unique_lock<std::mutex> lock(mu_queue);
29
+
30
+ // Waiting until there is a task to execute or the pool is stopped
31
+ cv_.wait(lock, [this] { return !taskQueue.empty() || stop_; });
32
+
33
+ // Exit the thread in case the pool is stopped and there are no tasks
34
+ if (stop_ && taskQueue.empty()) {
35
+ return;
36
+ }
37
+
38
+ // Get the next task from the queue
39
+ task = std::move(taskQueue.front());
40
+ taskQueue.pop();
41
+ }
42
+
43
+ task();
44
+ }
45
+ });
46
+ }
47
+ }
48
+
49
+ // Destructor to stop the thread pool
50
+ ~ThreadPool() {
51
+ {
52
+ std::lock_guard<std::mutex> lock(mu_queue);
53
+ stop_ = true;
54
+ }
55
+
56
+ // Notify all threads
57
+ cv_.notify_all();
58
+
59
+ // Joining all worker threads to ensure they have
60
+ // completed their tasks
61
+ for (auto &thread : workerThreads) {
62
+ thread.join();
63
+ }
64
+ }
65
+
66
+ // Wait until all tasks assigned to the threads have been finished
67
+ void waitUntilDone() {
68
+ while (true) {
69
+ {
70
+ std::lock_guard<std::mutex> guard(mu_queue);
71
+ if (taskQueue.empty()) {
72
+ return;
73
+ }
74
+ }
75
+ std::this_thread::sleep_for(std::chrono::milliseconds(50));
76
+ }
77
+ }
78
+
79
+ // Enqueue task for execution by the thread pool
80
+ void enqueue(std::function<void()> task) {
81
+ {
82
+ std::lock_guard<std::mutex> lock(mu_queue);
83
+ taskQueue.emplace(move(task));
84
+ }
85
+ cv_.notify_one();
86
+ }
87
+
88
+ private:
89
+ std::vector<std::thread> workerThreads;
90
+ std::queue<std::function<void()>> taskQueue;
91
+ std::mutex mu_queue;
92
+
93
+ // Condition variable to signal changes in the state of the tasks queue
94
+ std::condition_variable cv_;
95
+
96
+ // Flag to indicate whether the thread pool should stop
97
+ bool stop_ = false;
98
+ };
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: StrIdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sami Sieranoja
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-08 00:00:00.000000000 Z
11
+ date: 2024-05-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -55,6 +55,7 @@ files:
55
55
  - rubyext/ruby_interf.cpp
56
56
  - stridx.hpp
57
57
  - test.rb
58
+ - thread_pool.hpp
58
59
  - unordered_dense.h
59
60
  homepage: https://github.com/SamiSieranoja/stridx
60
61
  licenses:
@@ -62,7 +63,7 @@ licenses:
62
63
  metadata:
63
64
  source_code_uri: https://github.com/SamiSieranoja/stridx
64
65
  homepage_uri: https://github.com/SamiSieranoja/stridx
65
- post_install_message:
66
+ post_install_message:
66
67
  rdoc_options: []
67
68
  require_paths:
68
69
  - lib
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
80
  version: '0'
80
81
  requirements: []
81
82
  rubygems_version: 3.3.26
82
- signing_key:
83
+ signing_key:
83
84
  specification_version: 4
84
85
  summary: StrIdx
85
86
  test_files: []