StrIdx 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Makefile +2 -2
- data/README.md +10 -1
- data/demo.cpp +30 -8
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +18 -5
- data/stridx.hpp +152 -58
- data/test.rb +7 -1
- data/thread_pool.hpp +98 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
|
4
|
+
data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
|
7
|
+
data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217
|
data/Makefile
CHANGED
data/README.md
CHANGED
@@ -43,6 +43,15 @@ Install:
|
|
43
43
|
gem install StrIdx
|
44
44
|
```
|
45
45
|
|
46
|
+
Or, for development version:
|
47
|
+
```
|
48
|
+
git clone https://github.com/SamiSieranoja/stridx.git
|
49
|
+
cd stridx
|
50
|
+
cd rubyext; ruby extconf.rb ; make ; cd ..
|
51
|
+
gem build stridx.gemspec
|
52
|
+
gem install $(ls -1tr StrIdx*gem | tail -n 1)
|
53
|
+
```
|
54
|
+
|
46
55
|
Usage example (see test.rb):
|
47
56
|
```ruby
|
48
57
|
require "stridx"
|
@@ -58,7 +67,7 @@ for x in lines
|
|
58
67
|
end
|
59
68
|
|
60
69
|
idx_time = Time.new
|
61
|
-
puts "\nIndexing time (#{lines.size} files
|
70
|
+
puts "\nIndexing time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
62
71
|
|
63
72
|
query = "rngnomadriv"
|
64
73
|
res = idx.find(query)
|
data/demo.cpp
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
|
2
|
+
#include <condition_variable>
|
3
|
+
#include <functional>
|
4
|
+
#include <iostream>
|
5
|
+
#include <mutex>
|
6
|
+
#include <queue>
|
7
|
+
#include <thread>
|
8
|
+
#include <algorithm>
|
9
|
+
|
1
10
|
#include "stridx.hpp"
|
2
11
|
|
3
12
|
#include <iostream>
|
@@ -28,7 +37,7 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
28
37
|
}
|
29
38
|
|
30
39
|
int main() {
|
31
|
-
StringIndex idx;
|
40
|
+
StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
|
32
41
|
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
33
42
|
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
34
43
|
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
@@ -37,26 +46,39 @@ int main() {
|
|
37
46
|
std::string fn_filePaths = "flist.txt";
|
38
47
|
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
39
48
|
|
49
|
+
// Launch indexing to be run on background
|
50
|
+
cout << "File paths: " << v_filePaths.size() << std::endl;
|
51
|
+
cout << "Start indexing in the background" << std::endl;
|
40
52
|
auto start = std::chrono::high_resolution_clock::now();
|
41
53
|
int id = 0;
|
42
54
|
for (const auto &filePath : v_filePaths) {
|
43
|
-
idx.
|
44
|
-
// idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
|
55
|
+
idx.addStrToIndexThreaded(filePath, id);
|
45
56
|
id++;
|
46
57
|
}
|
47
|
-
|
58
|
+
|
59
|
+
auto idx_time_launch = std::chrono::high_resolution_clock::now();
|
60
|
+
std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
|
61
|
+
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
62
|
+
|
63
|
+
// Wait until indexing has finished
|
64
|
+
idx.waitUntilDone();
|
65
|
+
|
48
66
|
auto idx_time = std::chrono::high_resolution_clock::now();
|
49
67
|
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
50
|
-
cout << "Indexing
|
68
|
+
cout << "Indexing finished time for " << v_filePaths.size()
|
69
|
+
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
51
70
|
|
52
71
|
// Find matching filepaths from the index for the query string "rngnomadriv"
|
53
72
|
start = std::chrono::high_resolution_clock::now();
|
54
73
|
std::string query = "rngnomadriv";
|
74
|
+
for (int i = 0; i < 99; i++) {
|
75
|
+
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
76
|
+
}
|
77
|
+
|
55
78
|
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
56
79
|
auto search_time = std::chrono::high_resolution_clock::now();
|
57
80
|
duration = search_time - start;
|
58
|
-
cout << "Search time (seconds): " << duration.count() / 1000
|
59
|
-
<< "\n";
|
81
|
+
cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
|
60
82
|
|
61
83
|
int i = 0;
|
62
84
|
std::cout << "query string: " << query << "\n";
|
@@ -73,4 +95,4 @@ int main() {
|
|
73
95
|
}
|
74
96
|
|
75
97
|
// Compile:
|
76
|
-
// g++ -Wall -Wno-unused-variable -O3 -
|
98
|
+
// g++ -Wall -Wno-unused-variable -O3 -lstdc++ demo.cpp -o demo
|
data/rubyext/extconf.rb
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
2
|
|
4
3
|
require 'mkmf'
|
5
4
|
|
6
5
|
module_name = "stridx"
|
7
6
|
extension_name = 'stridx'
|
8
7
|
|
9
|
-
$CXXFLAGS << " -Wall -Wno-unused-variable -O3
|
8
|
+
$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
|
10
9
|
|
11
10
|
have_library( 'stdc++');
|
12
|
-
have_library( 'gomp' );
|
13
11
|
|
14
12
|
dir_config(extension_name) # The destination
|
15
13
|
create_makefile(extension_name) # Create Makefile
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
extern "C" {
|
9
9
|
|
10
|
-
void str_idx_free(void *data) { delete (StringIndex *)data; }
|
10
|
+
void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
|
11
11
|
|
12
12
|
// Wrap StringIndex class inside a ruby variable
|
13
13
|
static const rb_data_type_t str_idx_type = {
|
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
|
|
26
26
|
};
|
27
27
|
|
28
28
|
VALUE str_idx_alloc(VALUE self) {
|
29
|
-
void *data = new StringIndex();
|
29
|
+
void *data = new StrIdx::StringIndex();
|
30
30
|
return TypedData_Wrap_Struct(self, &str_idx_type, data);
|
31
31
|
}
|
32
32
|
|
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
36
36
|
|
37
37
|
void *data;
|
38
38
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
-
((StringIndex *)data)->addStrToIndex(s1, fid);
|
39
|
+
// ((StringIndex *)data)->addStrToIndex(s1, fid);
|
40
|
+
((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
|
40
41
|
|
41
42
|
return self;
|
42
43
|
}
|
43
44
|
|
45
|
+
VALUE StringIndexWaitUntilDone(VALUE self) {
|
46
|
+
void *data;
|
47
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
48
|
+
((StrIdx::StringIndex *)data)->waitUntilDone();
|
49
|
+
return self;
|
50
|
+
}
|
51
|
+
|
52
|
+
|
44
53
|
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
54
|
VALUE ret;
|
46
55
|
std::string s1 = StringValueCStr(str);
|
47
56
|
|
48
57
|
void *data;
|
49
58
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
50
|
-
StringIndex *idx = (StringIndex *)data;
|
59
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
51
60
|
|
52
61
|
ret = rb_ary_new();
|
53
62
|
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
@@ -80,7 +89,7 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
|
80
89
|
|
81
90
|
void *data;
|
82
91
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
83
|
-
StringIndex *idx = (StringIndex *)data;
|
92
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
84
93
|
idx->setDirSeparator(c);
|
85
94
|
|
86
95
|
return self;
|
@@ -93,8 +102,12 @@ void Init_stridx(void) {
|
|
93
102
|
|
94
103
|
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
95
104
|
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
105
|
+
rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
|
96
106
|
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
107
|
+
|
97
108
|
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
109
|
+
|
110
|
+
|
98
111
|
}
|
99
112
|
|
100
113
|
} // End extern "C"
|
data/stridx.hpp
CHANGED
@@ -1,21 +1,66 @@
|
|
1
1
|
|
2
|
+
#ifndef SSSTRIDX_HPP
|
3
|
+
#define SSSTRIDX_HPP
|
4
|
+
|
2
5
|
#include <stdio.h>
|
3
6
|
#include <stdlib.h>
|
4
7
|
#include <cassert>
|
5
8
|
|
6
9
|
#include <vector>
|
10
|
+
#include <array>
|
7
11
|
#include <iostream>
|
8
12
|
#include <unordered_map>
|
9
13
|
#include <set>
|
10
14
|
#include <algorithm>
|
11
15
|
#include <sstream>
|
12
16
|
|
13
|
-
#
|
14
|
-
#include <
|
15
|
-
#
|
17
|
+
#include <vector>
|
18
|
+
#include <mutex>
|
19
|
+
#include <thread>
|
16
20
|
|
21
|
+
#include "thread_pool.hpp"
|
17
22
|
#include "unordered_dense.h"
|
18
23
|
|
24
|
+
namespace StrIdx {
|
25
|
+
|
26
|
+
/* Alternative to using std::cout
|
27
|
+
Allows to control verbose level */
|
28
|
+
class Output {
|
29
|
+
private:
|
30
|
+
int verboseLevel;
|
31
|
+
|
32
|
+
public:
|
33
|
+
Output(int verb) : verboseLevel(verb) {}
|
34
|
+
Output() : Output(3) {}
|
35
|
+
~Output() = default;
|
36
|
+
void print() {}
|
37
|
+
|
38
|
+
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
+
template <typename T, typename... Types> void print(T var1, Types... var2) {
|
40
|
+
std::cout << var1;
|
41
|
+
print(var2...);
|
42
|
+
}
|
43
|
+
|
44
|
+
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
+
template <typename... Types> void printl(Types... var2) {
|
46
|
+
print(var2...);
|
47
|
+
print("\n");
|
48
|
+
}
|
49
|
+
|
50
|
+
/* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
51
|
+
* if verboseLevel >= 2 (first arg)
|
52
|
+
*/
|
53
|
+
template <typename... Types> void printv(int vlevel, Types... var2) {
|
54
|
+
if (verboseLevel < vlevel) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
if (verboseLevel >= 3) {
|
58
|
+
print("[v=", vlevel, "] ");
|
59
|
+
}
|
60
|
+
printl(var2...);
|
61
|
+
}
|
62
|
+
};
|
63
|
+
|
19
64
|
// Transforms input string as follows:
|
20
65
|
// '/foo/bar/file1.txt'
|
21
66
|
// => vector{"foo", "bar", "file1.txt"}
|
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
|
|
34
79
|
}
|
35
80
|
|
36
81
|
// Convert int64_t to binary string
|
37
|
-
std::string int64ToBinaryString(int64_t num) {
|
82
|
+
[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
|
38
83
|
std::string result;
|
39
84
|
for (int i = 63; i >= 0; --i) {
|
40
85
|
result += ((num >> i) & 1) ? '1' : '0';
|
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
|
|
42
87
|
return result;
|
43
88
|
}
|
44
89
|
|
45
|
-
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
-
std::string int64ToStr(int64_t key) {
|
90
|
+
// Debug. Convert a (8 char) string represented as int64_t to std::string
|
91
|
+
[[nodiscard]] std::string int64ToStr(int64_t key) {
|
47
92
|
int nchars = 8;
|
48
93
|
std::string str;
|
49
94
|
int multip = nchars * 8;
|
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
|
|
55
100
|
return str;
|
56
101
|
}
|
57
102
|
|
103
|
+
// Debug
|
58
104
|
void printVector(const std::vector<int> &vec) {
|
59
105
|
for (const auto &value : vec) {
|
60
106
|
std::cout << value << " ";
|
61
107
|
}
|
62
108
|
}
|
63
109
|
|
64
|
-
|
110
|
+
// Debug
|
111
|
+
[[nodiscard]] std::string charToBinaryString(char chr) {
|
65
112
|
std::string result;
|
66
113
|
for (int i = 7; i >= 0; --i) {
|
67
|
-
result += ((
|
114
|
+
result += ((chr >> i) & 1) ? '1' : '0';
|
68
115
|
}
|
69
116
|
return result;
|
70
117
|
}
|
71
118
|
|
72
119
|
class Candidate;
|
73
|
-
enum segmentType { Dir, File };
|
120
|
+
enum class segmentType { Dir, File };
|
74
121
|
|
75
122
|
// A segment of a file path
|
76
123
|
// e.g. if path is /foo/bar/baz.txt
|
@@ -81,17 +128,18 @@ public:
|
|
81
128
|
int fileId; // (if FILE)
|
82
129
|
Candidate *cand;
|
83
130
|
PathSegment *parent;
|
131
|
+
std::mutex mu;
|
84
132
|
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
-
segmentType type = Dir;
|
86
|
-
PathSegment() : parent(
|
87
|
-
PathSegment(std::string _str) : str(_str), parent(
|
133
|
+
segmentType type = segmentType::Dir;
|
134
|
+
PathSegment() : parent(nullptr) {}
|
135
|
+
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
88
136
|
PathSegment(std::string _str, int _fileId)
|
89
|
-
: str(_str), fileId(_fileId), cand(
|
90
|
-
int size() {
|
137
|
+
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
138
|
+
[[nodiscard]] int size() {
|
91
139
|
int sz = str.size();
|
92
140
|
PathSegment *cur = parent;
|
93
141
|
// Sum up length of parent segments (+1 for divisors)
|
94
|
-
while (cur->parent !=
|
142
|
+
while (cur->parent != nullptr) {
|
95
143
|
sz += cur->str.size() + 1;
|
96
144
|
cur = cur->parent;
|
97
145
|
}
|
@@ -118,7 +166,7 @@ public:
|
|
118
166
|
// Initialize v_charscores with zeros
|
119
167
|
v_charscore.resize(len, 0);
|
120
168
|
candLen = str.size();
|
121
|
-
seg =
|
169
|
+
seg = nullptr;
|
122
170
|
}
|
123
171
|
|
124
172
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
@@ -127,7 +175,7 @@ public:
|
|
127
175
|
candLen = seg->size();
|
128
176
|
}
|
129
177
|
|
130
|
-
float getScore() {
|
178
|
+
[[nodiscard]] float getScore() {
|
131
179
|
int i = 0;
|
132
180
|
float score = 0.0;
|
133
181
|
candLen = seg->size();
|
@@ -145,19 +193,21 @@ public:
|
|
145
193
|
return score;
|
146
194
|
}
|
147
195
|
|
148
|
-
float operator[](int idx) { return v_charscore[idx]; }
|
196
|
+
[[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
|
149
197
|
};
|
150
198
|
|
151
199
|
// This seems to give 10x speed improvement over std::unordered_map
|
152
200
|
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
201
|
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
202
|
|
155
|
-
typedef
|
203
|
+
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
204
|
+
// typedef std::unordered_map<int, Candidate*> CandMap;
|
156
205
|
|
157
206
|
class StringIndex {
|
158
207
|
private:
|
159
208
|
int tmp;
|
160
209
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
210
|
+
int numStrings = 0;
|
161
211
|
|
162
212
|
std::vector<SegMap *> dirmaps;
|
163
213
|
std::vector<SegMap *> filemaps;
|
@@ -170,10 +220,16 @@ private:
|
|
170
220
|
int dirId = 0;
|
171
221
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
222
|
|
223
|
+
std::array<std::mutex, 9> mts_f;
|
224
|
+
std::array<std::mutex, 9> mts_d;
|
225
|
+
|
226
|
+
std::unique_ptr<ThreadPool> pool;
|
227
|
+
Output out{1}; // verbose level = 1
|
228
|
+
|
173
229
|
public:
|
174
|
-
StringIndex() {
|
230
|
+
StringIndex(char sep) : dirSeparator(sep) {
|
175
231
|
root = new PathSegment();
|
176
|
-
root->parent =
|
232
|
+
root->parent = nullptr;
|
177
233
|
root->str = "[ROOT]";
|
178
234
|
|
179
235
|
for (int i = 0; i <= 8; i++) {
|
@@ -181,11 +237,18 @@ public:
|
|
181
237
|
filemaps.push_back(new SegMap);
|
182
238
|
}
|
183
239
|
|
184
|
-
|
185
|
-
|
186
|
-
|
240
|
+
// Threads between 4 and 6
|
241
|
+
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
242
|
+
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
243
|
+
num_threads = std::min(num_threads, 6);
|
244
|
+
out.printv(2, "Number of threads: ", num_threads);
|
245
|
+
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
187
246
|
}
|
188
247
|
|
248
|
+
/* Don't separate path to segments separator=\0.
|
249
|
+
This is slower, but can be used for other data than files also. */
|
250
|
+
StringIndex() : StringIndex('\0') {}
|
251
|
+
|
189
252
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
253
|
void setDirWeight(float val) { dirWeight = val; }
|
191
254
|
|
@@ -213,6 +276,13 @@ public:
|
|
213
276
|
addStrToIndex(filePath, fileId, dirSeparator);
|
214
277
|
}
|
215
278
|
|
279
|
+
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
280
|
+
pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
|
281
|
+
}
|
282
|
+
void waitUntilReady() { pool->waitUntilDone(); }
|
283
|
+
|
284
|
+
void waitUntilDone() { pool->waitUntilDone(); }
|
285
|
+
|
216
286
|
/**
|
217
287
|
* Add a string to the index to be searched for afterwards
|
218
288
|
*
|
@@ -222,8 +292,10 @@ public:
|
|
222
292
|
* one of {'\\', '/', '\0' (no separation)}.
|
223
293
|
*/
|
224
294
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
295
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
|
225
296
|
|
226
297
|
std::vector<std::string> segs;
|
298
|
+
numStrings += 1;
|
227
299
|
|
228
300
|
if (separator == '\0') {
|
229
301
|
// No separation to directories & files
|
@@ -233,7 +305,7 @@ public:
|
|
233
305
|
segs = splitString(filePath, separator);
|
234
306
|
}
|
235
307
|
|
236
|
-
PathSegment *prev =
|
308
|
+
PathSegment *prev = nullptr;
|
237
309
|
prev = root;
|
238
310
|
// Add segments to a tree type data structure
|
239
311
|
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
@@ -245,25 +317,27 @@ public:
|
|
245
317
|
auto x = *_x;
|
246
318
|
PathSegment *p;
|
247
319
|
|
248
|
-
|
320
|
+
prev->mu.lock();
|
321
|
+
|
249
322
|
// this part of the path already exists in the tree
|
250
|
-
if (it != prev->children.end()) {
|
323
|
+
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
251
324
|
p = it->second;
|
325
|
+
prev->mu.unlock();
|
252
326
|
} else {
|
253
327
|
p = new PathSegment(x, fileId);
|
254
328
|
p->parent = prev;
|
255
|
-
// If this is last item in segs
|
329
|
+
// If this is last item in segs, then it is a file.
|
256
330
|
if (_x == std::prev(segs.end())) {
|
257
|
-
|
258
|
-
p->type = File;
|
331
|
+
p->type = segmentType::File;
|
259
332
|
seglist[fileId] = p;
|
260
|
-
} else {
|
261
|
-
p->type = Dir;
|
333
|
+
} else { // otherwise, it is a directory
|
334
|
+
p->type = segmentType::Dir;
|
262
335
|
p->fileId = dirId;
|
263
336
|
// Files use user input Id. Directories need to have it generated
|
264
337
|
dirId++;
|
265
338
|
}
|
266
339
|
prev->children[x] = p;
|
340
|
+
prev->mu.unlock();
|
267
341
|
addPathSegmentKeys(p);
|
268
342
|
}
|
269
343
|
|
@@ -303,14 +377,16 @@ public:
|
|
303
377
|
@param query String to search for inside the index
|
304
378
|
*/
|
305
379
|
|
306
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
380
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
307
381
|
return findSimilar(query, 2);
|
308
382
|
}
|
309
383
|
|
310
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
384
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
311
385
|
CandMap fileCandMap;
|
312
386
|
CandMap dirCandMap;
|
313
387
|
|
388
|
+
waitUntilDone();
|
389
|
+
|
314
390
|
// Find both files and directories that match the input query
|
315
391
|
addToCandMap(fileCandMap, query, filemaps);
|
316
392
|
addToCandMap(dirCandMap, query, dirmaps);
|
@@ -319,9 +395,9 @@ public:
|
|
319
395
|
scores of the file */
|
320
396
|
mergeCandidateMaps(fileCandMap, dirCandMap);
|
321
397
|
|
322
|
-
// Set all candidate pointers to
|
398
|
+
// Set all candidate pointers to nullptr so they won't mess up future searches
|
323
399
|
for (auto seg : segsToClean) {
|
324
|
-
seg->cand =
|
400
|
+
seg->cand = nullptr;
|
325
401
|
}
|
326
402
|
segsToClean.clear();
|
327
403
|
|
@@ -329,11 +405,17 @@ public:
|
|
329
405
|
std::vector<std::pair<float, int>> results;
|
330
406
|
for (auto &[fid, cand] : fileCandMap) {
|
331
407
|
std::pair<float, int> v;
|
332
|
-
float sc = cand
|
408
|
+
float sc = cand->getScore();
|
333
409
|
v.first = sc;
|
334
410
|
v.second = fid;
|
335
411
|
results.push_back(v);
|
412
|
+
delete cand;
|
336
413
|
}
|
414
|
+
|
415
|
+
for (auto &[fid, cand] : dirCandMap) {
|
416
|
+
delete cand;
|
417
|
+
}
|
418
|
+
|
337
419
|
// Sort highest score first
|
338
420
|
std::sort(results.begin(), results.end(),
|
339
421
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
@@ -341,10 +423,10 @@ public:
|
|
341
423
|
}
|
342
424
|
|
343
425
|
// Return int64_t representation of the first nchars in str, starting from index i
|
344
|
-
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
426
|
+
[[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
345
427
|
int64_t key = 0;
|
346
428
|
for (int i_char = 0; i_char < nchars; i_char++) {
|
347
|
-
key = key | static_cast<
|
429
|
+
key = key | static_cast<int64_t>(str[i + i_char]);
|
348
430
|
if (i_char < nchars - 1) {
|
349
431
|
// Shift 8 bits to the left except on the last iteration
|
350
432
|
key = key << 8;
|
@@ -399,22 +481,29 @@ private:
|
|
399
481
|
maxChars = p->str.size();
|
400
482
|
}
|
401
483
|
|
402
|
-
#ifdef _OPENMP
|
403
|
-
#pragma omp parallel for
|
404
|
-
#endif
|
405
484
|
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
406
485
|
|
486
|
+
std::mutex *mu;
|
407
487
|
SegMap *map;
|
408
|
-
if (p->type == File) {
|
488
|
+
if (p->type == segmentType::File) {
|
409
489
|
map = filemaps[sublen];
|
490
|
+
mu = &mts_f[sublen];
|
410
491
|
} else {
|
411
492
|
map = dirmaps[sublen];
|
493
|
+
mu = &mts_d[sublen];
|
412
494
|
}
|
413
495
|
|
414
496
|
int count = str.size() - sublen + 1;
|
415
497
|
|
498
|
+
int64_t keys[count + 1];
|
416
499
|
for (int i = 0; i <= count; i++) {
|
417
|
-
|
500
|
+
keys[i] = getKeyAtIdx(str, i, sublen);
|
501
|
+
}
|
502
|
+
|
503
|
+
mu->lock();
|
504
|
+
for (int i = 0; i <= count; i++) {
|
505
|
+
// int64_t key = getKeyAtIdx(str, i, sublen);
|
506
|
+
auto key = keys[i];
|
418
507
|
|
419
508
|
// Create a new std::set for key if doesn't exist already
|
420
509
|
auto it = map->find(key);
|
@@ -423,12 +512,14 @@ private:
|
|
423
512
|
}
|
424
513
|
(*map)[key]->insert(p);
|
425
514
|
}
|
515
|
+
mu->unlock();
|
426
516
|
}
|
427
517
|
}
|
428
518
|
|
429
519
|
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
430
520
|
// is of length <nchars>.
|
431
|
-
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
521
|
+
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
522
|
+
SegMap &map) {
|
432
523
|
|
433
524
|
assert(i + nchars <= static_cast<int>(str.size()));
|
434
525
|
std::vector<PathSegment *> res;
|
@@ -437,8 +528,7 @@ private:
|
|
437
528
|
// transform that to 64 bit integer
|
438
529
|
int64_t key = getKeyAtIdx(str, i, nchars);
|
439
530
|
// Find all path segments in map that have the same substring
|
440
|
-
auto it = map.find(key);
|
441
|
-
if (it != map.end()) { // key found
|
531
|
+
if (auto it = map.find(key); it != map.end()) { // key found
|
442
532
|
auto set = it->second;
|
443
533
|
for (auto value : *set) {
|
444
534
|
res.push_back(value);
|
@@ -475,12 +565,12 @@ private:
|
|
475
565
|
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
476
566
|
|
477
567
|
for (auto &[fid, cand] : fileCandMap) {
|
478
|
-
PathSegment *p = cand
|
479
|
-
while (p->parent !=
|
480
|
-
if (p->cand !=
|
481
|
-
auto &scoreA = cand
|
568
|
+
PathSegment *p = cand->seg->parent;
|
569
|
+
while (p->parent != nullptr) {
|
570
|
+
if (p->cand != nullptr) {
|
571
|
+
auto &scoreA = cand->v_charscore;
|
482
572
|
auto &scoreB = p->cand->v_charscore;
|
483
|
-
for (int i = 0; i < cand
|
573
|
+
for (int i = 0; i < cand->len; i++) {
|
484
574
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
485
575
|
scoreA[i] = scoreB[i] * dirWeight;
|
486
576
|
}
|
@@ -493,18 +583,22 @@ private:
|
|
493
583
|
|
494
584
|
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
495
585
|
|
496
|
-
auto it2 = candmap.find(seg->fileId);
|
497
|
-
|
498
|
-
Candidate cand(seg, str.size());
|
499
|
-
seg->cand = &(candmap[seg->fileId]);
|
586
|
+
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
587
|
+
Candidate *cand = new Candidate(seg, str.size());
|
500
588
|
segsToClean.push_back(seg);
|
501
589
|
candmap[seg->fileId] = cand;
|
590
|
+
seg->cand = cand;
|
502
591
|
}
|
503
592
|
|
504
593
|
for (int j = i; j < i + nchars; j++) {
|
505
|
-
|
506
|
-
|
594
|
+
Candidate &cand = *(candmap[seg->fileId]);
|
595
|
+
if (cand[j] < nchars) {
|
596
|
+
cand.v_charscore[j] = nchars;
|
507
597
|
}
|
508
598
|
}
|
509
599
|
}
|
510
600
|
};
|
601
|
+
|
602
|
+
} // namespace StrIdx
|
603
|
+
|
604
|
+
#endif
|
data/test.rb
CHANGED
@@ -13,7 +13,13 @@ for x in lines
|
|
13
13
|
end
|
14
14
|
|
15
15
|
idx_time = Time.new
|
16
|
-
|
16
|
+
# Time to start the threadpool to process indexing
|
17
|
+
puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
18
|
+
|
19
|
+
idx.waitUntilDone() # Not necessary, will be called by idx.find
|
20
|
+
idx_time = Time.new
|
21
|
+
# Time when all threads have completed
|
22
|
+
puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
23
|
|
18
24
|
query = "rngnomadriv"
|
19
25
|
res = idx.find(query)
|
data/thread_pool.hpp
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
// Based on example in https://www.geeksforgeeks.org/thread-pool-in-cpp/
|
3
|
+
|
4
|
+
#include <condition_variable>
|
5
|
+
#include <functional>
|
6
|
+
#include <iostream>
|
7
|
+
#include <mutex>
|
8
|
+
#include <queue>
|
9
|
+
#include <thread>
|
10
|
+
#include <algorithm>
|
11
|
+
#include <iostream>
|
12
|
+
#include <fstream>
|
13
|
+
#include <vector>
|
14
|
+
#include <string>
|
15
|
+
#include <chrono>
|
16
|
+
|
17
|
+
class ThreadPool {
|
18
|
+
public:
|
19
|
+
// Create a thread pool with given number of threads
|
20
|
+
ThreadPool(size_t num_threads) {
|
21
|
+
|
22
|
+
// Creating worker threads
|
23
|
+
for (size_t i = 0; i < num_threads; ++i) {
|
24
|
+
workerThreads.emplace_back([this] {
|
25
|
+
while (true) {
|
26
|
+
std::function<void()> task;
|
27
|
+
{
|
28
|
+
std::unique_lock<std::mutex> lock(mu_queue);
|
29
|
+
|
30
|
+
// Waiting until there is a task to execute or the pool is stopped
|
31
|
+
cv_.wait(lock, [this] { return !taskQueue.empty() || stop_; });
|
32
|
+
|
33
|
+
// Exit the thread in case the pool is stopped and there are no tasks
|
34
|
+
if (stop_ && taskQueue.empty()) {
|
35
|
+
return;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Get the next task from the queue
|
39
|
+
task = std::move(taskQueue.front());
|
40
|
+
taskQueue.pop();
|
41
|
+
}
|
42
|
+
|
43
|
+
task();
|
44
|
+
}
|
45
|
+
});
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
// Destructor to stop the thread pool
|
50
|
+
~ThreadPool() {
|
51
|
+
{
|
52
|
+
std::lock_guard<std::mutex> lock(mu_queue);
|
53
|
+
stop_ = true;
|
54
|
+
}
|
55
|
+
|
56
|
+
// Notify all threads
|
57
|
+
cv_.notify_all();
|
58
|
+
|
59
|
+
// Joining all worker threads to ensure they have
|
60
|
+
// completed their tasks
|
61
|
+
for (auto &thread : workerThreads) {
|
62
|
+
thread.join();
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
// Wait until all tasks assigned to the threads have been finished
|
67
|
+
void waitUntilDone() {
|
68
|
+
while (true) {
|
69
|
+
{
|
70
|
+
std::lock_guard<std::mutex> guard(mu_queue);
|
71
|
+
if (taskQueue.empty()) {
|
72
|
+
return;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
// Enqueue task for execution by the thread pool
|
80
|
+
void enqueue(std::function<void()> task) {
|
81
|
+
{
|
82
|
+
std::lock_guard<std::mutex> lock(mu_queue);
|
83
|
+
taskQueue.emplace(move(task));
|
84
|
+
}
|
85
|
+
cv_.notify_one();
|
86
|
+
}
|
87
|
+
|
88
|
+
private:
|
89
|
+
std::vector<std::thread> workerThreads;
|
90
|
+
std::queue<std::function<void()>> taskQueue;
|
91
|
+
std::mutex mu_queue;
|
92
|
+
|
93
|
+
// Condition variable to signal changes in the state of the tasks queue
|
94
|
+
std::condition_variable cv_;
|
95
|
+
|
96
|
+
// Flag to indicate whether the thread pool should stop
|
97
|
+
bool stop_ = false;
|
98
|
+
};
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- rubyext/ruby_interf.cpp
|
56
56
|
- stridx.hpp
|
57
57
|
- test.rb
|
58
|
+
- thread_pool.hpp
|
58
59
|
- unordered_dense.h
|
59
60
|
homepage: https://github.com/SamiSieranoja/stridx
|
60
61
|
licenses:
|
@@ -62,7 +63,7 @@ licenses:
|
|
62
63
|
metadata:
|
63
64
|
source_code_uri: https://github.com/SamiSieranoja/stridx
|
64
65
|
homepage_uri: https://github.com/SamiSieranoja/stridx
|
65
|
-
post_install_message:
|
66
|
+
post_install_message:
|
66
67
|
rdoc_options: []
|
67
68
|
require_paths:
|
68
69
|
- lib
|
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
80
|
version: '0'
|
80
81
|
requirements: []
|
81
82
|
rubygems_version: 3.3.26
|
82
|
-
signing_key:
|
83
|
+
signing_key:
|
83
84
|
specification_version: 4
|
84
85
|
summary: StrIdx
|
85
86
|
test_files: []
|