StrIdx 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Makefile +2 -2
- data/README.md +10 -1
- data/demo.cpp +30 -8
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +18 -5
- data/stridx.hpp +152 -58
- data/test.rb +7 -1
- data/thread_pool.hpp +98 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
|
4
|
+
data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
|
7
|
+
data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217
|
data/Makefile
CHANGED
data/README.md
CHANGED
@@ -43,6 +43,15 @@ Install:
|
|
43
43
|
gem install StrIdx
|
44
44
|
```
|
45
45
|
|
46
|
+
Or, for development version:
|
47
|
+
```
|
48
|
+
git clone https://github.com/SamiSieranoja/stridx.git
|
49
|
+
cd stridx
|
50
|
+
cd rubyext; ruby extconf.rb ; make ; cd ..
|
51
|
+
gem build stridx.gemspec
|
52
|
+
gem install $(ls -1tr StrIdx*gem | tail -n 1)
|
53
|
+
```
|
54
|
+
|
46
55
|
Usage example (see test.rb):
|
47
56
|
```ruby
|
48
57
|
require "stridx"
|
@@ -58,7 +67,7 @@ for x in lines
|
|
58
67
|
end
|
59
68
|
|
60
69
|
idx_time = Time.new
|
61
|
-
puts "\nIndexing time (#{lines.size} files
|
70
|
+
puts "\nIndexing time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
62
71
|
|
63
72
|
query = "rngnomadriv"
|
64
73
|
res = idx.find(query)
|
data/demo.cpp
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
|
2
|
+
#include <condition_variable>
|
3
|
+
#include <functional>
|
4
|
+
#include <iostream>
|
5
|
+
#include <mutex>
|
6
|
+
#include <queue>
|
7
|
+
#include <thread>
|
8
|
+
#include <algorithm>
|
9
|
+
|
1
10
|
#include "stridx.hpp"
|
2
11
|
|
3
12
|
#include <iostream>
|
@@ -28,7 +37,7 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
28
37
|
}
|
29
38
|
|
30
39
|
int main() {
|
31
|
-
StringIndex idx;
|
40
|
+
StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
|
32
41
|
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
33
42
|
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
34
43
|
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
@@ -37,26 +46,39 @@ int main() {
|
|
37
46
|
std::string fn_filePaths = "flist.txt";
|
38
47
|
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
39
48
|
|
49
|
+
// Launch indexing to be run on background
|
50
|
+
cout << "File paths: " << v_filePaths.size() << std::endl;
|
51
|
+
cout << "Start indexing in the background" << std::endl;
|
40
52
|
auto start = std::chrono::high_resolution_clock::now();
|
41
53
|
int id = 0;
|
42
54
|
for (const auto &filePath : v_filePaths) {
|
43
|
-
idx.
|
44
|
-
// idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
|
55
|
+
idx.addStrToIndexThreaded(filePath, id);
|
45
56
|
id++;
|
46
57
|
}
|
47
|
-
|
58
|
+
|
59
|
+
auto idx_time_launch = std::chrono::high_resolution_clock::now();
|
60
|
+
std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
|
61
|
+
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
62
|
+
|
63
|
+
// Wait until indexing has finished
|
64
|
+
idx.waitUntilDone();
|
65
|
+
|
48
66
|
auto idx_time = std::chrono::high_resolution_clock::now();
|
49
67
|
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
50
|
-
cout << "Indexing
|
68
|
+
cout << "Indexing finished time for " << v_filePaths.size()
|
69
|
+
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
51
70
|
|
52
71
|
// Find matching filepaths from the index for the query string "rngnomadriv"
|
53
72
|
start = std::chrono::high_resolution_clock::now();
|
54
73
|
std::string query = "rngnomadriv";
|
74
|
+
for (int i = 0; i < 99; i++) {
|
75
|
+
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
76
|
+
}
|
77
|
+
|
55
78
|
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
56
79
|
auto search_time = std::chrono::high_resolution_clock::now();
|
57
80
|
duration = search_time - start;
|
58
|
-
cout << "Search time (seconds): " << duration.count() / 1000
|
59
|
-
<< "\n";
|
81
|
+
cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
|
60
82
|
|
61
83
|
int i = 0;
|
62
84
|
std::cout << "query string: " << query << "\n";
|
@@ -73,4 +95,4 @@ int main() {
|
|
73
95
|
}
|
74
96
|
|
75
97
|
// Compile:
|
76
|
-
// g++ -Wall -Wno-unused-variable -O3 -
|
98
|
+
// g++ -Wall -Wno-unused-variable -O3 -lstdc++ demo.cpp -o demo
|
data/rubyext/extconf.rb
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
2
|
|
4
3
|
require 'mkmf'
|
5
4
|
|
6
5
|
module_name = "stridx"
|
7
6
|
extension_name = 'stridx'
|
8
7
|
|
9
|
-
$CXXFLAGS << " -Wall -Wno-unused-variable -O3
|
8
|
+
$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
|
10
9
|
|
11
10
|
have_library( 'stdc++');
|
12
|
-
have_library( 'gomp' );
|
13
11
|
|
14
12
|
dir_config(extension_name) # The destination
|
15
13
|
create_makefile(extension_name) # Create Makefile
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
extern "C" {
|
9
9
|
|
10
|
-
void str_idx_free(void *data) { delete (StringIndex *)data; }
|
10
|
+
void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
|
11
11
|
|
12
12
|
// Wrap StringIndex class inside a ruby variable
|
13
13
|
static const rb_data_type_t str_idx_type = {
|
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
|
|
26
26
|
};
|
27
27
|
|
28
28
|
VALUE str_idx_alloc(VALUE self) {
|
29
|
-
void *data = new StringIndex();
|
29
|
+
void *data = new StrIdx::StringIndex();
|
30
30
|
return TypedData_Wrap_Struct(self, &str_idx_type, data);
|
31
31
|
}
|
32
32
|
|
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
36
36
|
|
37
37
|
void *data;
|
38
38
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
-
((StringIndex *)data)->addStrToIndex(s1, fid);
|
39
|
+
// ((StringIndex *)data)->addStrToIndex(s1, fid);
|
40
|
+
((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
|
40
41
|
|
41
42
|
return self;
|
42
43
|
}
|
43
44
|
|
45
|
+
VALUE StringIndexWaitUntilDone(VALUE self) {
|
46
|
+
void *data;
|
47
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
48
|
+
((StrIdx::StringIndex *)data)->waitUntilDone();
|
49
|
+
return self;
|
50
|
+
}
|
51
|
+
|
52
|
+
|
44
53
|
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
54
|
VALUE ret;
|
46
55
|
std::string s1 = StringValueCStr(str);
|
47
56
|
|
48
57
|
void *data;
|
49
58
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
50
|
-
StringIndex *idx = (StringIndex *)data;
|
59
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
51
60
|
|
52
61
|
ret = rb_ary_new();
|
53
62
|
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
@@ -80,7 +89,7 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
|
80
89
|
|
81
90
|
void *data;
|
82
91
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
83
|
-
StringIndex *idx = (StringIndex *)data;
|
92
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
84
93
|
idx->setDirSeparator(c);
|
85
94
|
|
86
95
|
return self;
|
@@ -93,8 +102,12 @@ void Init_stridx(void) {
|
|
93
102
|
|
94
103
|
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
95
104
|
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
105
|
+
rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
|
96
106
|
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
107
|
+
|
97
108
|
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
109
|
+
|
110
|
+
|
98
111
|
}
|
99
112
|
|
100
113
|
} // End extern "C"
|
data/stridx.hpp
CHANGED
@@ -1,21 +1,66 @@
|
|
1
1
|
|
2
|
+
#ifndef SSSTRIDX_HPP
|
3
|
+
#define SSSTRIDX_HPP
|
4
|
+
|
2
5
|
#include <stdio.h>
|
3
6
|
#include <stdlib.h>
|
4
7
|
#include <cassert>
|
5
8
|
|
6
9
|
#include <vector>
|
10
|
+
#include <array>
|
7
11
|
#include <iostream>
|
8
12
|
#include <unordered_map>
|
9
13
|
#include <set>
|
10
14
|
#include <algorithm>
|
11
15
|
#include <sstream>
|
12
16
|
|
13
|
-
#
|
14
|
-
#include <
|
15
|
-
#
|
17
|
+
#include <vector>
|
18
|
+
#include <mutex>
|
19
|
+
#include <thread>
|
16
20
|
|
21
|
+
#include "thread_pool.hpp"
|
17
22
|
#include "unordered_dense.h"
|
18
23
|
|
24
|
+
namespace StrIdx {
|
25
|
+
|
26
|
+
/* Alternative to using std::cout
|
27
|
+
Allows to control verbose level */
|
28
|
+
class Output {
|
29
|
+
private:
|
30
|
+
int verboseLevel;
|
31
|
+
|
32
|
+
public:
|
33
|
+
Output(int verb) : verboseLevel(verb) {}
|
34
|
+
Output() : Output(3) {}
|
35
|
+
~Output() = default;
|
36
|
+
void print() {}
|
37
|
+
|
38
|
+
// When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
|
39
|
+
template <typename T, typename... Types> void print(T var1, Types... var2) {
|
40
|
+
std::cout << var1;
|
41
|
+
print(var2...);
|
42
|
+
}
|
43
|
+
|
44
|
+
// When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
45
|
+
template <typename... Types> void printl(Types... var2) {
|
46
|
+
print(var2...);
|
47
|
+
print("\n");
|
48
|
+
}
|
49
|
+
|
50
|
+
/* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
|
51
|
+
* if verboseLevel >= 2 (first arg)
|
52
|
+
*/
|
53
|
+
template <typename... Types> void printv(int vlevel, Types... var2) {
|
54
|
+
if (verboseLevel < vlevel) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
if (verboseLevel >= 3) {
|
58
|
+
print("[v=", vlevel, "] ");
|
59
|
+
}
|
60
|
+
printl(var2...);
|
61
|
+
}
|
62
|
+
};
|
63
|
+
|
19
64
|
// Transforms input string as follows:
|
20
65
|
// '/foo/bar/file1.txt'
|
21
66
|
// => vector{"foo", "bar", "file1.txt"}
|
@@ -34,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
|
|
34
79
|
}
|
35
80
|
|
36
81
|
// Convert int64_t to binary string
|
37
|
-
std::string int64ToBinaryString(int64_t num) {
|
82
|
+
[[nodiscard]] std::string int64ToBinaryString(int64_t num) {
|
38
83
|
std::string result;
|
39
84
|
for (int i = 63; i >= 0; --i) {
|
40
85
|
result += ((num >> i) & 1) ? '1' : '0';
|
@@ -42,8 +87,8 @@ std::string int64ToBinaryString(int64_t num) {
|
|
42
87
|
return result;
|
43
88
|
}
|
44
89
|
|
45
|
-
// Convert a (8 char) string represented as int64_t to std::string
|
46
|
-
std::string int64ToStr(int64_t key) {
|
90
|
+
// Debug. Convert a (8 char) string represented as int64_t to std::string
|
91
|
+
[[nodiscard]] std::string int64ToStr(int64_t key) {
|
47
92
|
int nchars = 8;
|
48
93
|
std::string str;
|
49
94
|
int multip = nchars * 8;
|
@@ -55,22 +100,24 @@ std::string int64ToStr(int64_t key) {
|
|
55
100
|
return str;
|
56
101
|
}
|
57
102
|
|
103
|
+
// Debug
|
58
104
|
void printVector(const std::vector<int> &vec) {
|
59
105
|
for (const auto &value : vec) {
|
60
106
|
std::cout << value << " ";
|
61
107
|
}
|
62
108
|
}
|
63
109
|
|
64
|
-
|
110
|
+
// Debug
|
111
|
+
[[nodiscard]] std::string charToBinaryString(char chr) {
|
65
112
|
std::string result;
|
66
113
|
for (int i = 7; i >= 0; --i) {
|
67
|
-
result += ((
|
114
|
+
result += ((chr >> i) & 1) ? '1' : '0';
|
68
115
|
}
|
69
116
|
return result;
|
70
117
|
}
|
71
118
|
|
72
119
|
class Candidate;
|
73
|
-
enum segmentType { Dir, File };
|
120
|
+
enum class segmentType { Dir, File };
|
74
121
|
|
75
122
|
// A segment of a file path
|
76
123
|
// e.g. if path is /foo/bar/baz.txt
|
@@ -81,17 +128,18 @@ public:
|
|
81
128
|
int fileId; // (if FILE)
|
82
129
|
Candidate *cand;
|
83
130
|
PathSegment *parent;
|
131
|
+
std::mutex mu;
|
84
132
|
ankerl::unordered_dense::map<std::string, PathSegment *> children;
|
85
|
-
segmentType type = Dir;
|
86
|
-
PathSegment() : parent(
|
87
|
-
PathSegment(std::string _str) : str(_str), parent(
|
133
|
+
segmentType type = segmentType::Dir;
|
134
|
+
PathSegment() : parent(nullptr) {}
|
135
|
+
PathSegment(std::string _str) : str(_str), parent(nullptr) {}
|
88
136
|
PathSegment(std::string _str, int _fileId)
|
89
|
-
: str(_str), fileId(_fileId), cand(
|
90
|
-
int size() {
|
137
|
+
: str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
|
138
|
+
[[nodiscard]] int size() {
|
91
139
|
int sz = str.size();
|
92
140
|
PathSegment *cur = parent;
|
93
141
|
// Sum up length of parent segments (+1 for divisors)
|
94
|
-
while (cur->parent !=
|
142
|
+
while (cur->parent != nullptr) {
|
95
143
|
sz += cur->str.size() + 1;
|
96
144
|
cur = cur->parent;
|
97
145
|
}
|
@@ -118,7 +166,7 @@ public:
|
|
118
166
|
// Initialize v_charscores with zeros
|
119
167
|
v_charscore.resize(len, 0);
|
120
168
|
candLen = str.size();
|
121
|
-
seg =
|
169
|
+
seg = nullptr;
|
122
170
|
}
|
123
171
|
|
124
172
|
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
|
@@ -127,7 +175,7 @@ public:
|
|
127
175
|
candLen = seg->size();
|
128
176
|
}
|
129
177
|
|
130
|
-
float getScore() {
|
178
|
+
[[nodiscard]] float getScore() {
|
131
179
|
int i = 0;
|
132
180
|
float score = 0.0;
|
133
181
|
candLen = seg->size();
|
@@ -145,19 +193,21 @@ public:
|
|
145
193
|
return score;
|
146
194
|
}
|
147
195
|
|
148
|
-
float operator[](int idx) { return v_charscore[idx]; }
|
196
|
+
[[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
|
149
197
|
};
|
150
198
|
|
151
199
|
// This seems to give 10x speed improvement over std::unordered_map
|
152
200
|
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
|
153
201
|
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
|
154
202
|
|
155
|
-
typedef
|
203
|
+
typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
|
204
|
+
// typedef std::unordered_map<int, Candidate*> CandMap;
|
156
205
|
|
157
206
|
class StringIndex {
|
158
207
|
private:
|
159
208
|
int tmp;
|
160
209
|
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
|
210
|
+
int numStrings = 0;
|
161
211
|
|
162
212
|
std::vector<SegMap *> dirmaps;
|
163
213
|
std::vector<SegMap *> filemaps;
|
@@ -170,10 +220,16 @@ private:
|
|
170
220
|
int dirId = 0;
|
171
221
|
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
|
172
222
|
|
223
|
+
std::array<std::mutex, 9> mts_f;
|
224
|
+
std::array<std::mutex, 9> mts_d;
|
225
|
+
|
226
|
+
std::unique_ptr<ThreadPool> pool;
|
227
|
+
Output out{1}; // verbose level = 1
|
228
|
+
|
173
229
|
public:
|
174
|
-
StringIndex() {
|
230
|
+
StringIndex(char sep) : dirSeparator(sep) {
|
175
231
|
root = new PathSegment();
|
176
|
-
root->parent =
|
232
|
+
root->parent = nullptr;
|
177
233
|
root->str = "[ROOT]";
|
178
234
|
|
179
235
|
for (int i = 0; i <= 8; i++) {
|
@@ -181,11 +237,18 @@ public:
|
|
181
237
|
filemaps.push_back(new SegMap);
|
182
238
|
}
|
183
239
|
|
184
|
-
|
185
|
-
|
186
|
-
|
240
|
+
// Threads between 4 and 6
|
241
|
+
// We don't seem to get any benefit from more than 6 threads even if the hardware supports it
|
242
|
+
int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
|
243
|
+
num_threads = std::min(num_threads, 6);
|
244
|
+
out.printv(2, "Number of threads: ", num_threads);
|
245
|
+
pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
|
187
246
|
}
|
188
247
|
|
248
|
+
/* Don't separate path to segments separator=\0.
|
249
|
+
This is slower, but can be used for other data than files also. */
|
250
|
+
StringIndex() : StringIndex('\0') {}
|
251
|
+
|
189
252
|
void setDirSeparator(char sep) { dirSeparator = sep; }
|
190
253
|
void setDirWeight(float val) { dirWeight = val; }
|
191
254
|
|
@@ -213,6 +276,13 @@ public:
|
|
213
276
|
addStrToIndex(filePath, fileId, dirSeparator);
|
214
277
|
}
|
215
278
|
|
279
|
+
void addStrToIndexThreaded(std::string filePath, int fileId) {
|
280
|
+
pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
|
281
|
+
}
|
282
|
+
void waitUntilReady() { pool->waitUntilDone(); }
|
283
|
+
|
284
|
+
void waitUntilDone() { pool->waitUntilDone(); }
|
285
|
+
|
216
286
|
/**
|
217
287
|
* Add a string to the index to be searched for afterwards
|
218
288
|
*
|
@@ -222,8 +292,10 @@ public:
|
|
222
292
|
* one of {'\\', '/', '\0' (no separation)}.
|
223
293
|
*/
|
224
294
|
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
|
295
|
+
out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
|
225
296
|
|
226
297
|
std::vector<std::string> segs;
|
298
|
+
numStrings += 1;
|
227
299
|
|
228
300
|
if (separator == '\0') {
|
229
301
|
// No separation to directories & files
|
@@ -233,7 +305,7 @@ public:
|
|
233
305
|
segs = splitString(filePath, separator);
|
234
306
|
}
|
235
307
|
|
236
|
-
PathSegment *prev =
|
308
|
+
PathSegment *prev = nullptr;
|
237
309
|
prev = root;
|
238
310
|
// Add segments to a tree type data structure
|
239
311
|
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
|
@@ -245,25 +317,27 @@ public:
|
|
245
317
|
auto x = *_x;
|
246
318
|
PathSegment *p;
|
247
319
|
|
248
|
-
|
320
|
+
prev->mu.lock();
|
321
|
+
|
249
322
|
// this part of the path already exists in the tree
|
250
|
-
if (it != prev->children.end()) {
|
323
|
+
if (auto it = prev->children.find(x); it != prev->children.end()) {
|
251
324
|
p = it->second;
|
325
|
+
prev->mu.unlock();
|
252
326
|
} else {
|
253
327
|
p = new PathSegment(x, fileId);
|
254
328
|
p->parent = prev;
|
255
|
-
// If this is last item in segs
|
329
|
+
// If this is last item in segs, then it is a file.
|
256
330
|
if (_x == std::prev(segs.end())) {
|
257
|
-
|
258
|
-
p->type = File;
|
331
|
+
p->type = segmentType::File;
|
259
332
|
seglist[fileId] = p;
|
260
|
-
} else {
|
261
|
-
p->type = Dir;
|
333
|
+
} else { // otherwise, it is a directory
|
334
|
+
p->type = segmentType::Dir;
|
262
335
|
p->fileId = dirId;
|
263
336
|
// Files use user input Id. Directories need to have it generated
|
264
337
|
dirId++;
|
265
338
|
}
|
266
339
|
prev->children[x] = p;
|
340
|
+
prev->mu.unlock();
|
267
341
|
addPathSegmentKeys(p);
|
268
342
|
}
|
269
343
|
|
@@ -303,14 +377,16 @@ public:
|
|
303
377
|
@param query String to search for inside the index
|
304
378
|
*/
|
305
379
|
|
306
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
380
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
307
381
|
return findSimilar(query, 2);
|
308
382
|
}
|
309
383
|
|
310
|
-
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
384
|
+
[[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
311
385
|
CandMap fileCandMap;
|
312
386
|
CandMap dirCandMap;
|
313
387
|
|
388
|
+
waitUntilDone();
|
389
|
+
|
314
390
|
// Find both files and directories that match the input query
|
315
391
|
addToCandMap(fileCandMap, query, filemaps);
|
316
392
|
addToCandMap(dirCandMap, query, dirmaps);
|
@@ -319,9 +395,9 @@ public:
|
|
319
395
|
scores of the file */
|
320
396
|
mergeCandidateMaps(fileCandMap, dirCandMap);
|
321
397
|
|
322
|
-
// Set all candidate pointers to
|
398
|
+
// Set all candidate pointers to nullptr so they won't mess up future searches
|
323
399
|
for (auto seg : segsToClean) {
|
324
|
-
seg->cand =
|
400
|
+
seg->cand = nullptr;
|
325
401
|
}
|
326
402
|
segsToClean.clear();
|
327
403
|
|
@@ -329,11 +405,17 @@ public:
|
|
329
405
|
std::vector<std::pair<float, int>> results;
|
330
406
|
for (auto &[fid, cand] : fileCandMap) {
|
331
407
|
std::pair<float, int> v;
|
332
|
-
float sc = cand
|
408
|
+
float sc = cand->getScore();
|
333
409
|
v.first = sc;
|
334
410
|
v.second = fid;
|
335
411
|
results.push_back(v);
|
412
|
+
delete cand;
|
336
413
|
}
|
414
|
+
|
415
|
+
for (auto &[fid, cand] : dirCandMap) {
|
416
|
+
delete cand;
|
417
|
+
}
|
418
|
+
|
337
419
|
// Sort highest score first
|
338
420
|
std::sort(results.begin(), results.end(),
|
339
421
|
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
|
@@ -341,10 +423,10 @@ public:
|
|
341
423
|
}
|
342
424
|
|
343
425
|
// Return int64_t representation of the first nchars in str, starting from index i
|
344
|
-
int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
426
|
+
[[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
|
345
427
|
int64_t key = 0;
|
346
428
|
for (int i_char = 0; i_char < nchars; i_char++) {
|
347
|
-
key = key | static_cast<
|
429
|
+
key = key | static_cast<int64_t>(str[i + i_char]);
|
348
430
|
if (i_char < nchars - 1) {
|
349
431
|
// Shift 8 bits to the left except on the last iteration
|
350
432
|
key = key << 8;
|
@@ -399,22 +481,29 @@ private:
|
|
399
481
|
maxChars = p->str.size();
|
400
482
|
}
|
401
483
|
|
402
|
-
#ifdef _OPENMP
|
403
|
-
#pragma omp parallel for
|
404
|
-
#endif
|
405
484
|
for (int sublen = minChars; sublen <= maxChars; sublen++) {
|
406
485
|
|
486
|
+
std::mutex *mu;
|
407
487
|
SegMap *map;
|
408
|
-
if (p->type == File) {
|
488
|
+
if (p->type == segmentType::File) {
|
409
489
|
map = filemaps[sublen];
|
490
|
+
mu = &mts_f[sublen];
|
410
491
|
} else {
|
411
492
|
map = dirmaps[sublen];
|
493
|
+
mu = &mts_d[sublen];
|
412
494
|
}
|
413
495
|
|
414
496
|
int count = str.size() - sublen + 1;
|
415
497
|
|
498
|
+
int64_t keys[count + 1];
|
416
499
|
for (int i = 0; i <= count; i++) {
|
417
|
-
|
500
|
+
keys[i] = getKeyAtIdx(str, i, sublen);
|
501
|
+
}
|
502
|
+
|
503
|
+
mu->lock();
|
504
|
+
for (int i = 0; i <= count; i++) {
|
505
|
+
// int64_t key = getKeyAtIdx(str, i, sublen);
|
506
|
+
auto key = keys[i];
|
418
507
|
|
419
508
|
// Create a new std::set for key if doesn't exist already
|
420
509
|
auto it = map->find(key);
|
@@ -423,12 +512,14 @@ private:
|
|
423
512
|
}
|
424
513
|
(*map)[key]->insert(p);
|
425
514
|
}
|
515
|
+
mu->unlock();
|
426
516
|
}
|
427
517
|
}
|
428
518
|
|
429
519
|
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
|
430
520
|
// is of length <nchars>.
|
431
|
-
std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
521
|
+
[[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
|
522
|
+
SegMap &map) {
|
432
523
|
|
433
524
|
assert(i + nchars <= static_cast<int>(str.size()));
|
434
525
|
std::vector<PathSegment *> res;
|
@@ -437,8 +528,7 @@ private:
|
|
437
528
|
// transform that to 64 bit integer
|
438
529
|
int64_t key = getKeyAtIdx(str, i, nchars);
|
439
530
|
// Find all path segments in map that have the same substring
|
440
|
-
auto it = map.find(key);
|
441
|
-
if (it != map.end()) { // key found
|
531
|
+
if (auto it = map.find(key); it != map.end()) { // key found
|
442
532
|
auto set = it->second;
|
443
533
|
for (auto value : *set) {
|
444
534
|
res.push_back(value);
|
@@ -475,12 +565,12 @@ private:
|
|
475
565
|
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
|
476
566
|
|
477
567
|
for (auto &[fid, cand] : fileCandMap) {
|
478
|
-
PathSegment *p = cand
|
479
|
-
while (p->parent !=
|
480
|
-
if (p->cand !=
|
481
|
-
auto &scoreA = cand
|
568
|
+
PathSegment *p = cand->seg->parent;
|
569
|
+
while (p->parent != nullptr) {
|
570
|
+
if (p->cand != nullptr) {
|
571
|
+
auto &scoreA = cand->v_charscore;
|
482
572
|
auto &scoreB = p->cand->v_charscore;
|
483
|
-
for (int i = 0; i < cand
|
573
|
+
for (int i = 0; i < cand->len; i++) {
|
484
574
|
if (scoreA[i] < scoreB[i] * dirWeight) {
|
485
575
|
scoreA[i] = scoreB[i] * dirWeight;
|
486
576
|
}
|
@@ -493,18 +583,22 @@ private:
|
|
493
583
|
|
494
584
|
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
|
495
585
|
|
496
|
-
auto it2 = candmap.find(seg->fileId);
|
497
|
-
|
498
|
-
Candidate cand(seg, str.size());
|
499
|
-
seg->cand = &(candmap[seg->fileId]);
|
586
|
+
if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
|
587
|
+
Candidate *cand = new Candidate(seg, str.size());
|
500
588
|
segsToClean.push_back(seg);
|
501
589
|
candmap[seg->fileId] = cand;
|
590
|
+
seg->cand = cand;
|
502
591
|
}
|
503
592
|
|
504
593
|
for (int j = i; j < i + nchars; j++) {
|
505
|
-
|
506
|
-
|
594
|
+
Candidate &cand = *(candmap[seg->fileId]);
|
595
|
+
if (cand[j] < nchars) {
|
596
|
+
cand.v_charscore[j] = nchars;
|
507
597
|
}
|
508
598
|
}
|
509
599
|
}
|
510
600
|
};
|
601
|
+
|
602
|
+
} // namespace StrIdx
|
603
|
+
|
604
|
+
#endif
|
data/test.rb
CHANGED
@@ -13,7 +13,13 @@ for x in lines
|
|
13
13
|
end
|
14
14
|
|
15
15
|
idx_time = Time.new
|
16
|
-
|
16
|
+
# Time to start the threadpool to process indexing
|
17
|
+
puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
18
|
+
|
19
|
+
idx.waitUntilDone() # Not necessary, will be called by idx.find
|
20
|
+
idx_time = Time.new
|
21
|
+
# Time when all threads have completed
|
22
|
+
puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
23
|
|
18
24
|
query = "rngnomadriv"
|
19
25
|
res = idx.find(query)
|
data/thread_pool.hpp
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
|
2
|
+
// Based on example in https://www.geeksforgeeks.org/thread-pool-in-cpp/
|
3
|
+
|
4
|
+
#include <condition_variable>
|
5
|
+
#include <functional>
|
6
|
+
#include <iostream>
|
7
|
+
#include <mutex>
|
8
|
+
#include <queue>
|
9
|
+
#include <thread>
|
10
|
+
#include <algorithm>
|
11
|
+
#include <iostream>
|
12
|
+
#include <fstream>
|
13
|
+
#include <vector>
|
14
|
+
#include <string>
|
15
|
+
#include <chrono>
|
16
|
+
|
17
|
+
class ThreadPool {
|
18
|
+
public:
|
19
|
+
// Create a thread pool with given number of threads
|
20
|
+
ThreadPool(size_t num_threads) {
|
21
|
+
|
22
|
+
// Creating worker threads
|
23
|
+
for (size_t i = 0; i < num_threads; ++i) {
|
24
|
+
workerThreads.emplace_back([this] {
|
25
|
+
while (true) {
|
26
|
+
std::function<void()> task;
|
27
|
+
{
|
28
|
+
std::unique_lock<std::mutex> lock(mu_queue);
|
29
|
+
|
30
|
+
// Waiting until there is a task to execute or the pool is stopped
|
31
|
+
cv_.wait(lock, [this] { return !taskQueue.empty() || stop_; });
|
32
|
+
|
33
|
+
// Exit the thread in case the pool is stopped and there are no tasks
|
34
|
+
if (stop_ && taskQueue.empty()) {
|
35
|
+
return;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Get the next task from the queue
|
39
|
+
task = std::move(taskQueue.front());
|
40
|
+
taskQueue.pop();
|
41
|
+
}
|
42
|
+
|
43
|
+
task();
|
44
|
+
}
|
45
|
+
});
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
// Destructor to stop the thread pool
|
50
|
+
~ThreadPool() {
|
51
|
+
{
|
52
|
+
std::lock_guard<std::mutex> lock(mu_queue);
|
53
|
+
stop_ = true;
|
54
|
+
}
|
55
|
+
|
56
|
+
// Notify all threads
|
57
|
+
cv_.notify_all();
|
58
|
+
|
59
|
+
// Joining all worker threads to ensure they have
|
60
|
+
// completed their tasks
|
61
|
+
for (auto &thread : workerThreads) {
|
62
|
+
thread.join();
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
// Wait until all tasks assigned to the threads have been finished
|
67
|
+
void waitUntilDone() {
|
68
|
+
while (true) {
|
69
|
+
{
|
70
|
+
std::lock_guard<std::mutex> guard(mu_queue);
|
71
|
+
if (taskQueue.empty()) {
|
72
|
+
return;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
// Enqueue task for execution by the thread pool
|
80
|
+
void enqueue(std::function<void()> task) {
|
81
|
+
{
|
82
|
+
std::lock_guard<std::mutex> lock(mu_queue);
|
83
|
+
taskQueue.emplace(move(task));
|
84
|
+
}
|
85
|
+
cv_.notify_one();
|
86
|
+
}
|
87
|
+
|
88
|
+
private:
|
89
|
+
std::vector<std::thread> workerThreads;
|
90
|
+
std::queue<std::function<void()>> taskQueue;
|
91
|
+
std::mutex mu_queue;
|
92
|
+
|
93
|
+
// Condition variable to signal changes in the state of the tasks queue
|
94
|
+
std::condition_variable cv_;
|
95
|
+
|
96
|
+
// Flag to indicate whether the thread pool should stop
|
97
|
+
bool stop_ = false;
|
98
|
+
};
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- rubyext/ruby_interf.cpp
|
56
56
|
- stridx.hpp
|
57
57
|
- test.rb
|
58
|
+
- thread_pool.hpp
|
58
59
|
- unordered_dense.h
|
59
60
|
homepage: https://github.com/SamiSieranoja/stridx
|
60
61
|
licenses:
|
@@ -62,7 +63,7 @@ licenses:
|
|
62
63
|
metadata:
|
63
64
|
source_code_uri: https://github.com/SamiSieranoja/stridx
|
64
65
|
homepage_uri: https://github.com/SamiSieranoja/stridx
|
65
|
-
post_install_message:
|
66
|
+
post_install_message:
|
66
67
|
rdoc_options: []
|
67
68
|
require_paths:
|
68
69
|
- lib
|
@@ -79,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
80
|
version: '0'
|
80
81
|
requirements: []
|
81
82
|
rubygems_version: 3.3.26
|
82
|
-
signing_key:
|
83
|
+
signing_key:
|
83
84
|
specification_version: 4
|
84
85
|
summary: StrIdx
|
85
86
|
test_files: []
|