StrIdx 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Makefile +6 -265
- data/README.md +71 -2
- data/flist.txt +89828 -0
- data/rubyext/ruby_interf.cpp +29 -8
- data/stridx.hpp +9 -5
- data/{rubyext/test.rb → test.rb} +10 -13
- metadata +10 -7
data/rubyext/ruby_interf.cpp
CHANGED
@@ -36,12 +36,12 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
36
36
|
|
37
37
|
void *data;
|
38
38
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
-
((StringIndex *)data)->addStrToIndex(s1, fid
|
39
|
+
((StringIndex *)data)->addStrToIndex(s1, fid);
|
40
40
|
|
41
41
|
return self;
|
42
42
|
}
|
43
43
|
|
44
|
-
VALUE StringIndexFind(VALUE self, VALUE str
|
44
|
+
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
45
|
VALUE ret;
|
46
46
|
std::string s1 = StringValueCStr(str);
|
47
47
|
|
@@ -50,7 +50,7 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
50
50
|
StringIndex *idx = (StringIndex *)data;
|
51
51
|
|
52
52
|
ret = rb_ary_new();
|
53
|
-
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1,
|
53
|
+
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
54
54
|
int limit = 15;
|
55
55
|
int i = 0;
|
56
56
|
for (const auto &res : results) {
|
@@ -66,14 +66,35 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
66
66
|
return ret;
|
67
67
|
}
|
68
68
|
|
69
|
+
VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
70
|
+
char c = '/';
|
71
|
+
if (TYPE(str) == T_STRING) {
|
72
|
+
std::string s = StringValueCStr(str);
|
73
|
+
if (s.size() >= 1) {
|
74
|
+
c = s[0];
|
75
|
+
}
|
76
|
+
} else {
|
77
|
+
c = '\0'; // No separator
|
78
|
+
// if (TYPE(obj) == T_NIL)
|
79
|
+
}
|
80
|
+
|
81
|
+
void *data;
|
82
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
83
|
+
StringIndex *idx = (StringIndex *)data;
|
84
|
+
idx->setDirSeparator(c);
|
85
|
+
|
86
|
+
return self;
|
87
|
+
}
|
88
|
+
|
69
89
|
void Init_stridx(void) {
|
70
90
|
|
71
|
-
VALUE
|
91
|
+
VALUE mStrIdx = rb_define_module("StrIdx");
|
92
|
+
VALUE classStringIndex = rb_define_class_under(mStrIdx, "StringIndex", rb_cObject);
|
72
93
|
|
73
|
-
rb_define_alloc_func(
|
74
|
-
rb_define_method(
|
75
|
-
rb_define_method(
|
94
|
+
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
95
|
+
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
96
|
+
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
97
|
+
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
76
98
|
}
|
77
99
|
|
78
100
|
} // End extern "C"
|
79
|
-
|
data/stridx.hpp
CHANGED
@@ -214,7 +214,7 @@ public:
|
|
214
214
|
}
|
215
215
|
|
216
216
|
/**
|
217
|
-
* Add a string to the index to be
|
217
|
+
* Add a string to the index to be searched for afterwards
|
218
218
|
*
|
219
219
|
* @param filePath String to index (e.g. /home/user/Project/main.cpp).
|
220
220
|
* @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
|
@@ -272,7 +272,7 @@ public:
|
|
272
272
|
}
|
273
273
|
|
274
274
|
/**
|
275
|
-
|
275
|
+
The search will find filepaths similar to the input string
|
276
276
|
|
277
277
|
To be considered a candidate path, the file component of the path (e.g. file.txt)
|
278
278
|
is required to have at least a substring of two characters in common with the
|
@@ -286,8 +286,8 @@ public:
|
|
286
286
|
is also included in the PathSegment
|
287
287
|
- take the lenght of that substring as score
|
288
288
|
sum up the scores for each character c and divide by (string length)^2
|
289
|
-
|
290
|
-
For example, if query = "rngnomadriv"
|
289
|
+
|
290
|
+
For example, if query = "rngnomadriv"
|
291
291
|
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
|
292
292
|
as follows:
|
293
293
|
rngnomadriv
|
@@ -296,13 +296,17 @@ public:
|
|
296
296
|
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
297
297
|
|
298
298
|
In final score, give a small penalty for larger candidate filenames:
|
299
|
-
Divide main part of score with (query string length)^2
|
299
|
+
Divide main part of score with (query string length)^2
|
300
300
|
and minor part by (query string length)*(candidate string length)
|
301
301
|
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
302
302
|
|
303
303
|
@param query String to search for inside the index
|
304
304
|
*/
|
305
305
|
|
306
|
+
std::vector<std::pair<float, int>> findSimilar(std::string query) {
|
307
|
+
return findSimilar(query, 2);
|
308
|
+
}
|
309
|
+
|
306
310
|
std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
|
307
311
|
CandMap fileCandMap;
|
308
312
|
CandMap dirCandMap;
|
data/{rubyext/test.rb → test.rb}
RENAMED
@@ -1,34 +1,31 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
$:.unshift File.dirname(__FILE__)
|
4
3
|
require "stridx"
|
5
|
-
idx =
|
4
|
+
idx = StrIdx::StringIndex.new
|
6
5
|
|
7
6
|
t = Time.new
|
8
|
-
fn = File.expand_path("
|
7
|
+
fn = File.expand_path("flist.txt")
|
9
8
|
lines = IO.read(fn).lines.collect { |x| x.strip }
|
10
|
-
i =
|
9
|
+
i = 0
|
11
10
|
for x in lines
|
12
11
|
idx.add(x, i)
|
13
12
|
i += 1
|
14
13
|
end
|
15
14
|
|
16
15
|
idx_time = Time.new
|
17
|
-
puts "\nIndexing time: #{idx_time - t}"
|
18
|
-
|
19
|
-
|
16
|
+
puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
|
+
|
18
|
+
query = "rngnomadriv"
|
19
|
+
res = idx.find(query)
|
20
20
|
puts "query: #{query}"
|
21
21
|
puts "\nResults:"
|
22
22
|
puts "Filename, score"
|
23
23
|
puts "==============="
|
24
|
-
for
|
25
|
-
fn = lines[
|
26
|
-
score = x[1]
|
24
|
+
for id, score in res
|
25
|
+
fn = lines[id]
|
27
26
|
puts "#{fn}, #{score.round(4)}"
|
28
|
-
# pp [lines[x[0] - 1], x[1]]
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
29
|
query_time = Time.new
|
33
30
|
|
34
|
-
puts "\nSearch time: #{query_time - idx_time}"
|
31
|
+
puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -50,16 +50,19 @@ files:
|
|
50
50
|
- Makefile
|
51
51
|
- README.md
|
52
52
|
- demo.cpp
|
53
|
+
- flist.txt
|
53
54
|
- rubyext/extconf.rb
|
54
55
|
- rubyext/ruby_interf.cpp
|
55
|
-
- rubyext/test.rb
|
56
56
|
- stridx.hpp
|
57
|
+
- test.rb
|
57
58
|
- unordered_dense.h
|
58
59
|
homepage: https://github.com/SamiSieranoja/stridx
|
59
60
|
licenses:
|
60
61
|
- LGPL-2.0+
|
61
|
-
metadata:
|
62
|
-
|
62
|
+
metadata:
|
63
|
+
source_code_uri: https://github.com/SamiSieranoja/stridx
|
64
|
+
homepage_uri: https://github.com/SamiSieranoja/stridx
|
65
|
+
post_install_message:
|
63
66
|
rdoc_options: []
|
64
67
|
require_paths:
|
65
68
|
- lib
|
@@ -76,7 +79,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
79
|
version: '0'
|
77
80
|
requirements: []
|
78
81
|
rubygems_version: 3.3.26
|
79
|
-
signing_key:
|
82
|
+
signing_key:
|
80
83
|
specification_version: 4
|
81
84
|
summary: StrIdx
|
82
85
|
test_files: []
|