StrIdx 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +71 -2
- data/rubyext/ruby_interf.cpp +7 -7
- data/{rubyext/test.rb → test.rb} +10 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3e062e9a4d99367ff312ab222e861362e05381a034f513d1581cc2464df53df
|
4
|
+
data.tar.gz: 6088bf73c44fa3487757f2a0d3a60fcc6b00ee0514216a76278f027d83cc7807
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ece52c12cef6f1460e995e296fbaed2911e45a15d46cbe1fd1b0ff21f8ae546198078a83951cb9430bf82ff81d866a22e0244c7472f566efa2376285b8e805b2
|
7
|
+
data.tar.gz: 8b6751d914e41ffbe9c69274054ae398679cabdd159f0ab1c45ca5dc899a9a34c626f782ca4518c68b7d599efe71ea61489e277061b42284bc22345d793b116e
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# StrIdx
|
2
2
|
This library provides fast fuzzy string similarity search and indexing. It has been mainly developed for indexing filepaths, but can be used for other types of strings aswell. It can easily handle fuzzy searches for more than 100,000 filepaths.
|
3
3
|
|
4
4
|
The fuzziness means that candidate filepaths do not need to include exact match of the query string. They are considered a good match if they include parts of the query string, and even if those parts are in the wrong order.
|
@@ -26,7 +26,7 @@ Sum up the scores for each character c and divide by (string length)^2
|
|
26
26
|
For example, if query = "rngnomadriv"
|
27
27
|
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows:
|
28
28
|
```
|
29
|
-
rngnomadriv
|
29
|
+
rngnomadriv (substrings rng=3, nomad=5 and driv=4)
|
30
30
|
33355555444 (subscores)
|
31
31
|
FFFFFFFFDDD (F=file component, D=dir component)
|
32
32
|
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
@@ -36,6 +36,75 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
|
|
36
36
|
and minor part by (query string length)*(candidate string length)
|
37
37
|
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
38
38
|
```
|
39
|
+
|
40
|
+
# Ruby interface
|
41
|
+
Install:
|
42
|
+
```
|
43
|
+
gem install StrIdx
|
44
|
+
```
|
45
|
+
|
46
|
+
Usage example (see test.rb):
|
47
|
+
```ruby
|
48
|
+
require "stridx"
|
49
|
+
idx = StrIdx::StringIndex.new
|
50
|
+
|
51
|
+
t = Time.new
|
52
|
+
fn = File.expand_path("flist.txt")
|
53
|
+
lines = IO.read(fn).lines.collect { |x| x.strip }
|
54
|
+
i = 0
|
55
|
+
for x in lines
|
56
|
+
idx.add(x, i)
|
57
|
+
i += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
idx_time = Time.new
|
61
|
+
puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
62
|
+
|
63
|
+
query = "rngnomadriv"
|
64
|
+
res = idx.find(query)
|
65
|
+
puts "query: #{query}"
|
66
|
+
puts "\nResults:"
|
67
|
+
puts "Filename, score"
|
68
|
+
puts "==============="
|
69
|
+
for id, score in res
|
70
|
+
fn = lines[id]
|
71
|
+
puts "#{fn}, #{score.round(4)}"
|
72
|
+
end
|
73
|
+
|
74
|
+
query_time = Time.new
|
75
|
+
|
76
|
+
puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
|
77
|
+
|
78
|
+
```
|
79
|
+
|
80
|
+
Output:
|
81
|
+
```
|
82
|
+
Indexing time (89828 files}): 2.813722207
|
83
|
+
query: rngnomadriv
|
84
|
+
|
85
|
+
Results:
|
86
|
+
Filename, score
|
87
|
+
===============
|
88
|
+
./drivers/char/hw_random/nomadik-rng.c, 0.3429
|
89
|
+
./drivers/pinctrl/nomadik, 0.2714
|
90
|
+
./drivers/clk/clk-nomadik.c, 0.2711
|
91
|
+
./drivers/gpio/gpio-nomadik.c, 0.2709
|
92
|
+
./drivers/i2c/busses/i2c-nomadik.c, 0.2704
|
93
|
+
./drivers/clocksource/nomadik-mtu.c, 0.2704
|
94
|
+
./drivers/gpu/drm/pl111/pl111_nomadik.h, 0.2701
|
95
|
+
./drivers/gpu/drm/pl111/pl111_nomadik.c, 0.2701
|
96
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik.c, 0.2699
|
97
|
+
./drivers/input/keyboard/nomadik-ske-keypad.c, 0.2698
|
98
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c, 0.2696
|
99
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c, 0.2695
|
100
|
+
./drivers/char/hw_random/omap-rng.c, 0.2364
|
101
|
+
./drivers/char/hw_random/omap3-rom-rng.c, 0.2361
|
102
|
+
./include/dt-bindings/pinctrl/nomadik.h, 0.2248
|
103
|
+
|
104
|
+
Search time: 0.0488 seconds
|
105
|
+
```
|
106
|
+
|
107
|
+
|
39
108
|
# C++ API
|
40
109
|
See demo.cpp
|
41
110
|
```cpp
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -41,7 +41,7 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
41
41
|
return self;
|
42
42
|
}
|
43
43
|
|
44
|
-
VALUE StringIndexFind(VALUE self, VALUE str
|
44
|
+
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
45
|
VALUE ret;
|
46
46
|
std::string s1 = StringValueCStr(str);
|
47
47
|
|
@@ -50,7 +50,7 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
50
50
|
StringIndex *idx = (StringIndex *)data;
|
51
51
|
|
52
52
|
ret = rb_ary_new();
|
53
|
-
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1,
|
53
|
+
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
54
54
|
int limit = 15;
|
55
55
|
int i = 0;
|
56
56
|
for (const auto &res : results) {
|
@@ -68,12 +68,12 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
68
68
|
|
69
69
|
void Init_stridx(void) {
|
70
70
|
|
71
|
-
VALUE
|
71
|
+
VALUE mStrIdx = rb_define_module("StrIdx");
|
72
|
+
VALUE classStringIndex = rb_define_class_under(mStrIdx, "StringIndex", rb_cObject);
|
72
73
|
|
73
|
-
rb_define_alloc_func(
|
74
|
-
rb_define_method(
|
75
|
-
rb_define_method(
|
74
|
+
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
75
|
+
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
76
|
+
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
76
77
|
}
|
77
78
|
|
78
79
|
} // End extern "C"
|
79
|
-
|
data/{rubyext/test.rb → test.rb}
RENAMED
@@ -1,34 +1,31 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
$:.unshift File.dirname(__FILE__)
|
4
3
|
require "stridx"
|
5
|
-
idx =
|
4
|
+
idx = StrIdx::StringIndex.new
|
6
5
|
|
7
6
|
t = Time.new
|
8
|
-
fn = File.expand_path("
|
7
|
+
fn = File.expand_path("flist.txt")
|
9
8
|
lines = IO.read(fn).lines.collect { |x| x.strip }
|
10
|
-
i =
|
9
|
+
i = 0
|
11
10
|
for x in lines
|
12
11
|
idx.add(x, i)
|
13
12
|
i += 1
|
14
13
|
end
|
15
14
|
|
16
15
|
idx_time = Time.new
|
17
|
-
puts "\nIndexing time: #{idx_time - t}"
|
18
|
-
|
19
|
-
|
16
|
+
puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
|
+
|
18
|
+
query = "rngnomadriv"
|
19
|
+
res = idx.find(query)
|
20
20
|
puts "query: #{query}"
|
21
21
|
puts "\nResults:"
|
22
22
|
puts "Filename, score"
|
23
23
|
puts "==============="
|
24
|
-
for
|
25
|
-
fn = lines[
|
26
|
-
score = x[1]
|
24
|
+
for id, score in res
|
25
|
+
fn = lines[id]
|
27
26
|
puts "#{fn}, #{score.round(4)}"
|
28
|
-
# pp [lines[x[0] - 1], x[1]]
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
29
|
query_time = Time.new
|
33
30
|
|
34
|
-
puts "\nSearch time: #{query_time - idx_time}"
|
31
|
+
puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
@@ -52,8 +52,8 @@ files:
|
|
52
52
|
- demo.cpp
|
53
53
|
- rubyext/extconf.rb
|
54
54
|
- rubyext/ruby_interf.cpp
|
55
|
-
- rubyext/test.rb
|
56
55
|
- stridx.hpp
|
56
|
+
- test.rb
|
57
57
|
- unordered_dense.h
|
58
58
|
homepage: https://github.com/SamiSieranoja/stridx
|
59
59
|
licenses:
|