StrIdx 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -2
- data/rubyext/ruby_interf.cpp +7 -7
- data/{rubyext/test.rb → test.rb} +10 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3e062e9a4d99367ff312ab222e861362e05381a034f513d1581cc2464df53df
|
4
|
+
data.tar.gz: 6088bf73c44fa3487757f2a0d3a60fcc6b00ee0514216a76278f027d83cc7807
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ece52c12cef6f1460e995e296fbaed2911e45a15d46cbe1fd1b0ff21f8ae546198078a83951cb9430bf82ff81d866a22e0244c7472f566efa2376285b8e805b2
|
7
|
+
data.tar.gz: 8b6751d914e41ffbe9c69274054ae398679cabdd159f0ab1c45ca5dc899a9a34c626f782ca4518c68b7d599efe71ea61489e277061b42284bc22345d793b116e
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# StrIdx
|
2
2
|
This library provides fast fuzzy string similarity search and indexing. It has been mainly developed for indexing filepaths, but can be used for other types of strings aswell. It can easily handle fuzzy searches for more than 100,000 filepaths.
|
3
3
|
|
4
4
|
The fuzziness means that candidate filepaths do not need to include exact match of the query string. They are considered a good match if they include parts of the query string, and even if those parts are in the wrong order.
|
@@ -26,7 +26,7 @@ Sum up the scores for each character c and divide by (string length)^2
|
|
26
26
|
For example, if query = "rngnomadriv"
|
27
27
|
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows:
|
28
28
|
```
|
29
|
-
rngnomadriv
|
29
|
+
rngnomadriv (substrings rng=3, nomad=5 and driv=4)
|
30
30
|
33355555444 (subscores)
|
31
31
|
FFFFFFFFDDD (F=file component, D=dir component)
|
32
32
|
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
@@ -36,6 +36,75 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
|
|
36
36
|
and minor part by (query string length)*(candidate string length)
|
37
37
|
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
38
38
|
```
|
39
|
+
|
40
|
+
# Ruby interface
|
41
|
+
Install:
|
42
|
+
```
|
43
|
+
gem install StrIdx
|
44
|
+
```
|
45
|
+
|
46
|
+
Usage example (see test.rb):
|
47
|
+
```ruby
|
48
|
+
require "stridx"
|
49
|
+
idx = StrIdx::StringIndex.new
|
50
|
+
|
51
|
+
t = Time.new
|
52
|
+
fn = File.expand_path("flist.txt")
|
53
|
+
lines = IO.read(fn).lines.collect { |x| x.strip }
|
54
|
+
i = 0
|
55
|
+
for x in lines
|
56
|
+
idx.add(x, i)
|
57
|
+
i += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
idx_time = Time.new
|
61
|
+
puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
62
|
+
|
63
|
+
query = "rngnomadriv"
|
64
|
+
res = idx.find(query)
|
65
|
+
puts "query: #{query}"
|
66
|
+
puts "\nResults:"
|
67
|
+
puts "Filename, score"
|
68
|
+
puts "==============="
|
69
|
+
for id, score in res
|
70
|
+
fn = lines[id]
|
71
|
+
puts "#{fn}, #{score.round(4)}"
|
72
|
+
end
|
73
|
+
|
74
|
+
query_time = Time.new
|
75
|
+
|
76
|
+
puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
|
77
|
+
|
78
|
+
```
|
79
|
+
|
80
|
+
Output:
|
81
|
+
```
|
82
|
+
Indexing time (89828 files}): 2.813722207
|
83
|
+
query: rngnomadriv
|
84
|
+
|
85
|
+
Results:
|
86
|
+
Filename, score
|
87
|
+
===============
|
88
|
+
./drivers/char/hw_random/nomadik-rng.c, 0.3429
|
89
|
+
./drivers/pinctrl/nomadik, 0.2714
|
90
|
+
./drivers/clk/clk-nomadik.c, 0.2711
|
91
|
+
./drivers/gpio/gpio-nomadik.c, 0.2709
|
92
|
+
./drivers/i2c/busses/i2c-nomadik.c, 0.2704
|
93
|
+
./drivers/clocksource/nomadik-mtu.c, 0.2704
|
94
|
+
./drivers/gpu/drm/pl111/pl111_nomadik.h, 0.2701
|
95
|
+
./drivers/gpu/drm/pl111/pl111_nomadik.c, 0.2701
|
96
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik.c, 0.2699
|
97
|
+
./drivers/input/keyboard/nomadik-ske-keypad.c, 0.2698
|
98
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c, 0.2696
|
99
|
+
./drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c, 0.2695
|
100
|
+
./drivers/char/hw_random/omap-rng.c, 0.2364
|
101
|
+
./drivers/char/hw_random/omap3-rom-rng.c, 0.2361
|
102
|
+
./include/dt-bindings/pinctrl/nomadik.h, 0.2248
|
103
|
+
|
104
|
+
Search time: 0.0488 seconds
|
105
|
+
```
|
106
|
+
|
107
|
+
|
39
108
|
# C++ API
|
40
109
|
See demo.cpp
|
41
110
|
```cpp
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -41,7 +41,7 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
41
41
|
return self;
|
42
42
|
}
|
43
43
|
|
44
|
-
VALUE StringIndexFind(VALUE self, VALUE str
|
44
|
+
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
45
|
VALUE ret;
|
46
46
|
std::string s1 = StringValueCStr(str);
|
47
47
|
|
@@ -50,7 +50,7 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
50
50
|
StringIndex *idx = (StringIndex *)data;
|
51
51
|
|
52
52
|
ret = rb_ary_new();
|
53
|
-
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1,
|
53
|
+
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
54
54
|
int limit = 15;
|
55
55
|
int i = 0;
|
56
56
|
for (const auto &res : results) {
|
@@ -68,12 +68,12 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
|
68
68
|
|
69
69
|
void Init_stridx(void) {
|
70
70
|
|
71
|
-
VALUE
|
71
|
+
VALUE mStrIdx = rb_define_module("StrIdx");
|
72
|
+
VALUE classStringIndex = rb_define_class_under(mStrIdx, "StringIndex", rb_cObject);
|
72
73
|
|
73
|
-
rb_define_alloc_func(
|
74
|
-
rb_define_method(
|
75
|
-
rb_define_method(
|
74
|
+
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
75
|
+
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
76
|
+
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
76
77
|
}
|
77
78
|
|
78
79
|
} // End extern "C"
|
79
|
-
|
data/{rubyext/test.rb → test.rb}
RENAMED
@@ -1,34 +1,31 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
$:.unshift File.dirname(__FILE__)
|
4
3
|
require "stridx"
|
5
|
-
idx =
|
4
|
+
idx = StrIdx::StringIndex.new
|
6
5
|
|
7
6
|
t = Time.new
|
8
|
-
fn = File.expand_path("
|
7
|
+
fn = File.expand_path("flist.txt")
|
9
8
|
lines = IO.read(fn).lines.collect { |x| x.strip }
|
10
|
-
i =
|
9
|
+
i = 0
|
11
10
|
for x in lines
|
12
11
|
idx.add(x, i)
|
13
12
|
i += 1
|
14
13
|
end
|
15
14
|
|
16
15
|
idx_time = Time.new
|
17
|
-
puts "\nIndexing time: #{idx_time - t}"
|
18
|
-
|
19
|
-
|
16
|
+
puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
|
17
|
+
|
18
|
+
query = "rngnomadriv"
|
19
|
+
res = idx.find(query)
|
20
20
|
puts "query: #{query}"
|
21
21
|
puts "\nResults:"
|
22
22
|
puts "Filename, score"
|
23
23
|
puts "==============="
|
24
|
-
for
|
25
|
-
fn = lines[
|
26
|
-
score = x[1]
|
24
|
+
for id, score in res
|
25
|
+
fn = lines[id]
|
27
26
|
puts "#{fn}, #{score.round(4)}"
|
28
|
-
# pp [lines[x[0] - 1], x[1]]
|
29
27
|
end
|
30
28
|
|
31
|
-
|
32
29
|
query_time = Time.new
|
33
30
|
|
34
|
-
puts "\nSearch time: #{query_time - idx_time}"
|
31
|
+
puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: StrIdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sami Sieranoja
|
@@ -52,8 +52,8 @@ files:
|
|
52
52
|
- demo.cpp
|
53
53
|
- rubyext/extconf.rb
|
54
54
|
- rubyext/ruby_interf.cpp
|
55
|
-
- rubyext/test.rb
|
56
55
|
- stridx.hpp
|
56
|
+
- test.rb
|
57
57
|
- unordered_dense.h
|
58
58
|
homepage: https://github.com/SamiSieranoja/stridx
|
59
59
|
licenses:
|