StrIdx 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1079ebe4265ed15d2307d260395e33471ab149bc6cd33ca6e469ff93c960dba8
4
- data.tar.gz: 0bdba2496690ad9e1794d653bb5ca047a15fcfee6d2988ff06098c69920ae269
3
+ metadata.gz: c3e062e9a4d99367ff312ab222e861362e05381a034f513d1581cc2464df53df
4
+ data.tar.gz: 6088bf73c44fa3487757f2a0d3a60fcc6b00ee0514216a76278f027d83cc7807
5
5
  SHA512:
6
- metadata.gz: 8a1b2fe9ce6f87a8585f5b9ca86092f21c600575b704fc3bf4ebaa8ae3bd0897f7e272fb580fb98c38e8b65a5b95d0fd8cc1c3baeafdb235fe1d48e51153ae6c
7
- data.tar.gz: b4a62b6251a38637905653f2aaa919653465fbc453844a2f02466ce413a8a1de0946de09973535472a70ccbc17151d426e76fe92b3338ccc0af9f83f6821b201
6
+ metadata.gz: ece52c12cef6f1460e995e296fbaed2911e45a15d46cbe1fd1b0ff21f8ae546198078a83951cb9430bf82ff81d866a22e0244c7472f566efa2376285b8e805b2
7
+ data.tar.gz: 8b6751d914e41ffbe9c69274054ae398679cabdd159f0ab1c45ca5dc899a9a34c626f782ca4518c68b7d599efe71ea61489e277061b42284bc22345d793b116e
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # stridx
1
+ # StrIdx
2
2
  This library provides fast fuzzy string similarity search and indexing. It has been mainly developed for indexing filepaths, but can be used for other types of strings aswell. It can easily handle fuzzy searches for more than 100,000 filepaths.
3
3
 
4
4
  The fuzziness means that candidate filepaths do not need to include exact match of the query string. They are considered a good match if they include parts of the query string, and even if those parts are in the wrong order.
@@ -26,7 +26,7 @@ Sum up the scores for each character c and divide by (string length)^2
26
26
  For example, if query = "rngnomadriv"
27
27
  and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows:
28
28
  ```
29
- rngnomadriv
29
+ rngnomadriv (substrings rng=3, nomad=5 and driv=4)
30
30
  33355555444 (subscores)
31
31
  FFFFFFFFDDD (F=file component, D=dir component)
32
32
  score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
@@ -36,6 +36,75 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
36
36
  and minor part by (query string length)*(candidate string length)
37
37
  score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
38
38
  ```
39
+
40
+ # Ruby interface
41
+ Install:
42
+ ```
43
+ gem install StrIdx
44
+ ```
45
+
46
+ Usage example (see test.rb):
47
+ ```ruby
48
+ require "stridx"
49
+ idx = StrIdx::StringIndex.new
50
+
51
+ t = Time.new
52
+ fn = File.expand_path("flist.txt")
53
+ lines = IO.read(fn).lines.collect { |x| x.strip }
54
+ i = 0
55
+ for x in lines
56
+ idx.add(x, i)
57
+ i += 1
58
+ end
59
+
60
+ idx_time = Time.new
61
+ puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
62
+
63
+ query = "rngnomadriv"
64
+ res = idx.find(query)
65
+ puts "query: #{query}"
66
+ puts "\nResults:"
67
+ puts "Filename, score"
68
+ puts "==============="
69
+ for id, score in res
70
+ fn = lines[id]
71
+ puts "#{fn}, #{score.round(4)}"
72
+ end
73
+
74
+ query_time = Time.new
75
+
76
+ puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
77
+
78
+ ```
79
+
80
+ Output:
81
+ ```
82
+ Indexing time (89828 files}): 2.813722207
83
+ query: rngnomadriv
84
+
85
+ Results:
86
+ Filename, score
87
+ ===============
88
+ ./drivers/char/hw_random/nomadik-rng.c, 0.3429
89
+ ./drivers/pinctrl/nomadik, 0.2714
90
+ ./drivers/clk/clk-nomadik.c, 0.2711
91
+ ./drivers/gpio/gpio-nomadik.c, 0.2709
92
+ ./drivers/i2c/busses/i2c-nomadik.c, 0.2704
93
+ ./drivers/clocksource/nomadik-mtu.c, 0.2704
94
+ ./drivers/gpu/drm/pl111/pl111_nomadik.h, 0.2701
95
+ ./drivers/gpu/drm/pl111/pl111_nomadik.c, 0.2701
96
+ ./drivers/pinctrl/nomadik/pinctrl-nomadik.c, 0.2699
97
+ ./drivers/input/keyboard/nomadik-ske-keypad.c, 0.2698
98
+ ./drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c, 0.2696
99
+ ./drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c, 0.2695
100
+ ./drivers/char/hw_random/omap-rng.c, 0.2364
101
+ ./drivers/char/hw_random/omap3-rom-rng.c, 0.2361
102
+ ./include/dt-bindings/pinctrl/nomadik.h, 0.2248
103
+
104
+ Search time: 0.0488 seconds
105
+ ```
106
+
107
+
39
108
  # C++ API
40
109
  See demo.cpp
41
110
  ```cpp
@@ -41,7 +41,7 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
41
41
  return self;
42
42
  }
43
43
 
44
- VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
44
+ VALUE StringIndexFind(VALUE self, VALUE str) {
45
45
  VALUE ret;
46
46
  std::string s1 = StringValueCStr(str);
47
47
 
@@ -50,7 +50,7 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
50
50
  StringIndex *idx = (StringIndex *)data;
51
51
 
52
52
  ret = rb_ary_new();
53
- const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, NUM2INT(minChars));
53
+ const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
54
54
  int limit = 15;
55
55
  int i = 0;
56
56
  for (const auto &res : results) {
@@ -68,12 +68,12 @@ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
68
68
 
69
69
  void Init_stridx(void) {
70
70
 
71
- VALUE cFoo = rb_define_class("CppStringIndex", rb_cObject);
71
+ VALUE mStrIdx = rb_define_module("StrIdx");
72
+ VALUE classStringIndex = rb_define_class_under(mStrIdx, "StringIndex", rb_cObject);
72
73
 
73
- rb_define_alloc_func(cFoo, str_idx_alloc);
74
- rb_define_method(cFoo, "add", StringIndexAddSegments, 2);
75
- rb_define_method(cFoo, "find", StringIndexFind, 2);
74
+ rb_define_alloc_func(classStringIndex, str_idx_alloc);
75
+ rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
76
+ rb_define_method(classStringIndex, "find", StringIndexFind, 1);
76
77
  }
77
78
 
78
79
  } // End extern "C"
79
-
@@ -1,34 +1,31 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- $:.unshift File.dirname(__FILE__)
4
3
  require "stridx"
5
- idx = CppStringIndex.new
4
+ idx = StrIdx::StringIndex.new
6
5
 
7
6
  t = Time.new
8
- fn = File.expand_path("../flist.txt")
7
+ fn = File.expand_path("flist.txt")
9
8
  lines = IO.read(fn).lines.collect { |x| x.strip }
10
- i = 1
9
+ i = 0
11
10
  for x in lines
12
11
  idx.add(x, i)
13
12
  i += 1
14
13
  end
15
14
 
16
15
  idx_time = Time.new
17
- puts "\nIndexing time: #{idx_time - t}"
18
- query = "helbind.h"
19
- res = idx.find(query, 2)
16
+ puts "\nIndexing time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
17
+
18
+ query = "rngnomadriv"
19
+ res = idx.find(query)
20
20
  puts "query: #{query}"
21
21
  puts "\nResults:"
22
22
  puts "Filename, score"
23
23
  puts "==============="
24
- for x in res
25
- fn = lines[x[0] - 1]
26
- score = x[1]
24
+ for id, score in res
25
+ fn = lines[id]
27
26
  puts "#{fn}, #{score.round(4)}"
28
- # pp [lines[x[0] - 1], x[1]]
29
27
  end
30
28
 
31
-
32
29
  query_time = Time.new
33
30
 
34
- puts "\nSearch time: #{query_time - idx_time}"
31
+ puts "\nSearch time: #{(query_time - idx_time).round(4)} seconds"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: StrIdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sami Sieranoja
@@ -52,8 +52,8 @@ files:
52
52
  - demo.cpp
53
53
  - rubyext/extconf.rb
54
54
  - rubyext/ruby_interf.cpp
55
- - rubyext/test.rb
56
55
  - stridx.hpp
56
+ - test.rb
57
57
  - unordered_dense.h
58
58
  homepage: https://github.com/SamiSieranoja/stridx
59
59
  licenses: