StrIdx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Makefile ADDED
@@ -0,0 +1,268 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ V0 = $(V:0=)
7
+ Q1 = $(V:1=)
8
+ Q = $(Q1:0=@)
9
+ ECHO1 = $(V:1=@ :)
10
+ ECHO = $(ECHO1:0=@ echo)
11
+ NULLCMD = :
12
+
13
+ #### Start of system configuration section. ####
14
+
15
+ srcdir = .
16
+ topdir = /home/samsam/.rbenv/versions/3.1.4/include/ruby-3.1.0
17
+ hdrdir = $(topdir)
18
+ arch_hdrdir = /home/samsam/.rbenv/versions/3.1.4/include/ruby-3.1.0/x86_64-linux
19
+ PATH_SEPARATOR = :
20
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
21
+ prefix = $(DESTDIR)/home/samsam/.rbenv/versions/3.1.4
22
+ rubysitearchprefix = $(rubylibprefix)/$(sitearch)
23
+ rubyarchprefix = $(rubylibprefix)/$(arch)
24
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
25
+ exec_prefix = $(prefix)
26
+ vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
27
+ sitearchhdrdir = $(sitehdrdir)/$(sitearch)
28
+ rubyarchhdrdir = $(rubyhdrdir)/$(arch)
29
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
30
+ sitehdrdir = $(rubyhdrdir)/site_ruby
31
+ rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
32
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
33
+ vendorlibdir = $(vendordir)/$(ruby_version)
34
+ vendordir = $(rubylibprefix)/vendor_ruby
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ sitelibdir = $(sitedir)/$(ruby_version)
37
+ sitedir = $(rubylibprefix)/site_ruby
38
+ rubyarchdir = $(rubylibdir)/$(arch)
39
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
40
+ sitearchincludedir = $(includedir)/$(sitearch)
41
+ archincludedir = $(includedir)/$(arch)
42
+ sitearchlibdir = $(libdir)/$(sitearch)
43
+ archlibdir = $(libdir)/$(arch)
44
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
45
+ mandir = $(datarootdir)/man
46
+ localedir = $(datarootdir)/locale
47
+ libdir = $(exec_prefix)/lib
48
+ psdir = $(docdir)
49
+ pdfdir = $(docdir)
50
+ dvidir = $(docdir)
51
+ htmldir = $(docdir)
52
+ infodir = $(datarootdir)/info
53
+ docdir = $(datarootdir)/doc/$(PACKAGE)
54
+ oldincludedir = $(DESTDIR)/usr/include
55
+ includedir = $(prefix)/include
56
+ runstatedir = $(localstatedir)/run
57
+ localstatedir = $(prefix)/var
58
+ sharedstatedir = $(prefix)/com
59
+ sysconfdir = $(prefix)/etc
60
+ datadir = $(datarootdir)
61
+ datarootdir = $(prefix)/share
62
+ libexecdir = $(exec_prefix)/libexec
63
+ sbindir = $(exec_prefix)/sbin
64
+ bindir = $(exec_prefix)/bin
65
+ archdir = $(rubyarchdir)
66
+
67
+
68
+ CC_WRAPPER =
69
+ CC = gcc
70
+ CXX = g++
71
+ LIBRUBY = $(LIBRUBY_SO)
72
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
73
+ LIBRUBYARG_SHARED = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)
74
+ LIBRUBYARG_STATIC = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static $(MAINLIBS)
75
+ empty =
76
+ OUTFLAG = -o $(empty)
77
+ COUTFLAG = -o $(empty)
78
+ CSRCFLAG = $(empty)
79
+
80
+ RUBY_EXTCONF_H =
81
+ cflags = $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
83
+ optflags = -O3 -fno-fast-math
84
+ debugflags = -ggdb3
85
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
86
+ cppflags =
87
+ CCDLFLAGS = -fPIC
88
+ CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
89
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
+ DEFS =
91
+ CPPFLAGS = -I/home/samsam/.rbenv/versions/3.1.4/include $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) -Wall -Wno-unused-variable -O3 -fopenmp $(ARCH_FLAG)
93
+ ldflags = -L. -L/home/samsam/.rbenv/versions/3.1.4/lib -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
94
+ dldflags = -L/home/samsam/.rbenv/versions/3.1.4/lib -Wl,--compress-debug-sections=zlib
95
+ ARCH_FLAG =
96
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
+ LDSHARED = $(CC) -shared
98
+ LDSHAREDXX = $(CXX) -shared
99
+ AR = gcc-ar
100
+ EXEEXT =
101
+
102
+ RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
103
+ RUBY_SO_NAME = ruby
104
+ RUBYW_INSTALL_NAME =
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
106
+ RUBYW_BASE_NAME = rubyw
107
+ RUBY_BASE_NAME = ruby
108
+
109
+ arch = x86_64-linux
110
+ sitearch = $(arch)
111
+ ruby_version = 3.1.0
112
+ ruby = $(bindir)/$(RUBY_BASE_NAME)
113
+ RUBY = $(ruby)
114
+ BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
115
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
116
+
117
+ RM = rm -f
118
+ RM_RF = rm -fr
119
+ RMDIRS = rmdir --ignore-fail-on-non-empty -p
120
+ MAKEDIRS = /usr/bin/mkdir -p
121
+ INSTALL = /usr/bin/install -c
122
+ INSTALL_PROG = $(INSTALL) -m 0755
123
+ INSTALL_DATA = $(INSTALL) -m 644
124
+ COPY = cp
125
+ TOUCH = exit >
126
+
127
+ #### End of system configuration section. ####
128
+
129
+ preload =
130
+ libpath = . $(libdir)
131
+ LIBPATH = -L. -L$(libdir) -Wl,-rpath,$(libdir)
132
+ DEFFILE =
133
+
134
+ CLEANFILES = mkmf.log
135
+ DISTCLEANFILES =
136
+ DISTCLEANDIRS =
137
+
138
+ extout =
139
+ extout_prefix =
140
+ target_prefix =
141
+ LOCAL_LIBS =
142
+ LIBS = $(LIBRUBYARG_SHARED) -lgomp -lstdc++ -lm -lc
143
+ ORIG_SRCS = demo.cpp ruby_interf.cpp
144
+ SRCS = $(ORIG_SRCS)
145
+ OBJS = demo.o ruby_interf.o
146
+ HDRS = $(srcdir)/unordered_dense.h $(srcdir)/stridx.hpp
147
+ LOCAL_HDRS =
148
+ TARGET = stridx
149
+ TARGET_NAME = stridx
150
+ TARGET_ENTRY = Init_$(TARGET_NAME)
151
+ DLLIB = $(TARGET).so
152
+ EXTSTATIC =
153
+ STATIC_LIB =
154
+
155
+ TIMESTAMP_DIR = .
156
+ BINDIR = $(bindir)
157
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
158
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
159
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
160
+ HDRDIR = $(sitehdrdir)$(target_prefix)
161
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
162
+ TARGET_SO_DIR =
163
+ TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
164
+ CLEANLIBS = $(TARGET_SO) false
165
+ CLEANOBJS = *.o *.bak
166
+
167
+ all: $(DLLIB)
168
+ static: $(STATIC_LIB)
169
+ .PHONY: all install static install-so install-rb
170
+ .PHONY: clean clean-so clean-static clean-rb
171
+
172
+ clean-static::
173
+ clean-rb-default::
174
+ clean-rb::
175
+ clean-so::
176
+ clean: clean-so clean-static clean-rb-default clean-rb
177
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
178
+
179
+ distclean-rb-default::
180
+ distclean-rb::
181
+ distclean-so::
182
+ distclean-static::
183
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
184
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
185
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
186
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
187
+
188
+ realclean: distclean
189
+ install: install-so install-rb
190
+
191
+ install-so: $(DLLIB) $(TIMESTAMP_DIR)/.sitearchdir.time
192
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
193
+ clean-static::
194
+ -$(Q)$(RM) $(STATIC_LIB)
195
+ install-rb: pre-install-rb do-install-rb install-rb-default
196
+ install-rb-default: pre-install-rb-default do-install-rb-default
197
+ pre-install-rb: Makefile
198
+ pre-install-rb-default: Makefile
199
+ do-install-rb:
200
+ do-install-rb-default:
201
+ pre-install-rb-default:
202
+ @$(NULLCMD)
203
+ $(TIMESTAMP_DIR)/.sitearchdir.time:
204
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
205
+ $(Q) $(TOUCH) $@
206
+
207
+ site-install: site-install-so site-install-rb
208
+ site-install-so: install-so
209
+ site-install-rb: install-rb
210
+
211
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
212
+
213
+ .cc.o:
214
+ $(ECHO) compiling $(<)
215
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
216
+
217
+ .cc.S:
218
+ $(ECHO) translating $(<)
219
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
220
+
221
+ .mm.o:
222
+ $(ECHO) compiling $(<)
223
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
224
+
225
+ .mm.S:
226
+ $(ECHO) translating $(<)
227
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
228
+
229
+ .cxx.o:
230
+ $(ECHO) compiling $(<)
231
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
232
+
233
+ .cxx.S:
234
+ $(ECHO) translating $(<)
235
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
236
+
237
+ .cpp.o:
238
+ $(ECHO) compiling $(<)
239
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
240
+
241
+ .cpp.S:
242
+ $(ECHO) translating $(<)
243
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
244
+
245
+ .c.o:
246
+ $(ECHO) compiling $(<)
247
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
248
+
249
+ .c.S:
250
+ $(ECHO) translating $(<)
251
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
252
+
253
+ .m.o:
254
+ $(ECHO) compiling $(<)
255
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
256
+
257
+ .m.S:
258
+ $(ECHO) translating $(<)
259
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
260
+
261
+ $(TARGET_SO): $(OBJS) Makefile
262
+ $(ECHO) linking shared-object $(DLLIB)
263
+ -$(Q)$(RM) $(@)
264
+ $(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
265
+
266
+
267
+
268
+ $(OBJS): $(HDRS) $(ruby_headers)
data/README.md ADDED
@@ -0,0 +1,92 @@
1
+ # stridx
2
+ This library provides fast fuzzy string similarity search and indexing. It has been mainly developed for indexing filepaths, but can be used for other types of strings aswell. It can easily handle fuzzy searches for more than 100,000 filepaths.
3
+
4
+ The fuzziness means that candidate filepaths do not need to include exact match of the query string. They are considered a good match if they include parts of the query string, and even if those parts are in the wrong order.
5
+
6
+ The library can be applied for UTF-8 data also, although there is a small bias in scoring for multibyte characters.
7
+
8
+
9
+ ## String similarity calculation
10
+
11
+ Once the index has been created, the contents can be searched to find the best matching strings.
12
+
13
+ To be considered a candidate path, the file component of the path (e.g. file.txt)
14
+ is required to have at least a substring of two characters in common with the
15
+ query string. If that condition is true, then the directories will also add to the
16
+ score, although with a smaller weight.
17
+
18
+ The scores that measure how good a candidate is, are calculated as follows (somewhat simplified).
19
+ For each single character substring c in the query string:
20
+
21
+ - find the largest substring in the query which includes the substring c and is also included in the candidate path
22
+ - take the lenght of that substring as score
23
+
24
+ Sum up the scores for each character c and divide by (string length)^2
25
+
26
+ For example, if query = "rngnomadriv"
27
+ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows:
28
+ ```
29
+ rngnomadriv
30
+ 33355555444 (subscores)
31
+ FFFFFFFFDDD (F=file component, D=dir component)
32
+ score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
33
+
34
+ In final score, we give a small penalty for larger candidate filenames:
35
+ Divide main part of score with (query string length)^2
36
+ and minor part by (query string length)*(candidate string length)
37
+ score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
38
+ ```
39
+ # C++ API
40
+ See demo.cpp
41
+ ```cpp
42
+ #include "stridx.hpp"
43
+
44
+ int main() {
45
+ StringIndex idx;
46
+ ...
47
+
48
+ // Add the file paths of 89828 files in linux-6.9-rc6 to the index
49
+ std::string fn_filePaths = "flist.txt";
50
+ std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
51
+ int id = 0;
52
+ for (const auto &filePath : v_filePaths) {
53
+ idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
54
+ id++;
55
+ }
56
+ ...
57
+
58
+ // Find matching filepaths from the index for the query string "rngnomadriv"
59
+ std::string query = "rngnomadriv";
60
+ const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
61
+ ...
62
+ int i = 0;
63
+ std::cout << "query string: " << query << "\n";
64
+ std::cout << "Top 20 matches:\n";
65
+ for (const auto &res : results) {
66
+ std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
67
+ i++;
68
+ if (i > 20) {
69
+ break;
70
+ }
71
+ }
72
+ }
73
+
74
+
75
+ ```
76
+
77
+ Output:
78
+ ```
79
+ Indexing creation time for 89828 file paths (seconds): 2.89097
80
+ Search time (seconds): 0.0346287
81
+ query string: rngnomadriv
82
+ Top 20 matches:
83
+ 56383 0.342944 ./drivers/char/hw_random/nomadik-rng.c
84
+ 65420 0.271396 ./drivers/pinctrl/nomadik
85
+ 58689 0.271126 ./drivers/clk/clk-nomadik.c
86
+ 55819 0.270893 ./drivers/gpio/gpio-nomadik.c
87
+ 47837 0.270431 ./drivers/i2c/busses/i2c-nomadik.c
88
+ 59594 0.270355 ./drivers/clocksource/nomadik-mtu.c
89
+ 51950 0.270088 ./drivers/gpu/drm/pl111/pl111_nomadik.c
90
+ ...
91
+
92
+ ```
data/demo.cpp ADDED
@@ -0,0 +1,76 @@
1
+ #include "stridx.hpp"
2
+
3
+ #include <iostream>
4
+ #include <fstream>
5
+ #include <vector>
6
+ #include <string>
7
+ #include <chrono>
8
+
9
+ using std::cout;
10
+ using std::pair;
11
+ using std::vector;
12
+
13
+ std::vector<std::string> readLinesFromFile(const std::string &filename) {
14
+ std::vector<std::string> lines;
15
+ std::ifstream file(filename);
16
+ if (!file.is_open()) {
17
+ std::cerr << "Error opening file: " << filename << std::endl;
18
+ return lines;
19
+ }
20
+
21
+ std::string line;
22
+ while (std::getline(file, line)) {
23
+ lines.push_back(line);
24
+ }
25
+
26
+ file.close();
27
+ return lines;
28
+ }
29
+
30
+ int main() {
31
+ StringIndex idx;
32
+ // idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
33
+ // idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
34
+ // idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
35
+
36
+ // Add the file paths of 89828 files in linux-6.9-rc6 to the index
37
+ std::string fn_filePaths = "flist.txt";
38
+ std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
39
+
40
+ auto start = std::chrono::high_resolution_clock::now();
41
+ int id = 0;
42
+ for (const auto &filePath : v_filePaths) {
43
+ idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
44
+ // idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
45
+ id++;
46
+ }
47
+
48
+ auto idx_time = std::chrono::high_resolution_clock::now();
49
+ std::chrono::duration<double, std::milli> duration = idx_time - start;
50
+ cout << "Indexing creation time for " << v_filePaths.size() << " file paths (seconds): " << duration.count() / 1000 << "\n";
51
+
52
+ // Find matching filepaths from the index for the query string "rngnomadriv"
53
+ start = std::chrono::high_resolution_clock::now();
54
+ std::string query = "rngnomadriv";
55
+ const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
56
+ auto search_time = std::chrono::high_resolution_clock::now();
57
+ duration = search_time - start;
58
+ cout << "Search time (seconds): " << duration.count() / 1000
59
+ << "\n";
60
+
61
+ int i = 0;
62
+ std::cout << "query string: " << query << "\n";
63
+ std::cout << "Top 20 matches:\n";
64
+ for (const auto &res : results) {
65
+ std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
66
+ i++;
67
+ if (i > 20) {
68
+ break;
69
+ }
70
+ }
71
+
72
+ return 0;
73
+ }
74
+
75
+ // Compile:
76
+ // g++ -Wall -Wno-unused-variable -O3 -fopenmp -lstdc++ demo.cpp -o demo
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+
4
+ require 'mkmf'
5
+
6
+ module_name = "stridx"
7
+ extension_name = 'stridx'
8
+
9
+ $CXXFLAGS << " -Wall -Wno-unused-variable -O3 -fopenmp"
10
+
11
+ have_library( 'stdc++');
12
+ have_library( 'gomp' );
13
+
14
+ dir_config(extension_name) # The destination
15
+ create_makefile(extension_name) # Create Makefile
16
+
@@ -0,0 +1,79 @@
1
+
2
+ #include <ruby.h>
3
+ #include "ruby/ruby.h"
4
+ #include "ruby/thread.h"
5
+
6
+ #include "../stridx.hpp"
7
+
8
+ extern "C" {
9
+
10
+ void str_idx_free(void *data) { delete (StringIndex *)data; }
11
+
12
+ // Wrap StringIndex class inside a ruby variable
13
+ static const rb_data_type_t str_idx_type = {
14
+ // .wrap_struct_name: "doesn’t really matter what it is as long as it’s sensible and unique"
15
+ .wrap_struct_name = "StringIndexW9q4We",
16
+
17
+ // Used by Carbage Collector:
18
+ .function =
19
+ {
20
+ .dmark = NULL,
21
+ .dfree = str_idx_free,
22
+ .dsize = NULL, // TODO
23
+ },
24
+ .data = NULL,
25
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
26
+ };
27
+
28
+ VALUE str_idx_alloc(VALUE self) {
29
+ void *data = new StringIndex();
30
+ return TypedData_Wrap_Struct(self, &str_idx_type, data);
31
+ }
32
+
33
+ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
34
+ std::string s1 = StringValueCStr(str);
35
+ int fid = NUM2INT(fileId);
36
+
37
+ void *data;
38
+ TypedData_Get_Struct(self, int, &str_idx_type, data);
39
+ ((StringIndex *)data)->addStrToIndex(s1, fid, '/');
40
+
41
+ return self;
42
+ }
43
+
44
+ VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
45
+ VALUE ret;
46
+ std::string s1 = StringValueCStr(str);
47
+
48
+ void *data;
49
+ TypedData_Get_Struct(self, int, &str_idx_type, data);
50
+ StringIndex *idx = (StringIndex *)data;
51
+
52
+ ret = rb_ary_new();
53
+ const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, NUM2INT(minChars));
54
+ int limit = 15;
55
+ int i = 0;
56
+ for (const auto &res : results) {
57
+ VALUE arr = rb_ary_new();
58
+ rb_ary_push(arr, INT2NUM(res.second));
59
+ rb_ary_push(arr, DBL2NUM(res.first));
60
+ rb_ary_push(ret, arr);
61
+ i++;
62
+ if (i >= limit) {
63
+ break;
64
+ }
65
+ }
66
+ return ret;
67
+ }
68
+
69
+ void Init_stridx(void) {
70
+
71
+ VALUE cFoo = rb_define_class("CppStringIndex", rb_cObject);
72
+
73
+ rb_define_alloc_func(cFoo, str_idx_alloc);
74
+ rb_define_method(cFoo, "add", StringIndexAddSegments, 2);
75
+ rb_define_method(cFoo, "find", StringIndexFind, 2);
76
+ }
77
+
78
+ } // End extern "C"
79
+
data/rubyext/test.rb ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.dirname(__FILE__)
4
+ require "stridx"
5
+ idx = CppStringIndex.new
6
+
7
+ t = Time.new
8
+ fn = File.expand_path("../flist.txt")
9
+ lines = IO.read(fn).lines.collect { |x| x.strip }
10
+ i = 1
11
+ for x in lines
12
+ idx.add(x, i)
13
+ i += 1
14
+ end
15
+
16
+ idx_time = Time.new
17
+ puts "\nIndexing time: #{idx_time - t}"
18
+ query = "helbind.h"
19
+ res = idx.find(query, 2)
20
+ puts "query: #{query}"
21
+ puts "\nResults:"
22
+ puts "Filename, score"
23
+ puts "==============="
24
+ for x in res
25
+ fn = lines[x[0] - 1]
26
+ score = x[1]
27
+ puts "#{fn}, #{score.round(4)}"
28
+ # pp [lines[x[0] - 1], x[1]]
29
+ end
30
+
31
+
32
+ query_time = Time.new
33
+
34
+ puts "\nSearch time: #{query_time - idx_time}"