StrIdx 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +504 -0
- data/Makefile +268 -0
- data/README.md +92 -0
- data/demo.cpp +76 -0
- data/rubyext/extconf.rb +16 -0
- data/rubyext/ruby_interf.cpp +79 -0
- data/rubyext/test.rb +34 -0
- data/stridx.hpp +506 -0
- data/unordered_dense.h +2032 -0
- metadata +82 -0
data/Makefile
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
V0 = $(V:0=)
|
7
|
+
Q1 = $(V:1=)
|
8
|
+
Q = $(Q1:0=@)
|
9
|
+
ECHO1 = $(V:1=@ :)
|
10
|
+
ECHO = $(ECHO1:0=@ echo)
|
11
|
+
NULLCMD = :
|
12
|
+
|
13
|
+
#### Start of system configuration section. ####
|
14
|
+
|
15
|
+
srcdir = .
|
16
|
+
topdir = /home/samsam/.rbenv/versions/3.1.4/include/ruby-3.1.0
|
17
|
+
hdrdir = $(topdir)
|
18
|
+
arch_hdrdir = /home/samsam/.rbenv/versions/3.1.4/include/ruby-3.1.0/x86_64-linux
|
19
|
+
PATH_SEPARATOR = :
|
20
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
21
|
+
prefix = $(DESTDIR)/home/samsam/.rbenv/versions/3.1.4
|
22
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
23
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
24
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
25
|
+
exec_prefix = $(prefix)
|
26
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
27
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
28
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
29
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
30
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
31
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
32
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
33
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
34
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
35
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
36
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
37
|
+
sitedir = $(rubylibprefix)/site_ruby
|
38
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
39
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(libdir)/$(arch)
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(datarootdir)/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(datarootdir)/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(prefix)/include
|
56
|
+
runstatedir = $(localstatedir)/run
|
57
|
+
localstatedir = $(prefix)/var
|
58
|
+
sharedstatedir = $(prefix)/com
|
59
|
+
sysconfdir = $(prefix)/etc
|
60
|
+
datadir = $(datarootdir)
|
61
|
+
datarootdir = $(prefix)/share
|
62
|
+
libexecdir = $(exec_prefix)/libexec
|
63
|
+
sbindir = $(exec_prefix)/sbin
|
64
|
+
bindir = $(exec_prefix)/bin
|
65
|
+
archdir = $(rubyarchdir)
|
66
|
+
|
67
|
+
|
68
|
+
CC_WRAPPER =
|
69
|
+
CC = gcc
|
70
|
+
CXX = g++
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
73
|
+
LIBRUBYARG_SHARED = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)
|
74
|
+
LIBRUBYARG_STATIC = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static $(MAINLIBS)
|
75
|
+
empty =
|
76
|
+
OUTFLAG = -o $(empty)
|
77
|
+
COUTFLAG = -o $(empty)
|
78
|
+
CSRCFLAG = $(empty)
|
79
|
+
|
80
|
+
RUBY_EXTCONF_H =
|
81
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
82
|
+
cxxflags =
|
83
|
+
optflags = -O3 -fno-fast-math
|
84
|
+
debugflags = -ggdb3
|
85
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wundef
|
86
|
+
cppflags =
|
87
|
+
CCDLFLAGS = -fPIC
|
88
|
+
CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
|
+
DEFS =
|
91
|
+
CPPFLAGS = -I/home/samsam/.rbenv/versions/3.1.4/include $(DEFS) $(cppflags)
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -Wall -Wno-unused-variable -O3 -fopenmp $(ARCH_FLAG)
|
93
|
+
ldflags = -L. -L/home/samsam/.rbenv/versions/3.1.4/lib -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
|
94
|
+
dldflags = -L/home/samsam/.rbenv/versions/3.1.4/lib -Wl,--compress-debug-sections=zlib
|
95
|
+
ARCH_FLAG =
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
|
+
LDSHARED = $(CC) -shared
|
98
|
+
LDSHAREDXX = $(CXX) -shared
|
99
|
+
AR = gcc-ar
|
100
|
+
EXEEXT =
|
101
|
+
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
103
|
+
RUBY_SO_NAME = ruby
|
104
|
+
RUBYW_INSTALL_NAME =
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
107
|
+
RUBY_BASE_NAME = ruby
|
108
|
+
|
109
|
+
arch = x86_64-linux
|
110
|
+
sitearch = $(arch)
|
111
|
+
ruby_version = 3.1.0
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
|
+
RUBY = $(ruby)
|
114
|
+
BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
|
115
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
116
|
+
|
117
|
+
RM = rm -f
|
118
|
+
RM_RF = rm -fr
|
119
|
+
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
120
|
+
MAKEDIRS = /usr/bin/mkdir -p
|
121
|
+
INSTALL = /usr/bin/install -c
|
122
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
123
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
124
|
+
COPY = cp
|
125
|
+
TOUCH = exit >
|
126
|
+
|
127
|
+
#### End of system configuration section. ####
|
128
|
+
|
129
|
+
preload =
|
130
|
+
libpath = . $(libdir)
|
131
|
+
LIBPATH = -L. -L$(libdir) -Wl,-rpath,$(libdir)
|
132
|
+
DEFFILE =
|
133
|
+
|
134
|
+
CLEANFILES = mkmf.log
|
135
|
+
DISTCLEANFILES =
|
136
|
+
DISTCLEANDIRS =
|
137
|
+
|
138
|
+
extout =
|
139
|
+
extout_prefix =
|
140
|
+
target_prefix =
|
141
|
+
LOCAL_LIBS =
|
142
|
+
LIBS = $(LIBRUBYARG_SHARED) -lgomp -lstdc++ -lm -lc
|
143
|
+
ORIG_SRCS = demo.cpp ruby_interf.cpp
|
144
|
+
SRCS = $(ORIG_SRCS)
|
145
|
+
OBJS = demo.o ruby_interf.o
|
146
|
+
HDRS = $(srcdir)/unordered_dense.h $(srcdir)/stridx.hpp
|
147
|
+
LOCAL_HDRS =
|
148
|
+
TARGET = stridx
|
149
|
+
TARGET_NAME = stridx
|
150
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
151
|
+
DLLIB = $(TARGET).so
|
152
|
+
EXTSTATIC =
|
153
|
+
STATIC_LIB =
|
154
|
+
|
155
|
+
TIMESTAMP_DIR = .
|
156
|
+
BINDIR = $(bindir)
|
157
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
158
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
159
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
162
|
+
TARGET_SO_DIR =
|
163
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
164
|
+
CLEANLIBS = $(TARGET_SO) false
|
165
|
+
CLEANOBJS = *.o *.bak
|
166
|
+
|
167
|
+
all: $(DLLIB)
|
168
|
+
static: $(STATIC_LIB)
|
169
|
+
.PHONY: all install static install-so install-rb
|
170
|
+
.PHONY: clean clean-so clean-static clean-rb
|
171
|
+
|
172
|
+
clean-static::
|
173
|
+
clean-rb-default::
|
174
|
+
clean-rb::
|
175
|
+
clean-so::
|
176
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
177
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
178
|
+
|
179
|
+
distclean-rb-default::
|
180
|
+
distclean-rb::
|
181
|
+
distclean-so::
|
182
|
+
distclean-static::
|
183
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
184
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
185
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
186
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
187
|
+
|
188
|
+
realclean: distclean
|
189
|
+
install: install-so install-rb
|
190
|
+
|
191
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.sitearchdir.time
|
192
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
193
|
+
clean-static::
|
194
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
195
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
196
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
197
|
+
pre-install-rb: Makefile
|
198
|
+
pre-install-rb-default: Makefile
|
199
|
+
do-install-rb:
|
200
|
+
do-install-rb-default:
|
201
|
+
pre-install-rb-default:
|
202
|
+
@$(NULLCMD)
|
203
|
+
$(TIMESTAMP_DIR)/.sitearchdir.time:
|
204
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
205
|
+
$(Q) $(TOUCH) $@
|
206
|
+
|
207
|
+
site-install: site-install-so site-install-rb
|
208
|
+
site-install-so: install-so
|
209
|
+
site-install-rb: install-rb
|
210
|
+
|
211
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
212
|
+
|
213
|
+
.cc.o:
|
214
|
+
$(ECHO) compiling $(<)
|
215
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
216
|
+
|
217
|
+
.cc.S:
|
218
|
+
$(ECHO) translating $(<)
|
219
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
220
|
+
|
221
|
+
.mm.o:
|
222
|
+
$(ECHO) compiling $(<)
|
223
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
224
|
+
|
225
|
+
.mm.S:
|
226
|
+
$(ECHO) translating $(<)
|
227
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
228
|
+
|
229
|
+
.cxx.o:
|
230
|
+
$(ECHO) compiling $(<)
|
231
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
232
|
+
|
233
|
+
.cxx.S:
|
234
|
+
$(ECHO) translating $(<)
|
235
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
236
|
+
|
237
|
+
.cpp.o:
|
238
|
+
$(ECHO) compiling $(<)
|
239
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
240
|
+
|
241
|
+
.cpp.S:
|
242
|
+
$(ECHO) translating $(<)
|
243
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
244
|
+
|
245
|
+
.c.o:
|
246
|
+
$(ECHO) compiling $(<)
|
247
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
248
|
+
|
249
|
+
.c.S:
|
250
|
+
$(ECHO) translating $(<)
|
251
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
252
|
+
|
253
|
+
.m.o:
|
254
|
+
$(ECHO) compiling $(<)
|
255
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
256
|
+
|
257
|
+
.m.S:
|
258
|
+
$(ECHO) translating $(<)
|
259
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
260
|
+
|
261
|
+
$(TARGET_SO): $(OBJS) Makefile
|
262
|
+
$(ECHO) linking shared-object $(DLLIB)
|
263
|
+
-$(Q)$(RM) $(@)
|
264
|
+
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
265
|
+
|
266
|
+
|
267
|
+
|
268
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/README.md
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# stridx
|
2
|
+
This library provides fast fuzzy string similarity search and indexing. It has been mainly developed for indexing filepaths, but can be used for other types of strings aswell. It can easily handle fuzzy searches for more than 100,000 filepaths.
|
3
|
+
|
4
|
+
The fuzziness means that candidate filepaths do not need to include exact match of the query string. They are considered a good match if they include parts of the query string, and even if those parts are in the wrong order.
|
5
|
+
|
6
|
+
The library can be applied for UTF-8 data also, although there is a small bias in scoring for multibyte characters.
|
7
|
+
|
8
|
+
|
9
|
+
## String similarity calculation
|
10
|
+
|
11
|
+
Once the index has been created, the contents can be searched to find the best matching strings.
|
12
|
+
|
13
|
+
To be considered a candidate path, the file component of the path (e.g. file.txt)
|
14
|
+
is required to have at least a substring of two characters in common with the
|
15
|
+
query string. If that condition is true, then the directories will also add to the
|
16
|
+
score, although with a smaller weight.
|
17
|
+
|
18
|
+
The scores that measure how good a candidate is, are calculated as follows (somewhat simplified).
|
19
|
+
For each single character substring c in the query string:
|
20
|
+
|
21
|
+
- find the largest substring in the query which includes the substring c and is also included in the candidate path
|
22
|
+
- take the lenght of that substring as score
|
23
|
+
|
24
|
+
Sum up the scores for each character c and divide by (string length)^2
|
25
|
+
|
26
|
+
For example, if query = "rngnomadriv"
|
27
|
+
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows:
|
28
|
+
```
|
29
|
+
rngnomadriv
|
30
|
+
33355555444 (subscores)
|
31
|
+
FFFFFFFFDDD (F=file component, D=dir component)
|
32
|
+
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
|
33
|
+
|
34
|
+
In final score, we give a small penalty for larger candidate filenames:
|
35
|
+
Divide main part of score with (query string length)^2
|
36
|
+
and minor part by (query string length)*(candidate string length)
|
37
|
+
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
38
|
+
```
|
39
|
+
# C++ API
|
40
|
+
See demo.cpp
|
41
|
+
```cpp
|
42
|
+
#include "stridx.hpp"
|
43
|
+
|
44
|
+
int main() {
|
45
|
+
StringIndex idx;
|
46
|
+
...
|
47
|
+
|
48
|
+
// Add the file paths of 89828 files in linux-6.9-rc6 to the index
|
49
|
+
std::string fn_filePaths = "flist.txt";
|
50
|
+
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
51
|
+
int id = 0;
|
52
|
+
for (const auto &filePath : v_filePaths) {
|
53
|
+
idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
|
54
|
+
id++;
|
55
|
+
}
|
56
|
+
...
|
57
|
+
|
58
|
+
// Find matching filepaths from the index for the query string "rngnomadriv"
|
59
|
+
std::string query = "rngnomadriv";
|
60
|
+
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
61
|
+
...
|
62
|
+
int i = 0;
|
63
|
+
std::cout << "query string: " << query << "\n";
|
64
|
+
std::cout << "Top 20 matches:\n";
|
65
|
+
for (const auto &res : results) {
|
66
|
+
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
67
|
+
i++;
|
68
|
+
if (i > 20) {
|
69
|
+
break;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
|
75
|
+
```
|
76
|
+
|
77
|
+
Output:
|
78
|
+
```
|
79
|
+
Indexing creation time for 89828 file paths (seconds): 2.89097
|
80
|
+
Search time (seconds): 0.0346287
|
81
|
+
query string: rngnomadriv
|
82
|
+
Top 20 matches:
|
83
|
+
56383 0.342944 ./drivers/char/hw_random/nomadik-rng.c
|
84
|
+
65420 0.271396 ./drivers/pinctrl/nomadik
|
85
|
+
58689 0.271126 ./drivers/clk/clk-nomadik.c
|
86
|
+
55819 0.270893 ./drivers/gpio/gpio-nomadik.c
|
87
|
+
47837 0.270431 ./drivers/i2c/busses/i2c-nomadik.c
|
88
|
+
59594 0.270355 ./drivers/clocksource/nomadik-mtu.c
|
89
|
+
51950 0.270088 ./drivers/gpu/drm/pl111/pl111_nomadik.c
|
90
|
+
...
|
91
|
+
|
92
|
+
```
|
data/demo.cpp
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#include "stridx.hpp"
|
2
|
+
|
3
|
+
#include <iostream>
|
4
|
+
#include <fstream>
|
5
|
+
#include <vector>
|
6
|
+
#include <string>
|
7
|
+
#include <chrono>
|
8
|
+
|
9
|
+
using std::cout;
|
10
|
+
using std::pair;
|
11
|
+
using std::vector;
|
12
|
+
|
13
|
+
std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
14
|
+
std::vector<std::string> lines;
|
15
|
+
std::ifstream file(filename);
|
16
|
+
if (!file.is_open()) {
|
17
|
+
std::cerr << "Error opening file: " << filename << std::endl;
|
18
|
+
return lines;
|
19
|
+
}
|
20
|
+
|
21
|
+
std::string line;
|
22
|
+
while (std::getline(file, line)) {
|
23
|
+
lines.push_back(line);
|
24
|
+
}
|
25
|
+
|
26
|
+
file.close();
|
27
|
+
return lines;
|
28
|
+
}
|
29
|
+
|
30
|
+
int main() {
|
31
|
+
StringIndex idx;
|
32
|
+
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
33
|
+
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
34
|
+
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
35
|
+
|
36
|
+
// Add the file paths of 89828 files in linux-6.9-rc6 to the index
|
37
|
+
std::string fn_filePaths = "flist.txt";
|
38
|
+
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
39
|
+
|
40
|
+
auto start = std::chrono::high_resolution_clock::now();
|
41
|
+
int id = 0;
|
42
|
+
for (const auto &filePath : v_filePaths) {
|
43
|
+
idx.addStrToIndex(filePath, id, '/' /*dir separator*/);
|
44
|
+
// idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
|
45
|
+
id++;
|
46
|
+
}
|
47
|
+
|
48
|
+
auto idx_time = std::chrono::high_resolution_clock::now();
|
49
|
+
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
50
|
+
cout << "Indexing creation time for " << v_filePaths.size() << " file paths (seconds): " << duration.count() / 1000 << "\n";
|
51
|
+
|
52
|
+
// Find matching filepaths from the index for the query string "rngnomadriv"
|
53
|
+
start = std::chrono::high_resolution_clock::now();
|
54
|
+
std::string query = "rngnomadriv";
|
55
|
+
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
56
|
+
auto search_time = std::chrono::high_resolution_clock::now();
|
57
|
+
duration = search_time - start;
|
58
|
+
cout << "Search time (seconds): " << duration.count() / 1000
|
59
|
+
<< "\n";
|
60
|
+
|
61
|
+
int i = 0;
|
62
|
+
std::cout << "query string: " << query << "\n";
|
63
|
+
std::cout << "Top 20 matches:\n";
|
64
|
+
for (const auto &res : results) {
|
65
|
+
std::cout << res.second << " " << res.first << " " << v_filePaths[res.second] << "\n";
|
66
|
+
i++;
|
67
|
+
if (i > 20) {
|
68
|
+
break;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
return 0;
|
73
|
+
}
|
74
|
+
|
75
|
+
// Compile:
|
76
|
+
// g++ -Wall -Wno-unused-variable -O3 -fopenmp -lstdc++ demo.cpp -o demo
|
data/rubyext/extconf.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
|
4
|
+
require 'mkmf'
|
5
|
+
|
6
|
+
module_name = "stridx"
|
7
|
+
extension_name = 'stridx'
|
8
|
+
|
9
|
+
$CXXFLAGS << " -Wall -Wno-unused-variable -O3 -fopenmp"
|
10
|
+
|
11
|
+
have_library( 'stdc++');
|
12
|
+
have_library( 'gomp' );
|
13
|
+
|
14
|
+
dir_config(extension_name) # The destination
|
15
|
+
create_makefile(extension_name) # Create Makefile
|
16
|
+
|
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
#include <ruby.h>
|
3
|
+
#include "ruby/ruby.h"
|
4
|
+
#include "ruby/thread.h"
|
5
|
+
|
6
|
+
#include "../stridx.hpp"
|
7
|
+
|
8
|
+
extern "C" {
|
9
|
+
|
10
|
+
void str_idx_free(void *data) { delete (StringIndex *)data; }
|
11
|
+
|
12
|
+
// Wrap StringIndex class inside a ruby variable
|
13
|
+
static const rb_data_type_t str_idx_type = {
|
14
|
+
// .wrap_struct_name: "doesn’t really matter what it is as long as it’s sensible and unique"
|
15
|
+
.wrap_struct_name = "StringIndexW9q4We",
|
16
|
+
|
17
|
+
// Used by Carbage Collector:
|
18
|
+
.function =
|
19
|
+
{
|
20
|
+
.dmark = NULL,
|
21
|
+
.dfree = str_idx_free,
|
22
|
+
.dsize = NULL, // TODO
|
23
|
+
},
|
24
|
+
.data = NULL,
|
25
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
26
|
+
};
|
27
|
+
|
28
|
+
VALUE str_idx_alloc(VALUE self) {
|
29
|
+
void *data = new StringIndex();
|
30
|
+
return TypedData_Wrap_Struct(self, &str_idx_type, data);
|
31
|
+
}
|
32
|
+
|
33
|
+
VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
34
|
+
std::string s1 = StringValueCStr(str);
|
35
|
+
int fid = NUM2INT(fileId);
|
36
|
+
|
37
|
+
void *data;
|
38
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
+
((StringIndex *)data)->addStrToIndex(s1, fid, '/');
|
40
|
+
|
41
|
+
return self;
|
42
|
+
}
|
43
|
+
|
44
|
+
VALUE StringIndexFind(VALUE self, VALUE str, VALUE minChars) {
|
45
|
+
VALUE ret;
|
46
|
+
std::string s1 = StringValueCStr(str);
|
47
|
+
|
48
|
+
void *data;
|
49
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
50
|
+
StringIndex *idx = (StringIndex *)data;
|
51
|
+
|
52
|
+
ret = rb_ary_new();
|
53
|
+
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, NUM2INT(minChars));
|
54
|
+
int limit = 15;
|
55
|
+
int i = 0;
|
56
|
+
for (const auto &res : results) {
|
57
|
+
VALUE arr = rb_ary_new();
|
58
|
+
rb_ary_push(arr, INT2NUM(res.second));
|
59
|
+
rb_ary_push(arr, DBL2NUM(res.first));
|
60
|
+
rb_ary_push(ret, arr);
|
61
|
+
i++;
|
62
|
+
if (i >= limit) {
|
63
|
+
break;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
return ret;
|
67
|
+
}
|
68
|
+
|
69
|
+
void Init_stridx(void) {
|
70
|
+
|
71
|
+
VALUE cFoo = rb_define_class("CppStringIndex", rb_cObject);
|
72
|
+
|
73
|
+
rb_define_alloc_func(cFoo, str_idx_alloc);
|
74
|
+
rb_define_method(cFoo, "add", StringIndexAddSegments, 2);
|
75
|
+
rb_define_method(cFoo, "find", StringIndexFind, 2);
|
76
|
+
}
|
77
|
+
|
78
|
+
} // End extern "C"
|
79
|
+
|
data/rubyext/test.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.dirname(__FILE__)
|
4
|
+
require "stridx"
|
5
|
+
idx = CppStringIndex.new
|
6
|
+
|
7
|
+
t = Time.new
|
8
|
+
fn = File.expand_path("../flist.txt")
|
9
|
+
lines = IO.read(fn).lines.collect { |x| x.strip }
|
10
|
+
i = 1
|
11
|
+
for x in lines
|
12
|
+
idx.add(x, i)
|
13
|
+
i += 1
|
14
|
+
end
|
15
|
+
|
16
|
+
idx_time = Time.new
|
17
|
+
puts "\nIndexing time: #{idx_time - t}"
|
18
|
+
query = "helbind.h"
|
19
|
+
res = idx.find(query, 2)
|
20
|
+
puts "query: #{query}"
|
21
|
+
puts "\nResults:"
|
22
|
+
puts "Filename, score"
|
23
|
+
puts "==============="
|
24
|
+
for x in res
|
25
|
+
fn = lines[x[0] - 1]
|
26
|
+
score = x[1]
|
27
|
+
puts "#{fn}, #{score.round(4)}"
|
28
|
+
# pp [lines[x[0] - 1], x[1]]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
query_time = Time.new
|
33
|
+
|
34
|
+
puts "\nSearch time: #{query_time - idx_time}"
|