StrIdx 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CMakeLists.txt +27 -0
- data/Gemfile +5 -0
- data/Makefile +2 -2
- data/README.md +49 -3
- data/demo.cpp +30 -8
- data/exe/stridx.rb +16 -0
- data/gem_install +4 -0
- data/py_example.py +18 -0
- data/py_interf.cpp +182 -0
- data/rubyext/extconf.rb +1 -3
- data/rubyext/ruby_interf.cpp +18 -5
- data/runserver.rb +7 -0
- data/server.rb +103 -0
- data/setup.py +32 -0
- data/stridx-screencast.mp4 +0 -0
- data/stridx-tty.rb +122 -0
- data/stridx.gemspec +37 -0
- data/stridx.hpp +172 -71
- data/test.rb +12 -1
- data/thread_pool.hpp +98 -0
- data/unit_tests.sh +4 -0
- data/unittest.cpp +147 -0
- metadata +103 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f7655b6bd71bca58c86ad607fd197933fc19b97b3ae1c76e322ec0432025dad7
|
4
|
+
data.tar.gz: 2421892aa6fe750213d08e2254019d87ce7abb10496cdf1635a61815b8b8b0d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a0ed3f51b95b72a553cf97e1a852f63e8b6d1cbbba56fdab55ed5037ef4d658b8649f2cf92d35b5eea4e6657a18a3a1460a3a57069cda0889a1987f5d1611ee
|
7
|
+
data.tar.gz: f0d4753ee43cb205fa86468dad92a66644e12cf889d8018283cb421e4a5d670386b8666af64b89d4d475a59f10701de2223c31d03ac5d366f2ee255c77190cf8
|
data/CMakeLists.txt
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
cmake_minimum_required(VERSION 3.14)
|
2
|
+
|
3
|
+
project(my_project)
|
4
|
+
# https://github.com/google/googletest/issues/4000
|
5
|
+
include(FetchContent)
|
6
|
+
FetchContent_Declare(
|
7
|
+
googletest
|
8
|
+
URL https://github.com/google/googletest/archive/58d77fa8070e8cec2dc1ed015d66b454c8d78850.zip # release-1.12.1
|
9
|
+
)
|
10
|
+
|
11
|
+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
12
|
+
FetchContent_MakeAvailable(googletest)
|
13
|
+
|
14
|
+
enable_testing()
|
15
|
+
|
16
|
+
add_executable(
|
17
|
+
stridx_test
|
18
|
+
unittest.cpp
|
19
|
+
)
|
20
|
+
target_link_libraries(
|
21
|
+
stridx_test
|
22
|
+
GTest::gtest_main
|
23
|
+
)
|
24
|
+
|
25
|
+
include(GoogleTest)
|
26
|
+
gtest_discover_tests(stridx_test)
|
27
|
+
|
data/Makefile
CHANGED
data/README.md
CHANGED
@@ -37,12 +37,58 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
|
|
37
37
|
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
|
38
38
|
```
|
39
39
|
|
40
|
-
#
|
40
|
+
# Interfaces
|
41
|
+
|
42
|
+
## Commandline
|
43
|
+
Install instructions (for Ubuntu Linux):
|
44
|
+
```
|
45
|
+
apt update
|
46
|
+
apt install ruby ruby-dev build-essential
|
47
|
+
gem install StrIdx
|
48
|
+
```
|
49
|
+
|
50
|
+
Start indexing server (on background):
|
51
|
+
```
|
52
|
+
stridx.rb start -- ~/Documents/ ~/Pictures/
|
53
|
+
```
|
54
|
+
|
55
|
+
Add bash keybindings (Ctrl-t):
|
56
|
+
```
|
57
|
+
eval "$(stridx.rb bash)"
|
58
|
+
```
|
59
|
+
|
60
|
+
Search by pressing <kbd>ctrl</kbd>+<kbd>t</kbd>. Keys: <kbd>up</kbd>, <kbd>down</kbd>, select with <kbd>enter</kbd>
|
61
|
+
|
62
|
+

|
63
|
+
|
64
|
+
|
65
|
+
Stop server:
|
66
|
+
```
|
67
|
+
stridx.rb stop
|
68
|
+
```
|
69
|
+
|
70
|
+
Start indexing server (on foreground, to debug):
|
71
|
+
```
|
72
|
+
stridx.rb run -- ~/Documents/ ~/Pictures/
|
73
|
+
```
|
74
|
+
|
75
|
+
|
76
|
+
## Ruby
|
41
77
|
Install:
|
42
78
|
```
|
79
|
+
apt install ruby ruby-dev build-essential
|
43
80
|
gem install StrIdx
|
44
81
|
```
|
45
82
|
|
83
|
+
Or, for development version:
|
84
|
+
```
|
85
|
+
git clone https://github.com/SamiSieranoja/stridx.git
|
86
|
+
cd stridx
|
87
|
+
cd rubyext; ruby extconf.rb ; make ; cd ..
|
88
|
+
gem build stridx.gemspec
|
89
|
+
gem install $(ls -1tr StrIdx*gem | tail -n 1)
|
90
|
+
```
|
91
|
+
|
46
92
|
Usage example (see test.rb):
|
47
93
|
```ruby
|
48
94
|
require "stridx"
|
@@ -58,7 +104,7 @@ for x in lines
|
|
58
104
|
end
|
59
105
|
|
60
106
|
idx_time = Time.new
|
61
|
-
puts "\nIndexing time (#{lines.size} files
|
107
|
+
puts "\nIndexing time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
|
62
108
|
|
63
109
|
query = "rngnomadriv"
|
64
110
|
res = idx.find(query)
|
@@ -105,7 +151,7 @@ Search time: 0.0488 seconds
|
|
105
151
|
```
|
106
152
|
|
107
153
|
|
108
|
-
|
154
|
+
## C++
|
109
155
|
See demo.cpp
|
110
156
|
```cpp
|
111
157
|
#include "stridx.hpp"
|
data/demo.cpp
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
|
2
|
+
#include <condition_variable>
|
3
|
+
#include <functional>
|
4
|
+
#include <iostream>
|
5
|
+
#include <mutex>
|
6
|
+
#include <queue>
|
7
|
+
#include <thread>
|
8
|
+
#include <algorithm>
|
9
|
+
|
1
10
|
#include "stridx.hpp"
|
2
11
|
|
3
12
|
#include <iostream>
|
@@ -28,7 +37,7 @@ std::vector<std::string> readLinesFromFile(const std::string &filename) {
|
|
28
37
|
}
|
29
38
|
|
30
39
|
int main() {
|
31
|
-
StringIndex idx;
|
40
|
+
StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
|
32
41
|
// idx.addStrToIndex("./gdk/x11/gdkasync.c", 0 /*id*/, '/' /*separator*/);
|
33
42
|
// idx.addStrToIndex("./gdk/x11/gdksettings.c", 1, '/');
|
34
43
|
// idx.addStrToIndex("./gdk/x11/gdkx11devicemanager-xi2.h", 2, '/');
|
@@ -37,26 +46,39 @@ int main() {
|
|
37
46
|
std::string fn_filePaths = "flist.txt";
|
38
47
|
std::vector<std::string> v_filePaths = readLinesFromFile(fn_filePaths);
|
39
48
|
|
49
|
+
// Launch indexing to be run on background
|
50
|
+
cout << "File paths: " << v_filePaths.size() << std::endl;
|
51
|
+
cout << "Start indexing in the background" << std::endl;
|
40
52
|
auto start = std::chrono::high_resolution_clock::now();
|
41
53
|
int id = 0;
|
42
54
|
for (const auto &filePath : v_filePaths) {
|
43
|
-
idx.
|
44
|
-
// idx.addStrToIndex(filePath, id, '\0' /*dir separator*/);
|
55
|
+
idx.addStrToIndexThreaded(filePath, id);
|
45
56
|
id++;
|
46
57
|
}
|
47
|
-
|
58
|
+
|
59
|
+
auto idx_time_launch = std::chrono::high_resolution_clock::now();
|
60
|
+
std::chrono::duration<double, std::milli> duration_launch = idx_time_launch - start;
|
61
|
+
cout << "Indexing launch time (seconds): " << duration_launch.count() / 1000 << "\n";
|
62
|
+
|
63
|
+
// Wait until indexing has finished
|
64
|
+
idx.waitUntilDone();
|
65
|
+
|
48
66
|
auto idx_time = std::chrono::high_resolution_clock::now();
|
49
67
|
std::chrono::duration<double, std::milli> duration = idx_time - start;
|
50
|
-
cout << "Indexing
|
68
|
+
cout << "Indexing finished time for " << v_filePaths.size()
|
69
|
+
<< " file paths (seconds): " << duration.count() / 1000 << "\n";
|
51
70
|
|
52
71
|
// Find matching filepaths from the index for the query string "rngnomadriv"
|
53
72
|
start = std::chrono::high_resolution_clock::now();
|
54
73
|
std::string query = "rngnomadriv";
|
74
|
+
for (int i = 0; i < 99; i++) {
|
75
|
+
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
76
|
+
}
|
77
|
+
|
55
78
|
const vector<pair<float, int>> &results = idx.findSimilar(query, 2);
|
56
79
|
auto search_time = std::chrono::high_resolution_clock::now();
|
57
80
|
duration = search_time - start;
|
58
|
-
cout << "Search time (seconds): " << duration.count() / 1000
|
59
|
-
<< "\n";
|
81
|
+
cout << "Search time for 100 queries (seconds): " << duration.count() / 1000 << "\n";
|
60
82
|
|
61
83
|
int i = 0;
|
62
84
|
std::cout << "query string: " << query << "\n";
|
@@ -73,4 +95,4 @@ int main() {
|
|
73
95
|
}
|
74
96
|
|
75
97
|
// Compile:
|
76
|
-
// g++ -Wall -Wno-unused-variable -O3 -
|
98
|
+
// g++ -Wall -Wno-unused-variable -O3 -lstdc++ demo.cpp -o demo
|
data/exe/stridx.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.dirname(__FILE__) + "/.."
|
4
|
+
|
5
|
+
if ARGV[0] == "tty"
|
6
|
+
require "stridx-tty.rb"
|
7
|
+
StrIdxTTY.run
|
8
|
+
elsif ARGV[0] == "bash"
|
9
|
+
puts %q/
|
10
|
+
bind -m emacs-standard '"\er": redraw-current-line';
|
11
|
+
bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
|
12
|
+
/
|
13
|
+
else
|
14
|
+
require "daemons"
|
15
|
+
Daemons.run(File.dirname(__FILE__) + "/../runserver.rb")
|
16
|
+
end
|
data/gem_install
ADDED
data/py_example.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
from stridx import StringIndex
|
4
|
+
e=StringIndex()
|
5
|
+
e.set_value(3)
|
6
|
+
e.add("./rust/alloc/vec/spec_extend.rs",0)
|
7
|
+
e.add("./virt/kvm/dirty_ring.c",1)
|
8
|
+
e.add("./Documentation/staging/static-keys.rst",2)
|
9
|
+
e.add("./Documentation/staging/lzo.rst",3)
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
results = e.find("rstalloc")
|
14
|
+
for x in results:
|
15
|
+
print(x)
|
16
|
+
|
17
|
+
# print(e.get_value())
|
18
|
+
|
data/py_interf.cpp
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <limits.h>
|
4
|
+
#include <cstring>
|
5
|
+
#include <pthread.h>
|
6
|
+
|
7
|
+
#include <stdio.h>
|
8
|
+
#include <iostream>
|
9
|
+
#include <string>
|
10
|
+
#include <vector>
|
11
|
+
#include <cfloat>
|
12
|
+
#include <cmath>
|
13
|
+
#include <bits/stdc++.h>
|
14
|
+
|
15
|
+
using std::ios;
|
16
|
+
using std::sort;
|
17
|
+
using std::string;
|
18
|
+
using std::vector;
|
19
|
+
|
20
|
+
#include <Python.h>
|
21
|
+
#include <cstring>
|
22
|
+
|
23
|
+
#include "stridx.hpp"
|
24
|
+
|
25
|
+
extern "C" {
|
26
|
+
|
27
|
+
// Define a structure for the custom object
|
28
|
+
typedef struct {
|
29
|
+
PyObject_HEAD int value;
|
30
|
+
StrIdx::StringIndex *idx;
|
31
|
+
} StrIdxObject;
|
32
|
+
|
33
|
+
// Method to allocate memory for the object
|
34
|
+
static PyObject *StrIdxObject_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
|
35
|
+
StrIdxObject *self;
|
36
|
+
|
37
|
+
self = (StrIdxObject *)type->tp_alloc(type, 0);
|
38
|
+
if (self != NULL) {
|
39
|
+
self->value = 0;
|
40
|
+
self->idx = new StrIdx::StringIndex();
|
41
|
+
}
|
42
|
+
|
43
|
+
return (PyObject *)self;
|
44
|
+
}
|
45
|
+
|
46
|
+
// Method to deallocate memory for the object
|
47
|
+
static void StrIdxObject_dealloc(StrIdxObject *self) { Py_TYPE(self)->tp_free((PyObject *)self); }
|
48
|
+
|
49
|
+
// Method to set the value of the object
|
50
|
+
static PyObject *StrIdxObject_set_value(StrIdxObject *self, PyObject *args) {
|
51
|
+
int value;
|
52
|
+
|
53
|
+
if (!PyArg_ParseTuple(args, "i", &value)) {
|
54
|
+
return NULL;
|
55
|
+
}
|
56
|
+
|
57
|
+
self->value = value;
|
58
|
+
|
59
|
+
Py_INCREF(Py_None);
|
60
|
+
return Py_None;
|
61
|
+
}
|
62
|
+
|
63
|
+
static PyObject *StrIdxObject_add(StrIdxObject *self, PyObject *args) {
|
64
|
+
char *value;
|
65
|
+
int file_id;
|
66
|
+
std::string str;
|
67
|
+
if (!PyArg_ParseTuple(args, "si", &value, &file_id)) {
|
68
|
+
return NULL;
|
69
|
+
}
|
70
|
+
str = value;
|
71
|
+
|
72
|
+
printf("char[]*: %s %i\n", value, file_id);
|
73
|
+
self->idx->addStrToIndex(str, file_id);
|
74
|
+
// self->idx->addStrToIndexThreaded(str, file_id);
|
75
|
+
Py_INCREF(Py_None);
|
76
|
+
return Py_None;
|
77
|
+
}
|
78
|
+
|
79
|
+
static PyObject *StrIdxObject_find(StrIdxObject *self, PyObject *args) {
|
80
|
+
char *value;
|
81
|
+
std::string str;
|
82
|
+
if (!PyArg_ParseTuple(args, "s", &value)) {
|
83
|
+
return NULL;
|
84
|
+
}
|
85
|
+
str = value;
|
86
|
+
|
87
|
+
printf("char*: %s\n", value);
|
88
|
+
const std::vector<std::pair<float, int>> &results = self->idx->findSimilar(str, 2);
|
89
|
+
|
90
|
+
int limit = 15;
|
91
|
+
int i = 0;
|
92
|
+
|
93
|
+
printf("res=%d\n", results.size());
|
94
|
+
if (results.size() < limit) {
|
95
|
+
limit = results.size();
|
96
|
+
}
|
97
|
+
PyObject *pyarr = PyList_New(limit);
|
98
|
+
|
99
|
+
for (const auto &[score,fileId] : results) {
|
100
|
+
PyObject *arr2 = PyList_New(2);
|
101
|
+
// PyList_SetItem(arr2, 0, Py_BuildValue("i", res.second));
|
102
|
+
// PyList_SetItem(arr2, 1, Py_BuildValue("d", res.first));
|
103
|
+
PyList_SetItem(arr2, 0, Py_BuildValue("i", fileId));
|
104
|
+
PyList_SetItem(arr2, 1, Py_BuildValue("d", score));
|
105
|
+
PyList_SetItem(pyarr, i, arr2);
|
106
|
+
i++;
|
107
|
+
if (i >= limit) {
|
108
|
+
break;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
// Py_INCREF(Py_None);
|
113
|
+
return pyarr;
|
114
|
+
}
|
115
|
+
|
116
|
+
// Method to get the value of the object
|
117
|
+
static PyObject *StrIdxObject_get_value(StrIdxObject *self) {
|
118
|
+
return PyLong_FromLong(self->value);
|
119
|
+
}
|
120
|
+
|
121
|
+
// Define methods of the class
|
122
|
+
static PyMethodDef StrIdxObject_methods[] = {
|
123
|
+
{"set_value", (PyCFunction)StrIdxObject_set_value, METH_VARARGS,
|
124
|
+
"Set the value of the object"},
|
125
|
+
{"add", (PyCFunction)StrIdxObject_add, METH_VARARGS, "Set the value of the object"},
|
126
|
+
{"find", (PyCFunction)StrIdxObject_find, METH_VARARGS, "Find similar strings"},
|
127
|
+
{"get_value", (PyCFunction)StrIdxObject_get_value, METH_NOARGS, "Get the value of the object"},
|
128
|
+
{NULL} /* Sentinel */
|
129
|
+
};
|
130
|
+
|
131
|
+
// Define the type object for the class
|
132
|
+
static PyTypeObject StrIdxType = {
|
133
|
+
PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stridx.StrIdx",
|
134
|
+
.tp_basicsize = sizeof(StrIdxObject),
|
135
|
+
.tp_dealloc = (destructor)StrIdxObject_dealloc,
|
136
|
+
.tp_doc = PyDoc_STR("Fuzzy string index"),
|
137
|
+
.tp_methods = StrIdxObject_methods,
|
138
|
+
.tp_new = StrIdxObject_new,
|
139
|
+
// .tp_repr = (reprfunc)myobj_repr,
|
140
|
+
};
|
141
|
+
|
142
|
+
// PyVarObject_HEAD_INIT(NULL, 0)
|
143
|
+
// .tp_name = "stridx.StrIdx",
|
144
|
+
// .tp_doc = "StrIdx class",
|
145
|
+
// .tp_basicsize = sizeof(StrIdxObject),
|
146
|
+
// .tp_itemsize = 0,
|
147
|
+
// .tp_flags = Py_TPFLAGS_DEFAULT,
|
148
|
+
// .tp_new = StrIdxObject_new,
|
149
|
+
// .tp_dealloc = (destructor)StrIdxObject_dealloc,
|
150
|
+
// .tp_methods = StrIdxObject_methods,
|
151
|
+
// };
|
152
|
+
|
153
|
+
|
154
|
+
// Define python accessible methods
|
155
|
+
static PyMethodDef StrIdxMethods[] = {
|
156
|
+
{NULL, NULL, 0, NULL}};
|
157
|
+
|
158
|
+
static struct PyModuleDef moduledef = {
|
159
|
+
PyModuleDef_HEAD_INIT, "stridx", NULL, -1, StrIdxMethods, NULL, NULL, NULL, NULL};
|
160
|
+
|
161
|
+
PyMODINIT_FUNC PyInit_stridx(void) {
|
162
|
+
PyObject *m;
|
163
|
+
m = PyModule_Create(&moduledef);
|
164
|
+
|
165
|
+
// Initialize the type object
|
166
|
+
if (PyType_Ready(&StrIdxType) < 0) {
|
167
|
+
return NULL;
|
168
|
+
}
|
169
|
+
|
170
|
+
Py_INCREF(&StrIdxType);
|
171
|
+
if (PyModule_AddObject(m, "StringIndex", (PyObject *)&StrIdxType) < 0) {
|
172
|
+
Py_DECREF(&StrIdxType);
|
173
|
+
Py_DECREF(m);
|
174
|
+
return NULL;
|
175
|
+
}
|
176
|
+
|
177
|
+
if (!m) {
|
178
|
+
return NULL;
|
179
|
+
}
|
180
|
+
return m;
|
181
|
+
}
|
182
|
+
} // END extern "C"
|
data/rubyext/extconf.rb
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
#
|
3
2
|
|
4
3
|
require 'mkmf'
|
5
4
|
|
6
5
|
module_name = "stridx"
|
7
6
|
extension_name = 'stridx'
|
8
7
|
|
9
|
-
$CXXFLAGS << " -Wall -Wno-unused-variable -O3
|
8
|
+
$CXXFLAGS << " -Wall -Wno-unused-variable -O3"
|
10
9
|
|
11
10
|
have_library( 'stdc++');
|
12
|
-
have_library( 'gomp' );
|
13
11
|
|
14
12
|
dir_config(extension_name) # The destination
|
15
13
|
create_makefile(extension_name) # Create Makefile
|
data/rubyext/ruby_interf.cpp
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
extern "C" {
|
9
9
|
|
10
|
-
void str_idx_free(void *data) { delete (StringIndex *)data; }
|
10
|
+
void str_idx_free(void *data) { delete (StrIdx::StringIndex *)data; }
|
11
11
|
|
12
12
|
// Wrap StringIndex class inside a ruby variable
|
13
13
|
static const rb_data_type_t str_idx_type = {
|
@@ -26,7 +26,7 @@ static const rb_data_type_t str_idx_type = {
|
|
26
26
|
};
|
27
27
|
|
28
28
|
VALUE str_idx_alloc(VALUE self) {
|
29
|
-
void *data = new StringIndex();
|
29
|
+
void *data = new StrIdx::StringIndex();
|
30
30
|
return TypedData_Wrap_Struct(self, &str_idx_type, data);
|
31
31
|
}
|
32
32
|
|
@@ -36,18 +36,27 @@ VALUE StringIndexAddSegments(VALUE self, VALUE str, VALUE fileId) {
|
|
36
36
|
|
37
37
|
void *data;
|
38
38
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
39
|
-
((StringIndex *)data)->addStrToIndex(s1, fid);
|
39
|
+
// ((StringIndex *)data)->addStrToIndex(s1, fid);
|
40
|
+
((StrIdx::StringIndex *)data)->addStrToIndexThreaded(s1, fid);
|
40
41
|
|
41
42
|
return self;
|
42
43
|
}
|
43
44
|
|
45
|
+
VALUE StringIndexWaitUntilDone(VALUE self) {
|
46
|
+
void *data;
|
47
|
+
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
48
|
+
((StrIdx::StringIndex *)data)->waitUntilDone();
|
49
|
+
return self;
|
50
|
+
}
|
51
|
+
|
52
|
+
|
44
53
|
VALUE StringIndexFind(VALUE self, VALUE str) {
|
45
54
|
VALUE ret;
|
46
55
|
std::string s1 = StringValueCStr(str);
|
47
56
|
|
48
57
|
void *data;
|
49
58
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
50
|
-
StringIndex *idx = (StringIndex *)data;
|
59
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
51
60
|
|
52
61
|
ret = rb_ary_new();
|
53
62
|
const std::vector<std::pair<float, int>> &results = idx->findSimilar(s1, 2);
|
@@ -80,7 +89,7 @@ VALUE StringIndexSetDirSeparator(VALUE self, VALUE str) {
|
|
80
89
|
|
81
90
|
void *data;
|
82
91
|
TypedData_Get_Struct(self, int, &str_idx_type, data);
|
83
|
-
StringIndex *idx = (StringIndex *)data;
|
92
|
+
StrIdx::StringIndex *idx = (StrIdx::StringIndex *)data;
|
84
93
|
idx->setDirSeparator(c);
|
85
94
|
|
86
95
|
return self;
|
@@ -93,8 +102,12 @@ void Init_stridx(void) {
|
|
93
102
|
|
94
103
|
rb_define_alloc_func(classStringIndex, str_idx_alloc);
|
95
104
|
rb_define_method(classStringIndex, "add", StringIndexAddSegments, 2);
|
105
|
+
rb_define_method(classStringIndex, "waitUntilDone", StringIndexWaitUntilDone, 0);
|
96
106
|
rb_define_method(classStringIndex, "find", StringIndexFind, 1);
|
107
|
+
|
97
108
|
rb_define_method(classStringIndex, "setDirSeparator", StringIndexSetDirSeparator, 1);
|
109
|
+
|
110
|
+
|
98
111
|
}
|
99
112
|
|
100
113
|
} // End extern "C"
|
data/runserver.rb
ADDED
data/server.rb
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "socket"
|
4
|
+
require "stridx"
|
5
|
+
|
6
|
+
module StrIdx
|
7
|
+
class Server
|
8
|
+
def recursively_find_files(directories)
|
9
|
+
filelist = []
|
10
|
+
|
11
|
+
for d in directories
|
12
|
+
filelist = filelist + Dir.glob("#{d}/**/*").select { |e|
|
13
|
+
File.file?(e)
|
14
|
+
# File.file?(e) or File.directory?(e)
|
15
|
+
}
|
16
|
+
end
|
17
|
+
return filelist
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.start(dir_list, daemonize: false)
|
21
|
+
Server.new(dir_list, daemonize: daemonize)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.stop
|
25
|
+
sock_dir = File.expand_path("~/.stridx")
|
26
|
+
sockfn = "#{sock_dir}/sock"
|
27
|
+
client = UNIXSocket.new(sockfn)
|
28
|
+
client.puts "stop"
|
29
|
+
response = client.recv(200 * 200)
|
30
|
+
client.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(dir_list, daemonize: false)
|
34
|
+
idx = StrIdx::StringIndex.new
|
35
|
+
idx.setDirSeparator("/")
|
36
|
+
|
37
|
+
t = Time.new
|
38
|
+
|
39
|
+
dirs = dir_list.select { |x| File.directory?(x) }
|
40
|
+
puts "Scanning files in directories:#{dirs.join(",")}"
|
41
|
+
flist = recursively_find_files(dirs)
|
42
|
+
|
43
|
+
i = 0
|
44
|
+
for x in flist
|
45
|
+
idx.add(x, i)
|
46
|
+
i += 1
|
47
|
+
end
|
48
|
+
|
49
|
+
idx.waitUntilDone()
|
50
|
+
idx_time = Time.new
|
51
|
+
puts "\nIndexing time (#{flist.size} files): #{(idx_time - t).round(4)} seconds"
|
52
|
+
|
53
|
+
sock_dir = File.expand_path("~/.stridx")
|
54
|
+
Dir.mkdir(sock_dir) if !Dir.exist?(sock_dir)
|
55
|
+
sockfn = "#{sock_dir}/sock"
|
56
|
+
File.unlink(sockfn) if File.exist?(sockfn)
|
57
|
+
|
58
|
+
puts "Indexing done, starting server"
|
59
|
+
if (daemonize)
|
60
|
+
require "daemons"
|
61
|
+
Daemons.daemonize
|
62
|
+
# exit if fork() # Daemonize
|
63
|
+
end
|
64
|
+
|
65
|
+
# exit if fork() # Daemonize
|
66
|
+
# $PROGRAM_NAME = "stridx-daemon"
|
67
|
+
|
68
|
+
t = Thread.new {
|
69
|
+
serv = UNIXServer.new(sockfn)
|
70
|
+
|
71
|
+
loop do
|
72
|
+
# Accept a new client connection
|
73
|
+
client = serv.accept
|
74
|
+
|
75
|
+
# puts "Client connected!"
|
76
|
+
|
77
|
+
# Read data from the client
|
78
|
+
data = client.recv(1024)
|
79
|
+
|
80
|
+
if data.match(/^stop$/)
|
81
|
+
puts "Got stop signal. Shutting down server."
|
82
|
+
client.close
|
83
|
+
break
|
84
|
+
end
|
85
|
+
|
86
|
+
# puts "Received from client: #{data}"
|
87
|
+
if data.match(/^find:(.*)/)
|
88
|
+
query = Regexp.last_match(1)
|
89
|
+
res = idx.find(query)
|
90
|
+
response = res.collect { |x| flist[x[0]] }.join("\n")
|
91
|
+
|
92
|
+
# Send a response back to the client
|
93
|
+
client.puts response
|
94
|
+
end
|
95
|
+
# Close the client connection
|
96
|
+
client.close
|
97
|
+
end
|
98
|
+
}
|
99
|
+
|
100
|
+
t.join
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
data/setup.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
import numpy
|
3
|
+
|
4
|
+
import setuptools
|
5
|
+
from setuptools import setup, Extension
|
6
|
+
|
7
|
+
__version__ = "0.1"
|
8
|
+
|
9
|
+
cargs = ['-fpermissive']
|
10
|
+
|
11
|
+
|
12
|
+
with open('README.md', 'r', encoding='utf-8') as f:
|
13
|
+
long_description = f.read()
|
14
|
+
|
15
|
+
module1 = Extension('stridx', sources=['py_interf.cpp'], include_dirs=['.'], extra_compile_args=cargs,
|
16
|
+
language="c++",
|
17
|
+
)
|
18
|
+
|
19
|
+
ext_modules = [module1]
|
20
|
+
|
21
|
+
setup(
|
22
|
+
name='stridx',
|
23
|
+
version='1.0',
|
24
|
+
setup_requires=['wheel'],
|
25
|
+
python_requires='>=3',
|
26
|
+
provides=['stridx'],
|
27
|
+
description='Fast fuzzy string similarity search and indexing (for filenames) ',
|
28
|
+
long_description=long_description,
|
29
|
+
long_description_content_type='text/markdown',
|
30
|
+
ext_modules=[module1]
|
31
|
+
)
|
32
|
+
|
Binary file
|