StrIdx 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
4
- data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
3
+ metadata.gz: f7655b6bd71bca58c86ad607fd197933fc19b97b3ae1c76e322ec0432025dad7
4
+ data.tar.gz: 2421892aa6fe750213d08e2254019d87ce7abb10496cdf1635a61815b8b8b0d7
5
5
  SHA512:
6
- metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
7
- data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217
6
+ metadata.gz: 0a0ed3f51b95b72a553cf97e1a852f63e8b6d1cbbba56fdab55ed5037ef4d658b8649f2cf92d35b5eea4e6657a18a3a1460a3a57069cda0889a1987f5d1611ee
7
+ data.tar.gz: f0d4753ee43cb205fa86468dad92a66644e12cf889d8018283cb421e4a5d670386b8666af64b89d4d475a59f10701de2223c31d03ac5d366f2ee255c77190cf8
data/CMakeLists.txt ADDED
@@ -0,0 +1,27 @@
1
+ cmake_minimum_required(VERSION 3.14)
2
+
3
+ project(my_project)
4
+ # https://github.com/google/googletest/issues/4000
5
+ include(FetchContent)
6
+ FetchContent_Declare(
7
+ googletest
8
+ URL https://github.com/google/googletest/archive/58d77fa8070e8cec2dc1ed015d66b454c8d78850.zip # release-1.12.1
9
+ )
10
+
11
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
12
+ FetchContent_MakeAvailable(googletest)
13
+
14
+ enable_testing()
15
+
16
+ add_executable(
17
+ stridx_test
18
+ unittest.cpp
19
+ )
20
+ target_link_libraries(
21
+ stridx_test
22
+ GTest::gtest_main
23
+ )
24
+
25
+ include(GoogleTest)
26
+ gtest_discover_tests(stridx_test)
27
+
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+
data/README.md CHANGED
@@ -37,9 +37,46 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
37
37
  score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
38
38
  ```
39
39
 
40
- # Ruby interface
40
+ # Interfaces
41
+
42
+ ## Commandline
43
+ Install instructions (for Ubuntu Linux):
44
+ ```
45
+ apt update
46
+ apt install ruby ruby-dev build-essential
47
+ gem install StrIdx
48
+ ```
49
+
50
+ Start indexing server (on background):
51
+ ```
52
+ stridx.rb start -- ~/Documents/ ~/Pictures/
53
+ ```
54
+
55
+ Add bash keybindings (Ctrl-t):
56
+ ```
57
+ eval "$(stridx.rb bash)"
58
+ ```
59
+
60
+ Search by pressing <kbd>ctrl</kbd>+<kbd>t</kbd>. Keys: <kbd>up</kbd>, <kbd>down</kbd>, select with <kbd>enter</kbd>
61
+
62
+ ![screencast](https://github.com/SamiSieranoja/stridx/assets/46612258/b2fd4fa2-37ad-4423-bd5f-d54b24ff6df5)
63
+
64
+
65
+ Stop server:
66
+ ```
67
+ stridx.rb stop
68
+ ```
69
+
70
+ Start indexing server (on foreground, to debug):
71
+ ```
72
+ stridx.rb run -- ~/Documents/ ~/Pictures/
73
+ ```
74
+
75
+
76
+ ## Ruby
41
77
  Install:
42
78
  ```
79
+ apt install ruby ruby-dev build-essential
43
80
  gem install StrIdx
44
81
  ```
45
82
 
@@ -114,7 +151,7 @@ Search time: 0.0488 seconds
114
151
  ```
115
152
 
116
153
 
117
- # C++ API
154
+ ## C++
118
155
  See demo.cpp
119
156
  ```cpp
120
157
  #include "stridx.hpp"
data/exe/stridx.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.dirname(__FILE__) + "/.."
4
+
5
+ if ARGV[0] == "tty"
6
+ require "stridx-tty.rb"
7
+ StrIdxTTY.run
8
+ elsif ARGV[0] == "bash"
9
+ puts %q/
10
+ bind -m emacs-standard '"\er": redraw-current-line';
11
+ bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
12
+ /
13
+ else
14
+ require "daemons"
15
+ Daemons.run(File.dirname(__FILE__) + "/../runserver.rb")
16
+ end
data/gem_install ADDED
@@ -0,0 +1,4 @@
1
+ gem uninstall --force -x StrIdx
2
+ gem build stridx.gemspec
3
+ gem install $(ls -1tr StrIdx*gem | tail -n 1)
4
+
data/py_example.py ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+
3
+ from stridx import StringIndex
4
+ e=StringIndex()
5
+ e.set_value(3)
6
+ e.add("./rust/alloc/vec/spec_extend.rs",0)
7
+ e.add("./virt/kvm/dirty_ring.c",1)
8
+ e.add("./Documentation/staging/static-keys.rst",2)
9
+ e.add("./Documentation/staging/lzo.rst",3)
10
+
11
+
12
+
13
+ results = e.find("rstalloc")
14
+ for x in results:
15
+ print(x)
16
+
17
+ # print(e.get_value())
18
+
data/py_interf.cpp ADDED
@@ -0,0 +1,182 @@
1
+
2
+ #include <stdio.h>
3
+ #include <limits.h>
4
+ #include <cstring>
5
+ #include <pthread.h>
6
+
7
+ #include <stdio.h>
8
+ #include <iostream>
9
+ #include <string>
10
+ #include <vector>
11
+ #include <cfloat>
12
+ #include <cmath>
13
+ #include <bits/stdc++.h>
14
+
15
+ using std::ios;
16
+ using std::sort;
17
+ using std::string;
18
+ using std::vector;
19
+
20
+ #include <Python.h>
21
+ #include <cstring>
22
+
23
+ #include "stridx.hpp"
24
+
25
+ extern "C" {
26
+
27
+ // Define a structure for the custom object
28
+ typedef struct {
29
+ PyObject_HEAD int value;
30
+ StrIdx::StringIndex *idx;
31
+ } StrIdxObject;
32
+
33
+ // Method to allocate memory for the object
34
+ static PyObject *StrIdxObject_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
35
+ StrIdxObject *self;
36
+
37
+ self = (StrIdxObject *)type->tp_alloc(type, 0);
38
+ if (self != NULL) {
39
+ self->value = 0;
40
+ self->idx = new StrIdx::StringIndex();
41
+ }
42
+
43
+ return (PyObject *)self;
44
+ }
45
+
46
+ // Method to deallocate memory for the object
47
+ static void StrIdxObject_dealloc(StrIdxObject *self) { Py_TYPE(self)->tp_free((PyObject *)self); }
48
+
49
+ // Method to set the value of the object
50
+ static PyObject *StrIdxObject_set_value(StrIdxObject *self, PyObject *args) {
51
+ int value;
52
+
53
+ if (!PyArg_ParseTuple(args, "i", &value)) {
54
+ return NULL;
55
+ }
56
+
57
+ self->value = value;
58
+
59
+ Py_INCREF(Py_None);
60
+ return Py_None;
61
+ }
62
+
63
+ static PyObject *StrIdxObject_add(StrIdxObject *self, PyObject *args) {
64
+ char *value;
65
+ int file_id;
66
+ std::string str;
67
+ if (!PyArg_ParseTuple(args, "si", &value, &file_id)) {
68
+ return NULL;
69
+ }
70
+ str = value;
71
+
72
+ printf("char[]*: %s %i\n", value, file_id);
73
+ self->idx->addStrToIndex(str, file_id);
74
+ // self->idx->addStrToIndexThreaded(str, file_id);
75
+ Py_INCREF(Py_None);
76
+ return Py_None;
77
+ }
78
+
79
+ static PyObject *StrIdxObject_find(StrIdxObject *self, PyObject *args) {
80
+ char *value;
81
+ std::string str;
82
+ if (!PyArg_ParseTuple(args, "s", &value)) {
83
+ return NULL;
84
+ }
85
+ str = value;
86
+
87
+ printf("char*: %s\n", value);
88
+ const std::vector<std::pair<float, int>> &results = self->idx->findSimilar(str, 2);
89
+
90
+ int limit = 15;
91
+ int i = 0;
92
+
93
+ printf("res=%d\n", results.size());
94
+ if (results.size() < limit) {
95
+ limit = results.size();
96
+ }
97
+ PyObject *pyarr = PyList_New(limit);
98
+
99
+ for (const auto &[score,fileId] : results) {
100
+ PyObject *arr2 = PyList_New(2);
101
+ // PyList_SetItem(arr2, 0, Py_BuildValue("i", res.second));
102
+ // PyList_SetItem(arr2, 1, Py_BuildValue("d", res.first));
103
+ PyList_SetItem(arr2, 0, Py_BuildValue("i", fileId));
104
+ PyList_SetItem(arr2, 1, Py_BuildValue("d", score));
105
+ PyList_SetItem(pyarr, i, arr2);
106
+ i++;
107
+ if (i >= limit) {
108
+ break;
109
+ }
110
+ }
111
+
112
+ // Py_INCREF(Py_None);
113
+ return pyarr;
114
+ }
115
+
116
+ // Method to get the value of the object
117
+ static PyObject *StrIdxObject_get_value(StrIdxObject *self) {
118
+ return PyLong_FromLong(self->value);
119
+ }
120
+
121
+ // Define methods of the class
122
+ static PyMethodDef StrIdxObject_methods[] = {
123
+ {"set_value", (PyCFunction)StrIdxObject_set_value, METH_VARARGS,
124
+ "Set the value of the object"},
125
+ {"add", (PyCFunction)StrIdxObject_add, METH_VARARGS, "Set the value of the object"},
126
+ {"find", (PyCFunction)StrIdxObject_find, METH_VARARGS, "Find similar strings"},
127
+ {"get_value", (PyCFunction)StrIdxObject_get_value, METH_NOARGS, "Get the value of the object"},
128
+ {NULL} /* Sentinel */
129
+ };
130
+
131
+ // Define the type object for the class
132
+ static PyTypeObject StrIdxType = {
133
+ PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stridx.StrIdx",
134
+ .tp_basicsize = sizeof(StrIdxObject),
135
+ .tp_dealloc = (destructor)StrIdxObject_dealloc,
136
+ .tp_doc = PyDoc_STR("Fuzzy string index"),
137
+ .tp_methods = StrIdxObject_methods,
138
+ .tp_new = StrIdxObject_new,
139
+ // .tp_repr = (reprfunc)myobj_repr,
140
+ };
141
+
142
+ // PyVarObject_HEAD_INIT(NULL, 0)
143
+ // .tp_name = "stridx.StrIdx",
144
+ // .tp_doc = "StrIdx class",
145
+ // .tp_basicsize = sizeof(StrIdxObject),
146
+ // .tp_itemsize = 0,
147
+ // .tp_flags = Py_TPFLAGS_DEFAULT,
148
+ // .tp_new = StrIdxObject_new,
149
+ // .tp_dealloc = (destructor)StrIdxObject_dealloc,
150
+ // .tp_methods = StrIdxObject_methods,
151
+ // };
152
+
153
+
154
+ // Define python accessible methods
155
+ static PyMethodDef StrIdxMethods[] = {
156
+ {NULL, NULL, 0, NULL}};
157
+
158
+ static struct PyModuleDef moduledef = {
159
+ PyModuleDef_HEAD_INIT, "stridx", NULL, -1, StrIdxMethods, NULL, NULL, NULL, NULL};
160
+
161
+ PyMODINIT_FUNC PyInit_stridx(void) {
162
+ PyObject *m;
163
+ m = PyModule_Create(&moduledef);
164
+
165
+ // Initialize the type object
166
+ if (PyType_Ready(&StrIdxType) < 0) {
167
+ return NULL;
168
+ }
169
+
170
+ Py_INCREF(&StrIdxType);
171
+ if (PyModule_AddObject(m, "StringIndex", (PyObject *)&StrIdxType) < 0) {
172
+ Py_DECREF(&StrIdxType);
173
+ Py_DECREF(m);
174
+ return NULL;
175
+ }
176
+
177
+ if (!m) {
178
+ return NULL;
179
+ }
180
+ return m;
181
+ }
182
+ } // END extern "C"
data/runserver.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.dirname(__FILE__)
3
+
4
+ require "server.rb"
5
+ # StrIdx::Server.start ARGV, daemonize: true
6
+ StrIdx::Server.start ARGV
7
+
data/server.rb ADDED
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "socket"
4
+ require "stridx"
5
+
6
+ module StrIdx
7
+ class Server
8
+ def recursively_find_files(directories)
9
+ filelist = []
10
+
11
+ for d in directories
12
+ filelist = filelist + Dir.glob("#{d}/**/*").select { |e|
13
+ File.file?(e)
14
+ # File.file?(e) or File.directory?(e)
15
+ }
16
+ end
17
+ return filelist
18
+ end
19
+
20
+ def self.start(dir_list, daemonize: false)
21
+ Server.new(dir_list, daemonize: daemonize)
22
+ end
23
+
24
+ def self.stop
25
+ sock_dir = File.expand_path("~/.stridx")
26
+ sockfn = "#{sock_dir}/sock"
27
+ client = UNIXSocket.new(sockfn)
28
+ client.puts "stop"
29
+ response = client.recv(200 * 200)
30
+ client.close
31
+ end
32
+
33
+ def initialize(dir_list, daemonize: false)
34
+ idx = StrIdx::StringIndex.new
35
+ idx.setDirSeparator("/")
36
+
37
+ t = Time.new
38
+
39
+ dirs = dir_list.select { |x| File.directory?(x) }
40
+ puts "Scanning files in directories:#{dirs.join(",")}"
41
+ flist = recursively_find_files(dirs)
42
+
43
+ i = 0
44
+ for x in flist
45
+ idx.add(x, i)
46
+ i += 1
47
+ end
48
+
49
+ idx.waitUntilDone()
50
+ idx_time = Time.new
51
+ puts "\nIndexing time (#{flist.size} files): #{(idx_time - t).round(4)} seconds"
52
+
53
+ sock_dir = File.expand_path("~/.stridx")
54
+ Dir.mkdir(sock_dir) if !Dir.exist?(sock_dir)
55
+ sockfn = "#{sock_dir}/sock"
56
+ File.unlink(sockfn) if File.exist?(sockfn)
57
+
58
+ puts "Indexing done, starting server"
59
+ if (daemonize)
60
+ require "daemons"
61
+ Daemons.daemonize
62
+ # exit if fork() # Daemonize
63
+ end
64
+
65
+ # exit if fork() # Daemonize
66
+ # $PROGRAM_NAME = "stridx-daemon"
67
+
68
+ t = Thread.new {
69
+ serv = UNIXServer.new(sockfn)
70
+
71
+ loop do
72
+ # Accept a new client connection
73
+ client = serv.accept
74
+
75
+ # puts "Client connected!"
76
+
77
+ # Read data from the client
78
+ data = client.recv(1024)
79
+
80
+ if data.match(/^stop$/)
81
+ puts "Got stop signal. Shutting down server."
82
+ client.close
83
+ break
84
+ end
85
+
86
+ # puts "Received from client: #{data}"
87
+ if data.match(/^find:(.*)/)
88
+ query = Regexp.last_match(1)
89
+ res = idx.find(query)
90
+ response = res.collect { |x| flist[x[0]] }.join("\n")
91
+
92
+ # Send a response back to the client
93
+ client.puts response
94
+ end
95
+ # Close the client connection
96
+ client.close
97
+ end
98
+ }
99
+
100
+ t.join
101
+ end
102
+ end
103
+ end
data/setup.py ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env python
2
+ import numpy
3
+
4
+ import setuptools
5
+ from setuptools import setup, Extension
6
+
7
+ __version__ = "0.1"
8
+
9
+ cargs = ['-fpermissive']
10
+
11
+
12
+ with open('README.md', 'r', encoding='utf-8') as f:
13
+ long_description = f.read()
14
+
15
+ module1 = Extension('stridx', sources=['py_interf.cpp'], include_dirs=['.'], extra_compile_args=cargs,
16
+ language="c++",
17
+ )
18
+
19
+ ext_modules = [module1]
20
+
21
+ setup(
22
+ name='stridx',
23
+ version='1.0',
24
+ setup_requires=['wheel'],
25
+ python_requires='>=3',
26
+ provides=['stridx'],
27
+ description='Fast fuzzy string similarity search and indexing (for filenames) ',
28
+ long_description=long_description,
29
+ long_description_content_type='text/markdown',
30
+ ext_modules=[module1]
31
+ )
32
+
Binary file
data/stridx-tty.rb ADDED
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "tty-prompt"
4
+ require "tty-cursor"
5
+ require "tty-reader"
6
+ require "pastel"
7
+
8
+ require "socket"
9
+
10
+ class StrIdxTTY
11
+ def self.run
12
+ stty = StrIdxTTY.new
13
+ selected = stty.search
14
+ STDOUT.write selected
15
+ end
16
+
17
+ def initialize()
18
+ @lines = []
19
+ @selected = ""
20
+ @idx = 0
21
+
22
+ @reader = TTY::Reader.new(output: STDERR)
23
+ @pastel = Pastel.new()
24
+ @cursor = TTY::Cursor
25
+
26
+ sock_dir = File.expand_path("~/.stridx")
27
+ sockfn = "#{sock_dir}/sock"
28
+
29
+ error = true
30
+ while error
31
+ begin
32
+ # Create a new UNIXSocket
33
+ client = UNIXSocket.new(sockfn)
34
+ rescue Errno::ECONNREFUSED => e
35
+ out "Waiting for server to start\n"
36
+ sleep 2
37
+ error = true
38
+ else
39
+ error = false
40
+ client.close
41
+ #... executes when no error
42
+ end
43
+ end
44
+ end
45
+
46
+ def out(x)
47
+ STDERR.write x
48
+ end
49
+
50
+ def search
51
+ out "\n" * 20
52
+ out @cursor.clear_screen
53
+ out "\n" * 20
54
+ @cursor.move_to(0, 0)
55
+ @reader.on(:keypress) { |event|
56
+ handle_event(event)
57
+ }
58
+ @reader.read_line(">> ")
59
+
60
+ out @cursor.clear_screen
61
+ return @selected.strip
62
+ end
63
+
64
+ def get_res_from_server(query)
65
+ # Define the socket file path
66
+ sock_dir = File.expand_path("~/.stridx")
67
+ sockfn = "#{sock_dir}/sock"
68
+
69
+ # Create a new UNIXSocket
70
+ client = UNIXSocket.new(sockfn)
71
+
72
+ # Send data to the server
73
+ client.puts "find:#{query}"
74
+
75
+ # Read response from the server
76
+ response = client.recv(200 * 200)
77
+
78
+ # Close the client connection
79
+ client.close
80
+ return response.lines
81
+ end
82
+
83
+ def draw_list()
84
+ @selected = @list[@idx]
85
+ i = 0
86
+ for x in @list
87
+ out @cursor.up(1)
88
+ out @cursor.clear_line
89
+ if i == @idx
90
+ out @pastel.lookup(:bold)
91
+ end
92
+ out x.strip
93
+ out @pastel.lookup(:reset)
94
+ i += 1
95
+ end
96
+ end
97
+
98
+ def update_search(event)
99
+ query = event.line[3..-1]
100
+ if query.size > 2
101
+ @list = get_res_from_server(query)
102
+ draw_list
103
+ end
104
+ end
105
+
106
+ def handle_event(event)
107
+ out @cursor.save
108
+ if event.key.name == :alpha
109
+ update_search(event)
110
+ elsif event.key.name == :up
111
+ @idx += 1 if @idx < @list.size - 1
112
+ draw_list
113
+ elsif event.key.name == :down
114
+ @idx -= 1 if @idx > 0
115
+ draw_list
116
+ elsif event.key.name == :backspace
117
+ update_search(event)
118
+ end
119
+
120
+ out @cursor.restore
121
+ end
122
+ end
data/stridx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "StrIdx"
3
+ spec.version = "0.1.4"
4
+ spec.authors = ["Sami Sieranoja"]
5
+ spec.email = ["sami.sieranoja@gmail.com"]
6
+
7
+ spec.summary = %q{StrIdx}
8
+ spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
9
+ spec.homepage = "https://github.com/SamiSieranoja/stridx"
10
+ spec.metadata["source_code_uri"] = spec.homepage
11
+ spec.metadata["homepage_uri"] = spec.homepage
12
+
13
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
14
+ f.match(%r{^(refcode|spec|features)/})
15
+ end
16
+ # spec.files << "thread_pool.hpp"
17
+ # spec.files << "exe/stridx.rb"
18
+ # spec.files << "server.rb"
19
+ # spec.files << "stridx-tty.rb"
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib", "ext"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 2.4.21"
26
+ spec.add_development_dependency "rake", "~> 13.1.0"
27
+
28
+ spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
29
+ spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
30
+ spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
31
+ spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
32
+ spec.add_runtime_dependency "pastel", "~> 0.8.0"
33
+ spec.add_runtime_dependency "daemons", "~> 1.4.1"
34
+
35
+ spec.extensions = ["rubyext/extconf.rb"]
36
+ spec.licenses = ["LGPL-2.0+"]
37
+ end
data/stridx.hpp CHANGED
@@ -33,16 +33,16 @@ public:
33
33
  Output(int verb) : verboseLevel(verb) {}
34
34
  Output() : Output(3) {}
35
35
  ~Output() = default;
36
- void print() {}
36
+ static void print() {}
37
37
 
38
38
  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
- template <typename T, typename... Types> void print(T var1, Types... var2) {
39
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
40
40
  std::cout << var1;
41
41
  print(var2...);
42
42
  }
43
43
 
44
44
  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
- template <typename... Types> void printl(Types... var2) {
45
+ template <typename... Types> static void printl(Types... var2) {
46
46
  print(var2...);
47
47
  print("\n");
48
48
  }
@@ -79,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
79
79
  }
80
80
 
81
81
  // Convert int64_t to binary string
82
- [[nodiscard]] std::string int64ToBinaryString(int64_t num) {
82
+ [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
83
83
  std::string result;
84
84
  for (int i = 63; i >= 0; --i) {
85
85
  result += ((num >> i) & 1) ? '1' : '0';
@@ -88,7 +88,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
88
88
  }
89
89
 
90
90
  // Debug. Convert a (8 char) string represented as int64_t to std::string
91
- [[nodiscard]] std::string int64ToStr(int64_t key) {
91
+ [[nodiscard]] std::string int64ToStr(const int64_t &key) {
92
92
  int nchars = 8;
93
93
  std::string str;
94
94
  int multip = nchars * 8;
@@ -108,7 +108,7 @@ void printVector(const std::vector<int> &vec) {
108
108
  }
109
109
 
110
110
  // Debug
111
- [[nodiscard]] std::string charToBinaryString(char chr) {
111
+ [[nodiscard]] std::string charToBinaryString(const char &chr) {
112
112
  std::string result;
113
113
  for (int i = 7; i >= 0; --i) {
114
114
  result += ((chr >> i) & 1) ? '1' : '0';
@@ -122,8 +122,7 @@ enum class segmentType { Dir, File };
122
122
  // A segment of a file path
123
123
  // e.g. if path is /foo/bar/baz.txt
124
124
  // segments are [{root}, foo, bar, baz.txt]
125
- class PathSegment {
126
- public:
125
+ struct PathSegment {
127
126
  std::string str;
128
127
  int fileId; // (if FILE)
129
128
  Candidate *cand;
@@ -135,7 +134,7 @@ public:
135
134
  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
136
135
  PathSegment(std::string _str, int _fileId)
137
136
  : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
138
- [[nodiscard]] int size() {
137
+ [[nodiscard]] int size() const {
139
138
  int sz = str.size();
140
139
  PathSegment *cur = parent;
141
140
  // Sum up length of parent segments (+1 for divisors)
@@ -148,8 +147,7 @@ public:
148
147
  };
149
148
 
150
149
  // Candidate for result in string (filename) search
151
- class Candidate {
152
- public:
150
+ struct Candidate {
153
151
  std::vector<float> v_charscore;
154
152
  PathSegment *seg;
155
153
  int fileId;
@@ -162,25 +160,17 @@ public:
162
160
  int candLen; // Length of candidate
163
161
 
164
162
  Candidate(){};
165
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
166
- // Initialize v_charscores with zeros
167
- v_charscore.resize(len, 0);
168
- candLen = str.size();
169
- seg = nullptr;
170
- }
171
-
172
163
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
173
164
  // Initialize v_charscores with zeros
174
165
  v_charscore.resize(len, 0);
175
166
  candLen = seg->size();
176
167
  }
177
168
 
178
- [[nodiscard]] float getScore() {
169
+ [[nodiscard]] float getScore() const {
179
170
  int i = 0;
180
171
  float score = 0.0;
181
- candLen = seg->size();
182
172
 
183
- for (float &charscore : v_charscore) {
173
+ for (const float &charscore : v_charscore) {
184
174
  score += charscore;
185
175
  i++;
186
176
  }
@@ -193,7 +183,7 @@ public:
193
183
  return score;
194
184
  }
195
185
 
196
- [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
186
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
197
187
  };
198
188
 
199
189
  // This seems to give 10x speed improvement over std::unordered_map
@@ -210,19 +200,17 @@ private:
210
200
  int numStrings = 0;
211
201
 
212
202
  std::vector<SegMap *> dirmaps;
203
+ std::array<std::mutex, 9> mts_d; // for dirmaps
213
204
  std::vector<SegMap *> filemaps;
205
+ std::array<std::mutex, 9> mts_f; // for filemaps
214
206
 
215
207
  std::vector<PathSegment *> segsToClean;
216
208
 
217
- std::unordered_map<int, std::string> strlist;
218
209
  std::unordered_map<int, PathSegment *> seglist;
219
210
  PathSegment *root;
220
211
  int dirId = 0;
221
212
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
222
213
 
223
- std::array<std::mutex, 9> mts_f;
224
- std::array<std::mutex, 9> mts_d;
225
-
226
214
  std::unique_ptr<ThreadPool> pool;
227
215
  Output out{1}; // verbose level = 1
228
216
 
@@ -279,9 +267,11 @@ public:
279
267
  void addStrToIndexThreaded(std::string filePath, int fileId) {
280
268
  pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
281
269
  }
282
- void waitUntilReady() { pool->waitUntilDone(); }
270
+ void waitUntilReady() const { pool->waitUntilDone(); }
283
271
 
284
- void waitUntilDone() { pool->waitUntilDone(); }
272
+ void waitUntilDone() const { pool->waitUntilDone(); }
273
+
274
+ int size() const { return seglist.size(); }
285
275
 
286
276
  /**
287
277
  * Add a string to the index to be searched for afterwards
@@ -291,8 +281,14 @@ public:
291
281
  * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
292
282
  * one of {'\\', '/', '\0' (no separation)}.
293
283
  */
284
+
294
285
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
295
- out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
286
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
287
+
288
+ // If a string with this index has beeen added already
289
+ if (seglist.find(fileId) != seglist.end()) {
290
+ return;
291
+ }
296
292
 
297
293
  std::vector<std::string> segs;
298
294
  numStrings += 1;
@@ -345,6 +341,17 @@ public:
345
341
  }
346
342
  }
347
343
 
344
+ std::string getString(int id) {
345
+ std::string s = "";
346
+ PathSegment *seg = seglist[id];
347
+ s += seg->str;
348
+ while (seg->parent->parent != nullptr) {
349
+ seg = seg->parent;
350
+ s = seg->str + dirSeparator + s;
351
+ }
352
+ return s;
353
+ }
354
+
348
355
  /**
349
356
  The search will find filepaths similar to the input string
350
357
 
@@ -423,7 +430,7 @@ public:
423
430
  }
424
431
 
425
432
  // Return int64_t representation of the first nchars in str, starting from index i
426
- [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
433
+ [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
427
434
  int64_t key = 0;
428
435
  for (int i_char = 0; i_char < nchars; i_char++) {
429
436
  key = key | static_cast<int64_t>(str[i + i_char]);
@@ -519,7 +526,7 @@ private:
519
526
  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
520
527
  // is of length <nchars>.
521
528
  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
522
- SegMap &map) {
529
+ SegMap &map) const {
523
530
 
524
531
  assert(i + nchars <= static_cast<int>(str.size()));
525
532
  std::vector<PathSegment *> res;
data/test.rb CHANGED
@@ -1,8 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ $:.unshift File.dirname(__FILE__)
4
+
3
5
  require "stridx"
4
6
  idx = StrIdx::StringIndex.new
5
7
 
8
+ # "/" for unix-style file paths
9
+ idx.setDirSeparator("/") #(comment out if not file paths)
10
+
6
11
  t = Time.new
7
12
  fn = File.expand_path("flist.txt")
8
13
  lines = IO.read(fn).lines.collect { |x| x.strip }
@@ -14,12 +19,12 @@ end
14
19
 
15
20
  idx_time = Time.new
16
21
  # Time to start the threadpool to process indexing
17
- puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
22
+ puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
18
23
 
19
24
  idx.waitUntilDone() # Not necessary, will be called by idx.find
20
25
  idx_time = Time.new
21
26
  # Time when all threads have completed
22
- puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
27
+ puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
23
28
 
24
29
  query = "rngnomadriv"
25
30
  res = idx.find(query)
data/unit_tests.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/bash
2
+ cmake -S . -B build
3
+ cmake --build build
4
+ cd build && ctest
data/unittest.cpp ADDED
@@ -0,0 +1,147 @@
1
+
2
+ #include <gtest/gtest.h>
3
+ #include "stridx.hpp"
4
+ #include <cmath>
5
+ #include <memory>
6
+
7
+ TEST(SplitString, MatchSize) {
8
+ std::vector<std::string> svec = StrIdx::splitString("foo/bar/test1.txt", '/');
9
+ EXPECT_EQ(svec.size(), 3);
10
+ if (svec.size() == 3) {
11
+ EXPECT_EQ(svec[0].size(), 3);
12
+ EXPECT_EQ(svec[2].size(), 9);
13
+ }
14
+ }
15
+
16
+ std::vector<std::string> flist{"./drivers/char/hw_random/nomadik-rng.c",
17
+ "./drivers/pinctrl/nomadik",
18
+ "./drivers/clk/clk-nomadik.c",
19
+ "./drivers/gpio/gpio-nomadik.c",
20
+ "./drivers/i2c/busses/i2c-nomadik.c",
21
+ "./drivers/clocksource/nomadik-mtu.c",
22
+ "./drivers/gpu/drm/pl111/pl111_nomadik.h",
23
+ "./drivers/gpu/drm/pl111/pl111_nomadik.c",
24
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik.c",
25
+ "./drivers/input/keyboard/nomadik-ske-keypad.c",
26
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c",
27
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c",
28
+ "./drivers/char/hw_random/omap-rng.c",
29
+ "./drivers/char/hw_random/omap3-rom-rng.c",
30
+ "./include/dt-bindings/pinctrl/nomadik.h",
31
+ "./Documentation/devicetree/bindings/arm/ste-nomadik.txt"};
32
+
33
+ std::vector<float> target_scores{0.342944, 0.271396, 0.271126, 0.270893, 0.270431, 0.270355,
34
+ 0.270088, 0.270088, 0.26987, 0.269776, 0.269574, 0.269538,
35
+ 0.236358, 0.236074, 0.224804, 0.224238};
36
+
37
+ void scoreTest(bool threaded) {
38
+
39
+ StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
40
+ std::string query = "rngnomadriv";
41
+
42
+ int i = 1;
43
+ for (const auto &str : flist) {
44
+ if (threaded) {
45
+ idx.addStrToIndexThreaded(str, i);
46
+ } else {
47
+ idx.addStrToIndex(str, i);
48
+ }
49
+ i++;
50
+ }
51
+ const std::vector<std::pair<float, int>> &results = idx.findSimilar(query);
52
+
53
+ std::cout << results[0].first;
54
+ EXPECT_EQ(results[0].second, 1);
55
+ if (results.size() == 16) {
56
+ int i = 0;
57
+ for (const auto &res : results) {
58
+ // Check if first five digits of the scores match
59
+ EXPECT_EQ(std::floor(res.first * 1e5), std::floor(1e5 * target_scores[i]));
60
+ i++;
61
+ }
62
+ }
63
+ }
64
+
65
+ TEST(IndexSearch, MatchingScoresSingleThread) { scoreTest(false); }
66
+ TEST(IndexSearch, MatchingScoresThreaded) { scoreTest(true); }
67
+
68
+ class IndexTest : public testing::Test {
69
+ protected:
70
+ std::unique_ptr<StrIdx::StringIndex> idx = std::make_unique<StrIdx::StringIndex>('/');
71
+
72
+ IndexTest() {}
73
+
74
+ void SetUp() override {
75
+ // Code here will be called immediately after the constructor (right
76
+ // before each test).
77
+ idx = std::make_unique<StrIdx::StringIndex>('/');
78
+ }
79
+
80
+ void TearDown() override {
81
+ // Code here will be called immediately after each test (right
82
+ // before the destructor).
83
+ }
84
+ };
85
+
86
+ TEST_F(IndexTest, BinaryRepresentation1) {
87
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 0, 8);
88
+ std::string s = StrIdx::int64ToBinaryString(num);
89
+ // a b c d ...
90
+ EXPECT_TRUE(s == "0110000101100010011000110110010001100101011001100110011101101000");
91
+ }
92
+
93
+ TEST_F(IndexTest, BinaryRepresentation2) {
94
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 0, 1);
95
+ std::string s = StrIdx::int64ToBinaryString(num);
96
+ EXPECT_TRUE(
97
+ s == "0000000000000000000000000000000000000000000000000000000001100001"); // 01100001 == "a"
98
+ }
99
+ TEST_F(IndexTest, BinaryRepresentation3) {
100
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 7, 1);
101
+ std::string s = StrIdx::int64ToBinaryString(num);
102
+ EXPECT_TRUE(
103
+ s == "0000000000000000000000000000000000000000000000000000000001101000"); // 01101000 == "h"
104
+ }
105
+
106
+ TEST_F(IndexTest, AccessString) {
107
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 0);
108
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadiksdf.c", 2);
109
+ idx->addStrToIndex("./drivers//i2c///busses////aa-i2c-nomadiksdf.c", 3);
110
+ idx->addStrToIndex("/test/foo/bar.txt", 4);
111
+ idx->addStrToIndex("bar.txt", 5);
112
+
113
+ EXPECT_EQ(idx->size(), 5);
114
+ EXPECT_STREQ(idx->getString(0).c_str(), "./drivers/i2c/busses/i2c-nomadik.c");
115
+
116
+ // TODO: does not work yet
117
+ // EXPECT_STREQ(idx->getString(3).c_str(), "./drivers//i2c///busses////aa-i2c-nomadiksdf.c");
118
+
119
+ // TODO: does not work yet
120
+ // EXPECT_STREQ(idx->getString(4).c_str(), "/test/foo/bar.txt");
121
+ EXPECT_STREQ(idx->getString(5).c_str(), "bar.txt");
122
+ }
123
+
124
+ TEST_F(IndexTest, Size1) {
125
+ // Should not add different files with same id
126
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 0);
127
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadiksdf.c", 0);
128
+
129
+ // Should not be added because essentially same as 0:
130
+ idx->addStrToIndex("./drivers//i2c///busses////i2c-nomadik.c", 44);
131
+
132
+ // Should not add same file with different id
133
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 1);
134
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 2);
135
+
136
+ EXPECT_EQ(idx->size(), 1);
137
+
138
+ // Test no-overwriting
139
+ EXPECT_STREQ(idx->getString(0).c_str(), "./drivers/i2c/busses/i2c-nomadik.c");
140
+ }
141
+
142
+ TEST_F(IndexTest, Size2) {
143
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 22);
144
+ idx->addStrToIndex("./Documentation/devicetree/bindings/arm/ste-nomadik.txt", 1);
145
+ idx->addStrToIndex("./Documentation/devicetree/bindings/arm/ste-nomadik33.txt", 3);
146
+ EXPECT_EQ(idx->size(), 3);
147
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: StrIdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sami Sieranoja
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-24 00:00:00.000000000 Z
11
+ date: 2024-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,24 +38,123 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 13.1.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: tty-cursor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.7.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.7.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: tty-prompt
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.23.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.23.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: tty-reader
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.9.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.9.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: tty-screen
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.8.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.8.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: pastel
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 0.8.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 0.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: daemons
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: 1.4.1
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: 1.4.1
41
125
  description: " Fast fuzzy string similarity search and indexing (for filenames)"
42
126
  email:
43
127
  - sami.sieranoja@gmail.com
44
- executables: []
128
+ executables:
129
+ - stridx.rb
45
130
  extensions:
46
131
  - rubyext/extconf.rb
47
132
  extra_rdoc_files: []
48
133
  files:
134
+ - CMakeLists.txt
135
+ - Gemfile
49
136
  - LICENSE
50
137
  - Makefile
51
138
  - README.md
52
139
  - demo.cpp
140
+ - exe/stridx.rb
53
141
  - flist.txt
142
+ - gem_install
143
+ - py_example.py
144
+ - py_interf.cpp
54
145
  - rubyext/extconf.rb
55
146
  - rubyext/ruby_interf.cpp
147
+ - runserver.rb
148
+ - server.rb
149
+ - setup.py
150
+ - stridx-screencast.mp4
151
+ - stridx-tty.rb
152
+ - stridx.gemspec
56
153
  - stridx.hpp
57
154
  - test.rb
58
155
  - thread_pool.hpp
156
+ - unit_tests.sh
157
+ - unittest.cpp
59
158
  - unordered_dense.h
60
159
  homepage: https://github.com/SamiSieranoja/stridx
61
160
  licenses:
@@ -63,7 +162,7 @@ licenses:
63
162
  metadata:
64
163
  source_code_uri: https://github.com/SamiSieranoja/stridx
65
164
  homepage_uri: https://github.com/SamiSieranoja/stridx
66
- post_install_message:
165
+ post_install_message:
67
166
  rdoc_options: []
68
167
  require_paths:
69
168
  - lib
@@ -80,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
179
  version: '0'
81
180
  requirements: []
82
181
  rubygems_version: 3.3.26
83
- signing_key:
182
+ signing_key:
84
183
  specification_version: 4
85
184
  summary: StrIdx
86
185
  test_files: []