StrIdx 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52d40e64a5ace0231828bdbbe6fd94475ab5986d0c1fb7e35e0ce18463a97ae0
4
- data.tar.gz: e1cdcc2ed9f377b2acb049a9fb6de22f24acdbd6e3552748b1307342c10b6cf7
3
+ metadata.gz: f7655b6bd71bca58c86ad607fd197933fc19b97b3ae1c76e322ec0432025dad7
4
+ data.tar.gz: 2421892aa6fe750213d08e2254019d87ce7abb10496cdf1635a61815b8b8b0d7
5
5
  SHA512:
6
- metadata.gz: f3c27923a568fe5916c17e91766066362a965abf9568b21a4daa269cd16a8a4778248ae935b26502aa482aad4807908d401989e7ebfb88d1fbdb011b0c240b60
7
- data.tar.gz: f94dda8d71931c18ae3dc6b58204edda7ffd649bc7452a74fdba4929d6092183e99bf99d7b3632be5bafccfd0be7a877f5513c8c3e72e814dcca08bd79a9b217
6
+ metadata.gz: 0a0ed3f51b95b72a553cf97e1a852f63e8b6d1cbbba56fdab55ed5037ef4d658b8649f2cf92d35b5eea4e6657a18a3a1460a3a57069cda0889a1987f5d1611ee
7
+ data.tar.gz: f0d4753ee43cb205fa86468dad92a66644e12cf889d8018283cb421e4a5d670386b8666af64b89d4d475a59f10701de2223c31d03ac5d366f2ee255c77190cf8
data/CMakeLists.txt ADDED
@@ -0,0 +1,27 @@
1
+ cmake_minimum_required(VERSION 3.14)
2
+
3
+ project(my_project)
4
+ # https://github.com/google/googletest/issues/4000
5
+ include(FetchContent)
6
+ FetchContent_Declare(
7
+ googletest
8
+ URL https://github.com/google/googletest/archive/58d77fa8070e8cec2dc1ed015d66b454c8d78850.zip # release-1.12.1
9
+ )
10
+
11
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
12
+ FetchContent_MakeAvailable(googletest)
13
+
14
+ enable_testing()
15
+
16
+ add_executable(
17
+ stridx_test
18
+ unittest.cpp
19
+ )
20
+ target_link_libraries(
21
+ stridx_test
22
+ GTest::gtest_main
23
+ )
24
+
25
+ include(GoogleTest)
26
+ gtest_discover_tests(stridx_test)
27
+
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+
data/README.md CHANGED
@@ -37,9 +37,46 @@ and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calcu
37
37
  score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
38
38
  ```
39
39
 
40
- # Ruby interface
40
+ # Interfaces
41
+
42
+ ## Commandline
43
+ Install instructions (for Ubuntu Linux):
44
+ ```
45
+ apt update
46
+ apt install ruby ruby-dev build-essential
47
+ gem install StrIdx
48
+ ```
49
+
50
+ Start indexing server (on background):
51
+ ```
52
+ stridx.rb start -- ~/Documents/ ~/Pictures/
53
+ ```
54
+
55
+ Add bash keybindings (Ctrl-t):
56
+ ```
57
+ eval "$(stridx.rb bash)"
58
+ ```
59
+
60
+ Search by pressing <kbd>ctrl</kbd>+<kbd>t</kbd>. Keys: <kbd>up</kbd>, <kbd>down</kbd>, select with <kbd>enter</kbd>
61
+
62
+ ![screencast](https://github.com/SamiSieranoja/stridx/assets/46612258/b2fd4fa2-37ad-4423-bd5f-d54b24ff6df5)
63
+
64
+
65
+ Stop server:
66
+ ```
67
+ stridx.rb stop
68
+ ```
69
+
70
+ Start indexing server (on foreground, to debug):
71
+ ```
72
+ stridx.rb run -- ~/Documents/ ~/Pictures/
73
+ ```
74
+
75
+
76
+ ## Ruby
41
77
  Install:
42
78
  ```
79
+ apt install ruby ruby-dev build-essential
43
80
  gem install StrIdx
44
81
  ```
45
82
 
@@ -114,7 +151,7 @@ Search time: 0.0488 seconds
114
151
  ```
115
152
 
116
153
 
117
- # C++ API
154
+ ## C++
118
155
  See demo.cpp
119
156
  ```cpp
120
157
  #include "stridx.hpp"
data/exe/stridx.rb ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.unshift File.dirname(__FILE__) + "/.."
4
+
5
+ if ARGV[0] == "tty"
6
+ require "stridx-tty.rb"
7
+ StrIdxTTY.run
8
+ elsif ARGV[0] == "bash"
9
+ puts %q/
10
+ bind -m emacs-standard '"\er": redraw-current-line';
11
+ bind -m emacs-standard '"\C-t": " \C-b\C-k \C-u`stridx.rb tty`\e\C-e\er\C-a\C-y\C-h\C-e\e \C-y\ey\C-x\C-x\C-f"'
12
+ /
13
+ else
14
+ require "daemons"
15
+ Daemons.run(File.dirname(__FILE__) + "/../runserver.rb")
16
+ end
data/gem_install ADDED
@@ -0,0 +1,4 @@
1
+ gem uninstall --force -x StrIdx
2
+ gem build stridx.gemspec
3
+ gem install $(ls -1tr StrIdx*gem | tail -n 1)
4
+
data/py_example.py ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env python
2
+
3
+ from stridx import StringIndex
4
+ e=StringIndex()
5
+ e.set_value(3)
6
+ e.add("./rust/alloc/vec/spec_extend.rs",0)
7
+ e.add("./virt/kvm/dirty_ring.c",1)
8
+ e.add("./Documentation/staging/static-keys.rst",2)
9
+ e.add("./Documentation/staging/lzo.rst",3)
10
+
11
+
12
+
13
+ results = e.find("rstalloc")
14
+ for x in results:
15
+ print(x)
16
+
17
+ # print(e.get_value())
18
+
data/py_interf.cpp ADDED
@@ -0,0 +1,182 @@
1
+
2
+ #include <stdio.h>
3
+ #include <limits.h>
4
+ #include <cstring>
5
+ #include <pthread.h>
6
+
7
+ #include <stdio.h>
8
+ #include <iostream>
9
+ #include <string>
10
+ #include <vector>
11
+ #include <cfloat>
12
+ #include <cmath>
13
+ #include <bits/stdc++.h>
14
+
15
+ using std::ios;
16
+ using std::sort;
17
+ using std::string;
18
+ using std::vector;
19
+
20
+ #include <Python.h>
21
+ #include <cstring>
22
+
23
+ #include "stridx.hpp"
24
+
25
+ extern "C" {
26
+
27
+ // Define a structure for the custom object
28
+ typedef struct {
29
+ PyObject_HEAD int value;
30
+ StrIdx::StringIndex *idx;
31
+ } StrIdxObject;
32
+
33
+ // Method to allocate memory for the object
34
+ static PyObject *StrIdxObject_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
35
+ StrIdxObject *self;
36
+
37
+ self = (StrIdxObject *)type->tp_alloc(type, 0);
38
+ if (self != NULL) {
39
+ self->value = 0;
40
+ self->idx = new StrIdx::StringIndex();
41
+ }
42
+
43
+ return (PyObject *)self;
44
+ }
45
+
46
+ // Method to deallocate memory for the object
47
+ static void StrIdxObject_dealloc(StrIdxObject *self) { Py_TYPE(self)->tp_free((PyObject *)self); }
48
+
49
+ // Method to set the value of the object
50
+ static PyObject *StrIdxObject_set_value(StrIdxObject *self, PyObject *args) {
51
+ int value;
52
+
53
+ if (!PyArg_ParseTuple(args, "i", &value)) {
54
+ return NULL;
55
+ }
56
+
57
+ self->value = value;
58
+
59
+ Py_INCREF(Py_None);
60
+ return Py_None;
61
+ }
62
+
63
+ static PyObject *StrIdxObject_add(StrIdxObject *self, PyObject *args) {
64
+ char *value;
65
+ int file_id;
66
+ std::string str;
67
+ if (!PyArg_ParseTuple(args, "si", &value, &file_id)) {
68
+ return NULL;
69
+ }
70
+ str = value;
71
+
72
+ printf("char[]*: %s %i\n", value, file_id);
73
+ self->idx->addStrToIndex(str, file_id);
74
+ // self->idx->addStrToIndexThreaded(str, file_id);
75
+ Py_INCREF(Py_None);
76
+ return Py_None;
77
+ }
78
+
79
+ static PyObject *StrIdxObject_find(StrIdxObject *self, PyObject *args) {
80
+ char *value;
81
+ std::string str;
82
+ if (!PyArg_ParseTuple(args, "s", &value)) {
83
+ return NULL;
84
+ }
85
+ str = value;
86
+
87
+ printf("char*: %s\n", value);
88
+ const std::vector<std::pair<float, int>> &results = self->idx->findSimilar(str, 2);
89
+
90
+ int limit = 15;
91
+ int i = 0;
92
+
93
+ printf("res=%d\n", results.size());
94
+ if (results.size() < limit) {
95
+ limit = results.size();
96
+ }
97
+ PyObject *pyarr = PyList_New(limit);
98
+
99
+ for (const auto &[score,fileId] : results) {
100
+ PyObject *arr2 = PyList_New(2);
101
+ // PyList_SetItem(arr2, 0, Py_BuildValue("i", res.second));
102
+ // PyList_SetItem(arr2, 1, Py_BuildValue("d", res.first));
103
+ PyList_SetItem(arr2, 0, Py_BuildValue("i", fileId));
104
+ PyList_SetItem(arr2, 1, Py_BuildValue("d", score));
105
+ PyList_SetItem(pyarr, i, arr2);
106
+ i++;
107
+ if (i >= limit) {
108
+ break;
109
+ }
110
+ }
111
+
112
+ // Py_INCREF(Py_None);
113
+ return pyarr;
114
+ }
115
+
116
+ // Method to get the value of the object
117
+ static PyObject *StrIdxObject_get_value(StrIdxObject *self) {
118
+ return PyLong_FromLong(self->value);
119
+ }
120
+
121
+ // Define methods of the class
122
+ static PyMethodDef StrIdxObject_methods[] = {
123
+ {"set_value", (PyCFunction)StrIdxObject_set_value, METH_VARARGS,
124
+ "Set the value of the object"},
125
+ {"add", (PyCFunction)StrIdxObject_add, METH_VARARGS, "Set the value of the object"},
126
+ {"find", (PyCFunction)StrIdxObject_find, METH_VARARGS, "Find similar strings"},
127
+ {"get_value", (PyCFunction)StrIdxObject_get_value, METH_NOARGS, "Get the value of the object"},
128
+ {NULL} /* Sentinel */
129
+ };
130
+
131
+ // Define the type object for the class
132
+ static PyTypeObject StrIdxType = {
133
+ PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stridx.StrIdx",
134
+ .tp_basicsize = sizeof(StrIdxObject),
135
+ .tp_dealloc = (destructor)StrIdxObject_dealloc,
136
+ .tp_doc = PyDoc_STR("Fuzzy string index"),
137
+ .tp_methods = StrIdxObject_methods,
138
+ .tp_new = StrIdxObject_new,
139
+ // .tp_repr = (reprfunc)myobj_repr,
140
+ };
141
+
142
+ // PyVarObject_HEAD_INIT(NULL, 0)
143
+ // .tp_name = "stridx.StrIdx",
144
+ // .tp_doc = "StrIdx class",
145
+ // .tp_basicsize = sizeof(StrIdxObject),
146
+ // .tp_itemsize = 0,
147
+ // .tp_flags = Py_TPFLAGS_DEFAULT,
148
+ // .tp_new = StrIdxObject_new,
149
+ // .tp_dealloc = (destructor)StrIdxObject_dealloc,
150
+ // .tp_methods = StrIdxObject_methods,
151
+ // };
152
+
153
+
154
+ // Define python accessible methods
155
+ static PyMethodDef StrIdxMethods[] = {
156
+ {NULL, NULL, 0, NULL}};
157
+
158
+ static struct PyModuleDef moduledef = {
159
+ PyModuleDef_HEAD_INIT, "stridx", NULL, -1, StrIdxMethods, NULL, NULL, NULL, NULL};
160
+
161
+ PyMODINIT_FUNC PyInit_stridx(void) {
162
+ PyObject *m;
163
+ m = PyModule_Create(&moduledef);
164
+
165
+ // Initialize the type object
166
+ if (PyType_Ready(&StrIdxType) < 0) {
167
+ return NULL;
168
+ }
169
+
170
+ Py_INCREF(&StrIdxType);
171
+ if (PyModule_AddObject(m, "StringIndex", (PyObject *)&StrIdxType) < 0) {
172
+ Py_DECREF(&StrIdxType);
173
+ Py_DECREF(m);
174
+ return NULL;
175
+ }
176
+
177
+ if (!m) {
178
+ return NULL;
179
+ }
180
+ return m;
181
+ }
182
+ } // END extern "C"
data/runserver.rb ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.dirname(__FILE__)
3
+
4
+ require "server.rb"
5
+ # StrIdx::Server.start ARGV, daemonize: true
6
+ StrIdx::Server.start ARGV
7
+
data/server.rb ADDED
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "socket"
4
+ require "stridx"
5
+
6
+ module StrIdx
7
+ class Server
8
+ def recursively_find_files(directories)
9
+ filelist = []
10
+
11
+ for d in directories
12
+ filelist = filelist + Dir.glob("#{d}/**/*").select { |e|
13
+ File.file?(e)
14
+ # File.file?(e) or File.directory?(e)
15
+ }
16
+ end
17
+ return filelist
18
+ end
19
+
20
+ def self.start(dir_list, daemonize: false)
21
+ Server.new(dir_list, daemonize: daemonize)
22
+ end
23
+
24
+ def self.stop
25
+ sock_dir = File.expand_path("~/.stridx")
26
+ sockfn = "#{sock_dir}/sock"
27
+ client = UNIXSocket.new(sockfn)
28
+ client.puts "stop"
29
+ response = client.recv(200 * 200)
30
+ client.close
31
+ end
32
+
33
+ def initialize(dir_list, daemonize: false)
34
+ idx = StrIdx::StringIndex.new
35
+ idx.setDirSeparator("/")
36
+
37
+ t = Time.new
38
+
39
+ dirs = dir_list.select { |x| File.directory?(x) }
40
+ puts "Scanning files in directories:#{dirs.join(",")}"
41
+ flist = recursively_find_files(dirs)
42
+
43
+ i = 0
44
+ for x in flist
45
+ idx.add(x, i)
46
+ i += 1
47
+ end
48
+
49
+ idx.waitUntilDone()
50
+ idx_time = Time.new
51
+ puts "\nIndexing time (#{flist.size} files): #{(idx_time - t).round(4)} seconds"
52
+
53
+ sock_dir = File.expand_path("~/.stridx")
54
+ Dir.mkdir(sock_dir) if !Dir.exist?(sock_dir)
55
+ sockfn = "#{sock_dir}/sock"
56
+ File.unlink(sockfn) if File.exist?(sockfn)
57
+
58
+ puts "Indexing done, starting server"
59
+ if (daemonize)
60
+ require "daemons"
61
+ Daemons.daemonize
62
+ # exit if fork() # Daemonize
63
+ end
64
+
65
+ # exit if fork() # Daemonize
66
+ # $PROGRAM_NAME = "stridx-daemon"
67
+
68
+ t = Thread.new {
69
+ serv = UNIXServer.new(sockfn)
70
+
71
+ loop do
72
+ # Accept a new client connection
73
+ client = serv.accept
74
+
75
+ # puts "Client connected!"
76
+
77
+ # Read data from the client
78
+ data = client.recv(1024)
79
+
80
+ if data.match(/^stop$/)
81
+ puts "Got stop signal. Shutting down server."
82
+ client.close
83
+ break
84
+ end
85
+
86
+ # puts "Received from client: #{data}"
87
+ if data.match(/^find:(.*)/)
88
+ query = Regexp.last_match(1)
89
+ res = idx.find(query)
90
+ response = res.collect { |x| flist[x[0]] }.join("\n")
91
+
92
+ # Send a response back to the client
93
+ client.puts response
94
+ end
95
+ # Close the client connection
96
+ client.close
97
+ end
98
+ }
99
+
100
+ t.join
101
+ end
102
+ end
103
+ end
data/setup.py ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env python
2
+ import numpy
3
+
4
+ import setuptools
5
+ from setuptools import setup, Extension
6
+
7
+ __version__ = "0.1"
8
+
9
+ cargs = ['-fpermissive']
10
+
11
+
12
+ with open('README.md', 'r', encoding='utf-8') as f:
13
+ long_description = f.read()
14
+
15
+ module1 = Extension('stridx', sources=['py_interf.cpp'], include_dirs=['.'], extra_compile_args=cargs,
16
+ language="c++",
17
+ )
18
+
19
+ ext_modules = [module1]
20
+
21
+ setup(
22
+ name='stridx',
23
+ version='1.0',
24
+ setup_requires=['wheel'],
25
+ python_requires='>=3',
26
+ provides=['stridx'],
27
+ description='Fast fuzzy string similarity search and indexing (for filenames) ',
28
+ long_description=long_description,
29
+ long_description_content_type='text/markdown',
30
+ ext_modules=[module1]
31
+ )
32
+
Binary file
data/stridx-tty.rb ADDED
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "tty-prompt"
4
+ require "tty-cursor"
5
+ require "tty-reader"
6
+ require "pastel"
7
+
8
+ require "socket"
9
+
10
+ class StrIdxTTY
11
+ def self.run
12
+ stty = StrIdxTTY.new
13
+ selected = stty.search
14
+ STDOUT.write selected
15
+ end
16
+
17
+ def initialize()
18
+ @lines = []
19
+ @selected = ""
20
+ @idx = 0
21
+
22
+ @reader = TTY::Reader.new(output: STDERR)
23
+ @pastel = Pastel.new()
24
+ @cursor = TTY::Cursor
25
+
26
+ sock_dir = File.expand_path("~/.stridx")
27
+ sockfn = "#{sock_dir}/sock"
28
+
29
+ error = true
30
+ while error
31
+ begin
32
+ # Create a new UNIXSocket
33
+ client = UNIXSocket.new(sockfn)
34
+ rescue Errno::ECONNREFUSED => e
35
+ out "Waiting for server to start\n"
36
+ sleep 2
37
+ error = true
38
+ else
39
+ error = false
40
+ client.close
41
+ #... executes when no error
42
+ end
43
+ end
44
+ end
45
+
46
+ def out(x)
47
+ STDERR.write x
48
+ end
49
+
50
+ def search
51
+ out "\n" * 20
52
+ out @cursor.clear_screen
53
+ out "\n" * 20
54
+ @cursor.move_to(0, 0)
55
+ @reader.on(:keypress) { |event|
56
+ handle_event(event)
57
+ }
58
+ @reader.read_line(">> ")
59
+
60
+ out @cursor.clear_screen
61
+ return @selected.strip
62
+ end
63
+
64
+ def get_res_from_server(query)
65
+ # Define the socket file path
66
+ sock_dir = File.expand_path("~/.stridx")
67
+ sockfn = "#{sock_dir}/sock"
68
+
69
+ # Create a new UNIXSocket
70
+ client = UNIXSocket.new(sockfn)
71
+
72
+ # Send data to the server
73
+ client.puts "find:#{query}"
74
+
75
+ # Read response from the server
76
+ response = client.recv(200 * 200)
77
+
78
+ # Close the client connection
79
+ client.close
80
+ return response.lines
81
+ end
82
+
83
+ def draw_list()
84
+ @selected = @list[@idx]
85
+ i = 0
86
+ for x in @list
87
+ out @cursor.up(1)
88
+ out @cursor.clear_line
89
+ if i == @idx
90
+ out @pastel.lookup(:bold)
91
+ end
92
+ out x.strip
93
+ out @pastel.lookup(:reset)
94
+ i += 1
95
+ end
96
+ end
97
+
98
+ def update_search(event)
99
+ query = event.line[3..-1]
100
+ if query.size > 2
101
+ @list = get_res_from_server(query)
102
+ draw_list
103
+ end
104
+ end
105
+
106
+ def handle_event(event)
107
+ out @cursor.save
108
+ if event.key.name == :alpha
109
+ update_search(event)
110
+ elsif event.key.name == :up
111
+ @idx += 1 if @idx < @list.size - 1
112
+ draw_list
113
+ elsif event.key.name == :down
114
+ @idx -= 1 if @idx > 0
115
+ draw_list
116
+ elsif event.key.name == :backspace
117
+ update_search(event)
118
+ end
119
+
120
+ out @cursor.restore
121
+ end
122
+ end
data/stridx.gemspec ADDED
@@ -0,0 +1,37 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "StrIdx"
3
+ spec.version = "0.1.4"
4
+ spec.authors = ["Sami Sieranoja"]
5
+ spec.email = ["sami.sieranoja@gmail.com"]
6
+
7
+ spec.summary = %q{StrIdx}
8
+ spec.description = %q{ Fast fuzzy string similarity search and indexing (for filenames)}
9
+ spec.homepage = "https://github.com/SamiSieranoja/stridx"
10
+ spec.metadata["source_code_uri"] = spec.homepage
11
+ spec.metadata["homepage_uri"] = spec.homepage
12
+
13
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
14
+ f.match(%r{^(refcode|spec|features)/})
15
+ end
16
+ # spec.files << "thread_pool.hpp"
17
+ # spec.files << "exe/stridx.rb"
18
+ # spec.files << "server.rb"
19
+ # spec.files << "stridx-tty.rb"
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib", "ext"]
24
+
25
+ spec.add_development_dependency "bundler", "~> 2.4.21"
26
+ spec.add_development_dependency "rake", "~> 13.1.0"
27
+
28
+ spec.add_runtime_dependency "tty-cursor", "~> 0.7.1"
29
+ spec.add_runtime_dependency "tty-prompt", "~> 0.23.1"
30
+ spec.add_runtime_dependency "tty-reader", "~> 0.9.0"
31
+ spec.add_runtime_dependency "tty-screen", "~> 0.8.2"
32
+ spec.add_runtime_dependency "pastel", "~> 0.8.0"
33
+ spec.add_runtime_dependency "daemons", "~> 1.4.1"
34
+
35
+ spec.extensions = ["rubyext/extconf.rb"]
36
+ spec.licenses = ["LGPL-2.0+"]
37
+ end
data/stridx.hpp CHANGED
@@ -33,16 +33,16 @@ public:
33
33
  Output(int verb) : verboseLevel(verb) {}
34
34
  Output() : Output(3) {}
35
35
  ~Output() = default;
36
- void print() {}
36
+ static void print() {}
37
37
 
38
38
  // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
39
- template <typename T, typename... Types> void print(T var1, Types... var2) {
39
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
40
40
  std::cout << var1;
41
41
  print(var2...);
42
42
  }
43
43
 
44
44
  // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
45
- template <typename... Types> void printl(Types... var2) {
45
+ template <typename... Types> static void printl(Types... var2) {
46
46
  print(var2...);
47
47
  print("\n");
48
48
  }
@@ -79,7 +79,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
79
79
  }
80
80
 
81
81
  // Convert int64_t to binary string
82
- [[nodiscard]] std::string int64ToBinaryString(int64_t num) {
82
+ [[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
83
83
  std::string result;
84
84
  for (int i = 63; i >= 0; --i) {
85
85
  result += ((num >> i) & 1) ? '1' : '0';
@@ -88,7 +88,7 @@ std::vector<std::string> splitString(const std::string &input, const char &separ
88
88
  }
89
89
 
90
90
  // Debug. Convert a (8 char) string represented as int64_t to std::string
91
- [[nodiscard]] std::string int64ToStr(int64_t key) {
91
+ [[nodiscard]] std::string int64ToStr(const int64_t &key) {
92
92
  int nchars = 8;
93
93
  std::string str;
94
94
  int multip = nchars * 8;
@@ -108,7 +108,7 @@ void printVector(const std::vector<int> &vec) {
108
108
  }
109
109
 
110
110
  // Debug
111
- [[nodiscard]] std::string charToBinaryString(char chr) {
111
+ [[nodiscard]] std::string charToBinaryString(const char &chr) {
112
112
  std::string result;
113
113
  for (int i = 7; i >= 0; --i) {
114
114
  result += ((chr >> i) & 1) ? '1' : '0';
@@ -122,8 +122,7 @@ enum class segmentType { Dir, File };
122
122
  // A segment of a file path
123
123
  // e.g. if path is /foo/bar/baz.txt
124
124
  // segments are [{root}, foo, bar, baz.txt]
125
- class PathSegment {
126
- public:
125
+ struct PathSegment {
127
126
  std::string str;
128
127
  int fileId; // (if FILE)
129
128
  Candidate *cand;
@@ -135,7 +134,7 @@ public:
135
134
  PathSegment(std::string _str) : str(_str), parent(nullptr) {}
136
135
  PathSegment(std::string _str, int _fileId)
137
136
  : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
138
- [[nodiscard]] int size() {
137
+ [[nodiscard]] int size() const {
139
138
  int sz = str.size();
140
139
  PathSegment *cur = parent;
141
140
  // Sum up length of parent segments (+1 for divisors)
@@ -148,8 +147,7 @@ public:
148
147
  };
149
148
 
150
149
  // Candidate for result in string (filename) search
151
- class Candidate {
152
- public:
150
+ struct Candidate {
153
151
  std::vector<float> v_charscore;
154
152
  PathSegment *seg;
155
153
  int fileId;
@@ -162,25 +160,17 @@ public:
162
160
  int candLen; // Length of candidate
163
161
 
164
162
  Candidate(){};
165
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
166
- // Initialize v_charscores with zeros
167
- v_charscore.resize(len, 0);
168
- candLen = str.size();
169
- seg = nullptr;
170
- }
171
-
172
163
  Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
173
164
  // Initialize v_charscores with zeros
174
165
  v_charscore.resize(len, 0);
175
166
  candLen = seg->size();
176
167
  }
177
168
 
178
- [[nodiscard]] float getScore() {
169
+ [[nodiscard]] float getScore() const {
179
170
  int i = 0;
180
171
  float score = 0.0;
181
- candLen = seg->size();
182
172
 
183
- for (float &charscore : v_charscore) {
173
+ for (const float &charscore : v_charscore) {
184
174
  score += charscore;
185
175
  i++;
186
176
  }
@@ -193,7 +183,7 @@ public:
193
183
  return score;
194
184
  }
195
185
 
196
- [[nodiscard]] float operator[](int idx) { return v_charscore[idx]; }
186
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
197
187
  };
198
188
 
199
189
  // This seems to give 10x speed improvement over std::unordered_map
@@ -210,19 +200,17 @@ private:
210
200
  int numStrings = 0;
211
201
 
212
202
  std::vector<SegMap *> dirmaps;
203
+ std::array<std::mutex, 9> mts_d; // for dirmaps
213
204
  std::vector<SegMap *> filemaps;
205
+ std::array<std::mutex, 9> mts_f; // for filemaps
214
206
 
215
207
  std::vector<PathSegment *> segsToClean;
216
208
 
217
- std::unordered_map<int, std::string> strlist;
218
209
  std::unordered_map<int, PathSegment *> seglist;
219
210
  PathSegment *root;
220
211
  int dirId = 0;
221
212
  float dirWeight = 0.7; // Give only 70% of score if match is for a directory
222
213
 
223
- std::array<std::mutex, 9> mts_f;
224
- std::array<std::mutex, 9> mts_d;
225
-
226
214
  std::unique_ptr<ThreadPool> pool;
227
215
  Output out{1}; // verbose level = 1
228
216
 
@@ -279,9 +267,11 @@ public:
279
267
  void addStrToIndexThreaded(std::string filePath, int fileId) {
280
268
  pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
281
269
  }
282
- void waitUntilReady() { pool->waitUntilDone(); }
270
+ void waitUntilReady() const { pool->waitUntilDone(); }
283
271
 
284
- void waitUntilDone() { pool->waitUntilDone(); }
272
+ void waitUntilDone() const { pool->waitUntilDone(); }
273
+
274
+ int size() const { return seglist.size(); }
285
275
 
286
276
  /**
287
277
  * Add a string to the index to be searched for afterwards
@@ -291,8 +281,14 @@ public:
291
281
  * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
292
282
  * one of {'\\', '/', '\0' (no separation)}.
293
283
  */
284
+
294
285
  void addStrToIndex(std::string filePath, int fileId, const char &separator) {
295
- out.printv(3, "Add file:", filePath, ",", fileId, ",", separator);
286
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
287
+
288
+ // If a string with this index has beeen added already
289
+ if (seglist.find(fileId) != seglist.end()) {
290
+ return;
291
+ }
296
292
 
297
293
  std::vector<std::string> segs;
298
294
  numStrings += 1;
@@ -345,6 +341,17 @@ public:
345
341
  }
346
342
  }
347
343
 
344
+ std::string getString(int id) {
345
+ std::string s = "";
346
+ PathSegment *seg = seglist[id];
347
+ s += seg->str;
348
+ while (seg->parent->parent != nullptr) {
349
+ seg = seg->parent;
350
+ s = seg->str + dirSeparator + s;
351
+ }
352
+ return s;
353
+ }
354
+
348
355
  /**
349
356
  The search will find filepaths similar to the input string
350
357
 
@@ -423,7 +430,7 @@ public:
423
430
  }
424
431
 
425
432
  // Return int64_t representation of the first nchars in str, starting from index i
426
- [[nodiscard]] int64_t getKeyAtIdx(std::string str, int i, int nchars) {
433
+ [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
427
434
  int64_t key = 0;
428
435
  for (int i_char = 0; i_char < nchars; i_char++) {
429
436
  key = key | static_cast<int64_t>(str[i + i_char]);
@@ -519,7 +526,7 @@ private:
519
526
  // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
520
527
  // is of length <nchars>.
521
528
  [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
522
- SegMap &map) {
529
+ SegMap &map) const {
523
530
 
524
531
  assert(i + nchars <= static_cast<int>(str.size()));
525
532
  std::vector<PathSegment *> res;
data/test.rb CHANGED
@@ -1,8 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ $:.unshift File.dirname(__FILE__)
4
+
3
5
  require "stridx"
4
6
  idx = StrIdx::StringIndex.new
5
7
 
8
+ # "/" for unix-style file paths
9
+ idx.setDirSeparator("/") #(comment out if not file paths)
10
+
6
11
  t = Time.new
7
12
  fn = File.expand_path("flist.txt")
8
13
  lines = IO.read(fn).lines.collect { |x| x.strip }
@@ -14,12 +19,12 @@ end
14
19
 
15
20
  idx_time = Time.new
16
21
  # Time to start the threadpool to process indexing
17
- puts "\nIndexing launch time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
22
+ puts "\nIndexing launch time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
18
23
 
19
24
  idx.waitUntilDone() # Not necessary, will be called by idx.find
20
25
  idx_time = Time.new
21
26
  # Time when all threads have completed
22
- puts "\nIndexing completed time (#{lines.size} files}): #{(idx_time - t).round(4)} seconds"
27
+ puts "\nIndexing completed time (#{lines.size} files): #{(idx_time - t).round(4)} seconds"
23
28
 
24
29
  query = "rngnomadriv"
25
30
  res = idx.find(query)
data/unit_tests.sh ADDED
@@ -0,0 +1,4 @@
1
+ #!/bin/bash
2
+ cmake -S . -B build
3
+ cmake --build build
4
+ cd build && ctest
data/unittest.cpp ADDED
@@ -0,0 +1,147 @@
1
+
2
+ #include <gtest/gtest.h>
3
+ #include "stridx.hpp"
4
+ #include <cmath>
5
+ #include <memory>
6
+
7
+ TEST(SplitString, MatchSize) {
8
+ std::vector<std::string> svec = StrIdx::splitString("foo/bar/test1.txt", '/');
9
+ EXPECT_EQ(svec.size(), 3);
10
+ if (svec.size() == 3) {
11
+ EXPECT_EQ(svec[0].size(), 3);
12
+ EXPECT_EQ(svec[2].size(), 9);
13
+ }
14
+ }
15
+
16
+ std::vector<std::string> flist{"./drivers/char/hw_random/nomadik-rng.c",
17
+ "./drivers/pinctrl/nomadik",
18
+ "./drivers/clk/clk-nomadik.c",
19
+ "./drivers/gpio/gpio-nomadik.c",
20
+ "./drivers/i2c/busses/i2c-nomadik.c",
21
+ "./drivers/clocksource/nomadik-mtu.c",
22
+ "./drivers/gpu/drm/pl111/pl111_nomadik.h",
23
+ "./drivers/gpu/drm/pl111/pl111_nomadik.c",
24
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik.c",
25
+ "./drivers/input/keyboard/nomadik-ske-keypad.c",
26
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik-db8500.c",
27
+ "./drivers/pinctrl/nomadik/pinctrl-nomadik-stn8815.c",
28
+ "./drivers/char/hw_random/omap-rng.c",
29
+ "./drivers/char/hw_random/omap3-rom-rng.c",
30
+ "./include/dt-bindings/pinctrl/nomadik.h",
31
+ "./Documentation/devicetree/bindings/arm/ste-nomadik.txt"};
32
+
33
+ std::vector<float> target_scores{0.342944, 0.271396, 0.271126, 0.270893, 0.270431, 0.270355,
34
+ 0.270088, 0.270088, 0.26987, 0.269776, 0.269574, 0.269538,
35
+ 0.236358, 0.236074, 0.224804, 0.224238};
36
+
37
+ void scoreTest(bool threaded) {
38
+
39
+ StrIdx::StringIndex idx('/'); // Separate directories using unix style "/" char
40
+ std::string query = "rngnomadriv";
41
+
42
+ int i = 1;
43
+ for (const auto &str : flist) {
44
+ if (threaded) {
45
+ idx.addStrToIndexThreaded(str, i);
46
+ } else {
47
+ idx.addStrToIndex(str, i);
48
+ }
49
+ i++;
50
+ }
51
+ const std::vector<std::pair<float, int>> &results = idx.findSimilar(query);
52
+
53
+ std::cout << results[0].first;
54
+ EXPECT_EQ(results[0].second, 1);
55
+ if (results.size() == 16) {
56
+ int i = 0;
57
+ for (const auto &res : results) {
58
+ // Check if first five digits of the scores match
59
+ EXPECT_EQ(std::floor(res.first * 1e5), std::floor(1e5 * target_scores[i]));
60
+ i++;
61
+ }
62
+ }
63
+ }
64
+
65
+ TEST(IndexSearch, MatchingScoresSingleThread) { scoreTest(false); }
66
+ TEST(IndexSearch, MatchingScoresThreaded) { scoreTest(true); }
67
+
68
+ class IndexTest : public testing::Test {
69
+ protected:
70
+ std::unique_ptr<StrIdx::StringIndex> idx = std::make_unique<StrIdx::StringIndex>('/');
71
+
72
+ IndexTest() {}
73
+
74
+ void SetUp() override {
75
+ // Code here will be called immediately after the constructor (right
76
+ // before each test).
77
+ idx = std::make_unique<StrIdx::StringIndex>('/');
78
+ }
79
+
80
+ void TearDown() override {
81
+ // Code here will be called immediately after each test (right
82
+ // before the destructor).
83
+ }
84
+ };
85
+
86
+ TEST_F(IndexTest, BinaryRepresentation1) {
87
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 0, 8);
88
+ std::string s = StrIdx::int64ToBinaryString(num);
89
+ // a b c d ...
90
+ EXPECT_TRUE(s == "0110000101100010011000110110010001100101011001100110011101101000");
91
+ }
92
+
93
+ TEST_F(IndexTest, BinaryRepresentation2) {
94
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 0, 1);
95
+ std::string s = StrIdx::int64ToBinaryString(num);
96
+ EXPECT_TRUE(
97
+ s == "0000000000000000000000000000000000000000000000000000000001100001"); // 01100001 == "a"
98
+ }
99
+ TEST_F(IndexTest, BinaryRepresentation3) {
100
+ int64_t num = idx->getKeyAtIdx("abcdefgh", 7, 1);
101
+ std::string s = StrIdx::int64ToBinaryString(num);
102
+ EXPECT_TRUE(
103
+ s == "0000000000000000000000000000000000000000000000000000000001101000"); // 01101000 == "h"
104
+ }
105
+
106
+ TEST_F(IndexTest, AccessString) {
107
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 0);
108
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadiksdf.c", 2);
109
+ idx->addStrToIndex("./drivers//i2c///busses////aa-i2c-nomadiksdf.c", 3);
110
+ idx->addStrToIndex("/test/foo/bar.txt", 4);
111
+ idx->addStrToIndex("bar.txt", 5);
112
+
113
+ EXPECT_EQ(idx->size(), 5);
114
+ EXPECT_STREQ(idx->getString(0).c_str(), "./drivers/i2c/busses/i2c-nomadik.c");
115
+
116
+ // TODO: does not work yet
117
+ // EXPECT_STREQ(idx->getString(3).c_str(), "./drivers//i2c///busses////aa-i2c-nomadiksdf.c");
118
+
119
+ // TODO: does not work yet
120
+ // EXPECT_STREQ(idx->getString(4).c_str(), "/test/foo/bar.txt");
121
+ EXPECT_STREQ(idx->getString(5).c_str(), "bar.txt");
122
+ }
123
+
124
+ TEST_F(IndexTest, Size1) {
125
+ // Should not add different files with same id
126
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 0);
127
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadiksdf.c", 0);
128
+
129
+ // Should not be added because essentially same as 0:
130
+ idx->addStrToIndex("./drivers//i2c///busses////i2c-nomadik.c", 44);
131
+
132
+ // Should not add same file with different id
133
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 1);
134
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 2);
135
+
136
+ EXPECT_EQ(idx->size(), 1);
137
+
138
+ // Test no-overwriting
139
+ EXPECT_STREQ(idx->getString(0).c_str(), "./drivers/i2c/busses/i2c-nomadik.c");
140
+ }
141
+
142
+ TEST_F(IndexTest, Size2) {
143
+ idx->addStrToIndex("./drivers/i2c/busses/i2c-nomadik.c", 22);
144
+ idx->addStrToIndex("./Documentation/devicetree/bindings/arm/ste-nomadik.txt", 1);
145
+ idx->addStrToIndex("./Documentation/devicetree/bindings/arm/ste-nomadik33.txt", 3);
146
+ EXPECT_EQ(idx->size(), 3);
147
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: StrIdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sami Sieranoja
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-24 00:00:00.000000000 Z
11
+ date: 2024-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,24 +38,123 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 13.1.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: tty-cursor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.7.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.7.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: tty-prompt
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.23.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.23.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: tty-reader
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.9.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.9.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: tty-screen
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.8.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.8.2
97
+ - !ruby/object:Gem::Dependency
98
+ name: pastel
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 0.8.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 0.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: daemons
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: 1.4.1
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: 1.4.1
41
125
  description: " Fast fuzzy string similarity search and indexing (for filenames)"
42
126
  email:
43
127
  - sami.sieranoja@gmail.com
44
- executables: []
128
+ executables:
129
+ - stridx.rb
45
130
  extensions:
46
131
  - rubyext/extconf.rb
47
132
  extra_rdoc_files: []
48
133
  files:
134
+ - CMakeLists.txt
135
+ - Gemfile
49
136
  - LICENSE
50
137
  - Makefile
51
138
  - README.md
52
139
  - demo.cpp
140
+ - exe/stridx.rb
53
141
  - flist.txt
142
+ - gem_install
143
+ - py_example.py
144
+ - py_interf.cpp
54
145
  - rubyext/extconf.rb
55
146
  - rubyext/ruby_interf.cpp
147
+ - runserver.rb
148
+ - server.rb
149
+ - setup.py
150
+ - stridx-screencast.mp4
151
+ - stridx-tty.rb
152
+ - stridx.gemspec
56
153
  - stridx.hpp
57
154
  - test.rb
58
155
  - thread_pool.hpp
156
+ - unit_tests.sh
157
+ - unittest.cpp
59
158
  - unordered_dense.h
60
159
  homepage: https://github.com/SamiSieranoja/stridx
61
160
  licenses:
@@ -63,7 +162,7 @@ licenses:
63
162
  metadata:
64
163
  source_code_uri: https://github.com/SamiSieranoja/stridx
65
164
  homepage_uri: https://github.com/SamiSieranoja/stridx
66
- post_install_message:
165
+ post_install_message:
67
166
  rdoc_options: []
68
167
  require_paths:
69
168
  - lib
@@ -80,7 +179,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
179
  version: '0'
81
180
  requirements: []
82
181
  rubygems_version: 3.3.26
83
- signing_key:
182
+ signing_key:
84
183
  specification_version: 4
85
184
  summary: StrIdx
86
185
  test_files: []