vinted-blurrily 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/bin/blurrily +43 -0
- data/ext/blurrily/blurrily.h +21 -0
- data/ext/blurrily/extconf.rb +21 -0
- data/ext/blurrily/map_ext.c +230 -0
- data/ext/blurrily/search_tree.c +66 -0
- data/ext/blurrily/search_tree.h +30 -0
- data/ext/blurrily/storage.c +629 -0
- data/ext/blurrily/storage.h +119 -0
- data/ext/blurrily/tokeniser.c +126 -0
- data/ext/blurrily/tokeniser.h +46 -0
- data/lib/blurrily.rb +1 -0
- data/lib/blurrily/client.rb +136 -0
- data/lib/blurrily/command_processor.rb +53 -0
- data/lib/blurrily/defaults.rb +10 -0
- data/lib/blurrily/map.rb +49 -0
- data/lib/blurrily/map_group.rb +39 -0
- data/lib/blurrily/server.rb +49 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +280 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b9c8978b87d66ef94646d643c471b745c84b2489
|
4
|
+
data.tar.gz: 73c63bbd8614d452ee27c16ff3fa5371b8a48a21
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bccb29b9b6665b9e8606ee483b434e2873815a804faf2d0bd800714c9af2b02f3156694b29cd5f66e9e9173b28adc6119703ee14503e5843c18849b0c47c78cb
|
7
|
+
data.tar.gz: 59bcd0faf278887485033c26a19757ad5d15c01de468e94c27b2115cc04f4fcb65280b0fba0057cb8a28bc2f902dbc2a9f879a50e9101464e07f133c1eed0981
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 HouseTrip Ltd.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
# Blurrily — Millisecond fuzzy string matching
|
2
|
+
|
3
|
+
This is a fork made solely for the purpose of adding Ruby 2.4 support and making it available in RubyGems.
|
4
|
+
|
5
|
+
[](http://badge.fury.io/rb/blurrily)
|
6
|
+
[](https://travis-ci.org/mezis/blurrily)
|
7
|
+
[](https://gemnasium.com/mezis/blurrily)
|
8
|
+
[](https://codeclimate.com/github/mezis/blurrily)
|
9
|
+
[](https://coveralls.io/r/mezis/blurrily)
|
10
|
+
|
11
|
+
> Show me photos of **Marakech** !
|
12
|
+
>
|
13
|
+
> Here are some photos of **Marrakesh**, Morroco.
|
14
|
+
> Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
|
15
|
+
|
16
|
+
Blurrily finds misspelled, prefix, or partial needles in a haystack of
|
17
|
+
strings, quickly. It scales well: its response time is typically 1-2ms on
|
18
|
+
user-input datasets and 75-100ms on pathological datasets
|
19
|
+
([more](#benchmarks)).
|
20
|
+
|
21
|
+
Blurrily is compatible and tested with all MRI Rubies from 1.9.3 to 2.2.0.
|
22
|
+
It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
|
23
|
+
|
24
|
+
Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
|
25
|
+
approach to find good matches. If you're using ActiveRecord and looking for
|
26
|
+
a lightweight (albeit much slower), in-process, Rails-friendly version of
|
27
|
+
this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
|
28
|
+
perform fuzzy text searching in ActiveRecord.
|
29
|
+
|
30
|
+
|
31
|
+
## Installation
|
32
|
+
|
33
|
+
Add this line to your application's Gemfile:
|
34
|
+
|
35
|
+
gem 'blurrily'
|
36
|
+
|
37
|
+
Or install it yourself as:
|
38
|
+
|
39
|
+
$ gem install blurrily
|
40
|
+
|
41
|
+
## Docker
|
42
|
+
|
43
|
+
You can optionally run [Burrily as a Docker Container](https://github.com/mrmattwright/docker-blurrily). Maintained by [MrMattWright](https://github.com/mrmattwright).
|
44
|
+
|
45
|
+
## Usage
|
46
|
+
|
47
|
+
You can use blurrily as a client/server combination (recommended in
|
48
|
+
production), or use the internals standalone.
|
49
|
+
|
50
|
+
See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
|
51
|
+
for more details.
|
52
|
+
|
53
|
+
### Client/server
|
54
|
+
|
55
|
+
Fire up a blurrily server:
|
56
|
+
|
57
|
+
$ blurrily
|
58
|
+
|
59
|
+
Open up a console and connect:
|
60
|
+
|
61
|
+
$ irb -rubygems
|
62
|
+
> require 'blurrily/client'
|
63
|
+
> client = Blurrily::Client.new
|
64
|
+
|
65
|
+
Store a needle with a reference:
|
66
|
+
|
67
|
+
> client.put('London', 1337)
|
68
|
+
|
69
|
+
Recover a reference form the haystack:
|
70
|
+
|
71
|
+
> client.find('lonndon')
|
72
|
+
#=> [1337]
|
73
|
+
|
74
|
+
### Standalone
|
75
|
+
|
76
|
+
Create the in-memory database:
|
77
|
+
|
78
|
+
> map = Blurrily::Map.new
|
79
|
+
|
80
|
+
Store a needle with a reference:
|
81
|
+
|
82
|
+
> map.put('London', 1337)
|
83
|
+
|
84
|
+
Recover a reference form the haystack:
|
85
|
+
|
86
|
+
> map.find('lonndon')
|
87
|
+
#=> [1337]
|
88
|
+
|
89
|
+
Save the database to disk:
|
90
|
+
|
91
|
+
> map.save('/var/db/data.trigrams')
|
92
|
+
|
93
|
+
Load a previously saved database:
|
94
|
+
|
95
|
+
> map = Blurrily::Map.load('/var/db/data.trigrams')
|
96
|
+
|
97
|
+
|
98
|
+
## Caveats
|
99
|
+
|
100
|
+
### Diacritics, non-latin languages
|
101
|
+
|
102
|
+
Blurrily forms trigrams from the 26 latin letters and a stop character (used
|
103
|
+
to model start-of-string and separation between words in multi-word
|
104
|
+
strings).
|
105
|
+
|
106
|
+
This means that case and diacritrics are completely ignored by Blurrily. For
|
107
|
+
instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
|
108
|
+
|
109
|
+
It also means that any non-latin input will probably result in garbage data
|
110
|
+
and garbage results (although it won't crash).
|
111
|
+
|
112
|
+
### Multi-word needles and edge stickyness.
|
113
|
+
|
114
|
+
Multi-word needles (say, *New York*) are supported.
|
115
|
+
|
116
|
+
The engine always favours matches that begin and end similarly to the
|
117
|
+
needle, with a bias to the beginning of the strings.
|
118
|
+
|
119
|
+
This is because internally, the string *New York* is turned into this
|
120
|
+
sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
|
121
|
+
`ork`, `rk*`.
|
122
|
+
|
123
|
+
## Production notes
|
124
|
+
|
125
|
+
### Memory usage
|
126
|
+
|
127
|
+
Blurrily does not store your original strings but rather a flat map of
|
128
|
+
references and weights for each trigram in your input strings.
|
129
|
+
|
130
|
+
In practice any database will use up a base 560KB for the index header, plus
|
131
|
+
128 bits per trigram.
|
132
|
+
|
133
|
+
As a rule of thumb idea memory usages is 40MB + 8 times the size of your
|
134
|
+
input data, and 50% extra on top during bulk imports (lots of writes to the
|
135
|
+
database).
|
136
|
+
|
137
|
+
For instance, `/usr/share/dict/words` is a list of 235k English words, and
|
138
|
+
weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
|
139
|
+
are the database.
|
140
|
+
|
141
|
+
Note that once a database has been written to disk and loaded from disk,
|
142
|
+
memory usage is minimal (560KB per database) as the database file is memory
|
143
|
+
mapped. For performance you do need as much free memory as the database
|
144
|
+
size.
|
145
|
+
|
146
|
+
### Disk usage
|
147
|
+
|
148
|
+
Disk usage is almost exactly like memory usage, since database files are
|
149
|
+
nothing more than a memory dump.
|
150
|
+
|
151
|
+
In the `/usr/share/dict/words` example, on-disk size is 51MB.
|
152
|
+
For the whole list of Geonames places, on-disk size is 1.1GB.
|
153
|
+
|
154
|
+
### Read v write
|
155
|
+
|
156
|
+
Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
|
157
|
+
after all, optimized for intensive reads.
|
158
|
+
|
159
|
+
Supporting writes means the engine needs to keep a hash table of all
|
160
|
+
references around, typically weighing 50% of your total input. This is build
|
161
|
+
lazily while writing however; so if you load a database from disk and only
|
162
|
+
ever read, you will not incur the memory penalty.
|
163
|
+
|
164
|
+
### Saving & backing up
|
165
|
+
|
166
|
+
Blurrily saves atomically (writing to a separate file, then using rename(2)
|
167
|
+
to overwrite the old file), meaning you should never lose data.
|
168
|
+
|
169
|
+
The server does this for you every 60 seconds and when quitting. If using
|
170
|
+
`Blurrily::Map` directly, remember that a map loaded from disk is more
|
171
|
+
memory efficient that a map in memory, so if your workload is read-heavy,
|
172
|
+
you should `.load` after each `#save`.
|
173
|
+
|
174
|
+
Backing up comes with a caveat: database files are only portable across
|
175
|
+
architectures if endianness and pointer size are the same (tested between
|
176
|
+
darwin-x86_64 and linux-amd64).
|
177
|
+
|
178
|
+
Database files are very compressible; `bzip2` typically shrinks them to 20%
|
179
|
+
of their original size.
|
180
|
+
|
181
|
+
|
182
|
+
## Benchmarks
|
183
|
+
|
184
|
+
Blurrily is wicked fast, often 100x faster than it's ancestor,
|
185
|
+
[fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
|
186
|
+
the-metal, single-purpose index using almost exclusively libc primitives. On
|
187
|
+
the inside the only expensive operations it performs are
|
188
|
+
|
189
|
+
- memcpy(2) lots of data around (selection);
|
190
|
+
- mergesort(3) to aggregate/count similar entries (reduction);
|
191
|
+
- qsort(3) to order by counts (sort).
|
192
|
+
|
193
|
+
It tends to be faster with large datasets on BSD than on Linux because the
|
194
|
+
former has fast quicksort and mergesort, wheras the latter only has `qsort`,
|
195
|
+
a slower, catch-all sorter. In complexity terms this is because FIND tends
|
196
|
+
to be *O(n)* on BSD and *O(n ln n)* on Linux.
|
197
|
+
|
198
|
+
Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
|
199
|
+
and take respectively ~10ms and ~100µs on any platform, so they aren't
|
200
|
+
graphed here.
|
201
|
+
|
202
|
+
- [FIND latency](/doc/bench-find.png)
|
203
|
+
- [SAVE latency](/doc/bench-save.png)
|
204
|
+
- [DELETE latency](/doc/bench-delete.png)
|
205
|
+
|
206
|
+
|
207
|
+
## Contributing
|
208
|
+
|
209
|
+
1. Fork it
|
210
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
211
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
212
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
213
|
+
5. Create new Pull Request
|
data/bin/blurrily
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$PROGRAM_NAME = 'blurrily'
|
3
|
+
|
4
|
+
require "blurrily/server"
|
5
|
+
require 'optparse'
|
6
|
+
require 'ostruct'
|
7
|
+
|
8
|
+
options = OpenStruct.new
|
9
|
+
|
10
|
+
# Defaults
|
11
|
+
options.port = 12021
|
12
|
+
options.directory = '.'
|
13
|
+
options.host = '0.0.0.0'
|
14
|
+
|
15
|
+
parser = OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
17
|
+
|
18
|
+
opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
|
19
|
+
puts 'Port has to be numeric value' and exit unless port =~ /\d+/
|
20
|
+
options.port = port.to_i
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
|
24
|
+
options.directory = directory
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
|
28
|
+
options.host = address || '0.0.0.0'
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-V", "--version", "Output version") do |address|
|
32
|
+
puts Blurrily::VERSION
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
37
|
+
puts opts
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.parse!(ARGV)
|
43
|
+
Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
|
@@ -0,0 +1,21 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
blurrily.h --
|
4
|
+
|
5
|
+
Helper macros
|
6
|
+
|
7
|
+
*/
|
8
|
+
|
9
|
+
#ifndef __BLURRILY_H__
|
10
|
+
#define __BLURRILY_H__ 1
|
11
|
+
|
12
|
+
#define BR_PACKED_STRUCT __attribute__ ((__packed__))
|
13
|
+
#define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
|
14
|
+
|
15
|
+
#ifdef DEBUG
|
16
|
+
#define LOG(...) fprintf(stderr, __VA_ARGS__)
|
17
|
+
#else
|
18
|
+
#define LOG(...)
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#endif
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
PLATFORM = `uname`.strip.upcase
|
4
|
+
SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra"
|
5
|
+
|
6
|
+
case PLATFORM
|
7
|
+
when 'LINUX'
|
8
|
+
# make sure ftruncate is available
|
9
|
+
SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
|
10
|
+
SHARED_FLAGS << ' -D_GNU_SOURCE=1'
|
11
|
+
# make sure off_t is 64 bit long
|
12
|
+
SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
|
13
|
+
end
|
14
|
+
|
15
|
+
# production
|
16
|
+
$CFLAGS += " #{SHARED_FLAGS} -Os"
|
17
|
+
|
18
|
+
# development
|
19
|
+
# $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
|
20
|
+
|
21
|
+
create_makefile('blurrily/map_ext')
|
@@ -0,0 +1,230 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "storage.h"
|
4
|
+
#include "blurrily.h"
|
5
|
+
|
6
|
+
static VALUE eClosedError = Qnil;
|
7
|
+
static VALUE eBlurrilyModule = Qnil;
|
8
|
+
|
9
|
+
/******************************************************************************/
|
10
|
+
|
11
|
+
static int raise_if_closed(VALUE self)
|
12
|
+
{
|
13
|
+
if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
|
14
|
+
rb_raise(eClosedError, "Map was freed");
|
15
|
+
return 1;
|
16
|
+
}
|
17
|
+
|
18
|
+
static void mark_as_closed(VALUE self)
|
19
|
+
{
|
20
|
+
rb_ivar_set(self, rb_intern("@closed"), Qtrue);
|
21
|
+
}
|
22
|
+
|
23
|
+
/******************************************************************************/
|
24
|
+
|
25
|
+
static void blurrily_free(void* haystack)
|
26
|
+
{
|
27
|
+
int res = -1;
|
28
|
+
|
29
|
+
if (haystack == NULL) return;
|
30
|
+
res = blurrily_storage_close((trigram_map*) &haystack);
|
31
|
+
assert(res >= 0);
|
32
|
+
}
|
33
|
+
|
34
|
+
/******************************************************************************/
|
35
|
+
|
36
|
+
static void blurrily_mark(void* haystack)
|
37
|
+
{
|
38
|
+
if (haystack == NULL) return;
|
39
|
+
blurrily_storage_mark((trigram_map) haystack);
|
40
|
+
}
|
41
|
+
|
42
|
+
/******************************************************************************/
|
43
|
+
|
44
|
+
static VALUE blurrily_new(VALUE class) {
|
45
|
+
VALUE wrapper = Qnil;
|
46
|
+
trigram_map haystack = (trigram_map)NULL;
|
47
|
+
int res = -1;
|
48
|
+
|
49
|
+
res = blurrily_storage_new(&haystack);
|
50
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
51
|
+
|
52
|
+
wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
|
53
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
54
|
+
return wrapper;
|
55
|
+
}
|
56
|
+
|
57
|
+
/******************************************************************************/
|
58
|
+
|
59
|
+
static VALUE blurrily_load(VALUE class, VALUE rb_path) {
|
60
|
+
char* path = StringValuePtr(rb_path);
|
61
|
+
VALUE wrapper = Qnil;
|
62
|
+
trigram_map haystack = (trigram_map)NULL;
|
63
|
+
int res = -1;
|
64
|
+
|
65
|
+
res = blurrily_storage_load(&haystack, path);
|
66
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
67
|
+
|
68
|
+
wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
|
69
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
70
|
+
return wrapper;
|
71
|
+
}
|
72
|
+
|
73
|
+
/******************************************************************************/
|
74
|
+
|
75
|
+
static VALUE blurrily_initialize(VALUE UNUSED(self)) {
|
76
|
+
return Qtrue;
|
77
|
+
}
|
78
|
+
|
79
|
+
/******************************************************************************/
|
80
|
+
|
81
|
+
static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
|
82
|
+
trigram_map haystack = (trigram_map)NULL;
|
83
|
+
int res = -1;
|
84
|
+
char* needle = StringValuePtr(rb_needle);
|
85
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
86
|
+
uint32_t weight = NUM2UINT(rb_weight);
|
87
|
+
|
88
|
+
if (raise_if_closed(self)) return Qnil;
|
89
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
90
|
+
|
91
|
+
res = blurrily_storage_put(haystack, needle, reference, weight);
|
92
|
+
assert(res >= 0);
|
93
|
+
|
94
|
+
return INT2NUM(res);
|
95
|
+
}
|
96
|
+
|
97
|
+
/******************************************************************************/
|
98
|
+
|
99
|
+
static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
|
100
|
+
trigram_map haystack = (trigram_map)NULL;
|
101
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
102
|
+
int res = -1;
|
103
|
+
|
104
|
+
if (raise_if_closed(self)) return Qnil;
|
105
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
106
|
+
|
107
|
+
res = blurrily_storage_delete(haystack, reference);
|
108
|
+
assert(res >= 0);
|
109
|
+
|
110
|
+
return INT2NUM(res);
|
111
|
+
}
|
112
|
+
|
113
|
+
/******************************************************************************/
|
114
|
+
|
115
|
+
static VALUE blurrily_save(VALUE self, VALUE rb_path) {
|
116
|
+
trigram_map haystack = (trigram_map)NULL;
|
117
|
+
int res = -1;
|
118
|
+
const char* path = StringValuePtr(rb_path);
|
119
|
+
|
120
|
+
if (raise_if_closed(self)) return Qnil;
|
121
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
122
|
+
|
123
|
+
res = blurrily_storage_save(haystack, path);
|
124
|
+
if (res < 0) rb_sys_fail(NULL);
|
125
|
+
|
126
|
+
return Qnil;
|
127
|
+
}
|
128
|
+
|
129
|
+
/******************************************************************************/
|
130
|
+
|
131
|
+
static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
|
132
|
+
trigram_map haystack = (trigram_map)NULL;
|
133
|
+
int res = -1;
|
134
|
+
const char* needle = StringValuePtr(rb_needle);
|
135
|
+
int limit = NUM2UINT(rb_limit);
|
136
|
+
trigram_match matches = NULL;
|
137
|
+
VALUE rb_matches = Qnil;
|
138
|
+
|
139
|
+
if (raise_if_closed(self)) return Qnil;
|
140
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
141
|
+
|
142
|
+
if (limit <= 0) {
|
143
|
+
// rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
|
144
|
+
rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
|
145
|
+
limit = NUM2UINT(rb_limit);
|
146
|
+
}
|
147
|
+
matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
|
148
|
+
|
149
|
+
res = blurrily_storage_find(haystack, needle, limit, matches);
|
150
|
+
assert(res >= 0);
|
151
|
+
|
152
|
+
/* wrap the matches into a Ruby array */
|
153
|
+
rb_matches = rb_ary_new();
|
154
|
+
for (int k = 0; k < res; ++k) {
|
155
|
+
VALUE rb_match = rb_ary_new();
|
156
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
|
157
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
|
158
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
|
159
|
+
rb_ary_push(rb_matches, rb_match);
|
160
|
+
}
|
161
|
+
return rb_matches;
|
162
|
+
}
|
163
|
+
|
164
|
+
|
165
|
+
/******************************************************************************/
|
166
|
+
|
167
|
+
static VALUE blurrily_stats(VALUE self)
|
168
|
+
{
|
169
|
+
trigram_map haystack = (trigram_map)NULL;
|
170
|
+
trigram_stat_t stats;
|
171
|
+
VALUE result = rb_hash_new();
|
172
|
+
int res = -1;
|
173
|
+
|
174
|
+
if (raise_if_closed(self)) return Qnil;
|
175
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
176
|
+
|
177
|
+
res = blurrily_storage_stats(haystack, &stats);
|
178
|
+
assert(res >= 0);
|
179
|
+
|
180
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
|
181
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
|
182
|
+
|
183
|
+
return result;
|
184
|
+
}
|
185
|
+
|
186
|
+
/******************************************************************************/
|
187
|
+
|
188
|
+
static VALUE blurrily_close(VALUE self)
|
189
|
+
{
|
190
|
+
trigram_map haystack = (trigram_map)NULL;
|
191
|
+
int res = -1;
|
192
|
+
|
193
|
+
if (raise_if_closed(self)) return Qnil;
|
194
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
195
|
+
|
196
|
+
res = blurrily_storage_close(&haystack);
|
197
|
+
if (res < 0) rb_sys_fail(NULL);
|
198
|
+
|
199
|
+
DATA_PTR(self) = NULL;
|
200
|
+
mark_as_closed(self);
|
201
|
+
return Qnil;
|
202
|
+
}
|
203
|
+
|
204
|
+
/******************************************************************************/
|
205
|
+
|
206
|
+
void Init_map_ext(void) {
|
207
|
+
VALUE klass = Qnil;
|
208
|
+
|
209
|
+
/* assume we haven't yet defined blurrily */
|
210
|
+
eBlurrilyModule = rb_define_module("Blurrily");
|
211
|
+
assert(eBlurrilyModule != Qnil);
|
212
|
+
|
213
|
+
klass = rb_define_class_under(eBlurrilyModule, "RawMap", rb_cObject);
|
214
|
+
assert(klass != Qnil);
|
215
|
+
|
216
|
+
eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
|
217
|
+
assert(klass != Qnil);
|
218
|
+
|
219
|
+
rb_define_singleton_method(klass, "new", blurrily_new, 0);
|
220
|
+
rb_define_singleton_method(klass, "load", blurrily_load, 1);
|
221
|
+
|
222
|
+
rb_define_method(klass, "initialize", blurrily_initialize, 0);
|
223
|
+
rb_define_method(klass, "put", blurrily_put, 3);
|
224
|
+
rb_define_method(klass, "delete", blurrily_delete, 1);
|
225
|
+
rb_define_method(klass, "save", blurrily_save, 1);
|
226
|
+
rb_define_method(klass, "find", blurrily_find, 2);
|
227
|
+
rb_define_method(klass, "stats", blurrily_stats, 0);
|
228
|
+
rb_define_method(klass, "close", blurrily_close, 0);
|
229
|
+
return;
|
230
|
+
}
|