vinted-blurrily 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/bin/blurrily +43 -0
- data/ext/blurrily/blurrily.h +21 -0
- data/ext/blurrily/extconf.rb +21 -0
- data/ext/blurrily/map_ext.c +230 -0
- data/ext/blurrily/search_tree.c +66 -0
- data/ext/blurrily/search_tree.h +30 -0
- data/ext/blurrily/storage.c +629 -0
- data/ext/blurrily/storage.h +119 -0
- data/ext/blurrily/tokeniser.c +126 -0
- data/ext/blurrily/tokeniser.h +46 -0
- data/lib/blurrily.rb +1 -0
- data/lib/blurrily/client.rb +136 -0
- data/lib/blurrily/command_processor.rb +53 -0
- data/lib/blurrily/defaults.rb +10 -0
- data/lib/blurrily/map.rb +49 -0
- data/lib/blurrily/map_group.rb +39 -0
- data/lib/blurrily/server.rb +49 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +280 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b9c8978b87d66ef94646d643c471b745c84b2489
|
4
|
+
data.tar.gz: 73c63bbd8614d452ee27c16ff3fa5371b8a48a21
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bccb29b9b6665b9e8606ee483b434e2873815a804faf2d0bd800714c9af2b02f3156694b29cd5f66e9e9173b28adc6119703ee14503e5843c18849b0c47c78cb
|
7
|
+
data.tar.gz: 59bcd0faf278887485033c26a19757ad5d15c01de468e94c27b2115cc04f4fcb65280b0fba0057cb8a28bc2f902dbc2a9f879a50e9101464e07f133c1eed0981
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 HouseTrip Ltd.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,213 @@
|
|
1
|
+
# Blurrily — Millisecond fuzzy string matching
|
2
|
+
|
3
|
+
This is a fork made solely for the purpose of adding Ruby 2.4 support and making it available in RubyGems.
|
4
|
+
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/blurrily.svg)](http://badge.fury.io/rb/blurrily)
|
6
|
+
[![Build Status](https://travis-ci.org/mezis/blurrily.svg?branch=master)](https://travis-ci.org/mezis/blurrily)
|
7
|
+
[![Dependency Status](https://gemnasium.com/mezis/blurrily.svg)](https://gemnasium.com/mezis/blurrily)
|
8
|
+
[![Code Climate](https://codeclimate.com/github/mezis/blurrily.svg)](https://codeclimate.com/github/mezis/blurrily)
|
9
|
+
[![Coverage Status](https://coveralls.io/repos/mezis/blurrily/badge.png)](https://coveralls.io/r/mezis/blurrily)
|
10
|
+
|
11
|
+
> Show me photos of **Marakech** !
|
12
|
+
>
|
13
|
+
> Here are some photos of **Marrakesh**, Morroco.
|
14
|
+
> Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
|
15
|
+
|
16
|
+
Blurrily finds misspelled, prefix, or partial needles in a haystack of
|
17
|
+
strings, quickly. It scales well: its response time is typically 1-2ms on
|
18
|
+
user-input datasets and 75-100ms on pathological datasets
|
19
|
+
([more](#benchmarks)).
|
20
|
+
|
21
|
+
Blurrily is compatible and tested with all MRI Rubies from 1.9.3 to 2.2.0.
|
22
|
+
It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
|
23
|
+
|
24
|
+
Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
|
25
|
+
approach to find good matches. If you're using ActiveRecord and looking for
|
26
|
+
a lightweight (albeit much slower), in-process, Rails-friendly version of
|
27
|
+
this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
|
28
|
+
perform fuzzy text searching in ActiveRecord.
|
29
|
+
|
30
|
+
|
31
|
+
## Installation
|
32
|
+
|
33
|
+
Add this line to your application's Gemfile:
|
34
|
+
|
35
|
+
gem 'blurrily'
|
36
|
+
|
37
|
+
Or install it yourself as:
|
38
|
+
|
39
|
+
$ gem install blurrily
|
40
|
+
|
41
|
+
## Docker
|
42
|
+
|
43
|
+
You can optionally run [Burrily as a Docker Container](https://github.com/mrmattwright/docker-blurrily). Maintained by [MrMattWright](https://github.com/mrmattwright).
|
44
|
+
|
45
|
+
## Usage
|
46
|
+
|
47
|
+
You can use blurrily as a client/server combination (recommended in
|
48
|
+
production), or use the internals standalone.
|
49
|
+
|
50
|
+
See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
|
51
|
+
for more details.
|
52
|
+
|
53
|
+
### Client/server
|
54
|
+
|
55
|
+
Fire up a blurrily server:
|
56
|
+
|
57
|
+
$ blurrily
|
58
|
+
|
59
|
+
Open up a console and connect:
|
60
|
+
|
61
|
+
$ irb -rubygems
|
62
|
+
> require 'blurrily/client'
|
63
|
+
> client = Blurrily::Client.new
|
64
|
+
|
65
|
+
Store a needle with a reference:
|
66
|
+
|
67
|
+
> client.put('London', 1337)
|
68
|
+
|
69
|
+
Recover a reference form the haystack:
|
70
|
+
|
71
|
+
> client.find('lonndon')
|
72
|
+
#=> [1337]
|
73
|
+
|
74
|
+
### Standalone
|
75
|
+
|
76
|
+
Create the in-memory database:
|
77
|
+
|
78
|
+
> map = Blurrily::Map.new
|
79
|
+
|
80
|
+
Store a needle with a reference:
|
81
|
+
|
82
|
+
> map.put('London', 1337)
|
83
|
+
|
84
|
+
Recover a reference form the haystack:
|
85
|
+
|
86
|
+
> map.find('lonndon')
|
87
|
+
#=> [1337]
|
88
|
+
|
89
|
+
Save the database to disk:
|
90
|
+
|
91
|
+
> map.save('/var/db/data.trigrams')
|
92
|
+
|
93
|
+
Load a previously saved database:
|
94
|
+
|
95
|
+
> map = Blurrily::Map.load('/var/db/data.trigrams')
|
96
|
+
|
97
|
+
|
98
|
+
## Caveats
|
99
|
+
|
100
|
+
### Diacritics, non-latin languages
|
101
|
+
|
102
|
+
Blurrily forms trigrams from the 26 latin letters and a stop character (used
|
103
|
+
to model start-of-string and separation between words in multi-word
|
104
|
+
strings).
|
105
|
+
|
106
|
+
This means that case and diacritrics are completely ignored by Blurrily. For
|
107
|
+
instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
|
108
|
+
|
109
|
+
It also means that any non-latin input will probably result in garbage data
|
110
|
+
and garbage results (although it won't crash).
|
111
|
+
|
112
|
+
### Multi-word needles and edge stickyness.
|
113
|
+
|
114
|
+
Multi-word needles (say, *New York*) are supported.
|
115
|
+
|
116
|
+
The engine always favours matches that begin and end similarly to the
|
117
|
+
needle, with a bias to the beginning of the strings.
|
118
|
+
|
119
|
+
This is because internally, the string *New York* is turned into this
|
120
|
+
sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
|
121
|
+
`ork`, `rk*`.
|
122
|
+
|
123
|
+
## Production notes
|
124
|
+
|
125
|
+
### Memory usage
|
126
|
+
|
127
|
+
Blurrily does not store your original strings but rather a flat map of
|
128
|
+
references and weights for each trigram in your input strings.
|
129
|
+
|
130
|
+
In practice any database will use up a base 560KB for the index header, plus
|
131
|
+
128 bits per trigram.
|
132
|
+
|
133
|
+
As a rule of thumb idea memory usages is 40MB + 8 times the size of your
|
134
|
+
input data, and 50% extra on top during bulk imports (lots of writes to the
|
135
|
+
database).
|
136
|
+
|
137
|
+
For instance, `/usr/share/dict/words` is a list of 235k English words, and
|
138
|
+
weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
|
139
|
+
are the database.
|
140
|
+
|
141
|
+
Note that once a database has been written to disk and loaded from disk,
|
142
|
+
memory usage is minimal (560KB per database) as the database file is memory
|
143
|
+
mapped. For performance you do need as much free memory as the database
|
144
|
+
size.
|
145
|
+
|
146
|
+
### Disk usage
|
147
|
+
|
148
|
+
Disk usage is almost exactly like memory usage, since database files are
|
149
|
+
nothing more than a memory dump.
|
150
|
+
|
151
|
+
In the `/usr/share/dict/words` example, on-disk size is 51MB.
|
152
|
+
For the whole list of Geonames places, on-disk size is 1.1GB.
|
153
|
+
|
154
|
+
### Read v write
|
155
|
+
|
156
|
+
Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
|
157
|
+
after all, optimized for intensive reads.
|
158
|
+
|
159
|
+
Supporting writes means the engine needs to keep a hash table of all
|
160
|
+
references around, typically weighing 50% of your total input. This is build
|
161
|
+
lazily while writing however; so if you load a database from disk and only
|
162
|
+
ever read, you will not incur the memory penalty.
|
163
|
+
|
164
|
+
### Saving & backing up
|
165
|
+
|
166
|
+
Blurrily saves atomically (writing to a separate file, then using rename(2)
|
167
|
+
to overwrite the old file), meaning you should never lose data.
|
168
|
+
|
169
|
+
The server does this for you every 60 seconds and when quitting. If using
|
170
|
+
`Blurrily::Map` directly, remember that a map loaded from disk is more
|
171
|
+
memory efficient that a map in memory, so if your workload is read-heavy,
|
172
|
+
you should `.load` after each `#save`.
|
173
|
+
|
174
|
+
Backing up comes with a caveat: database files are only portable across
|
175
|
+
architectures if endianness and pointer size are the same (tested between
|
176
|
+
darwin-x86_64 and linux-amd64).
|
177
|
+
|
178
|
+
Database files are very compressible; `bzip2` typically shrinks them to 20%
|
179
|
+
of their original size.
|
180
|
+
|
181
|
+
|
182
|
+
## Benchmarks
|
183
|
+
|
184
|
+
Blurrily is wicked fast, often 100x faster than it's ancestor,
|
185
|
+
[fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
|
186
|
+
the-metal, single-purpose index using almost exclusively libc primitives. On
|
187
|
+
the inside the only expensive operations it performs are
|
188
|
+
|
189
|
+
- memcpy(2) lots of data around (selection);
|
190
|
+
- mergesort(3) to aggregate/count similar entries (reduction);
|
191
|
+
- qsort(3) to order by counts (sort).
|
192
|
+
|
193
|
+
It tends to be faster with large datasets on BSD than on Linux because the
|
194
|
+
former has fast quicksort and mergesort, wheras the latter only has `qsort`,
|
195
|
+
a slower, catch-all sorter. In complexity terms this is because FIND tends
|
196
|
+
to be *O(n)* on BSD and *O(n ln n)* on Linux.
|
197
|
+
|
198
|
+
Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
|
199
|
+
and take respectively ~10ms and ~100µs on any platform, so they aren't
|
200
|
+
graphed here.
|
201
|
+
|
202
|
+
- [FIND latency](/doc/bench-find.png)
|
203
|
+
- [SAVE latency](/doc/bench-save.png)
|
204
|
+
- [DELETE latency](/doc/bench-delete.png)
|
205
|
+
|
206
|
+
|
207
|
+
## Contributing
|
208
|
+
|
209
|
+
1. Fork it
|
210
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
211
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
212
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
213
|
+
5. Create new Pull Request
|
data/bin/blurrily
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$PROGRAM_NAME = 'blurrily'
|
3
|
+
|
4
|
+
require "blurrily/server"
|
5
|
+
require 'optparse'
|
6
|
+
require 'ostruct'
|
7
|
+
|
8
|
+
options = OpenStruct.new
|
9
|
+
|
10
|
+
# Defaults
|
11
|
+
options.port = 12021
|
12
|
+
options.directory = '.'
|
13
|
+
options.host = '0.0.0.0'
|
14
|
+
|
15
|
+
parser = OptionParser.new do |opts|
|
16
|
+
opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
|
17
|
+
|
18
|
+
opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
|
19
|
+
puts 'Port has to be numeric value' and exit unless port =~ /\d+/
|
20
|
+
options.port = port.to_i
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
|
24
|
+
options.directory = directory
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
|
28
|
+
options.host = address || '0.0.0.0'
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-V", "--version", "Output version") do |address|
|
32
|
+
puts Blurrily::VERSION
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
37
|
+
puts opts
|
38
|
+
exit
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
parser.parse!(ARGV)
|
43
|
+
Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
|
@@ -0,0 +1,21 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
blurrily.h --
|
4
|
+
|
5
|
+
Helper macros
|
6
|
+
|
7
|
+
*/
|
8
|
+
|
9
|
+
#ifndef __BLURRILY_H__
|
10
|
+
#define __BLURRILY_H__ 1
|
11
|
+
|
12
|
+
#define BR_PACKED_STRUCT __attribute__ ((__packed__))
|
13
|
+
#define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
|
14
|
+
|
15
|
+
#ifdef DEBUG
|
16
|
+
#define LOG(...) fprintf(stderr, __VA_ARGS__)
|
17
|
+
#else
|
18
|
+
#define LOG(...)
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#endif
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
PLATFORM = `uname`.strip.upcase
|
4
|
+
SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra"
|
5
|
+
|
6
|
+
case PLATFORM
|
7
|
+
when 'LINUX'
|
8
|
+
# make sure ftruncate is available
|
9
|
+
SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
|
10
|
+
SHARED_FLAGS << ' -D_GNU_SOURCE=1'
|
11
|
+
# make sure off_t is 64 bit long
|
12
|
+
SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
|
13
|
+
end
|
14
|
+
|
15
|
+
# production
|
16
|
+
$CFLAGS += " #{SHARED_FLAGS} -Os"
|
17
|
+
|
18
|
+
# development
|
19
|
+
# $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
|
20
|
+
|
21
|
+
create_makefile('blurrily/map_ext')
|
@@ -0,0 +1,230 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "storage.h"
|
4
|
+
#include "blurrily.h"
|
5
|
+
|
6
|
+
static VALUE eClosedError = Qnil;
|
7
|
+
static VALUE eBlurrilyModule = Qnil;
|
8
|
+
|
9
|
+
/******************************************************************************/
|
10
|
+
|
11
|
+
static int raise_if_closed(VALUE self)
|
12
|
+
{
|
13
|
+
if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
|
14
|
+
rb_raise(eClosedError, "Map was freed");
|
15
|
+
return 1;
|
16
|
+
}
|
17
|
+
|
18
|
+
static void mark_as_closed(VALUE self)
|
19
|
+
{
|
20
|
+
rb_ivar_set(self, rb_intern("@closed"), Qtrue);
|
21
|
+
}
|
22
|
+
|
23
|
+
/******************************************************************************/
|
24
|
+
|
25
|
+
static void blurrily_free(void* haystack)
|
26
|
+
{
|
27
|
+
int res = -1;
|
28
|
+
|
29
|
+
if (haystack == NULL) return;
|
30
|
+
res = blurrily_storage_close((trigram_map*) &haystack);
|
31
|
+
assert(res >= 0);
|
32
|
+
}
|
33
|
+
|
34
|
+
/******************************************************************************/
|
35
|
+
|
36
|
+
static void blurrily_mark(void* haystack)
|
37
|
+
{
|
38
|
+
if (haystack == NULL) return;
|
39
|
+
blurrily_storage_mark((trigram_map) haystack);
|
40
|
+
}
|
41
|
+
|
42
|
+
/******************************************************************************/
|
43
|
+
|
44
|
+
static VALUE blurrily_new(VALUE class) {
|
45
|
+
VALUE wrapper = Qnil;
|
46
|
+
trigram_map haystack = (trigram_map)NULL;
|
47
|
+
int res = -1;
|
48
|
+
|
49
|
+
res = blurrily_storage_new(&haystack);
|
50
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
51
|
+
|
52
|
+
wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
|
53
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
54
|
+
return wrapper;
|
55
|
+
}
|
56
|
+
|
57
|
+
/******************************************************************************/
|
58
|
+
|
59
|
+
static VALUE blurrily_load(VALUE class, VALUE rb_path) {
|
60
|
+
char* path = StringValuePtr(rb_path);
|
61
|
+
VALUE wrapper = Qnil;
|
62
|
+
trigram_map haystack = (trigram_map)NULL;
|
63
|
+
int res = -1;
|
64
|
+
|
65
|
+
res = blurrily_storage_load(&haystack, path);
|
66
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
67
|
+
|
68
|
+
wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
|
69
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
70
|
+
return wrapper;
|
71
|
+
}
|
72
|
+
|
73
|
+
/******************************************************************************/
|
74
|
+
|
75
|
+
static VALUE blurrily_initialize(VALUE UNUSED(self)) {
|
76
|
+
return Qtrue;
|
77
|
+
}
|
78
|
+
|
79
|
+
/******************************************************************************/
|
80
|
+
|
81
|
+
static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
|
82
|
+
trigram_map haystack = (trigram_map)NULL;
|
83
|
+
int res = -1;
|
84
|
+
char* needle = StringValuePtr(rb_needle);
|
85
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
86
|
+
uint32_t weight = NUM2UINT(rb_weight);
|
87
|
+
|
88
|
+
if (raise_if_closed(self)) return Qnil;
|
89
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
90
|
+
|
91
|
+
res = blurrily_storage_put(haystack, needle, reference, weight);
|
92
|
+
assert(res >= 0);
|
93
|
+
|
94
|
+
return INT2NUM(res);
|
95
|
+
}
|
96
|
+
|
97
|
+
/******************************************************************************/
|
98
|
+
|
99
|
+
static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
|
100
|
+
trigram_map haystack = (trigram_map)NULL;
|
101
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
102
|
+
int res = -1;
|
103
|
+
|
104
|
+
if (raise_if_closed(self)) return Qnil;
|
105
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
106
|
+
|
107
|
+
res = blurrily_storage_delete(haystack, reference);
|
108
|
+
assert(res >= 0);
|
109
|
+
|
110
|
+
return INT2NUM(res);
|
111
|
+
}
|
112
|
+
|
113
|
+
/******************************************************************************/
|
114
|
+
|
115
|
+
static VALUE blurrily_save(VALUE self, VALUE rb_path) {
|
116
|
+
trigram_map haystack = (trigram_map)NULL;
|
117
|
+
int res = -1;
|
118
|
+
const char* path = StringValuePtr(rb_path);
|
119
|
+
|
120
|
+
if (raise_if_closed(self)) return Qnil;
|
121
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
122
|
+
|
123
|
+
res = blurrily_storage_save(haystack, path);
|
124
|
+
if (res < 0) rb_sys_fail(NULL);
|
125
|
+
|
126
|
+
return Qnil;
|
127
|
+
}
|
128
|
+
|
129
|
+
/******************************************************************************/
|
130
|
+
|
131
|
+
static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
|
132
|
+
trigram_map haystack = (trigram_map)NULL;
|
133
|
+
int res = -1;
|
134
|
+
const char* needle = StringValuePtr(rb_needle);
|
135
|
+
int limit = NUM2UINT(rb_limit);
|
136
|
+
trigram_match matches = NULL;
|
137
|
+
VALUE rb_matches = Qnil;
|
138
|
+
|
139
|
+
if (raise_if_closed(self)) return Qnil;
|
140
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
141
|
+
|
142
|
+
if (limit <= 0) {
|
143
|
+
// rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
|
144
|
+
rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
|
145
|
+
limit = NUM2UINT(rb_limit);
|
146
|
+
}
|
147
|
+
matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
|
148
|
+
|
149
|
+
res = blurrily_storage_find(haystack, needle, limit, matches);
|
150
|
+
assert(res >= 0);
|
151
|
+
|
152
|
+
/* wrap the matches into a Ruby array */
|
153
|
+
rb_matches = rb_ary_new();
|
154
|
+
for (int k = 0; k < res; ++k) {
|
155
|
+
VALUE rb_match = rb_ary_new();
|
156
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
|
157
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
|
158
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
|
159
|
+
rb_ary_push(rb_matches, rb_match);
|
160
|
+
}
|
161
|
+
return rb_matches;
|
162
|
+
}
|
163
|
+
|
164
|
+
|
165
|
+
/******************************************************************************/
|
166
|
+
|
167
|
+
static VALUE blurrily_stats(VALUE self)
|
168
|
+
{
|
169
|
+
trigram_map haystack = (trigram_map)NULL;
|
170
|
+
trigram_stat_t stats;
|
171
|
+
VALUE result = rb_hash_new();
|
172
|
+
int res = -1;
|
173
|
+
|
174
|
+
if (raise_if_closed(self)) return Qnil;
|
175
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
176
|
+
|
177
|
+
res = blurrily_storage_stats(haystack, &stats);
|
178
|
+
assert(res >= 0);
|
179
|
+
|
180
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
|
181
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
|
182
|
+
|
183
|
+
return result;
|
184
|
+
}
|
185
|
+
|
186
|
+
/******************************************************************************/
|
187
|
+
|
188
|
+
static VALUE blurrily_close(VALUE self)
|
189
|
+
{
|
190
|
+
trigram_map haystack = (trigram_map)NULL;
|
191
|
+
int res = -1;
|
192
|
+
|
193
|
+
if (raise_if_closed(self)) return Qnil;
|
194
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
195
|
+
|
196
|
+
res = blurrily_storage_close(&haystack);
|
197
|
+
if (res < 0) rb_sys_fail(NULL);
|
198
|
+
|
199
|
+
DATA_PTR(self) = NULL;
|
200
|
+
mark_as_closed(self);
|
201
|
+
return Qnil;
|
202
|
+
}
|
203
|
+
|
204
|
+
/******************************************************************************/
|
205
|
+
|
206
|
+
void Init_map_ext(void) {
|
207
|
+
VALUE klass = Qnil;
|
208
|
+
|
209
|
+
/* assume we haven't yet defined blurrily */
|
210
|
+
eBlurrilyModule = rb_define_module("Blurrily");
|
211
|
+
assert(eBlurrilyModule != Qnil);
|
212
|
+
|
213
|
+
klass = rb_define_class_under(eBlurrilyModule, "RawMap", rb_cObject);
|
214
|
+
assert(klass != Qnil);
|
215
|
+
|
216
|
+
eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
|
217
|
+
assert(klass != Qnil);
|
218
|
+
|
219
|
+
rb_define_singleton_method(klass, "new", blurrily_new, 0);
|
220
|
+
rb_define_singleton_method(klass, "load", blurrily_load, 1);
|
221
|
+
|
222
|
+
rb_define_method(klass, "initialize", blurrily_initialize, 0);
|
223
|
+
rb_define_method(klass, "put", blurrily_put, 3);
|
224
|
+
rb_define_method(klass, "delete", blurrily_delete, 1);
|
225
|
+
rb_define_method(klass, "save", blurrily_save, 1);
|
226
|
+
rb_define_method(klass, "find", blurrily_find, 2);
|
227
|
+
rb_define_method(klass, "stats", blurrily_stats, 0);
|
228
|
+
rb_define_method(klass, "close", blurrily_close, 0);
|
229
|
+
return;
|
230
|
+
}
|