vinted-blurrily 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9c8978b87d66ef94646d643c471b745c84b2489
4
+ data.tar.gz: 73c63bbd8614d452ee27c16ff3fa5371b8a48a21
5
+ SHA512:
6
+ metadata.gz: bccb29b9b6665b9e8606ee483b434e2873815a804faf2d0bd800714c9af2b02f3156694b29cd5f66e9e9173b28adc6119703ee14503e5843c18849b0c47c78cb
7
+ data.tar.gz: 59bcd0faf278887485033c26a19757ad5d15c01de468e94c27b2115cc04f4fcb65280b0fba0057cb8a28bc2f902dbc2a9f879a50e9101464e07f133c1eed0981
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 HouseTrip Ltd.
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,213 @@
1
+ # Blurrily — Millisecond fuzzy string matching
2
+
3
+ This is a fork made solely for the purpose of adding Ruby 2.4 support and making it available in RubyGems.
4
+
5
+ [![Gem Version](https://badge.fury.io/rb/blurrily.svg)](http://badge.fury.io/rb/blurrily)
6
+ [![Build Status](https://travis-ci.org/mezis/blurrily.svg?branch=master)](https://travis-ci.org/mezis/blurrily)
7
+ [![Dependency Status](https://gemnasium.com/mezis/blurrily.svg)](https://gemnasium.com/mezis/blurrily)
8
+ [![Code Climate](https://codeclimate.com/github/mezis/blurrily.svg)](https://codeclimate.com/github/mezis/blurrily)
9
+ [![Coverage Status](https://coveralls.io/repos/mezis/blurrily/badge.png)](https://coveralls.io/r/mezis/blurrily)
10
+
11
+ > Show me photos of **Marakech** !
12
+ >
13
+ > Here are some photos of **Marrakesh**, Morroco.
14
+ > Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
15
+
16
+ Blurrily finds misspelled, prefix, or partial needles in a haystack of
17
+ strings, quickly. It scales well: its response time is typically 1-2ms on
18
+ user-input datasets and 75-100ms on pathological datasets
19
+ ([more](#benchmarks)).
20
+
21
+ Blurrily is compatible and tested with all MRI Rubies from 1.9.3 to 2.2.0.
22
+ It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
23
+
24
+ Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
25
+ approach to find good matches. If you're using ActiveRecord and looking for
26
+ a lightweight (albeit much slower), in-process, Rails-friendly version of
27
+ this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
28
+ perform fuzzy text searching in ActiveRecord.
29
+
30
+
31
+ ## Installation
32
+
33
+ Add this line to your application's Gemfile:
34
+
35
+ gem 'blurrily'
36
+
37
+ Or install it yourself as:
38
+
39
+ $ gem install blurrily
40
+
41
+ ## Docker
42
+
43
+ You can optionally run [Burrily as a Docker Container](https://github.com/mrmattwright/docker-blurrily). Maintained by [MrMattWright](https://github.com/mrmattwright).
44
+
45
+ ## Usage
46
+
47
+ You can use blurrily as a client/server combination (recommended in
48
+ production), or use the internals standalone.
49
+
50
+ See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
51
+ for more details.
52
+
53
+ ### Client/server
54
+
55
+ Fire up a blurrily server:
56
+
57
+ $ blurrily
58
+
59
+ Open up a console and connect:
60
+
61
+ $ irb -rubygems
62
+ > require 'blurrily/client'
63
+ > client = Blurrily::Client.new
64
+
65
+ Store a needle with a reference:
66
+
67
+ > client.put('London', 1337)
68
+
69
+ Recover a reference form the haystack:
70
+
71
+ > client.find('lonndon')
72
+ #=> [1337]
73
+
74
+ ### Standalone
75
+
76
+ Create the in-memory database:
77
+
78
+ > map = Blurrily::Map.new
79
+
80
+ Store a needle with a reference:
81
+
82
+ > map.put('London', 1337)
83
+
84
+ Recover a reference form the haystack:
85
+
86
+ > map.find('lonndon')
87
+ #=> [1337]
88
+
89
+ Save the database to disk:
90
+
91
+ > map.save('/var/db/data.trigrams')
92
+
93
+ Load a previously saved database:
94
+
95
+ > map = Blurrily::Map.load('/var/db/data.trigrams')
96
+
97
+
98
+ ## Caveats
99
+
100
+ ### Diacritics, non-latin languages
101
+
102
+ Blurrily forms trigrams from the 26 latin letters and a stop character (used
103
+ to model start-of-string and separation between words in multi-word
104
+ strings).
105
+
106
+ This means that case and diacritrics are completely ignored by Blurrily. For
107
+ instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
108
+
109
+ It also means that any non-latin input will probably result in garbage data
110
+ and garbage results (although it won't crash).
111
+
112
+ ### Multi-word needles and edge stickyness.
113
+
114
+ Multi-word needles (say, *New York*) are supported.
115
+
116
+ The engine always favours matches that begin and end similarly to the
117
+ needle, with a bias to the beginning of the strings.
118
+
119
+ This is because internally, the string *New York* is turned into this
120
+ sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
121
+ `ork`, `rk*`.
122
+
123
+ ## Production notes
124
+
125
+ ### Memory usage
126
+
127
+ Blurrily does not store your original strings but rather a flat map of
128
+ references and weights for each trigram in your input strings.
129
+
130
+ In practice any database will use up a base 560KB for the index header, plus
131
+ 128 bits per trigram.
132
+
133
+ As a rule of thumb idea memory usages is 40MB + 8 times the size of your
134
+ input data, and 50% extra on top during bulk imports (lots of writes to the
135
+ database).
136
+
137
+ For instance, `/usr/share/dict/words` is a list of 235k English words, and
138
+ weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
139
+ are the database.
140
+
141
+ Note that once a database has been written to disk and loaded from disk,
142
+ memory usage is minimal (560KB per database) as the database file is memory
143
+ mapped. For performance you do need as much free memory as the database
144
+ size.
145
+
146
+ ### Disk usage
147
+
148
+ Disk usage is almost exactly like memory usage, since database files are
149
+ nothing more than a memory dump.
150
+
151
+ In the `/usr/share/dict/words` example, on-disk size is 51MB.
152
+ For the whole list of Geonames places, on-disk size is 1.1GB.
153
+
154
+ ### Read v write
155
+
156
+ Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
157
+ after all, optimized for intensive reads.
158
+
159
+ Supporting writes means the engine needs to keep a hash table of all
160
+ references around, typically weighing 50% of your total input. This is build
161
+ lazily while writing however; so if you load a database from disk and only
162
+ ever read, you will not incur the memory penalty.
163
+
164
+ ### Saving & backing up
165
+
166
+ Blurrily saves atomically (writing to a separate file, then using rename(2)
167
+ to overwrite the old file), meaning you should never lose data.
168
+
169
+ The server does this for you every 60 seconds and when quitting. If using
170
+ `Blurrily::Map` directly, remember that a map loaded from disk is more
171
+ memory efficient that a map in memory, so if your workload is read-heavy,
172
+ you should `.load` after each `#save`.
173
+
174
+ Backing up comes with a caveat: database files are only portable across
175
+ architectures if endianness and pointer size are the same (tested between
176
+ darwin-x86_64 and linux-amd64).
177
+
178
+ Database files are very compressible; `bzip2` typically shrinks them to 20%
179
+ of their original size.
180
+
181
+
182
+ ## Benchmarks
183
+
184
+ Blurrily is wicked fast, often 100x faster than it's ancestor,
185
+ [fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
186
+ the-metal, single-purpose index using almost exclusively libc primitives. On
187
+ the inside the only expensive operations it performs are
188
+
189
+ - memcpy(2) lots of data around (selection);
190
+ - mergesort(3) to aggregate/count similar entries (reduction);
191
+ - qsort(3) to order by counts (sort).
192
+
193
+ It tends to be faster with large datasets on BSD than on Linux because the
194
+ former has fast quicksort and mergesort, wheras the latter only has `qsort`,
195
+ a slower, catch-all sorter. In complexity terms this is because FIND tends
196
+ to be *O(n)* on BSD and *O(n ln n)* on Linux.
197
+
198
+ Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
199
+ and take respectively ~10ms and ~100µs on any platform, so they aren't
200
+ graphed here.
201
+
202
+ - [FIND latency](/doc/bench-find.png)
203
+ - [SAVE latency](/doc/bench-save.png)
204
+ - [DELETE latency](/doc/bench-delete.png)
205
+
206
+
207
+ ## Contributing
208
+
209
+ 1. Fork it
210
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
211
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
212
+ 4. Push to the branch (`git push origin my-new-feature`)
213
+ 5. Create new Pull Request
data/bin/blurrily ADDED
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ $PROGRAM_NAME = 'blurrily'
3
+
4
+ require "blurrily/server"
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+
10
+ # Defaults
11
+ options.port = 12021
12
+ options.directory = '.'
13
+ options.host = '0.0.0.0'
14
+
15
+ parser = OptionParser.new do |opts|
16
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
17
+
18
+ opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
19
+ puts 'Port has to be numeric value' and exit unless port =~ /\d+/
20
+ options.port = port.to_i
21
+ end
22
+
23
+ opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
24
+ options.directory = directory
25
+ end
26
+
27
+ opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
28
+ options.host = address || '0.0.0.0'
29
+ end
30
+
31
+ opts.on("-V", "--version", "Output version") do |address|
32
+ puts Blurrily::VERSION
33
+ exit
34
+ end
35
+
36
+ opts.on_tail("-h", "--help", "Show this message") do
37
+ puts opts
38
+ exit
39
+ end
40
+ end
41
+
42
+ parser.parse!(ARGV)
43
+ Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
@@ -0,0 +1,21 @@
1
+ /*
2
+
3
+ blurrily.h --
4
+
5
+ Helper macros
6
+
7
+ */
8
+
9
+ #ifndef __BLURRILY_H__
10
+ #define __BLURRILY_H__ 1
11
+
12
+ #define BR_PACKED_STRUCT __attribute__ ((__packed__))
13
+ #define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
14
+
15
+ #ifdef DEBUG
16
+ #define LOG(...) fprintf(stderr, __VA_ARGS__)
17
+ #else
18
+ #define LOG(...)
19
+ #endif
20
+
21
+ #endif
@@ -0,0 +1,21 @@
1
+ require 'mkmf'
2
+
3
+ PLATFORM = `uname`.strip.upcase
4
+ SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra"
5
+
6
+ case PLATFORM
7
+ when 'LINUX'
8
+ # make sure ftruncate is available
9
+ SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
10
+ SHARED_FLAGS << ' -D_GNU_SOURCE=1'
11
+ # make sure off_t is 64 bit long
12
+ SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
13
+ end
14
+
15
+ # production
16
+ $CFLAGS += " #{SHARED_FLAGS} -Os"
17
+
18
+ # development
19
+ # $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
20
+
21
+ create_makefile('blurrily/map_ext')
@@ -0,0 +1,230 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+ #include "storage.h"
4
+ #include "blurrily.h"
5
+
6
+ static VALUE eClosedError = Qnil;
7
+ static VALUE eBlurrilyModule = Qnil;
8
+
9
+ /******************************************************************************/
10
+
11
+ static int raise_if_closed(VALUE self)
12
+ {
13
+ if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
14
+ rb_raise(eClosedError, "Map was freed");
15
+ return 1;
16
+ }
17
+
18
+ static void mark_as_closed(VALUE self)
19
+ {
20
+ rb_ivar_set(self, rb_intern("@closed"), Qtrue);
21
+ }
22
+
23
+ /******************************************************************************/
24
+
25
+ static void blurrily_free(void* haystack)
26
+ {
27
+ int res = -1;
28
+
29
+ if (haystack == NULL) return;
30
+ res = blurrily_storage_close((trigram_map*) &haystack);
31
+ assert(res >= 0);
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ static void blurrily_mark(void* haystack)
37
+ {
38
+ if (haystack == NULL) return;
39
+ blurrily_storage_mark((trigram_map) haystack);
40
+ }
41
+
42
+ /******************************************************************************/
43
+
44
+ static VALUE blurrily_new(VALUE class) {
45
+ VALUE wrapper = Qnil;
46
+ trigram_map haystack = (trigram_map)NULL;
47
+ int res = -1;
48
+
49
+ res = blurrily_storage_new(&haystack);
50
+ if (res < 0) { rb_sys_fail(NULL); return Qnil; }
51
+
52
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
53
+ rb_obj_call_init(wrapper, 0, NULL);
54
+ return wrapper;
55
+ }
56
+
57
+ /******************************************************************************/
58
+
59
+ static VALUE blurrily_load(VALUE class, VALUE rb_path) {
60
+ char* path = StringValuePtr(rb_path);
61
+ VALUE wrapper = Qnil;
62
+ trigram_map haystack = (trigram_map)NULL;
63
+ int res = -1;
64
+
65
+ res = blurrily_storage_load(&haystack, path);
66
+ if (res < 0) { rb_sys_fail(NULL); return Qnil; }
67
+
68
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
69
+ rb_obj_call_init(wrapper, 0, NULL);
70
+ return wrapper;
71
+ }
72
+
73
+ /******************************************************************************/
74
+
75
+ static VALUE blurrily_initialize(VALUE UNUSED(self)) {
76
+ return Qtrue;
77
+ }
78
+
79
+ /******************************************************************************/
80
+
81
+ static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
82
+ trigram_map haystack = (trigram_map)NULL;
83
+ int res = -1;
84
+ char* needle = StringValuePtr(rb_needle);
85
+ uint32_t reference = NUM2UINT(rb_reference);
86
+ uint32_t weight = NUM2UINT(rb_weight);
87
+
88
+ if (raise_if_closed(self)) return Qnil;
89
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
90
+
91
+ res = blurrily_storage_put(haystack, needle, reference, weight);
92
+ assert(res >= 0);
93
+
94
+ return INT2NUM(res);
95
+ }
96
+
97
+ /******************************************************************************/
98
+
99
+ static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
100
+ trigram_map haystack = (trigram_map)NULL;
101
+ uint32_t reference = NUM2UINT(rb_reference);
102
+ int res = -1;
103
+
104
+ if (raise_if_closed(self)) return Qnil;
105
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
106
+
107
+ res = blurrily_storage_delete(haystack, reference);
108
+ assert(res >= 0);
109
+
110
+ return INT2NUM(res);
111
+ }
112
+
113
+ /******************************************************************************/
114
+
115
+ static VALUE blurrily_save(VALUE self, VALUE rb_path) {
116
+ trigram_map haystack = (trigram_map)NULL;
117
+ int res = -1;
118
+ const char* path = StringValuePtr(rb_path);
119
+
120
+ if (raise_if_closed(self)) return Qnil;
121
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
122
+
123
+ res = blurrily_storage_save(haystack, path);
124
+ if (res < 0) rb_sys_fail(NULL);
125
+
126
+ return Qnil;
127
+ }
128
+
129
+ /******************************************************************************/
130
+
131
+ static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
132
+ trigram_map haystack = (trigram_map)NULL;
133
+ int res = -1;
134
+ const char* needle = StringValuePtr(rb_needle);
135
+ int limit = NUM2UINT(rb_limit);
136
+ trigram_match matches = NULL;
137
+ VALUE rb_matches = Qnil;
138
+
139
+ if (raise_if_closed(self)) return Qnil;
140
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
141
+
142
+ if (limit <= 0) {
143
+ // rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
144
+ rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
145
+ limit = NUM2UINT(rb_limit);
146
+ }
147
+ matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
148
+
149
+ res = blurrily_storage_find(haystack, needle, limit, matches);
150
+ assert(res >= 0);
151
+
152
+ /* wrap the matches into a Ruby array */
153
+ rb_matches = rb_ary_new();
154
+ for (int k = 0; k < res; ++k) {
155
+ VALUE rb_match = rb_ary_new();
156
+ rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
157
+ rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
158
+ rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
159
+ rb_ary_push(rb_matches, rb_match);
160
+ }
161
+ return rb_matches;
162
+ }
163
+
164
+
165
+ /******************************************************************************/
166
+
167
+ static VALUE blurrily_stats(VALUE self)
168
+ {
169
+ trigram_map haystack = (trigram_map)NULL;
170
+ trigram_stat_t stats;
171
+ VALUE result = rb_hash_new();
172
+ int res = -1;
173
+
174
+ if (raise_if_closed(self)) return Qnil;
175
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
176
+
177
+ res = blurrily_storage_stats(haystack, &stats);
178
+ assert(res >= 0);
179
+
180
+ (void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
181
+ (void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
182
+
183
+ return result;
184
+ }
185
+
186
+ /******************************************************************************/
187
+
188
+ static VALUE blurrily_close(VALUE self)
189
+ {
190
+ trigram_map haystack = (trigram_map)NULL;
191
+ int res = -1;
192
+
193
+ if (raise_if_closed(self)) return Qnil;
194
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
195
+
196
+ res = blurrily_storage_close(&haystack);
197
+ if (res < 0) rb_sys_fail(NULL);
198
+
199
+ DATA_PTR(self) = NULL;
200
+ mark_as_closed(self);
201
+ return Qnil;
202
+ }
203
+
204
+ /******************************************************************************/
205
+
206
+ void Init_map_ext(void) {
207
+ VALUE klass = Qnil;
208
+
209
+ /* assume we haven't yet defined blurrily */
210
+ eBlurrilyModule = rb_define_module("Blurrily");
211
+ assert(eBlurrilyModule != Qnil);
212
+
213
+ klass = rb_define_class_under(eBlurrilyModule, "RawMap", rb_cObject);
214
+ assert(klass != Qnil);
215
+
216
+ eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
217
+ assert(klass != Qnil);
218
+
219
+ rb_define_singleton_method(klass, "new", blurrily_new, 0);
220
+ rb_define_singleton_method(klass, "load", blurrily_load, 1);
221
+
222
+ rb_define_method(klass, "initialize", blurrily_initialize, 0);
223
+ rb_define_method(klass, "put", blurrily_put, 3);
224
+ rb_define_method(klass, "delete", blurrily_delete, 1);
225
+ rb_define_method(klass, "save", blurrily_save, 1);
226
+ rb_define_method(klass, "find", blurrily_find, 2);
227
+ rb_define_method(klass, "stats", blurrily_stats, 0);
228
+ rb_define_method(klass, "close", blurrily_close, 0);
229
+ return;
230
+ }