vinted-blurrily 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b9c8978b87d66ef94646d643c471b745c84b2489
4
+ data.tar.gz: 73c63bbd8614d452ee27c16ff3fa5371b8a48a21
5
+ SHA512:
6
+ metadata.gz: bccb29b9b6665b9e8606ee483b434e2873815a804faf2d0bd800714c9af2b02f3156694b29cd5f66e9e9173b28adc6119703ee14503e5843c18849b0c47c78cb
7
+ data.tar.gz: 59bcd0faf278887485033c26a19757ad5d15c01de468e94c27b2115cc04f4fcb65280b0fba0057cb8a28bc2f902dbc2a9f879a50e9101464e07f133c1eed0981
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 HouseTrip Ltd.
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,213 @@
1
+ # Blurrily — Millisecond fuzzy string matching
2
+
3
+ This is a fork made solely for the purpose of adding Ruby 2.4 support and making it available in RubyGems.
4
+
5
+ [![Gem Version](https://badge.fury.io/rb/blurrily.svg)](http://badge.fury.io/rb/blurrily)
6
+ [![Build Status](https://travis-ci.org/mezis/blurrily.svg?branch=master)](https://travis-ci.org/mezis/blurrily)
7
+ [![Dependency Status](https://gemnasium.com/mezis/blurrily.svg)](https://gemnasium.com/mezis/blurrily)
8
+ [![Code Climate](https://codeclimate.com/github/mezis/blurrily.svg)](https://codeclimate.com/github/mezis/blurrily)
9
+ [![Coverage Status](https://coveralls.io/repos/mezis/blurrily/badge.png)](https://coveralls.io/r/mezis/blurrily)
10
+
11
+ > Show me photos of **Marakech** !
12
+ >
13
+ > Here are some photos of **Marrakesh**, Morroco.
14
+ > Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
15
+
16
+ Blurrily finds misspelled, prefix, or partial needles in a haystack of
17
+ strings, quickly. It scales well: its response time is typically 1-2ms on
18
+ user-input datasets and 75-100ms on pathological datasets
19
+ ([more](#benchmarks)).
20
+
21
+ Blurrily is compatible and tested with all MRI Rubies from 1.9.3 to 2.2.0.
22
+ It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
23
+
24
+ Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
25
+ approach to find good matches. If you're using ActiveRecord and looking for
26
+ a lightweight (albeit much slower), in-process, Rails-friendly version of
27
+ this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
28
+ perform fuzzy text searching in ActiveRecord.
29
+
30
+
31
+ ## Installation
32
+
33
+ Add this line to your application's Gemfile:
34
+
35
+ gem 'blurrily'
36
+
37
+ Or install it yourself as:
38
+
39
+ $ gem install blurrily
40
+
41
+ ## Docker
42
+
43
+ You can optionally run [Burrily as a Docker Container](https://github.com/mrmattwright/docker-blurrily). Maintained by [MrMattWright](https://github.com/mrmattwright).
44
+
45
+ ## Usage
46
+
47
+ You can use blurrily as a client/server combination (recommended in
48
+ production), or use the internals standalone.
49
+
50
+ See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
51
+ for more details.
52
+
53
+ ### Client/server
54
+
55
+ Fire up a blurrily server:
56
+
57
+ $ blurrily
58
+
59
+ Open up a console and connect:
60
+
61
+ $ irb -rubygems
62
+ > require 'blurrily/client'
63
+ > client = Blurrily::Client.new
64
+
65
+ Store a needle with a reference:
66
+
67
+ > client.put('London', 1337)
68
+
69
+ Recover a reference form the haystack:
70
+
71
+ > client.find('lonndon')
72
+ #=> [1337]
73
+
74
+ ### Standalone
75
+
76
+ Create the in-memory database:
77
+
78
+ > map = Blurrily::Map.new
79
+
80
+ Store a needle with a reference:
81
+
82
+ > map.put('London', 1337)
83
+
84
+ Recover a reference form the haystack:
85
+
86
+ > map.find('lonndon')
87
+ #=> [1337]
88
+
89
+ Save the database to disk:
90
+
91
+ > map.save('/var/db/data.trigrams')
92
+
93
+ Load a previously saved database:
94
+
95
+ > map = Blurrily::Map.load('/var/db/data.trigrams')
96
+
97
+
98
+ ## Caveats
99
+
100
+ ### Diacritics, non-latin languages
101
+
102
+ Blurrily forms trigrams from the 26 latin letters and a stop character (used
103
+ to model start-of-string and separation between words in multi-word
104
+ strings).
105
+
106
+ This means that case and diacritrics are completely ignored by Blurrily. For
107
+ instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
108
+
109
+ It also means that any non-latin input will probably result in garbage data
110
+ and garbage results (although it won't crash).
111
+
112
+ ### Multi-word needles and edge stickyness.
113
+
114
+ Multi-word needles (say, *New York*) are supported.
115
+
116
+ The engine always favours matches that begin and end similarly to the
117
+ needle, with a bias to the beginning of the strings.
118
+
119
+ This is because internally, the string *New York* is turned into this
120
+ sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
121
+ `ork`, `rk*`.
122
+
123
+ ## Production notes
124
+
125
+ ### Memory usage
126
+
127
+ Blurrily does not store your original strings but rather a flat map of
128
+ references and weights for each trigram in your input strings.
129
+
130
+ In practice any database will use up a base 560KB for the index header, plus
131
+ 128 bits per trigram.
132
+
133
+ As a rule of thumb idea memory usages is 40MB + 8 times the size of your
134
+ input data, and 50% extra on top during bulk imports (lots of writes to the
135
+ database).
136
+
137
+ For instance, `/usr/share/dict/words` is a list of 235k English words, and
138
+ weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
139
+ are the database.
140
+
141
+ Note that once a database has been written to disk and loaded from disk,
142
+ memory usage is minimal (560KB per database) as the database file is memory
143
+ mapped. For performance you do need as much free memory as the database
144
+ size.
145
+
146
+ ### Disk usage
147
+
148
+ Disk usage is almost exactly like memory usage, since database files are
149
+ nothing more than a memory dump.
150
+
151
+ In the `/usr/share/dict/words` example, on-disk size is 51MB.
152
+ For the whole list of Geonames places, on-disk size is 1.1GB.
153
+
154
+ ### Read v write
155
+
156
+ Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
157
+ after all, optimized for intensive reads.
158
+
159
+ Supporting writes means the engine needs to keep a hash table of all
160
+ references around, typically weighing 50% of your total input. This is build
161
+ lazily while writing however; so if you load a database from disk and only
162
+ ever read, you will not incur the memory penalty.
163
+
164
+ ### Saving & backing up
165
+
166
+ Blurrily saves atomically (writing to a separate file, then using rename(2)
167
+ to overwrite the old file), meaning you should never lose data.
168
+
169
+ The server does this for you every 60 seconds and when quitting. If using
170
+ `Blurrily::Map` directly, remember that a map loaded from disk is more
171
+ memory efficient that a map in memory, so if your workload is read-heavy,
172
+ you should `.load` after each `#save`.
173
+
174
+ Backing up comes with a caveat: database files are only portable across
175
+ architectures if endianness and pointer size are the same (tested between
176
+ darwin-x86_64 and linux-amd64).
177
+
178
+ Database files are very compressible; `bzip2` typically shrinks them to 20%
179
+ of their original size.
180
+
181
+
182
+ ## Benchmarks
183
+
184
+ Blurrily is wicked fast, often 100x faster than it's ancestor,
185
+ [fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
186
+ the-metal, single-purpose index using almost exclusively libc primitives. On
187
+ the inside the only expensive operations it performs are
188
+
189
+ - memcpy(2) lots of data around (selection);
190
+ - mergesort(3) to aggregate/count similar entries (reduction);
191
+ - qsort(3) to order by counts (sort).
192
+
193
+ It tends to be faster with large datasets on BSD than on Linux because the
194
+ former has fast quicksort and mergesort, wheras the latter only has `qsort`,
195
+ a slower, catch-all sorter. In complexity terms this is because FIND tends
196
+ to be *O(n)* on BSD and *O(n ln n)* on Linux.
197
+
198
+ Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
199
+ and take respectively ~10ms and ~100µs on any platform, so they aren't
200
+ graphed here.
201
+
202
+ - [FIND latency](/doc/bench-find.png)
203
+ - [SAVE latency](/doc/bench-save.png)
204
+ - [DELETE latency](/doc/bench-delete.png)
205
+
206
+
207
+ ## Contributing
208
+
209
+ 1. Fork it
210
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
211
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
212
+ 4. Push to the branch (`git push origin my-new-feature`)
213
+ 5. Create new Pull Request
data/bin/blurrily ADDED
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ $PROGRAM_NAME = 'blurrily'
3
+
4
+ require "blurrily/server"
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+
10
+ # Defaults
11
+ options.port = 12021
12
+ options.directory = '.'
13
+ options.host = '0.0.0.0'
14
+
15
+ parser = OptionParser.new do |opts|
16
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
17
+
18
+ opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
19
+ puts 'Port has to be numeric value' and exit unless port =~ /\d+/
20
+ options.port = port.to_i
21
+ end
22
+
23
+ opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
24
+ options.directory = directory
25
+ end
26
+
27
+ opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
28
+ options.host = address || '0.0.0.0'
29
+ end
30
+
31
+ opts.on("-V", "--version", "Output version") do |address|
32
+ puts Blurrily::VERSION
33
+ exit
34
+ end
35
+
36
+ opts.on_tail("-h", "--help", "Show this message") do
37
+ puts opts
38
+ exit
39
+ end
40
+ end
41
+
42
+ parser.parse!(ARGV)
43
+ Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
@@ -0,0 +1,21 @@
1
+ /*
2
+
3
+ blurrily.h --
4
+
5
+ Helper macros
6
+
7
+ */
8
+
9
+ #ifndef __BLURRILY_H__
10
+ #define __BLURRILY_H__ 1
11
+
12
+ #define BR_PACKED_STRUCT __attribute__ ((__packed__))
13
+ #define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
14
+
15
+ #ifdef DEBUG
16
+ #define LOG(...) fprintf(stderr, __VA_ARGS__)
17
+ #else
18
+ #define LOG(...)
19
+ #endif
20
+
21
+ #endif
@@ -0,0 +1,21 @@
1
+ require 'mkmf'
2
+
3
+ PLATFORM = `uname`.strip.upcase
4
+ SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra"
5
+
6
+ case PLATFORM
7
+ when 'LINUX'
8
+ # make sure ftruncate is available
9
+ SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
10
+ SHARED_FLAGS << ' -D_GNU_SOURCE=1'
11
+ # make sure off_t is 64 bit long
12
+ SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
13
+ end
14
+
15
+ # production
16
+ $CFLAGS += " #{SHARED_FLAGS} -Os"
17
+
18
+ # development
19
+ # $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
20
+
21
+ create_makefile('blurrily/map_ext')
@@ -0,0 +1,230 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+ #include "storage.h"
4
+ #include "blurrily.h"
5
+
6
+ static VALUE eClosedError = Qnil;
7
+ static VALUE eBlurrilyModule = Qnil;
8
+
9
+ /******************************************************************************/
10
+
11
+ static int raise_if_closed(VALUE self)
12
+ {
13
+ if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
14
+ rb_raise(eClosedError, "Map was freed");
15
+ return 1;
16
+ }
17
+
18
+ static void mark_as_closed(VALUE self)
19
+ {
20
+ rb_ivar_set(self, rb_intern("@closed"), Qtrue);
21
+ }
22
+
23
+ /******************************************************************************/
24
+
25
+ static void blurrily_free(void* haystack)
26
+ {
27
+ int res = -1;
28
+
29
+ if (haystack == NULL) return;
30
+ res = blurrily_storage_close((trigram_map*) &haystack);
31
+ assert(res >= 0);
32
+ }
33
+
34
+ /******************************************************************************/
35
+
36
+ static void blurrily_mark(void* haystack)
37
+ {
38
+ if (haystack == NULL) return;
39
+ blurrily_storage_mark((trigram_map) haystack);
40
+ }
41
+
42
+ /******************************************************************************/
43
+
44
+ static VALUE blurrily_new(VALUE class) {
45
+ VALUE wrapper = Qnil;
46
+ trigram_map haystack = (trigram_map)NULL;
47
+ int res = -1;
48
+
49
+ res = blurrily_storage_new(&haystack);
50
+ if (res < 0) { rb_sys_fail(NULL); return Qnil; }
51
+
52
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
53
+ rb_obj_call_init(wrapper, 0, NULL);
54
+ return wrapper;
55
+ }
56
+
57
+ /******************************************************************************/
58
+
59
+ static VALUE blurrily_load(VALUE class, VALUE rb_path) {
60
+ char* path = StringValuePtr(rb_path);
61
+ VALUE wrapper = Qnil;
62
+ trigram_map haystack = (trigram_map)NULL;
63
+ int res = -1;
64
+
65
+ res = blurrily_storage_load(&haystack, path);
66
+ if (res < 0) { rb_sys_fail(NULL); return Qnil; }
67
+
68
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
69
+ rb_obj_call_init(wrapper, 0, NULL);
70
+ return wrapper;
71
+ }
72
+
73
+ /******************************************************************************/
74
+
75
+ static VALUE blurrily_initialize(VALUE UNUSED(self)) {
76
+ return Qtrue;
77
+ }
78
+
79
+ /******************************************************************************/
80
+
81
+ static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
82
+ trigram_map haystack = (trigram_map)NULL;
83
+ int res = -1;
84
+ char* needle = StringValuePtr(rb_needle);
85
+ uint32_t reference = NUM2UINT(rb_reference);
86
+ uint32_t weight = NUM2UINT(rb_weight);
87
+
88
+ if (raise_if_closed(self)) return Qnil;
89
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
90
+
91
+ res = blurrily_storage_put(haystack, needle, reference, weight);
92
+ assert(res >= 0);
93
+
94
+ return INT2NUM(res);
95
+ }
96
+
97
+ /******************************************************************************/
98
+
99
+ static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
100
+ trigram_map haystack = (trigram_map)NULL;
101
+ uint32_t reference = NUM2UINT(rb_reference);
102
+ int res = -1;
103
+
104
+ if (raise_if_closed(self)) return Qnil;
105
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
106
+
107
+ res = blurrily_storage_delete(haystack, reference);
108
+ assert(res >= 0);
109
+
110
+ return INT2NUM(res);
111
+ }
112
+
113
+ /******************************************************************************/
114
+
115
+ static VALUE blurrily_save(VALUE self, VALUE rb_path) {
116
+ trigram_map haystack = (trigram_map)NULL;
117
+ int res = -1;
118
+ const char* path = StringValuePtr(rb_path);
119
+
120
+ if (raise_if_closed(self)) return Qnil;
121
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
122
+
123
+ res = blurrily_storage_save(haystack, path);
124
+ if (res < 0) rb_sys_fail(NULL);
125
+
126
+ return Qnil;
127
+ }
128
+
129
+ /******************************************************************************/
130
+
131
+ static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
132
+ trigram_map haystack = (trigram_map)NULL;
133
+ int res = -1;
134
+ const char* needle = StringValuePtr(rb_needle);
135
+ int limit = NUM2UINT(rb_limit);
136
+ trigram_match matches = NULL;
137
+ VALUE rb_matches = Qnil;
138
+
139
+ if (raise_if_closed(self)) return Qnil;
140
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
141
+
142
+ if (limit <= 0) {
143
+ // rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
144
+ rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
145
+ limit = NUM2UINT(rb_limit);
146
+ }
147
+ matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
148
+
149
+ res = blurrily_storage_find(haystack, needle, limit, matches);
150
+ assert(res >= 0);
151
+
152
+ /* wrap the matches into a Ruby array */
153
+ rb_matches = rb_ary_new();
154
+ for (int k = 0; k < res; ++k) {
155
+ VALUE rb_match = rb_ary_new();
156
+ rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
157
+ rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
158
+ rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
159
+ rb_ary_push(rb_matches, rb_match);
160
+ }
161
+ return rb_matches;
162
+ }
163
+
164
+
165
+ /******************************************************************************/
166
+
167
+ static VALUE blurrily_stats(VALUE self)
168
+ {
169
+ trigram_map haystack = (trigram_map)NULL;
170
+ trigram_stat_t stats;
171
+ VALUE result = rb_hash_new();
172
+ int res = -1;
173
+
174
+ if (raise_if_closed(self)) return Qnil;
175
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
176
+
177
+ res = blurrily_storage_stats(haystack, &stats);
178
+ assert(res >= 0);
179
+
180
+ (void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
181
+ (void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
182
+
183
+ return result;
184
+ }
185
+
186
+ /******************************************************************************/
187
+
188
+ static VALUE blurrily_close(VALUE self)
189
+ {
190
+ trigram_map haystack = (trigram_map)NULL;
191
+ int res = -1;
192
+
193
+ if (raise_if_closed(self)) return Qnil;
194
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
195
+
196
+ res = blurrily_storage_close(&haystack);
197
+ if (res < 0) rb_sys_fail(NULL);
198
+
199
+ DATA_PTR(self) = NULL;
200
+ mark_as_closed(self);
201
+ return Qnil;
202
+ }
203
+
204
+ /******************************************************************************/
205
+
206
+ void Init_map_ext(void) {
207
+ VALUE klass = Qnil;
208
+
209
+ /* assume we haven't yet defined blurrily */
210
+ eBlurrilyModule = rb_define_module("Blurrily");
211
+ assert(eBlurrilyModule != Qnil);
212
+
213
+ klass = rb_define_class_under(eBlurrilyModule, "RawMap", rb_cObject);
214
+ assert(klass != Qnil);
215
+
216
+ eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
217
+ assert(klass != Qnil);
218
+
219
+ rb_define_singleton_method(klass, "new", blurrily_new, 0);
220
+ rb_define_singleton_method(klass, "load", blurrily_load, 1);
221
+
222
+ rb_define_method(klass, "initialize", blurrily_initialize, 0);
223
+ rb_define_method(klass, "put", blurrily_put, 3);
224
+ rb_define_method(klass, "delete", blurrily_delete, 1);
225
+ rb_define_method(klass, "save", blurrily_save, 1);
226
+ rb_define_method(klass, "find", blurrily_find, 2);
227
+ rb_define_method(klass, "stats", blurrily_stats, 0);
228
+ rb_define_method(klass, "close", blurrily_close, 0);
229
+ return;
230
+ }