blurrily 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ed39eb955b4d71f3b924a16be4430046ba1d02ab
4
- data.tar.gz: 1c5a5b42b6877ad3d66928a0fe0520ea73defa9b
3
+ metadata.gz: 5278062ebce0b77e8f18ffebdefa1639cafc8b59
4
+ data.tar.gz: 5ce39cb3bc008428947ff9ab234d8aba07455241
5
5
  SHA512:
6
- metadata.gz: 54fdb049c894470cf18afdafe18053607e1b4336b6f7353866ae8d81115e87a97ed6f5273270d930a88c292bf02a361868280997b6dbe5668c894aa456745950
7
- data.tar.gz: b8c280aa93d062a9a89fbda80cdf3365efcb34ed3e3c28d8dadf6c9b9ee5deba389a3a0233c788e8e182894b1067254ec9a9ef4ae80e7c1676a60edd6cd50e83
6
+ metadata.gz: 28fdbf9009c005523e30c450fabb461fff877a4c86d37fe83096998f5ab1addd2d0d032c378f5333e553e27fe28aac9a500508967e64a7f0c33ecdc9c960fe06
7
+ data.tar.gz: 5e9894f29ce68dfc5ceb222e38f8a05c138f5c7cdc770b8a6c0940b765d9ae4d983cc7ee3e74449b94136b696e65dec666de85cfa1aa2f8b2d38970b6add1fad
data/README.md CHANGED
@@ -1,13 +1,27 @@
1
- # Blurrily — Fast fuzzy text search
1
+ # Blurrily — Millisecond fuzzy string matching
2
2
 
3
3
  [![Build Status](https://travis-ci.org/mezis/blurrily.png?branch=master)](https://travis-ci.org/mezis/blurrily)
4
4
  [![Dependency Status](https://gemnasium.com/mezis/blurrily.png)](https://gemnasium.com/mezis/blurrily)
5
5
  [![Code Climate](https://codeclimate.com/github/mezis/blurrily.png)](https://codeclimate.com/github/mezis/blurrily)
6
6
 
7
- This will be a C version of [fuzzily](http://github.com/mezis/fuzzily), a
8
- Ruby gem to perform fuzzy text searching.
7
+ > Show me photos of **Marakech** !
8
+ >
9
+ > Here aresome photos of **Marrakesh**, Morroco.
10
+ > Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
11
+
12
+ Blurrily find missplet or partial needles in a haystack of strings, quickly.
13
+ It scales well: its response time is typically 1-2ms on user-input datasets
14
+ and 75-100ms on pathological datasets ([more](#Benchmarks)).
15
+
16
+ Blurrily is compatible and tested with all MRI Rubies from 1.8.7 to 2.0.0.
17
+ It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
18
+
19
+ Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
20
+ approach to find good matches. If you're using ActiveRecord and looking for
21
+ a lightweight (albeit much slower), in-process, Rails-friendly version of
22
+ this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
23
+ perform fuzzy text searching in ActiveRecord.
9
24
 
10
- WORK IN PROGRESS.
11
25
 
12
26
  ## Installation
13
27
 
@@ -15,17 +29,171 @@ Add this line to your application's Gemfile:
15
29
 
16
30
  gem 'blurrily'
17
31
 
18
- And then execute:
19
-
20
- $ bundle
21
-
22
32
  Or install it yourself as:
23
33
 
24
34
  $ gem install blurrily
25
35
 
26
36
  ## Usage
27
37
 
28
- TODO: Write usage instructions here
38
+ You can use blurrily as a client/server combination (recommended in
39
+ production), or use the internals standalone.
40
+
41
+ See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
42
+ for more details.
43
+
44
+ ### Client/server
45
+
46
+ Fire up a blurrily server:
47
+
48
+ $ blurrily
49
+
50
+ Open up a console and connect:
51
+
52
+ $ irb -rubygems
53
+ > require 'blurrily/client'
54
+ > client = Blurrily::Client.new
55
+
56
+ Store a needle with a reference:
57
+
58
+ > client.put('London', 1337)
59
+
60
+ Recover a reference form the haystack:
61
+
62
+ > client.find('lonndon')
63
+ #=> [1337]
64
+
65
+ ### Standalone
66
+
67
+ Create the in-memory database:
68
+
69
+ > map = Blurrily::Map.new
70
+
71
+ Store a needle with a reference:
72
+
73
+ > map.put('London', 1337)
74
+
75
+ Recover a reference form the haystack:
76
+
77
+ > map.find('lonndon')
78
+ #=> [1337]
79
+
80
+ Save the database to disk:
81
+
82
+ > map.save('/var/db/data.trigrams')
83
+
84
+ Load a previously saved database:
85
+
86
+ > map = Blurrily::Map.load('/var/db/data.trigrams')
87
+
88
+
89
+ ## Caveats
90
+
91
+ ### Diacritics, non-latin languages
92
+
93
+ Blurrily forms trigrams from the 26 latin letters and a stop character (used
94
+ to model start-of-string and separation between words in multi-word
95
+ strings).
96
+
97
+ This means that case and diacritrics are completely ignored by Blurrily. For
98
+ instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
99
+
100
+ It also means that any non-latin input will probably result in garbase data
101
+ and garbage results (although it won't crash).
102
+
103
+ ### Multi-word needles and edge stickyness.
104
+
105
+ Multi-word needles (say, *New York*) are supported.
106
+
107
+ The engine always favours matches that begin and end similarly to the
108
+ needle, with a bias to the beginning of the strings.
109
+
110
+ Thsi is because internally, the string *New York* is turned into this
111
+ sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
112
+ `ork`, `rk*`.
113
+
114
+ ## Production notes
115
+
116
+ ### Memory usage
117
+
118
+ Blurrily does not store your original strings but rather a flat map of
119
+ references and weights for each trigram in your input strings.
120
+
121
+ In practice any database will use up a base 560KB for the index header, plus
122
+ 128 bits per trigram.
123
+
124
+ As a rule of thumb idea memory usages is 40MB + 8 times the size of your
125
+ input data, and 50% extra on top during bulk imports (lots of writes to the
126
+ database).
127
+
128
+ For instance, `/usr/share/dict/words` is a list of 235k English words, and
129
+ weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
130
+ are the database.
131
+
132
+ Note that once a databse has been written to disk and loaded from disk,
133
+ memory usage is minimal (560KB per database) as the database file is memory
134
+ mapped. For performance you do need as much free memory as the database
135
+ size.
136
+
137
+ ### Disk usage
138
+
139
+ Disk usage is almost exactly like memory usage, since database files are
140
+ nothing more than a memory dump.
141
+
142
+ In the `/usr/share/dict/words` example, on-disk size is 51MB.
143
+ For the whole list of Geonames places, on-disk size is 1.1GB.
144
+
145
+ ### Read v write
146
+
147
+ Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
148
+ after all, optimized for intensive reads.
149
+
150
+ Supporting writes means the engine needs to keep a hash table of all
151
+ references around, typically weighing 50% of your total input. This is build
152
+ lazily while writing however; so if you load a database from disk and only
153
+ ever read, you will not incur the memory penalty.
154
+
155
+ ### Saving & backing up
156
+
157
+ Blurrily saves atomically (writing to a separate file, then using rename(2)
158
+ to overwrite the old file), meaning you should never lose data.
159
+
160
+ The server does this for you every 60 seconds and when quitting. If using
161
+ `Blurrily::Map` directly, remember that a map loaded from disk is more
162
+ memory efficient that a map in memory, so if your workload is read-heavy,
163
+ you should `.load` after each `#save`.
164
+
165
+ Backing up comes with a caveat: database files are only portable across
166
+ architectures if endianness and pointer size are the same (tested between
167
+ darwin-x86_64 and linux-amd64).
168
+
169
+ Database files are very compressible; `bzip2` typically shrinks them to 20%
170
+ of their original size.
171
+
172
+
173
+ ## Benchmarks
174
+
175
+ Blurrily is wicked fast, often 100x faster than it's ancestor,
176
+ [fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
177
+ the-metal, single-purpose index using almost exclusively libc primitives. On
178
+ the inside the only expensive operations it performs are
179
+
180
+ - memcpy(2) lots of data around (selection);
181
+ - mergesort(3) to aggregate/count similar entries (reduction);
182
+ - qsort(3) to order by counts (sort).
183
+
184
+ It tends to be faster with large datasets on BSD than on Linux because the
185
+ former has fast quicksort and mergesort, wheras the latter only has `qsort`,
186
+ a slower, catch-all sorter. In complexity terms this is because FIND tends
187
+ to be *O(n)* on BSD and *O(n ln n)* on Linux.
188
+
189
+ Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
190
+ and take respectively ~10ms and ~100µs on any platform, so they aren't
191
+ graphed here.
192
+
193
+ - [FIND latency](/doc/bench-find.png)
194
+ - [SAVE latency](/doc/bench-save.png)
195
+ - [DELETE latency](/doc/bench-delete.png)
196
+
29
197
 
30
198
  ## Contributing
31
199
 
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ $PROGRAM_NAME = 'blurrily'
3
+
4
+ require "blurrily/server"
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+
10
+ # Defaults
11
+ options.port = 12021
12
+ options.directory = '.'
13
+ options.host = '0.0.0.0'
14
+
15
+ parser = OptionParser.new do |opts|
16
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
17
+
18
+ opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
19
+ puts 'Port has to be numeric value' and exit unless port =~ /\d+/
20
+ options.port = port.to_i
21
+ end
22
+
23
+ opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
24
+ options.directory = directory
25
+ end
26
+
27
+ opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
28
+ options.host = address || '0.0.0.0'
29
+ end
30
+
31
+ opts.on("-V", "--version", "Output version") do |address|
32
+ puts Blurrily::VERSION
33
+ exit
34
+ end
35
+
36
+ opts.on_tail("-h", "--help", "Show this message") do
37
+ puts opts
38
+ exit
39
+ end
40
+ end
41
+
42
+ parser.parse!(ARGV)
43
+ Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
@@ -1,2 +1,16 @@
1
+ /*
2
+
3
+ blurrily.h --
4
+
5
+ Helper macros
6
+
7
+ */
8
+
1
9
  #define PACKED_STRUCT __attribute__ ((__packed__))
2
10
  #define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
11
+
12
+ #ifdef DEBUG
13
+ #define LOG(...) fprintf(stderr, __VA_ARGS__)
14
+ #else
15
+ #define LOG(...)
16
+ #endif
@@ -5,7 +5,11 @@ SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra -Werror"
5
5
 
6
6
  case PLATFORM
7
7
  when 'LINUX'
8
- SHARED_FLAGS += ' -D_XOPEN_SOURCE=500' # for ftruncate to be present
8
+ # make sure ftruncate is available
9
+ SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
10
+ SHARED_FLAGS << ' -D_GNU_SOURCE=1'
11
+ # make sure off_t is 64 bit long
12
+ SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
9
13
  end
10
14
 
11
15
  # production
@@ -3,18 +3,44 @@
3
3
  #include "storage.h"
4
4
  #include "blurrily.h"
5
5
 
6
+ static VALUE eClosedError = Qnil;
7
+ static VALUE eBlurrilyModule = Qnil;
8
+
9
+ /******************************************************************************/
10
+
11
+ static int raise_if_closed(VALUE self)
12
+ {
13
+ if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
14
+ rb_raise(eClosedError, "Map was freed");
15
+ return 1;
16
+ }
17
+
18
+ static void mark_as_closed(VALUE self)
19
+ {
20
+ rb_ivar_set(self, rb_intern("@closed"), Qtrue);
21
+ }
22
+
6
23
  /******************************************************************************/
7
24
 
8
25
  static void blurrily_free(void* haystack)
9
26
  {
10
27
  int res = -1;
11
28
 
29
+ if (haystack == NULL) return;
12
30
  res = blurrily_storage_close((trigram_map*) &haystack);
13
31
  assert(res >= 0);
14
32
  }
15
33
 
16
34
  /******************************************************************************/
17
35
 
36
+ static void blurrily_mark(void* haystack)
37
+ {
38
+ if (haystack == NULL) return;
39
+ blurrily_storage_mark((trigram_map) haystack);
40
+ }
41
+
42
+ /******************************************************************************/
43
+
18
44
  static VALUE blurrily_new(VALUE class) {
19
45
  VALUE wrapper = Qnil;
20
46
  trigram_map haystack = (trigram_map)NULL;
@@ -23,7 +49,7 @@ static VALUE blurrily_new(VALUE class) {
23
49
  res = blurrily_storage_new(&haystack);
24
50
  if (res < 0) { rb_sys_fail(NULL); return Qnil; }
25
51
 
26
- wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
52
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
27
53
  rb_obj_call_init(wrapper, 0, NULL);
28
54
  return wrapper;
29
55
  }
@@ -39,7 +65,7 @@ static VALUE blurrily_load(VALUE class, VALUE rb_path) {
39
65
  res = blurrily_storage_load(&haystack, path);
40
66
  if (res < 0) { rb_sys_fail(NULL); return Qnil; }
41
67
 
42
- wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
68
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
43
69
  rb_obj_call_init(wrapper, 0, NULL);
44
70
  return wrapper;
45
71
  }
@@ -59,12 +85,13 @@ static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE
59
85
  uint32_t reference = NUM2UINT(rb_reference);
60
86
  uint32_t weight = NUM2UINT(rb_weight);
61
87
 
88
+ if (raise_if_closed(self)) return Qnil;
62
89
  Data_Get_Struct(self, struct trigram_map_t, haystack);
63
90
 
64
91
  res = blurrily_storage_put(haystack, needle, reference, weight);
65
92
  assert(res >= 0);
66
93
 
67
- return Qnil;
94
+ return INT2NUM(res);
68
95
  }
69
96
 
70
97
  /******************************************************************************/
@@ -74,6 +101,7 @@ static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
74
101
  uint32_t reference = NUM2UINT(rb_reference);
75
102
  int res = -1;
76
103
 
104
+ if (raise_if_closed(self)) return Qnil;
77
105
  Data_Get_Struct(self, struct trigram_map_t, haystack);
78
106
 
79
107
  res = blurrily_storage_delete(haystack, reference);
@@ -89,10 +117,11 @@ static VALUE blurrily_save(VALUE self, VALUE rb_path) {
89
117
  int res = -1;
90
118
  const char* path = StringValuePtr(rb_path);
91
119
 
120
+ if (raise_if_closed(self)) return Qnil;
92
121
  Data_Get_Struct(self, struct trigram_map_t, haystack);
93
122
 
94
123
  res = blurrily_storage_save(haystack, path);
95
- assert(res >= 0);
124
+ if (res < 0) rb_sys_fail(NULL);
96
125
 
97
126
  return Qnil;
98
127
  }
@@ -107,11 +136,16 @@ static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
107
136
  trigram_match matches = NULL;
108
137
  VALUE rb_matches = Qnil;
109
138
 
110
- if (limit <= 0) { limit = 10 ; }
111
- matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
112
-
139
+ if (raise_if_closed(self)) return Qnil;
113
140
  Data_Get_Struct(self, struct trigram_map_t, haystack);
114
141
 
142
+ if (limit <= 0) {
143
+ // rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
144
+ rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
145
+ limit = NUM2UINT(rb_limit);
146
+ }
147
+ matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
148
+
115
149
  res = blurrily_storage_find(haystack, needle, limit, matches);
116
150
  assert(res >= 0);
117
151
 
@@ -137,6 +171,7 @@ static VALUE blurrily_stats(VALUE self)
137
171
  VALUE result = rb_hash_new();
138
172
  int res = -1;
139
173
 
174
+ if (raise_if_closed(self)) return Qnil;
140
175
  Data_Get_Struct(self, struct trigram_map_t, haystack);
141
176
 
142
177
  res = blurrily_storage_stats(haystack, &stats);
@@ -150,15 +185,35 @@ static VALUE blurrily_stats(VALUE self)
150
185
 
151
186
  /******************************************************************************/
152
187
 
188
+ static VALUE blurrily_close(VALUE self)
189
+ {
190
+ trigram_map haystack = (trigram_map)NULL;
191
+ int res = -1;
192
+
193
+ if (raise_if_closed(self)) return Qnil;
194
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
195
+
196
+ res = blurrily_storage_close(&haystack);
197
+ if (res < 0) rb_sys_fail(NULL);
198
+
199
+ DATA_PTR(self) = NULL;
200
+ mark_as_closed(self);
201
+ return Qnil;
202
+ }
203
+
204
+ /******************************************************************************/
205
+
153
206
  void Init_map_ext(void) {
154
- VALUE module = Qnil;
155
207
  VALUE klass = Qnil;
156
208
 
157
209
  /* assume we haven't yet defined blurrily */
158
- module = rb_define_module("Blurrily");
159
- assert(module != Qnil);
210
+ eBlurrilyModule = rb_define_module("Blurrily");
211
+ assert(eBlurrilyModule != Qnil);
212
+
213
+ klass = rb_define_class_under(eBlurrilyModule, "Map", rb_cObject);
214
+ assert(klass != Qnil);
160
215
 
161
- klass = rb_define_class_under(module, "Map", rb_cObject);
216
+ eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
162
217
  assert(klass != Qnil);
163
218
 
164
219
  rb_define_singleton_method(klass, "new", blurrily_new, 0);
@@ -170,5 +225,6 @@ void Init_map_ext(void) {
170
225
  rb_define_method(klass, "save", blurrily_save, 1);
171
226
  rb_define_method(klass, "find", blurrily_find, 2);
172
227
  rb_define_method(klass, "stats", blurrily_stats, 0);
228
+ rb_define_method(klass, "close", blurrily_close, 0);
173
229
  return;
174
230
  }