blurrily 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ed39eb955b4d71f3b924a16be4430046ba1d02ab
4
- data.tar.gz: 1c5a5b42b6877ad3d66928a0fe0520ea73defa9b
3
+ metadata.gz: 5278062ebce0b77e8f18ffebdefa1639cafc8b59
4
+ data.tar.gz: 5ce39cb3bc008428947ff9ab234d8aba07455241
5
5
  SHA512:
6
- metadata.gz: 54fdb049c894470cf18afdafe18053607e1b4336b6f7353866ae8d81115e87a97ed6f5273270d930a88c292bf02a361868280997b6dbe5668c894aa456745950
7
- data.tar.gz: b8c280aa93d062a9a89fbda80cdf3365efcb34ed3e3c28d8dadf6c9b9ee5deba389a3a0233c788e8e182894b1067254ec9a9ef4ae80e7c1676a60edd6cd50e83
6
+ metadata.gz: 28fdbf9009c005523e30c450fabb461fff877a4c86d37fe83096998f5ab1addd2d0d032c378f5333e553e27fe28aac9a500508967e64a7f0c33ecdc9c960fe06
7
+ data.tar.gz: 5e9894f29ce68dfc5ceb222e38f8a05c138f5c7cdc770b8a6c0940b765d9ae4d983cc7ee3e74449b94136b696e65dec666de85cfa1aa2f8b2d38970b6add1fad
data/README.md CHANGED
@@ -1,13 +1,27 @@
1
- # Blurrily — Fast fuzzy text search
1
+ # Blurrily — Millisecond fuzzy string matching
2
2
 
3
3
  [![Build Status](https://travis-ci.org/mezis/blurrily.png?branch=master)](https://travis-ci.org/mezis/blurrily)
4
4
  [![Dependency Status](https://gemnasium.com/mezis/blurrily.png)](https://gemnasium.com/mezis/blurrily)
5
5
  [![Code Climate](https://codeclimate.com/github/mezis/blurrily.png)](https://codeclimate.com/github/mezis/blurrily)
6
6
 
7
- This will be a C version of [fuzzily](http://github.com/mezis/fuzzily), a
8
- Ruby gem to perform fuzzy text searching.
7
+ > Show me photos of **Marakech** !
8
+ >
9
+ > Here aresome photos of **Marrakesh**, Morroco.
10
+ > Did you mean **Martanesh**, Albania, **Marakkanam**, India, or **Marasheshty**, Romania?
11
+
12
+ Blurrily find missplet or partial needles in a haystack of strings, quickly.
13
+ It scales well: its response time is typically 1-2ms on user-input datasets
14
+ and 75-100ms on pathological datasets ([more](#Benchmarks)).
15
+
16
+ Blurrily is compatible and tested with all MRI Rubies from 1.8.7 to 2.0.0.
17
+ It is tested on Linux 2.6 (32bit and 64bit) and MacOS X 10.8.
18
+
19
+ Blurrily uses a tweaked [trigram](http://en.wikipedia.org/wiki/N-gram)-based
20
+ approach to find good matches. If you're using ActiveRecord and looking for
21
+ a lightweight (albeit much slower), in-process, Rails-friendly version of
22
+ this, check out [fuzzily](http://github.com/mezis/fuzzily), a Ruby gem to
23
+ perform fuzzy text searching in ActiveRecord.
9
24
 
10
- WORK IN PROGRESS.
11
25
 
12
26
  ## Installation
13
27
 
@@ -15,17 +29,171 @@ Add this line to your application's Gemfile:
15
29
 
16
30
  gem 'blurrily'
17
31
 
18
- And then execute:
19
-
20
- $ bundle
21
-
22
32
  Or install it yourself as:
23
33
 
24
34
  $ gem install blurrily
25
35
 
26
36
  ## Usage
27
37
 
28
- TODO: Write usage instructions here
38
+ You can use blurrily as a client/server combination (recommended in
39
+ production), or use the internals standalone.
40
+
41
+ See the [API Documentation](http://rubydoc.info/github/mezis/blurrily/frames)
42
+ for more details.
43
+
44
+ ### Client/server
45
+
46
+ Fire up a blurrily server:
47
+
48
+ $ blurrily
49
+
50
+ Open up a console and connect:
51
+
52
+ $ irb -rubygems
53
+ > require 'blurrily/client'
54
+ > client = Blurrily::Client.new
55
+
56
+ Store a needle with a reference:
57
+
58
+ > client.put('London', 1337)
59
+
60
+ Recover a reference form the haystack:
61
+
62
+ > client.find('lonndon')
63
+ #=> [1337]
64
+
65
+ ### Standalone
66
+
67
+ Create the in-memory database:
68
+
69
+ > map = Blurrily::Map.new
70
+
71
+ Store a needle with a reference:
72
+
73
+ > map.put('London', 1337)
74
+
75
+ Recover a reference form the haystack:
76
+
77
+ > map.find('lonndon')
78
+ #=> [1337]
79
+
80
+ Save the database to disk:
81
+
82
+ > map.save('/var/db/data.trigrams')
83
+
84
+ Load a previously saved database:
85
+
86
+ > map = Blurrily::Map.load('/var/db/data.trigrams')
87
+
88
+
89
+ ## Caveats
90
+
91
+ ### Diacritics, non-latin languages
92
+
93
+ Blurrily forms trigrams from the 26 latin letters and a stop character (used
94
+ to model start-of-string and separation between words in multi-word
95
+ strings).
96
+
97
+ This means that case and diacritrics are completely ignored by Blurrily. For
98
+ instance, *Puy-de-Dôme* is strictly equivalent to *puy de dome*.
99
+
100
+ It also means that any non-latin input will probably result in garbase data
101
+ and garbage results (although it won't crash).
102
+
103
+ ### Multi-word needles and edge stickyness.
104
+
105
+ Multi-word needles (say, *New York*) are supported.
106
+
107
+ The engine always favours matches that begin and end similarly to the
108
+ needle, with a bias to the beginning of the strings.
109
+
110
+ Thsi is because internally, the string *New York* is turned into this
111
+ sequence of trigrams: `**n`, `*ne`, `new`, `ew*`, `w*y`, `*yo`, `yor`,
112
+ `ork`, `rk*`.
113
+
114
+ ## Production notes
115
+
116
+ ### Memory usage
117
+
118
+ Blurrily does not store your original strings but rather a flat map of
119
+ references and weights for each trigram in your input strings.
120
+
121
+ In practice any database will use up a base 560KB for the index header, plus
122
+ 128 bits per trigram.
123
+
124
+ As a rule of thumb idea memory usages is 40MB + 8 times the size of your
125
+ input data, and 50% extra on top during bulk imports (lots of writes to the
126
+ database).
127
+
128
+ For instance, `/usr/share/dict/words` is a list of 235k English words, and
129
+ weighs 2.5MB. Importing the whole list uses up 75MB of memory, 51MB of which
130
+ are the database.
131
+
132
+ Note that once a databse has been written to disk and loaded from disk,
133
+ memory usage is minimal (560KB per database) as the database file is memory
134
+ mapped. For performance you do need as much free memory as the database
135
+ size.
136
+
137
+ ### Disk usage
138
+
139
+ Disk usage is almost exactly like memory usage, since database files are
140
+ nothing more than a memory dump.
141
+
142
+ In the `/usr/share/dict/words` example, on-disk size is 51MB.
143
+ For the whole list of Geonames places, on-disk size is 1.1GB.
144
+
145
+ ### Read v write
146
+
147
+ Writing to blurrily (with `#put`) is fairly expensive—it's a search engine
148
+ after all, optimized for intensive reads.
149
+
150
+ Supporting writes means the engine needs to keep a hash table of all
151
+ references around, typically weighing 50% of your total input. This is build
152
+ lazily while writing however; so if you load a database from disk and only
153
+ ever read, you will not incur the memory penalty.
154
+
155
+ ### Saving & backing up
156
+
157
+ Blurrily saves atomically (writing to a separate file, then using rename(2)
158
+ to overwrite the old file), meaning you should never lose data.
159
+
160
+ The server does this for you every 60 seconds and when quitting. If using
161
+ `Blurrily::Map` directly, remember that a map loaded from disk is more
162
+ memory efficient that a map in memory, so if your workload is read-heavy,
163
+ you should `.load` after each `#save`.
164
+
165
+ Backing up comes with a caveat: database files are only portable across
166
+ architectures if endianness and pointer size are the same (tested between
167
+ darwin-x86_64 and linux-amd64).
168
+
169
+ Database files are very compressible; `bzip2` typically shrinks them to 20%
170
+ of their original size.
171
+
172
+
173
+ ## Benchmarks
174
+
175
+ Blurrily is wicked fast, often 100x faster than it's ancestor,
176
+ [fuzzily](http://github.com/mezis/fuzzily). This is because it's a close-to-
177
+ the-metal, single-purpose index using almost exclusively libc primitives. On
178
+ the inside the only expensive operations it performs are
179
+
180
+ - memcpy(2) lots of data around (selection);
181
+ - mergesort(3) to aggregate/count similar entries (reduction);
182
+ - qsort(3) to order by counts (sort).
183
+
184
+ It tends to be faster with large datasets on BSD than on Linux because the
185
+ former has fast quicksort and mergesort, wheras the latter only has `qsort`,
186
+ a slower, catch-all sorter. In complexity terms this is because FIND tends
187
+ to be *O(n)* on BSD and *O(n ln n)* on Linux.
188
+
189
+ Enough talk, here are the graphs. The `LOAD` and `PUT` operations are O(1)
190
+ and take respectively ~10ms and ~100µs on any platform, so they aren't
191
+ graphed here.
192
+
193
+ - [FIND latency](/doc/bench-find.png)
194
+ - [SAVE latency](/doc/bench-save.png)
195
+ - [DELETE latency](/doc/bench-delete.png)
196
+
29
197
 
30
198
  ## Contributing
31
199
 
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ $PROGRAM_NAME = 'blurrily'
3
+
4
+ require "blurrily/server"
5
+ require 'optparse'
6
+ require 'ostruct'
7
+
8
+ options = OpenStruct.new
9
+
10
+ # Defaults
11
+ options.port = 12021
12
+ options.directory = '.'
13
+ options.host = '0.0.0.0'
14
+
15
+ parser = OptionParser.new do |opts|
16
+ opts.banner = "Usage: #{$PROGRAM_NAME} [options]"
17
+
18
+ opts.on("-p", "--port <PORT>", "Bind to PORT, defaults to 12021") do |port|
19
+ puts 'Port has to be numeric value' and exit unless port =~ /\d+/
20
+ options.port = port.to_i
21
+ end
22
+
23
+ opts.on("-d", "--directory <DIRECTORY>", "Work in DIRECTORY, defaults to .") do |directory|
24
+ options.directory = directory
25
+ end
26
+
27
+ opts.on("-b", "--bind <ADDRESS>", "Bind to ADDRESS, defaults to 0.0.0.0") do |address|
28
+ options.host = address || '0.0.0.0'
29
+ end
30
+
31
+ opts.on("-V", "--version", "Output version") do |address|
32
+ puts Blurrily::VERSION
33
+ exit
34
+ end
35
+
36
+ opts.on_tail("-h", "--help", "Show this message") do
37
+ puts opts
38
+ exit
39
+ end
40
+ end
41
+
42
+ parser.parse!(ARGV)
43
+ Blurrily::Server.new(:host => options.host, :port => options.port, :directory => options.directory).start
@@ -1,2 +1,16 @@
1
+ /*
2
+
3
+ blurrily.h --
4
+
5
+ Helper macros
6
+
7
+ */
8
+
1
9
  #define PACKED_STRUCT __attribute__ ((__packed__))
2
10
  #define UNUSED(_IDENT) _IDENT __attribute__ ((unused))
11
+
12
+ #ifdef DEBUG
13
+ #define LOG(...) fprintf(stderr, __VA_ARGS__)
14
+ #else
15
+ #define LOG(...)
16
+ #endif
@@ -5,7 +5,11 @@ SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra -Werror"
5
5
 
6
6
  case PLATFORM
7
7
  when 'LINUX'
8
- SHARED_FLAGS += ' -D_XOPEN_SOURCE=500' # for ftruncate to be present
8
+ # make sure ftruncate is available
9
+ SHARED_FLAGS << ' -D_XOPEN_SOURCE=700'
10
+ SHARED_FLAGS << ' -D_GNU_SOURCE=1'
11
+ # make sure off_t is 64 bit long
12
+ SHARED_FLAGS << ' -D_FILE_OFFSET_BITS=64'
9
13
  end
10
14
 
11
15
  # production
@@ -3,18 +3,44 @@
3
3
  #include "storage.h"
4
4
  #include "blurrily.h"
5
5
 
6
+ static VALUE eClosedError = Qnil;
7
+ static VALUE eBlurrilyModule = Qnil;
8
+
9
+ /******************************************************************************/
10
+
11
+ static int raise_if_closed(VALUE self)
12
+ {
13
+ if (rb_ivar_get(self, rb_intern("@closed")) != Qtrue) return 0;
14
+ rb_raise(eClosedError, "Map was freed");
15
+ return 1;
16
+ }
17
+
18
+ static void mark_as_closed(VALUE self)
19
+ {
20
+ rb_ivar_set(self, rb_intern("@closed"), Qtrue);
21
+ }
22
+
6
23
  /******************************************************************************/
7
24
 
8
25
  static void blurrily_free(void* haystack)
9
26
  {
10
27
  int res = -1;
11
28
 
29
+ if (haystack == NULL) return;
12
30
  res = blurrily_storage_close((trigram_map*) &haystack);
13
31
  assert(res >= 0);
14
32
  }
15
33
 
16
34
  /******************************************************************************/
17
35
 
36
+ static void blurrily_mark(void* haystack)
37
+ {
38
+ if (haystack == NULL) return;
39
+ blurrily_storage_mark((trigram_map) haystack);
40
+ }
41
+
42
+ /******************************************************************************/
43
+
18
44
  static VALUE blurrily_new(VALUE class) {
19
45
  VALUE wrapper = Qnil;
20
46
  trigram_map haystack = (trigram_map)NULL;
@@ -23,7 +49,7 @@ static VALUE blurrily_new(VALUE class) {
23
49
  res = blurrily_storage_new(&haystack);
24
50
  if (res < 0) { rb_sys_fail(NULL); return Qnil; }
25
51
 
26
- wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
52
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
27
53
  rb_obj_call_init(wrapper, 0, NULL);
28
54
  return wrapper;
29
55
  }
@@ -39,7 +65,7 @@ static VALUE blurrily_load(VALUE class, VALUE rb_path) {
39
65
  res = blurrily_storage_load(&haystack, path);
40
66
  if (res < 0) { rb_sys_fail(NULL); return Qnil; }
41
67
 
42
- wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
68
+ wrapper = Data_Wrap_Struct(class, blurrily_mark, blurrily_free, (void*)haystack);
43
69
  rb_obj_call_init(wrapper, 0, NULL);
44
70
  return wrapper;
45
71
  }
@@ -59,12 +85,13 @@ static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE
59
85
  uint32_t reference = NUM2UINT(rb_reference);
60
86
  uint32_t weight = NUM2UINT(rb_weight);
61
87
 
88
+ if (raise_if_closed(self)) return Qnil;
62
89
  Data_Get_Struct(self, struct trigram_map_t, haystack);
63
90
 
64
91
  res = blurrily_storage_put(haystack, needle, reference, weight);
65
92
  assert(res >= 0);
66
93
 
67
- return Qnil;
94
+ return INT2NUM(res);
68
95
  }
69
96
 
70
97
  /******************************************************************************/
@@ -74,6 +101,7 @@ static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
74
101
  uint32_t reference = NUM2UINT(rb_reference);
75
102
  int res = -1;
76
103
 
104
+ if (raise_if_closed(self)) return Qnil;
77
105
  Data_Get_Struct(self, struct trigram_map_t, haystack);
78
106
 
79
107
  res = blurrily_storage_delete(haystack, reference);
@@ -89,10 +117,11 @@ static VALUE blurrily_save(VALUE self, VALUE rb_path) {
89
117
  int res = -1;
90
118
  const char* path = StringValuePtr(rb_path);
91
119
 
120
+ if (raise_if_closed(self)) return Qnil;
92
121
  Data_Get_Struct(self, struct trigram_map_t, haystack);
93
122
 
94
123
  res = blurrily_storage_save(haystack, path);
95
- assert(res >= 0);
124
+ if (res < 0) rb_sys_fail(NULL);
96
125
 
97
126
  return Qnil;
98
127
  }
@@ -107,11 +136,16 @@ static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
107
136
  trigram_match matches = NULL;
108
137
  VALUE rb_matches = Qnil;
109
138
 
110
- if (limit <= 0) { limit = 10 ; }
111
- matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
112
-
139
+ if (raise_if_closed(self)) return Qnil;
113
140
  Data_Get_Struct(self, struct trigram_map_t, haystack);
114
141
 
142
+ if (limit <= 0) {
143
+ // rb_limit = rb_const_get(eBlurrilyModule, rb_intern('LIMIT_DEFAULT'));
144
+ rb_limit = rb_const_get(eBlurrilyModule, rb_intern("LIMIT_DEFAULT"));
145
+ limit = NUM2UINT(rb_limit);
146
+ }
147
+ matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
148
+
115
149
  res = blurrily_storage_find(haystack, needle, limit, matches);
116
150
  assert(res >= 0);
117
151
 
@@ -137,6 +171,7 @@ static VALUE blurrily_stats(VALUE self)
137
171
  VALUE result = rb_hash_new();
138
172
  int res = -1;
139
173
 
174
+ if (raise_if_closed(self)) return Qnil;
140
175
  Data_Get_Struct(self, struct trigram_map_t, haystack);
141
176
 
142
177
  res = blurrily_storage_stats(haystack, &stats);
@@ -150,15 +185,35 @@ static VALUE blurrily_stats(VALUE self)
150
185
 
151
186
  /******************************************************************************/
152
187
 
188
+ static VALUE blurrily_close(VALUE self)
189
+ {
190
+ trigram_map haystack = (trigram_map)NULL;
191
+ int res = -1;
192
+
193
+ if (raise_if_closed(self)) return Qnil;
194
+ Data_Get_Struct(self, struct trigram_map_t, haystack);
195
+
196
+ res = blurrily_storage_close(&haystack);
197
+ if (res < 0) rb_sys_fail(NULL);
198
+
199
+ DATA_PTR(self) = NULL;
200
+ mark_as_closed(self);
201
+ return Qnil;
202
+ }
203
+
204
+ /******************************************************************************/
205
+
153
206
  void Init_map_ext(void) {
154
- VALUE module = Qnil;
155
207
  VALUE klass = Qnil;
156
208
 
157
209
  /* assume we haven't yet defined blurrily */
158
- module = rb_define_module("Blurrily");
159
- assert(module != Qnil);
210
+ eBlurrilyModule = rb_define_module("Blurrily");
211
+ assert(eBlurrilyModule != Qnil);
212
+
213
+ klass = rb_define_class_under(eBlurrilyModule, "Map", rb_cObject);
214
+ assert(klass != Qnil);
160
215
 
161
- klass = rb_define_class_under(module, "Map", rb_cObject);
216
+ eClosedError = rb_define_class_under(klass, "ClosedError", rb_eRuntimeError);
162
217
  assert(klass != Qnil);
163
218
 
164
219
  rb_define_singleton_method(klass, "new", blurrily_new, 0);
@@ -170,5 +225,6 @@ void Init_map_ext(void) {
170
225
  rb_define_method(klass, "save", blurrily_save, 1);
171
226
  rb_define_method(klass, "find", blurrily_find, 2);
172
227
  rb_define_method(klass, "stats", blurrily_stats, 0);
228
+ rb_define_method(klass, "close", blurrily_close, 0);
173
229
  return;
174
230
  }