blurrily 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +36 -0
- data/ext/blurrily/blurrily.h +2 -0
- data/ext/blurrily/extconf.rb +17 -0
- data/ext/blurrily/log.h +5 -0
- data/ext/blurrily/map_ext.c +174 -0
- data/ext/blurrily/storage.c +541 -0
- data/ext/blurrily/storage.h +109 -0
- data/ext/blurrily/tokeniser.c +127 -0
- data/ext/blurrily/tokeniser.h +41 -0
- data/lib/blurrily.rb +3 -0
- data/lib/blurrily/map.rb +34 -0
- data/lib/blurrily/server.rb +0 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +199 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ed39eb955b4d71f3b924a16be4430046ba1d02ab
|
4
|
+
data.tar.gz: 1c5a5b42b6877ad3d66928a0fe0520ea73defa9b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 54fdb049c894470cf18afdafe18053607e1b4336b6f7353866ae8d81115e87a97ed6f5273270d930a88c292bf02a361868280997b6dbe5668c894aa456745950
|
7
|
+
data.tar.gz: b8c280aa93d062a9a89fbda80cdf3365efcb34ed3e3c28d8dadf6c9b9ee5deba389a3a0233c788e8e182894b1067254ec9a9ef4ae80e7c1676a60edd6cd50e83
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 HouseTrip Ltd.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Blurrily — Fast fuzzy text search
|
2
|
+
|
3
|
+
[](https://travis-ci.org/mezis/blurrily)
|
4
|
+
[](https://gemnasium.com/mezis/blurrily)
|
5
|
+
[](https://codeclimate.com/github/mezis/blurrily)
|
6
|
+
|
7
|
+
This will be a C version of [fuzzily](http://github.com/mezis/fuzzily), a
|
8
|
+
Ruby gem to perform fuzzy text searching.
|
9
|
+
|
10
|
+
WORK IN PROGRESS.
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
gem 'blurrily'
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install blurrily
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
TODO: Write usage instructions here
|
29
|
+
|
30
|
+
## Contributing
|
31
|
+
|
32
|
+
1. Fork it
|
33
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
34
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
35
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
36
|
+
5. Create new Pull Request
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
PLATFORM = `uname`.strip.upcase
|
4
|
+
SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra -Werror"
|
5
|
+
|
6
|
+
case PLATFORM
|
7
|
+
when 'LINUX'
|
8
|
+
SHARED_FLAGS += ' -D_XOPEN_SOURCE=500' # for ftruncate to be present
|
9
|
+
end
|
10
|
+
|
11
|
+
# production
|
12
|
+
$CFLAGS += " #{SHARED_FLAGS} -O3 -fno-fast-math"
|
13
|
+
|
14
|
+
# development
|
15
|
+
# $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
|
16
|
+
|
17
|
+
create_makefile('blurrily/map_ext')
|
data/ext/blurrily/log.h
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "storage.h"
|
4
|
+
#include "blurrily.h"
|
5
|
+
|
6
|
+
/******************************************************************************/
|
7
|
+
|
8
|
+
static void blurrily_free(void* haystack)
|
9
|
+
{
|
10
|
+
int res = -1;
|
11
|
+
|
12
|
+
res = blurrily_storage_close((trigram_map*) &haystack);
|
13
|
+
assert(res >= 0);
|
14
|
+
}
|
15
|
+
|
16
|
+
/******************************************************************************/
|
17
|
+
|
18
|
+
static VALUE blurrily_new(VALUE class) {
|
19
|
+
VALUE wrapper = Qnil;
|
20
|
+
trigram_map haystack = (trigram_map)NULL;
|
21
|
+
int res = -1;
|
22
|
+
|
23
|
+
res = blurrily_storage_new(&haystack);
|
24
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
25
|
+
|
26
|
+
wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
|
27
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
28
|
+
return wrapper;
|
29
|
+
}
|
30
|
+
|
31
|
+
/******************************************************************************/
|
32
|
+
|
33
|
+
static VALUE blurrily_load(VALUE class, VALUE rb_path) {
|
34
|
+
char* path = StringValuePtr(rb_path);
|
35
|
+
VALUE wrapper = Qnil;
|
36
|
+
trigram_map haystack = (trigram_map)NULL;
|
37
|
+
int res = -1;
|
38
|
+
|
39
|
+
res = blurrily_storage_load(&haystack, path);
|
40
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
41
|
+
|
42
|
+
wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
|
43
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
44
|
+
return wrapper;
|
45
|
+
}
|
46
|
+
|
47
|
+
/******************************************************************************/
|
48
|
+
|
49
|
+
static VALUE blurrily_initialize(VALUE UNUSED(self)) {
|
50
|
+
return Qtrue;
|
51
|
+
}
|
52
|
+
|
53
|
+
/******************************************************************************/
|
54
|
+
|
55
|
+
static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
|
56
|
+
trigram_map haystack = (trigram_map)NULL;
|
57
|
+
int res = -1;
|
58
|
+
char* needle = StringValuePtr(rb_needle);
|
59
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
60
|
+
uint32_t weight = NUM2UINT(rb_weight);
|
61
|
+
|
62
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
63
|
+
|
64
|
+
res = blurrily_storage_put(haystack, needle, reference, weight);
|
65
|
+
assert(res >= 0);
|
66
|
+
|
67
|
+
return Qnil;
|
68
|
+
}
|
69
|
+
|
70
|
+
/******************************************************************************/
|
71
|
+
|
72
|
+
static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
|
73
|
+
trigram_map haystack = (trigram_map)NULL;
|
74
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
75
|
+
int res = -1;
|
76
|
+
|
77
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
78
|
+
|
79
|
+
res = blurrily_storage_delete(haystack, reference);
|
80
|
+
assert(res >= 0);
|
81
|
+
|
82
|
+
return INT2NUM(res);
|
83
|
+
}
|
84
|
+
|
85
|
+
/******************************************************************************/
|
86
|
+
|
87
|
+
static VALUE blurrily_save(VALUE self, VALUE rb_path) {
|
88
|
+
trigram_map haystack = (trigram_map)NULL;
|
89
|
+
int res = -1;
|
90
|
+
const char* path = StringValuePtr(rb_path);
|
91
|
+
|
92
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
93
|
+
|
94
|
+
res = blurrily_storage_save(haystack, path);
|
95
|
+
assert(res >= 0);
|
96
|
+
|
97
|
+
return Qnil;
|
98
|
+
}
|
99
|
+
|
100
|
+
/******************************************************************************/
|
101
|
+
|
102
|
+
static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
|
103
|
+
trigram_map haystack = (trigram_map)NULL;
|
104
|
+
int res = -1;
|
105
|
+
const char* needle = StringValuePtr(rb_needle);
|
106
|
+
int limit = NUM2UINT(rb_limit);
|
107
|
+
trigram_match matches = NULL;
|
108
|
+
VALUE rb_matches = Qnil;
|
109
|
+
|
110
|
+
if (limit <= 0) { limit = 10 ; }
|
111
|
+
matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
|
112
|
+
|
113
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
114
|
+
|
115
|
+
res = blurrily_storage_find(haystack, needle, limit, matches);
|
116
|
+
assert(res >= 0);
|
117
|
+
|
118
|
+
/* wrap the matches into a Ruby array */
|
119
|
+
rb_matches = rb_ary_new();
|
120
|
+
for (int k = 0; k < res; ++k) {
|
121
|
+
VALUE rb_match = rb_ary_new();
|
122
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
|
123
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
|
124
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
|
125
|
+
rb_ary_push(rb_matches, rb_match);
|
126
|
+
}
|
127
|
+
return rb_matches;
|
128
|
+
}
|
129
|
+
|
130
|
+
|
131
|
+
/******************************************************************************/
|
132
|
+
|
133
|
+
static VALUE blurrily_stats(VALUE self)
|
134
|
+
{
|
135
|
+
trigram_map haystack = (trigram_map)NULL;
|
136
|
+
trigram_stat_t stats;
|
137
|
+
VALUE result = rb_hash_new();
|
138
|
+
int res = -1;
|
139
|
+
|
140
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
141
|
+
|
142
|
+
res = blurrily_storage_stats(haystack, &stats);
|
143
|
+
assert(res >= 0);
|
144
|
+
|
145
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
|
146
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
|
147
|
+
|
148
|
+
return result;
|
149
|
+
}
|
150
|
+
|
151
|
+
/******************************************************************************/
|
152
|
+
|
153
|
+
void Init_map_ext(void) {
|
154
|
+
VALUE module = Qnil;
|
155
|
+
VALUE klass = Qnil;
|
156
|
+
|
157
|
+
/* assume we haven't yet defined blurrily */
|
158
|
+
module = rb_define_module("Blurrily");
|
159
|
+
assert(module != Qnil);
|
160
|
+
|
161
|
+
klass = rb_define_class_under(module, "Map", rb_cObject);
|
162
|
+
assert(klass != Qnil);
|
163
|
+
|
164
|
+
rb_define_singleton_method(klass, "new", blurrily_new, 0);
|
165
|
+
rb_define_singleton_method(klass, "load", blurrily_load, 1);
|
166
|
+
|
167
|
+
rb_define_method(klass, "initialize", blurrily_initialize, 0);
|
168
|
+
rb_define_method(klass, "put", blurrily_put, 3);
|
169
|
+
rb_define_method(klass, "delete", blurrily_delete, 1);
|
170
|
+
rb_define_method(klass, "save", blurrily_save, 1);
|
171
|
+
rb_define_method(klass, "find", blurrily_find, 2);
|
172
|
+
rb_define_method(klass, "stats", blurrily_stats, 0);
|
173
|
+
return;
|
174
|
+
}
|
@@ -0,0 +1,541 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include <fcntl.h>
|
6
|
+
#include <sys/mman.h>
|
7
|
+
#include <unistd.h>
|
8
|
+
#include <sys/stat.h>
|
9
|
+
|
10
|
+
#ifdef PLATFORM_LINUX
|
11
|
+
#include <linux/limits.h>
|
12
|
+
#define MERGESORT fake_mergesort
|
13
|
+
#else
|
14
|
+
#include <limits.h>
|
15
|
+
#define MERGESORT mergesort
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifndef PATH_MAX
|
19
|
+
/* safe default ... */
|
20
|
+
#define PATH_MAX 1024
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#include "storage.h"
|
24
|
+
|
25
|
+
#include "log.h"
|
26
|
+
|
27
|
+
/******************************************************************************/
|
28
|
+
|
29
|
+
#define PAGE_SIZE 4096
|
30
|
+
#define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
|
31
|
+
#define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/8
|
32
|
+
|
33
|
+
/******************************************************************************/
|
34
|
+
|
35
|
+
/* one trigram entry -- client reference and sorting weight */
|
36
|
+
struct PACKED_STRUCT trigram_entry_t
|
37
|
+
{
|
38
|
+
uint32_t reference;
|
39
|
+
uint32_t weight;
|
40
|
+
};
|
41
|
+
typedef struct trigram_entry_t trigram_entry_t;
|
42
|
+
|
43
|
+
|
44
|
+
/* collection of entries for a given trigram */
|
45
|
+
/* <entries> points to an array of <buckets> entries */
|
46
|
+
/* of which <used> are filled */
|
47
|
+
struct PACKED_STRUCT trigram_entries_t
|
48
|
+
{
|
49
|
+
uint32_t buckets;
|
50
|
+
uint32_t used;
|
51
|
+
|
52
|
+
trigram_entry_t* entries; /* set when the structure is in memory */
|
53
|
+
size_t entries_offset; /* set when the structure is on disk */
|
54
|
+
|
55
|
+
uint8_t dirty; /* not optimised (presorted) yet */
|
56
|
+
};
|
57
|
+
typedef struct trigram_entries_t trigram_entries_t;
|
58
|
+
|
59
|
+
|
60
|
+
/* hash map of all possible trigrams to collection of entries */
|
61
|
+
/* there are 28^3 = 19,683 possible trigrams */
|
62
|
+
struct PACKED_STRUCT trigram_map_t
|
63
|
+
{
|
64
|
+
char magic[6]; /* the string "trigra" */
|
65
|
+
uint8_t big_endian;
|
66
|
+
uint8_t pointer_size;
|
67
|
+
|
68
|
+
uint32_t total_references;
|
69
|
+
uint32_t total_trigrams;
|
70
|
+
size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
|
71
|
+
int mapped_fd; /* when mapped from disk, the file descriptor */
|
72
|
+
|
73
|
+
trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
|
74
|
+
};
|
75
|
+
typedef struct trigram_map_t trigram_map_t;
|
76
|
+
|
77
|
+
/******************************************************************************/
|
78
|
+
|
79
|
+
#ifdef PLATFORM_LINUX
|
80
|
+
/* fake version of mergesort(3) implemented with qsort(3) as Linux lacks */
|
81
|
+
/* the specific variants */
|
82
|
+
static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(const void *, const void *))
|
83
|
+
{
|
84
|
+
qsort(base, nel, width, compar);
|
85
|
+
return 0;
|
86
|
+
}
|
87
|
+
#endif
|
88
|
+
|
89
|
+
/******************************************************************************/
|
90
|
+
|
91
|
+
/* 1 -> little endian, 2 -> big endian */
|
92
|
+
static uint8_t get_big_endian()
|
93
|
+
{
|
94
|
+
uint32_t magic = 0xAA0000BB;
|
95
|
+
uint8_t head = *((uint8_t*) &magic);
|
96
|
+
|
97
|
+
return (head == 0xBB) ? 1 : 2;
|
98
|
+
}
|
99
|
+
|
100
|
+
/******************************************************************************/
|
101
|
+
|
102
|
+
/* 4 or 8 (bytes) */
|
103
|
+
static uint8_t get_pointer_size()
|
104
|
+
{
|
105
|
+
return (uint8_t) sizeof(void*);
|
106
|
+
}
|
107
|
+
|
108
|
+
/******************************************************************************/
|
109
|
+
|
110
|
+
static int compare_entries(const void* left_p, const void* right_p)
|
111
|
+
{
|
112
|
+
trigram_entry_t* left = (trigram_entry_t*)left_p;
|
113
|
+
trigram_entry_t* right = (trigram_entry_t*)right_p;
|
114
|
+
return (int)left->reference - (int)right->reference;
|
115
|
+
}
|
116
|
+
|
117
|
+
/* compares matches on #matches (descending) then weight (ascending) */
|
118
|
+
static int compare_matches(const void* left_p, const void* right_p)
|
119
|
+
{
|
120
|
+
trigram_match_t* left = (trigram_match_t*)left_p;
|
121
|
+
trigram_match_t* right = (trigram_match_t*)right_p;
|
122
|
+
/* int delta = (int)left->matches - (int)right->matches; */
|
123
|
+
int delta = (int)right->matches - (int)left->matches;
|
124
|
+
|
125
|
+
return (delta != 0) ? delta : ((int)left->weight - (int)right->weight);
|
126
|
+
|
127
|
+
}
|
128
|
+
|
129
|
+
/******************************************************************************/
|
130
|
+
|
131
|
+
static void sort_map_if_dirty(trigram_entries_t* map)
|
132
|
+
{
|
133
|
+
int res = -1;
|
134
|
+
if (! map->dirty) return;
|
135
|
+
|
136
|
+
res = MERGESORT(map->entries, map->used, sizeof(trigram_entry_t), &compare_entries);
|
137
|
+
assert(res >= 0);
|
138
|
+
map->dirty = 0;
|
139
|
+
}
|
140
|
+
|
141
|
+
/******************************************************************************/
|
142
|
+
|
143
|
+
static size_t round_to_page(size_t value)
|
144
|
+
{
|
145
|
+
if (value % PAGE_SIZE == 0) return value;
|
146
|
+
return (value / PAGE_SIZE + 1) * PAGE_SIZE;
|
147
|
+
}
|
148
|
+
|
149
|
+
/******************************************************************************/
|
150
|
+
|
151
|
+
static size_t get_map_size(trigram_map haystack, int index)
|
152
|
+
{
|
153
|
+
return haystack->map[index].buckets * sizeof(trigram_entry_t);
|
154
|
+
}
|
155
|
+
|
156
|
+
/******************************************************************************/
|
157
|
+
|
158
|
+
static void free_if(void* ptr)
|
159
|
+
{
|
160
|
+
if (ptr == NULL) return;
|
161
|
+
free(ptr);
|
162
|
+
return;
|
163
|
+
}
|
164
|
+
|
165
|
+
/******************************************************************************/
|
166
|
+
|
167
|
+
int blurrily_storage_new(trigram_map* haystack_ptr)
|
168
|
+
{
|
169
|
+
trigram_map haystack = (trigram_map)NULL;
|
170
|
+
trigram_entries_t* ptr = NULL;
|
171
|
+
int k = 0;
|
172
|
+
|
173
|
+
LOG("blurrily_storage_new\n");
|
174
|
+
haystack = (trigram_map) malloc(sizeof(trigram_map_t));
|
175
|
+
if (haystack == NULL) return -1;
|
176
|
+
|
177
|
+
memset(haystack, 0x00, sizeof(trigram_map_t));
|
178
|
+
|
179
|
+
memcpy(haystack->magic, "trigra", 6);
|
180
|
+
haystack->big_endian = get_big_endian();
|
181
|
+
haystack->pointer_size = get_pointer_size();
|
182
|
+
|
183
|
+
haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
|
184
|
+
haystack->mapped_fd = 0;
|
185
|
+
haystack->total_references = 0;
|
186
|
+
haystack->total_trigrams = 0;
|
187
|
+
for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
|
188
|
+
ptr->buckets = 0;
|
189
|
+
ptr->used = 0;
|
190
|
+
ptr->dirty = 0;
|
191
|
+
ptr->entries = (trigram_entry_t*)NULL;
|
192
|
+
}
|
193
|
+
|
194
|
+
*haystack_ptr = haystack;
|
195
|
+
return 0;
|
196
|
+
}
|
197
|
+
|
198
|
+
/******************************************************************************/
|
199
|
+
|
200
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path)
|
201
|
+
{
|
202
|
+
int fd = -1;
|
203
|
+
int res = -1;
|
204
|
+
trigram_map header = NULL;
|
205
|
+
uint8_t* origin = NULL;
|
206
|
+
struct stat metadata;
|
207
|
+
|
208
|
+
/* open and map file */
|
209
|
+
res = fd = open(path, O_RDONLY);
|
210
|
+
if (res < 0) goto cleanup;
|
211
|
+
|
212
|
+
res = fstat(fd, &metadata);
|
213
|
+
if (res < 0) goto cleanup;
|
214
|
+
|
215
|
+
header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
|
216
|
+
assert(header != NULL);
|
217
|
+
|
218
|
+
/* check magic */
|
219
|
+
/* TODO */
|
220
|
+
|
221
|
+
/* fix header data */
|
222
|
+
header->mapped_size = metadata.st_size;
|
223
|
+
header->mapped_fd = fd;
|
224
|
+
origin = (uint8_t*)header;
|
225
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
226
|
+
trigram_entries_t* map = header->map + k;
|
227
|
+
if (map->entries_offset == 0) continue;
|
228
|
+
map->entries = (trigram_entry_t*) (origin + map->entries_offset);
|
229
|
+
map->entries_offset = 0;
|
230
|
+
}
|
231
|
+
*haystack = header;
|
232
|
+
|
233
|
+
cleanup:
|
234
|
+
return res;
|
235
|
+
}
|
236
|
+
|
237
|
+
/******************************************************************************/
|
238
|
+
|
239
|
+
int blurrily_storage_close(trigram_map* haystack_ptr)
|
240
|
+
{
|
241
|
+
trigram_map haystack = *haystack_ptr;
|
242
|
+
int res = -1;
|
243
|
+
|
244
|
+
LOG("blurrily_storage_close\n");
|
245
|
+
|
246
|
+
if (haystack->mapped_size) {
|
247
|
+
int fd = haystack->mapped_fd;
|
248
|
+
|
249
|
+
res = munmap(haystack, haystack->mapped_size);
|
250
|
+
assert(res >= 0);
|
251
|
+
|
252
|
+
res = close(fd);
|
253
|
+
assert(res >= 0);
|
254
|
+
} else {
|
255
|
+
trigram_entries_t* ptr = haystack->map;
|
256
|
+
for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
|
257
|
+
free(ptr->entries);
|
258
|
+
++ptr;
|
259
|
+
}
|
260
|
+
free(haystack);
|
261
|
+
}
|
262
|
+
|
263
|
+
*haystack_ptr = NULL;
|
264
|
+
return 0;
|
265
|
+
}
|
266
|
+
|
267
|
+
/******************************************************************************/
|
268
|
+
|
269
|
+
int blurrily_storage_save(trigram_map haystack, const char* path)
|
270
|
+
{
|
271
|
+
int fd = -1;
|
272
|
+
int res = -1;
|
273
|
+
uint8_t* ptr = (uint8_t*)NULL;
|
274
|
+
size_t total_size = 0;
|
275
|
+
size_t offset = 0;
|
276
|
+
trigram_map header = NULL;
|
277
|
+
char path_tmp[PATH_MAX];
|
278
|
+
|
279
|
+
/* cleanup maps in memory */
|
280
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
281
|
+
sort_map_if_dirty(haystack->map + k);
|
282
|
+
}
|
283
|
+
|
284
|
+
/* path for temporary file */
|
285
|
+
snprintf(path_tmp, PATH_MAX, "%s.tmp", path);
|
286
|
+
|
287
|
+
/* compute storage space required */
|
288
|
+
total_size += round_to_page(sizeof(trigram_map_t));
|
289
|
+
|
290
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
291
|
+
total_size += round_to_page(get_map_size(haystack, k));
|
292
|
+
}
|
293
|
+
|
294
|
+
/* open and map file */
|
295
|
+
fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
|
296
|
+
assert(fd >= 0);
|
297
|
+
|
298
|
+
res = ftruncate(fd, total_size);
|
299
|
+
assert(res >= 0);
|
300
|
+
|
301
|
+
ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
|
302
|
+
assert(ptr != NULL);
|
303
|
+
|
304
|
+
/* flush data */
|
305
|
+
memset(ptr, 0x00, total_size);
|
306
|
+
|
307
|
+
/* copy header & clean copy */
|
308
|
+
memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
|
309
|
+
offset += round_to_page(sizeof(trigram_map_t));
|
310
|
+
header = (trigram_map)ptr;
|
311
|
+
|
312
|
+
header->mapped_size = 0;
|
313
|
+
header->mapped_fd = 0;
|
314
|
+
|
315
|
+
/* copy each map, set offset in header */
|
316
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
317
|
+
size_t block_size = get_map_size(haystack, k);
|
318
|
+
|
319
|
+
if (block_size > 0) {
|
320
|
+
memcpy(ptr+offset, haystack->map[k].entries, block_size);
|
321
|
+
|
322
|
+
header->map[k].entries = NULL;
|
323
|
+
header->map[k].entries_offset = offset;
|
324
|
+
|
325
|
+
offset += round_to_page(block_size);
|
326
|
+
} else {
|
327
|
+
header->map[k].entries = NULL;
|
328
|
+
header->map[k].entries_offset = 0;
|
329
|
+
}
|
330
|
+
}
|
331
|
+
assert(offset == total_size);
|
332
|
+
|
333
|
+
res = munmap(ptr, total_size);
|
334
|
+
assert(res >= 0);
|
335
|
+
|
336
|
+
res = close(fd);
|
337
|
+
assert(res >= 0);
|
338
|
+
|
339
|
+
/* commit by renaming the file */
|
340
|
+
res = rename(path_tmp, path);
|
341
|
+
assert(res >= 0);
|
342
|
+
|
343
|
+
return 0;
|
344
|
+
}
|
345
|
+
|
346
|
+
/******************************************************************************/
|
347
|
+
|
348
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
|
349
|
+
{
|
350
|
+
int nb_trigrams = -1;
|
351
|
+
int length = strlen(needle);
|
352
|
+
trigram_t* trigrams = (trigram_t*)NULL;
|
353
|
+
|
354
|
+
trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
|
355
|
+
nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
|
356
|
+
|
357
|
+
if (weight <= 0) weight = length;
|
358
|
+
|
359
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
360
|
+
trigram_t t = trigrams[k];
|
361
|
+
trigram_entries_t* map = &haystack->map[t];
|
362
|
+
trigram_entry_t entry = { reference, weight };
|
363
|
+
|
364
|
+
assert(t < TRIGRAM_COUNT);
|
365
|
+
assert(map-> used <= map-> buckets);
|
366
|
+
|
367
|
+
/* allocate more space as needed (exponential growth) */
|
368
|
+
if (map->buckets == 0) {
|
369
|
+
LOG("- alloc for %d\n", t);
|
370
|
+
|
371
|
+
map->buckets = TRIGRAM_ENTRIES_START_SIZE;
|
372
|
+
map->entries = (trigram_entry_t*) calloc(map->buckets, sizeof(trigram_entry_t));
|
373
|
+
}
|
374
|
+
if (map->used == map->buckets) {
|
375
|
+
uint32_t new_buckets = map->buckets * 4/3;
|
376
|
+
trigram_entry_t* new_entries = NULL;
|
377
|
+
LOG("- realloc for %d\n", t);
|
378
|
+
|
379
|
+
/* copy old data, free old pointer, zero extra space */
|
380
|
+
new_entries = malloc(new_buckets * sizeof(trigram_entry_t));
|
381
|
+
assert(new_entries != NULL);
|
382
|
+
memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
|
383
|
+
free(map->entries);
|
384
|
+
memset(new_entries + map->buckets, 0x00, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
|
385
|
+
/* swap fields */
|
386
|
+
map->buckets = new_buckets;
|
387
|
+
map->entries = new_entries;
|
388
|
+
}
|
389
|
+
map->entries[map->used] = entry;
|
390
|
+
|
391
|
+
map->used += 1;
|
392
|
+
map->dirty = 1;
|
393
|
+
}
|
394
|
+
haystack->total_trigrams += nb_trigrams;
|
395
|
+
haystack->total_references += 1;
|
396
|
+
|
397
|
+
free((void*)trigrams);
|
398
|
+
return 0;
|
399
|
+
}
|
400
|
+
|
401
|
+
/******************************************************************************/
|
402
|
+
|
403
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
|
404
|
+
{
|
405
|
+
int nb_trigrams = -1;
|
406
|
+
int length = strlen(needle);
|
407
|
+
trigram_t* trigrams = (trigram_t*)NULL;
|
408
|
+
int nb_entries = -1;
|
409
|
+
trigram_entry_t* entries = NULL;
|
410
|
+
trigram_entry_t* entry_ptr = NULL;
|
411
|
+
int nb_matches = -1;
|
412
|
+
trigram_match_t* matches = NULL;
|
413
|
+
trigram_match_t* match_ptr = NULL;
|
414
|
+
uint32_t last_ref = (uint32_t)-1;
|
415
|
+
int nb_results = 0;
|
416
|
+
|
417
|
+
trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
|
418
|
+
nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
|
419
|
+
if (nb_trigrams == 0) goto cleanup;
|
420
|
+
|
421
|
+
LOG("%d trigrams in '%s'\n", nb_trigrams, needle);
|
422
|
+
|
423
|
+
/* measure size required for sorting */
|
424
|
+
nb_entries = 0;
|
425
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
426
|
+
trigram_t t = trigrams[k];
|
427
|
+
nb_entries += haystack->map[t].used;
|
428
|
+
}
|
429
|
+
if (nb_entries == 0) goto cleanup;
|
430
|
+
|
431
|
+
/* allocate sorting memory */
|
432
|
+
entries = (trigram_entry_t*) malloc(nb_entries * sizeof(trigram_entry_t));
|
433
|
+
assert(entries != NULL);
|
434
|
+
LOG("allocated space for %zd trigrams entries\n", nb_entries);
|
435
|
+
|
436
|
+
/* copy data for sorting */
|
437
|
+
entry_ptr = entries;
|
438
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
439
|
+
trigram_t t = trigrams[k];
|
440
|
+
size_t buckets = haystack->map[t].used;
|
441
|
+
|
442
|
+
sort_map_if_dirty(haystack->map + t);
|
443
|
+
memcpy(entry_ptr, haystack->map[t].entries, buckets * sizeof(trigram_entry_t));
|
444
|
+
entry_ptr += buckets;
|
445
|
+
}
|
446
|
+
assert(entry_ptr == entries + nb_entries);
|
447
|
+
|
448
|
+
/* sort data */
|
449
|
+
MERGESORT(entries, nb_entries, sizeof(trigram_entry_t), &compare_entries);
|
450
|
+
LOG("sorting entries\n");
|
451
|
+
|
452
|
+
/* count distinct matches */
|
453
|
+
entry_ptr = entries;
|
454
|
+
last_ref = -1;
|
455
|
+
nb_matches = 0;
|
456
|
+
for (int k = 0; k < nb_entries; ++k) {
|
457
|
+
if (entry_ptr->reference != last_ref) {
|
458
|
+
last_ref = entry_ptr->reference;
|
459
|
+
++nb_matches;
|
460
|
+
}
|
461
|
+
++entry_ptr;
|
462
|
+
}
|
463
|
+
assert(entry_ptr == entries + nb_entries);
|
464
|
+
LOG("total %zd distinct matches\n", nb_matches);
|
465
|
+
|
466
|
+
/* allocate maches result */
|
467
|
+
matches = (trigram_match_t*) calloc(nb_matches, sizeof(trigram_match_t));
|
468
|
+
assert(matches != NULL);
|
469
|
+
|
470
|
+
/* reduction, counting matches per reference */
|
471
|
+
entry_ptr = entries;
|
472
|
+
match_ptr = matches;
|
473
|
+
match_ptr->matches = 0;
|
474
|
+
match_ptr->reference = entry_ptr->reference; /* setup the first match to */
|
475
|
+
match_ptr->weight = entry_ptr->weight; /* simplify the loop */
|
476
|
+
for (int k = 0; k < nb_entries; ++k) {
|
477
|
+
if (entry_ptr->reference != match_ptr->reference) {
|
478
|
+
++match_ptr;
|
479
|
+
match_ptr->reference = entry_ptr->reference;
|
480
|
+
match_ptr->weight = entry_ptr->weight;
|
481
|
+
match_ptr->matches = 1;
|
482
|
+
} else {
|
483
|
+
match_ptr->matches += 1;
|
484
|
+
}
|
485
|
+
assert((int) match_ptr->matches <= nb_trigrams);
|
486
|
+
++entry_ptr;
|
487
|
+
}
|
488
|
+
assert(match_ptr == matches + nb_matches - 1);
|
489
|
+
assert(entry_ptr == entries + nb_entries);
|
490
|
+
|
491
|
+
/* sort by weight (qsort) */
|
492
|
+
qsort(matches, nb_matches, sizeof(trigram_match_t), &compare_matches);
|
493
|
+
|
494
|
+
/* output results */
|
495
|
+
nb_results = (limit < nb_matches) ? limit : nb_matches;
|
496
|
+
for (int k = 0; k < nb_results; ++k) {
|
497
|
+
results[k] = matches[k];
|
498
|
+
LOG("match %d: reference %d, matchiness %d, weight %d\n", k, matches[k].reference, matches[k].matches, matches[k].weight);
|
499
|
+
}
|
500
|
+
|
501
|
+
cleanup:
|
502
|
+
free_if(entries);
|
503
|
+
free_if(matches);
|
504
|
+
free_if(trigrams);
|
505
|
+
return nb_results;
|
506
|
+
}
|
507
|
+
|
508
|
+
/******************************************************************************/
|
509
|
+
|
510
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
|
511
|
+
{
|
512
|
+
int trigrams_deleted = 0;
|
513
|
+
|
514
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
515
|
+
trigram_entries_t* map = haystack->map + k;
|
516
|
+
trigram_entry_t* entry = NULL;
|
517
|
+
|
518
|
+
for (unsigned int j = 0; j < map->used; ++j) {
|
519
|
+
entry = map->entries + j;
|
520
|
+
if (entry->reference != reference) continue;
|
521
|
+
|
522
|
+
*entry = map->entries[map->used - 1];
|
523
|
+
map->used -= 1;
|
524
|
+
|
525
|
+
++trigrams_deleted;
|
526
|
+
--j;
|
527
|
+
}
|
528
|
+
}
|
529
|
+
haystack->total_trigrams -= trigrams_deleted;
|
530
|
+
haystack->total_references -= 1;
|
531
|
+
return trigrams_deleted;
|
532
|
+
}
|
533
|
+
|
534
|
+
/******************************************************************************/
|
535
|
+
|
536
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
|
537
|
+
{
|
538
|
+
stats->references = haystack->total_references;
|
539
|
+
stats->trigrams = haystack->total_trigrams;
|
540
|
+
return 0;
|
541
|
+
}
|
@@ -0,0 +1,109 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
storage.h --
|
4
|
+
|
5
|
+
Trigram map creation, persistence, and qurying.
|
6
|
+
|
7
|
+
*/
|
8
|
+
#include <inttypes.h>
|
9
|
+
#include "tokeniser.h"
|
10
|
+
#include "blurrily.h"
|
11
|
+
|
12
|
+
struct trigram_map_t;
|
13
|
+
typedef struct trigram_map_t* trigram_map;
|
14
|
+
|
15
|
+
struct PACKED_STRUCT trigram_match_t {
|
16
|
+
uint32_t reference;
|
17
|
+
uint32_t matches;
|
18
|
+
uint32_t weight;
|
19
|
+
};
|
20
|
+
typedef struct trigram_match_t trigram_match_t;
|
21
|
+
typedef struct trigram_match_t* trigram_match;
|
22
|
+
|
23
|
+
typedef struct trigram_stat_t {
|
24
|
+
uint32_t references;
|
25
|
+
uint32_t trigrams;
|
26
|
+
|
27
|
+
} trigram_stat_t;
|
28
|
+
|
29
|
+
|
30
|
+
/*
|
31
|
+
Create a new trigram map, resident in memory.
|
32
|
+
*/
|
33
|
+
int blurrily_storage_new(trigram_map* haystack);
|
34
|
+
|
35
|
+
/*
|
36
|
+
Load an existing trigram map from disk.
|
37
|
+
*/
|
38
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path);
|
39
|
+
|
40
|
+
/*
|
41
|
+
Release resources claimed by <new> or <open>.
|
42
|
+
*/
|
43
|
+
int blurrily_storage_close(trigram_map* haystack);
|
44
|
+
|
45
|
+
/*
|
46
|
+
Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
|
47
|
+
gave you.
|
48
|
+
*/
|
49
|
+
int blurrily_storage_save(trigram_map haystack, const char* path);
|
50
|
+
|
51
|
+
/*
|
52
|
+
Add a new string to the map. <reference> is your identifier for that
|
53
|
+
string, <weight> will be using to discriminate entries that match "as
|
54
|
+
well" when searching.
|
55
|
+
|
56
|
+
If <weight> is zero, it will be replaced by the number of characters in
|
57
|
+
the <needle>.
|
58
|
+
|
59
|
+
Returns positive on success, negative on failure.
|
60
|
+
*/
|
61
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
|
62
|
+
|
63
|
+
/*
|
64
|
+
Check the map for an existing <reference>.
|
65
|
+
|
66
|
+
Returns < 0 on error, 0 if the reference is not found, the number of trigrams
|
67
|
+
for that reference otherwise.
|
68
|
+
|
69
|
+
If <weight> is not NULL, will be set to the weight value passed to the put
|
70
|
+
method on return (is the reference is found).
|
71
|
+
|
72
|
+
If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
|
73
|
+
and up to <nb_trigrams> will be copied into it matching the <needle>
|
74
|
+
originally passed to the put method.
|
75
|
+
|
76
|
+
Not that this is a O(n) method: the whole map will be read.
|
77
|
+
*/
|
78
|
+
// int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
|
79
|
+
|
80
|
+
/*
|
81
|
+
Remove a <reference> from the map.
|
82
|
+
|
83
|
+
Note that this is very innefective.
|
84
|
+
|
85
|
+
Returns positive on success, negative on failure.
|
86
|
+
*/
|
87
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
|
88
|
+
|
89
|
+
/*
|
90
|
+
Return at most <limit> entries matching <needle> from the <haystack>.
|
91
|
+
|
92
|
+
Results are written to <results>. The first results are the ones entries
|
93
|
+
sharing the most trigrams with the <needle>. Amongst entries with the same
|
94
|
+
number of matches, the lightest ones (lowest <weight>) will be returned
|
95
|
+
first.
|
96
|
+
|
97
|
+
<results> should be allocated by the caller.
|
98
|
+
|
99
|
+
Returns number of matches on success, negative on failure.
|
100
|
+
*/
|
101
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
|
102
|
+
|
103
|
+
/*
|
104
|
+
Copies metadata into <stats>
|
105
|
+
|
106
|
+
Returns positive on success, negative on failure.
|
107
|
+
*/
|
108
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
|
109
|
+
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include "tokeniser.h"
|
6
|
+
#include "log.h"
|
7
|
+
#include "blurrily.h"
|
8
|
+
|
9
|
+
|
10
|
+
/******************************************************************************/
|
11
|
+
|
12
|
+
static int ipow(int a, int b)
|
13
|
+
{
|
14
|
+
int result = 1;
|
15
|
+
|
16
|
+
while (b-- > 0) result = result * a;
|
17
|
+
return result;
|
18
|
+
}
|
19
|
+
|
20
|
+
/******************************************************************************/
|
21
|
+
|
22
|
+
static void string_to_code(const char* input, trigram_t *output)
|
23
|
+
{
|
24
|
+
trigram_t result = 0;
|
25
|
+
|
26
|
+
for (int k = 0 ; k < 3; ++k) {
|
27
|
+
if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
|
28
|
+
result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
|
29
|
+
}
|
30
|
+
|
31
|
+
*output = result;
|
32
|
+
}
|
33
|
+
|
34
|
+
/******************************************************************************/
|
35
|
+
|
36
|
+
static void code_to_string(trigram_t input, char* output)
|
37
|
+
{
|
38
|
+
for (int k = 0 ; k < 3; ++k) {
|
39
|
+
uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
|
40
|
+
if (elem == 0) {
|
41
|
+
output[k] = '*';
|
42
|
+
} else {
|
43
|
+
output[k] = ('a' + elem - 1);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
output[3] = 0;
|
47
|
+
}
|
48
|
+
|
49
|
+
/******************************************************************************/
|
50
|
+
|
51
|
+
static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
|
52
|
+
{
|
53
|
+
trigram_t* left = (trigram_t*)left_p;
|
54
|
+
trigram_t* right = (trigram_t*)right_p;
|
55
|
+
return (int)*left - (int)*right;
|
56
|
+
}
|
57
|
+
|
58
|
+
/******************************************************************************/
|
59
|
+
|
60
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
|
61
|
+
{
|
62
|
+
int length = strlen(input);
|
63
|
+
char* normalized = (char*) malloc(length+5);
|
64
|
+
int duplicates = 0;
|
65
|
+
|
66
|
+
snprintf(normalized, length+4, "**%s*", input);
|
67
|
+
|
68
|
+
/* replace spaces with '*' */
|
69
|
+
for (int k = 0; k < length+3; ++k) {
|
70
|
+
if (normalized[k] == ' ') normalized[k] = '*';
|
71
|
+
}
|
72
|
+
|
73
|
+
/* compute trigrams */
|
74
|
+
for (int k = 0; k <= length; ++k) {
|
75
|
+
string_to_code(normalized+k, output+k);
|
76
|
+
}
|
77
|
+
|
78
|
+
/* print results */
|
79
|
+
LOG("-- normalization\n");
|
80
|
+
LOG("%s -> %s\n", input, normalized);
|
81
|
+
LOG("-- tokenisation\n");
|
82
|
+
for (int k = 0; k <= length; ++k) {
|
83
|
+
char res[4];
|
84
|
+
|
85
|
+
code_to_string(output[k], res);
|
86
|
+
|
87
|
+
LOG("%c%c%c -> %d -> %s\n",
|
88
|
+
normalized[k], normalized[k+1], normalized[k+2],
|
89
|
+
output[k], res
|
90
|
+
);
|
91
|
+
}
|
92
|
+
|
93
|
+
/* sort */
|
94
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
95
|
+
|
96
|
+
/* remove duplicates */
|
97
|
+
for (int k = 1; k <= length; ++k) {
|
98
|
+
trigram_t* previous = output + k - 1;
|
99
|
+
trigram_t* current = output + k;
|
100
|
+
|
101
|
+
if (*previous == *current) {
|
102
|
+
*previous = 32768;
|
103
|
+
++duplicates;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* compact */
|
108
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
109
|
+
|
110
|
+
/* print again */
|
111
|
+
LOG("-- after sort/compact\n");
|
112
|
+
for (int k = 0; k <= length-duplicates; ++k) {
|
113
|
+
char res[4];
|
114
|
+
code_to_string(output[k], res);
|
115
|
+
LOG("%d -> %s\n", output[k], res);
|
116
|
+
}
|
117
|
+
|
118
|
+
free((void*)normalized);
|
119
|
+
return length+1 - duplicates;
|
120
|
+
}
|
121
|
+
|
122
|
+
/******************************************************************************/
|
123
|
+
|
124
|
+
int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
|
125
|
+
{
|
126
|
+
return 0;
|
127
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
tokeniser.h --
|
4
|
+
|
5
|
+
Split a string into an array of trigrams.
|
6
|
+
|
7
|
+
The input string should be only lowercase latin letters and spaces
|
8
|
+
(convert using iconv).
|
9
|
+
|
10
|
+
Each trigram is a three-symbol tuple consisting of latters and the
|
11
|
+
"epsilon" character used to represent spaces and beginning-of-word/end-of-
|
12
|
+
word anchors.
|
13
|
+
|
14
|
+
Each trigram is represented by a 16-bit integer.
|
15
|
+
|
16
|
+
*/
|
17
|
+
#include <inttypes.h>
|
18
|
+
|
19
|
+
#define TRIGRAM_BASE 28
|
20
|
+
|
21
|
+
typedef uint16_t trigram_t;
|
22
|
+
|
23
|
+
/*
|
24
|
+
Parse the <input> string and store the result in <ouput>.
|
25
|
+
<output> must be allocated by the caller and provide at least as many slots
|
26
|
+
as characters in <input>, plus one.
|
27
|
+
(not all will be necessarily be filled)
|
28
|
+
|
29
|
+
Returns the number of trigrams on success, a negative number on failure.
|
30
|
+
*/
|
31
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
|
32
|
+
|
33
|
+
|
34
|
+
/*
|
35
|
+
Given an <input> returns a string representation of the trigram in <output>.
|
36
|
+
<output> must be allocated by caller and will always be exactly 3
|
37
|
+
<characters plus NULL.
|
38
|
+
|
39
|
+
Returns positive on success, negative on failure.
|
40
|
+
*/
|
41
|
+
int blurrily_tokeniser_trigram(trigram_t input, char* output);
|
data/lib/blurrily.rb
ADDED
data/lib/blurrily/map.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'blurrily/map_ext'
|
2
|
+
require 'active_support/all' # fixme: we only need enough to get mb_chars and alias_method_chain in
|
3
|
+
|
4
|
+
module Blurrily
|
5
|
+
Map.class_eval do
|
6
|
+
|
7
|
+
def put_with_string_normalize(needle, reference, weight=0)
|
8
|
+
needle = normalize_string needle
|
9
|
+
put_without_string_normalize(needle, reference, weight)
|
10
|
+
end
|
11
|
+
|
12
|
+
alias_method_chain :put, :string_normalize
|
13
|
+
|
14
|
+
|
15
|
+
def find_with_string_normalize(needle, limit=10)
|
16
|
+
needle = normalize_string needle
|
17
|
+
find_without_string_normalize(needle, limit)
|
18
|
+
end
|
19
|
+
|
20
|
+
alias_method_chain :find, :string_normalize
|
21
|
+
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def normalize_string(needle)
|
26
|
+
result = needle.downcase
|
27
|
+
unless result =~ /^([a-z ])+$/
|
28
|
+
result = result.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/,'').to_s.gsub(/[^a-z]/,' ')
|
29
|
+
end
|
30
|
+
result.gsub(/\s+/,' ').strip
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
File without changes
|
metadata
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: blurrily
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Julien Letessier
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: eventmachine
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake-compiler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry-nav
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pry-doc
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: progressbar
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - '>='
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
description: Native fuzzy string search
|
154
|
+
email:
|
155
|
+
- julien.letessier@gmail.com
|
156
|
+
executables: []
|
157
|
+
extensions:
|
158
|
+
- ext/blurrily/extconf.rb
|
159
|
+
extra_rdoc_files: []
|
160
|
+
files:
|
161
|
+
- lib/blurrily/map.rb
|
162
|
+
- lib/blurrily/server.rb
|
163
|
+
- lib/blurrily/version.rb
|
164
|
+
- lib/blurrily.rb
|
165
|
+
- ext/blurrily/map_ext.c
|
166
|
+
- ext/blurrily/storage.c
|
167
|
+
- ext/blurrily/tokeniser.c
|
168
|
+
- ext/blurrily/blurrily.h
|
169
|
+
- ext/blurrily/log.h
|
170
|
+
- ext/blurrily/storage.h
|
171
|
+
- ext/blurrily/tokeniser.h
|
172
|
+
- ext/blurrily/extconf.rb
|
173
|
+
- README.md
|
174
|
+
- LICENSE.txt
|
175
|
+
homepage: http://github.com/mezis/blurrily
|
176
|
+
licenses: []
|
177
|
+
metadata: {}
|
178
|
+
post_install_message:
|
179
|
+
rdoc_options: []
|
180
|
+
require_paths:
|
181
|
+
- lib
|
182
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
183
|
+
requirements:
|
184
|
+
- - '>='
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: '0'
|
187
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
|
+
requirements:
|
189
|
+
- - '>='
|
190
|
+
- !ruby/object:Gem::Version
|
191
|
+
version: '0'
|
192
|
+
requirements: []
|
193
|
+
rubyforge_project:
|
194
|
+
rubygems_version: 2.0.0
|
195
|
+
signing_key:
|
196
|
+
specification_version: 4
|
197
|
+
summary: Native fuzzy string search
|
198
|
+
test_files: []
|
199
|
+
has_rdoc:
|