blurrily 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +36 -0
- data/ext/blurrily/blurrily.h +2 -0
- data/ext/blurrily/extconf.rb +17 -0
- data/ext/blurrily/log.h +5 -0
- data/ext/blurrily/map_ext.c +174 -0
- data/ext/blurrily/storage.c +541 -0
- data/ext/blurrily/storage.h +109 -0
- data/ext/blurrily/tokeniser.c +127 -0
- data/ext/blurrily/tokeniser.h +41 -0
- data/lib/blurrily.rb +3 -0
- data/lib/blurrily/map.rb +34 -0
- data/lib/blurrily/server.rb +0 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +199 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ed39eb955b4d71f3b924a16be4430046ba1d02ab
|
4
|
+
data.tar.gz: 1c5a5b42b6877ad3d66928a0fe0520ea73defa9b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 54fdb049c894470cf18afdafe18053607e1b4336b6f7353866ae8d81115e87a97ed6f5273270d930a88c292bf02a361868280997b6dbe5668c894aa456745950
|
7
|
+
data.tar.gz: b8c280aa93d062a9a89fbda80cdf3365efcb34ed3e3c28d8dadf6c9b9ee5deba389a3a0233c788e8e182894b1067254ec9a9ef4ae80e7c1676a60edd6cd50e83
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 HouseTrip Ltd.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# Blurrily — Fast fuzzy text search
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/mezis/blurrily.png?branch=master)](https://travis-ci.org/mezis/blurrily)
|
4
|
+
[![Dependency Status](https://gemnasium.com/mezis/blurrily.png)](https://gemnasium.com/mezis/blurrily)
|
5
|
+
[![Code Climate](https://codeclimate.com/github/mezis/blurrily.png)](https://codeclimate.com/github/mezis/blurrily)
|
6
|
+
|
7
|
+
This will be a C version of [fuzzily](http://github.com/mezis/fuzzily), a
|
8
|
+
Ruby gem to perform fuzzy text searching.
|
9
|
+
|
10
|
+
WORK IN PROGRESS.
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
gem 'blurrily'
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install blurrily
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
TODO: Write usage instructions here
|
29
|
+
|
30
|
+
## Contributing
|
31
|
+
|
32
|
+
1. Fork it
|
33
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
34
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
35
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
36
|
+
5. Create new Pull Request
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
PLATFORM = `uname`.strip.upcase
|
4
|
+
SHARED_FLAGS = "-DPLATFORM_#{PLATFORM} --std=c99 -Wall -Wextra -Werror"
|
5
|
+
|
6
|
+
case PLATFORM
|
7
|
+
when 'LINUX'
|
8
|
+
SHARED_FLAGS += ' -D_XOPEN_SOURCE=500' # for ftruncate to be present
|
9
|
+
end
|
10
|
+
|
11
|
+
# production
|
12
|
+
$CFLAGS += " #{SHARED_FLAGS} -O3 -fno-fast-math"
|
13
|
+
|
14
|
+
# development
|
15
|
+
# $CFLAGS += " #{SHARED_FLAGS} -O0 -g"
|
16
|
+
|
17
|
+
create_makefile('blurrily/map_ext')
|
data/ext/blurrily/log.h
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <assert.h>
|
3
|
+
#include "storage.h"
|
4
|
+
#include "blurrily.h"
|
5
|
+
|
6
|
+
/******************************************************************************/
|
7
|
+
|
8
|
+
static void blurrily_free(void* haystack)
|
9
|
+
{
|
10
|
+
int res = -1;
|
11
|
+
|
12
|
+
res = blurrily_storage_close((trigram_map*) &haystack);
|
13
|
+
assert(res >= 0);
|
14
|
+
}
|
15
|
+
|
16
|
+
/******************************************************************************/
|
17
|
+
|
18
|
+
static VALUE blurrily_new(VALUE class) {
|
19
|
+
VALUE wrapper = Qnil;
|
20
|
+
trigram_map haystack = (trigram_map)NULL;
|
21
|
+
int res = -1;
|
22
|
+
|
23
|
+
res = blurrily_storage_new(&haystack);
|
24
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
25
|
+
|
26
|
+
wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
|
27
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
28
|
+
return wrapper;
|
29
|
+
}
|
30
|
+
|
31
|
+
/******************************************************************************/
|
32
|
+
|
33
|
+
static VALUE blurrily_load(VALUE class, VALUE rb_path) {
|
34
|
+
char* path = StringValuePtr(rb_path);
|
35
|
+
VALUE wrapper = Qnil;
|
36
|
+
trigram_map haystack = (trigram_map)NULL;
|
37
|
+
int res = -1;
|
38
|
+
|
39
|
+
res = blurrily_storage_load(&haystack, path);
|
40
|
+
if (res < 0) { rb_sys_fail(NULL); return Qnil; }
|
41
|
+
|
42
|
+
wrapper = Data_Wrap_Struct(class, 0, blurrily_free, (void*)haystack);
|
43
|
+
rb_obj_call_init(wrapper, 0, NULL);
|
44
|
+
return wrapper;
|
45
|
+
}
|
46
|
+
|
47
|
+
/******************************************************************************/
|
48
|
+
|
49
|
+
static VALUE blurrily_initialize(VALUE UNUSED(self)) {
|
50
|
+
return Qtrue;
|
51
|
+
}
|
52
|
+
|
53
|
+
/******************************************************************************/
|
54
|
+
|
55
|
+
static VALUE blurrily_put(VALUE self, VALUE rb_needle, VALUE rb_reference, VALUE rb_weight) {
|
56
|
+
trigram_map haystack = (trigram_map)NULL;
|
57
|
+
int res = -1;
|
58
|
+
char* needle = StringValuePtr(rb_needle);
|
59
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
60
|
+
uint32_t weight = NUM2UINT(rb_weight);
|
61
|
+
|
62
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
63
|
+
|
64
|
+
res = blurrily_storage_put(haystack, needle, reference, weight);
|
65
|
+
assert(res >= 0);
|
66
|
+
|
67
|
+
return Qnil;
|
68
|
+
}
|
69
|
+
|
70
|
+
/******************************************************************************/
|
71
|
+
|
72
|
+
static VALUE blurrily_delete(VALUE self, VALUE rb_reference) {
|
73
|
+
trigram_map haystack = (trigram_map)NULL;
|
74
|
+
uint32_t reference = NUM2UINT(rb_reference);
|
75
|
+
int res = -1;
|
76
|
+
|
77
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
78
|
+
|
79
|
+
res = blurrily_storage_delete(haystack, reference);
|
80
|
+
assert(res >= 0);
|
81
|
+
|
82
|
+
return INT2NUM(res);
|
83
|
+
}
|
84
|
+
|
85
|
+
/******************************************************************************/
|
86
|
+
|
87
|
+
static VALUE blurrily_save(VALUE self, VALUE rb_path) {
|
88
|
+
trigram_map haystack = (trigram_map)NULL;
|
89
|
+
int res = -1;
|
90
|
+
const char* path = StringValuePtr(rb_path);
|
91
|
+
|
92
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
93
|
+
|
94
|
+
res = blurrily_storage_save(haystack, path);
|
95
|
+
assert(res >= 0);
|
96
|
+
|
97
|
+
return Qnil;
|
98
|
+
}
|
99
|
+
|
100
|
+
/******************************************************************************/
|
101
|
+
|
102
|
+
static VALUE blurrily_find(VALUE self, VALUE rb_needle, VALUE rb_limit) {
|
103
|
+
trigram_map haystack = (trigram_map)NULL;
|
104
|
+
int res = -1;
|
105
|
+
const char* needle = StringValuePtr(rb_needle);
|
106
|
+
int limit = NUM2UINT(rb_limit);
|
107
|
+
trigram_match matches = NULL;
|
108
|
+
VALUE rb_matches = Qnil;
|
109
|
+
|
110
|
+
if (limit <= 0) { limit = 10 ; }
|
111
|
+
matches = (trigram_match) malloc(limit * sizeof(trigram_match_t));
|
112
|
+
|
113
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
114
|
+
|
115
|
+
res = blurrily_storage_find(haystack, needle, limit, matches);
|
116
|
+
assert(res >= 0);
|
117
|
+
|
118
|
+
/* wrap the matches into a Ruby array */
|
119
|
+
rb_matches = rb_ary_new();
|
120
|
+
for (int k = 0; k < res; ++k) {
|
121
|
+
VALUE rb_match = rb_ary_new();
|
122
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].reference));
|
123
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].matches));
|
124
|
+
rb_ary_push(rb_match, rb_uint_new(matches[k].weight));
|
125
|
+
rb_ary_push(rb_matches, rb_match);
|
126
|
+
}
|
127
|
+
return rb_matches;
|
128
|
+
}
|
129
|
+
|
130
|
+
|
131
|
+
/******************************************************************************/
|
132
|
+
|
133
|
+
static VALUE blurrily_stats(VALUE self)
|
134
|
+
{
|
135
|
+
trigram_map haystack = (trigram_map)NULL;
|
136
|
+
trigram_stat_t stats;
|
137
|
+
VALUE result = rb_hash_new();
|
138
|
+
int res = -1;
|
139
|
+
|
140
|
+
Data_Get_Struct(self, struct trigram_map_t, haystack);
|
141
|
+
|
142
|
+
res = blurrily_storage_stats(haystack, &stats);
|
143
|
+
assert(res >= 0);
|
144
|
+
|
145
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("references")), UINT2NUM(stats.references));
|
146
|
+
(void) rb_hash_aset(result, ID2SYM(rb_intern("trigrams")), UINT2NUM(stats.trigrams));
|
147
|
+
|
148
|
+
return result;
|
149
|
+
}
|
150
|
+
|
151
|
+
/******************************************************************************/
|
152
|
+
|
153
|
+
void Init_map_ext(void) {
|
154
|
+
VALUE module = Qnil;
|
155
|
+
VALUE klass = Qnil;
|
156
|
+
|
157
|
+
/* assume we haven't yet defined blurrily */
|
158
|
+
module = rb_define_module("Blurrily");
|
159
|
+
assert(module != Qnil);
|
160
|
+
|
161
|
+
klass = rb_define_class_under(module, "Map", rb_cObject);
|
162
|
+
assert(klass != Qnil);
|
163
|
+
|
164
|
+
rb_define_singleton_method(klass, "new", blurrily_new, 0);
|
165
|
+
rb_define_singleton_method(klass, "load", blurrily_load, 1);
|
166
|
+
|
167
|
+
rb_define_method(klass, "initialize", blurrily_initialize, 0);
|
168
|
+
rb_define_method(klass, "put", blurrily_put, 3);
|
169
|
+
rb_define_method(klass, "delete", blurrily_delete, 1);
|
170
|
+
rb_define_method(klass, "save", blurrily_save, 1);
|
171
|
+
rb_define_method(klass, "find", blurrily_find, 2);
|
172
|
+
rb_define_method(klass, "stats", blurrily_stats, 0);
|
173
|
+
return;
|
174
|
+
}
|
@@ -0,0 +1,541 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <assert.h>
|
5
|
+
#include <fcntl.h>
|
6
|
+
#include <sys/mman.h>
|
7
|
+
#include <unistd.h>
|
8
|
+
#include <sys/stat.h>
|
9
|
+
|
10
|
+
#ifdef PLATFORM_LINUX
|
11
|
+
#include <linux/limits.h>
|
12
|
+
#define MERGESORT fake_mergesort
|
13
|
+
#else
|
14
|
+
#include <limits.h>
|
15
|
+
#define MERGESORT mergesort
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifndef PATH_MAX
|
19
|
+
/* safe default ... */
|
20
|
+
#define PATH_MAX 1024
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#include "storage.h"
|
24
|
+
|
25
|
+
#include "log.h"
|
26
|
+
|
27
|
+
/******************************************************************************/
|
28
|
+
|
29
|
+
#define PAGE_SIZE 4096
|
30
|
+
#define TRIGRAM_COUNT (TRIGRAM_BASE * TRIGRAM_BASE * TRIGRAM_BASE)
|
31
|
+
#define TRIGRAM_ENTRIES_START_SIZE PAGE_SIZE/8
|
32
|
+
|
33
|
+
/******************************************************************************/
|
34
|
+
|
35
|
+
/* one trigram entry -- client reference and sorting weight */
|
36
|
+
struct PACKED_STRUCT trigram_entry_t
|
37
|
+
{
|
38
|
+
uint32_t reference;
|
39
|
+
uint32_t weight;
|
40
|
+
};
|
41
|
+
typedef struct trigram_entry_t trigram_entry_t;
|
42
|
+
|
43
|
+
|
44
|
+
/* collection of entries for a given trigram */
|
45
|
+
/* <entries> points to an array of <buckets> entries */
|
46
|
+
/* of which <used> are filled */
|
47
|
+
struct PACKED_STRUCT trigram_entries_t
|
48
|
+
{
|
49
|
+
uint32_t buckets;
|
50
|
+
uint32_t used;
|
51
|
+
|
52
|
+
trigram_entry_t* entries; /* set when the structure is in memory */
|
53
|
+
size_t entries_offset; /* set when the structure is on disk */
|
54
|
+
|
55
|
+
uint8_t dirty; /* not optimised (presorted) yet */
|
56
|
+
};
|
57
|
+
typedef struct trigram_entries_t trigram_entries_t;
|
58
|
+
|
59
|
+
|
60
|
+
/* hash map of all possible trigrams to collection of entries */
|
61
|
+
/* there are 28^3 = 19,683 possible trigrams */
|
62
|
+
struct PACKED_STRUCT trigram_map_t
|
63
|
+
{
|
64
|
+
char magic[6]; /* the string "trigra" */
|
65
|
+
uint8_t big_endian;
|
66
|
+
uint8_t pointer_size;
|
67
|
+
|
68
|
+
uint32_t total_references;
|
69
|
+
uint32_t total_trigrams;
|
70
|
+
size_t mapped_size; /* when mapped from disk, the number of bytes mapped */
|
71
|
+
int mapped_fd; /* when mapped from disk, the file descriptor */
|
72
|
+
|
73
|
+
trigram_entries_t map[TRIGRAM_COUNT]; /* this whole structure is ~500KB */
|
74
|
+
};
|
75
|
+
typedef struct trigram_map_t trigram_map_t;
|
76
|
+
|
77
|
+
/******************************************************************************/
|
78
|
+
|
79
|
+
#ifdef PLATFORM_LINUX
|
80
|
+
/* fake version of mergesort(3) implemented with qsort(3) as Linux lacks */
|
81
|
+
/* the specific variants */
|
82
|
+
static int fake_mergesort(void *base, size_t nel, size_t width, int (*compar)(const void *, const void *))
|
83
|
+
{
|
84
|
+
qsort(base, nel, width, compar);
|
85
|
+
return 0;
|
86
|
+
}
|
87
|
+
#endif
|
88
|
+
|
89
|
+
/******************************************************************************/
|
90
|
+
|
91
|
+
/* 1 -> little endian, 2 -> big endian */
|
92
|
+
static uint8_t get_big_endian()
|
93
|
+
{
|
94
|
+
uint32_t magic = 0xAA0000BB;
|
95
|
+
uint8_t head = *((uint8_t*) &magic);
|
96
|
+
|
97
|
+
return (head == 0xBB) ? 1 : 2;
|
98
|
+
}
|
99
|
+
|
100
|
+
/******************************************************************************/
|
101
|
+
|
102
|
+
/* 4 or 8 (bytes) */
|
103
|
+
static uint8_t get_pointer_size()
|
104
|
+
{
|
105
|
+
return (uint8_t) sizeof(void*);
|
106
|
+
}
|
107
|
+
|
108
|
+
/******************************************************************************/
|
109
|
+
|
110
|
+
static int compare_entries(const void* left_p, const void* right_p)
|
111
|
+
{
|
112
|
+
trigram_entry_t* left = (trigram_entry_t*)left_p;
|
113
|
+
trigram_entry_t* right = (trigram_entry_t*)right_p;
|
114
|
+
return (int)left->reference - (int)right->reference;
|
115
|
+
}
|
116
|
+
|
117
|
+
/* compares matches on #matches (descending) then weight (ascending) */
|
118
|
+
static int compare_matches(const void* left_p, const void* right_p)
|
119
|
+
{
|
120
|
+
trigram_match_t* left = (trigram_match_t*)left_p;
|
121
|
+
trigram_match_t* right = (trigram_match_t*)right_p;
|
122
|
+
/* int delta = (int)left->matches - (int)right->matches; */
|
123
|
+
int delta = (int)right->matches - (int)left->matches;
|
124
|
+
|
125
|
+
return (delta != 0) ? delta : ((int)left->weight - (int)right->weight);
|
126
|
+
|
127
|
+
}
|
128
|
+
|
129
|
+
/******************************************************************************/
|
130
|
+
|
131
|
+
static void sort_map_if_dirty(trigram_entries_t* map)
|
132
|
+
{
|
133
|
+
int res = -1;
|
134
|
+
if (! map->dirty) return;
|
135
|
+
|
136
|
+
res = MERGESORT(map->entries, map->used, sizeof(trigram_entry_t), &compare_entries);
|
137
|
+
assert(res >= 0);
|
138
|
+
map->dirty = 0;
|
139
|
+
}
|
140
|
+
|
141
|
+
/******************************************************************************/
|
142
|
+
|
143
|
+
static size_t round_to_page(size_t value)
|
144
|
+
{
|
145
|
+
if (value % PAGE_SIZE == 0) return value;
|
146
|
+
return (value / PAGE_SIZE + 1) * PAGE_SIZE;
|
147
|
+
}
|
148
|
+
|
149
|
+
/******************************************************************************/
|
150
|
+
|
151
|
+
static size_t get_map_size(trigram_map haystack, int index)
|
152
|
+
{
|
153
|
+
return haystack->map[index].buckets * sizeof(trigram_entry_t);
|
154
|
+
}
|
155
|
+
|
156
|
+
/******************************************************************************/
|
157
|
+
|
158
|
+
static void free_if(void* ptr)
|
159
|
+
{
|
160
|
+
if (ptr == NULL) return;
|
161
|
+
free(ptr);
|
162
|
+
return;
|
163
|
+
}
|
164
|
+
|
165
|
+
/******************************************************************************/
|
166
|
+
|
167
|
+
int blurrily_storage_new(trigram_map* haystack_ptr)
|
168
|
+
{
|
169
|
+
trigram_map haystack = (trigram_map)NULL;
|
170
|
+
trigram_entries_t* ptr = NULL;
|
171
|
+
int k = 0;
|
172
|
+
|
173
|
+
LOG("blurrily_storage_new\n");
|
174
|
+
haystack = (trigram_map) malloc(sizeof(trigram_map_t));
|
175
|
+
if (haystack == NULL) return -1;
|
176
|
+
|
177
|
+
memset(haystack, 0x00, sizeof(trigram_map_t));
|
178
|
+
|
179
|
+
memcpy(haystack->magic, "trigra", 6);
|
180
|
+
haystack->big_endian = get_big_endian();
|
181
|
+
haystack->pointer_size = get_pointer_size();
|
182
|
+
|
183
|
+
haystack->mapped_size = 0; /* not mapped, as we just created it in memory */
|
184
|
+
haystack->mapped_fd = 0;
|
185
|
+
haystack->total_references = 0;
|
186
|
+
haystack->total_trigrams = 0;
|
187
|
+
for(k = 0, ptr = haystack->map ; k < TRIGRAM_COUNT ; ++k, ++ptr) {
|
188
|
+
ptr->buckets = 0;
|
189
|
+
ptr->used = 0;
|
190
|
+
ptr->dirty = 0;
|
191
|
+
ptr->entries = (trigram_entry_t*)NULL;
|
192
|
+
}
|
193
|
+
|
194
|
+
*haystack_ptr = haystack;
|
195
|
+
return 0;
|
196
|
+
}
|
197
|
+
|
198
|
+
/******************************************************************************/
|
199
|
+
|
200
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path)
|
201
|
+
{
|
202
|
+
int fd = -1;
|
203
|
+
int res = -1;
|
204
|
+
trigram_map header = NULL;
|
205
|
+
uint8_t* origin = NULL;
|
206
|
+
struct stat metadata;
|
207
|
+
|
208
|
+
/* open and map file */
|
209
|
+
res = fd = open(path, O_RDONLY);
|
210
|
+
if (res < 0) goto cleanup;
|
211
|
+
|
212
|
+
res = fstat(fd, &metadata);
|
213
|
+
if (res < 0) goto cleanup;
|
214
|
+
|
215
|
+
header = (trigram_map) mmap(NULL, metadata.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
|
216
|
+
assert(header != NULL);
|
217
|
+
|
218
|
+
/* check magic */
|
219
|
+
/* TODO */
|
220
|
+
|
221
|
+
/* fix header data */
|
222
|
+
header->mapped_size = metadata.st_size;
|
223
|
+
header->mapped_fd = fd;
|
224
|
+
origin = (uint8_t*)header;
|
225
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
226
|
+
trigram_entries_t* map = header->map + k;
|
227
|
+
if (map->entries_offset == 0) continue;
|
228
|
+
map->entries = (trigram_entry_t*) (origin + map->entries_offset);
|
229
|
+
map->entries_offset = 0;
|
230
|
+
}
|
231
|
+
*haystack = header;
|
232
|
+
|
233
|
+
cleanup:
|
234
|
+
return res;
|
235
|
+
}
|
236
|
+
|
237
|
+
/******************************************************************************/
|
238
|
+
|
239
|
+
int blurrily_storage_close(trigram_map* haystack_ptr)
|
240
|
+
{
|
241
|
+
trigram_map haystack = *haystack_ptr;
|
242
|
+
int res = -1;
|
243
|
+
|
244
|
+
LOG("blurrily_storage_close\n");
|
245
|
+
|
246
|
+
if (haystack->mapped_size) {
|
247
|
+
int fd = haystack->mapped_fd;
|
248
|
+
|
249
|
+
res = munmap(haystack, haystack->mapped_size);
|
250
|
+
assert(res >= 0);
|
251
|
+
|
252
|
+
res = close(fd);
|
253
|
+
assert(res >= 0);
|
254
|
+
} else {
|
255
|
+
trigram_entries_t* ptr = haystack->map;
|
256
|
+
for(int k = 0 ; k < TRIGRAM_COUNT ; ++k) {
|
257
|
+
free(ptr->entries);
|
258
|
+
++ptr;
|
259
|
+
}
|
260
|
+
free(haystack);
|
261
|
+
}
|
262
|
+
|
263
|
+
*haystack_ptr = NULL;
|
264
|
+
return 0;
|
265
|
+
}
|
266
|
+
|
267
|
+
/******************************************************************************/
|
268
|
+
|
269
|
+
int blurrily_storage_save(trigram_map haystack, const char* path)
|
270
|
+
{
|
271
|
+
int fd = -1;
|
272
|
+
int res = -1;
|
273
|
+
uint8_t* ptr = (uint8_t*)NULL;
|
274
|
+
size_t total_size = 0;
|
275
|
+
size_t offset = 0;
|
276
|
+
trigram_map header = NULL;
|
277
|
+
char path_tmp[PATH_MAX];
|
278
|
+
|
279
|
+
/* cleanup maps in memory */
|
280
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
281
|
+
sort_map_if_dirty(haystack->map + k);
|
282
|
+
}
|
283
|
+
|
284
|
+
/* path for temporary file */
|
285
|
+
snprintf(path_tmp, PATH_MAX, "%s.tmp", path);
|
286
|
+
|
287
|
+
/* compute storage space required */
|
288
|
+
total_size += round_to_page(sizeof(trigram_map_t));
|
289
|
+
|
290
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
291
|
+
total_size += round_to_page(get_map_size(haystack, k));
|
292
|
+
}
|
293
|
+
|
294
|
+
/* open and map file */
|
295
|
+
fd = open(path_tmp, O_RDWR | O_CREAT | O_TRUNC, 0644);
|
296
|
+
assert(fd >= 0);
|
297
|
+
|
298
|
+
res = ftruncate(fd, total_size);
|
299
|
+
assert(res >= 0);
|
300
|
+
|
301
|
+
ptr = mmap(NULL, total_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
|
302
|
+
assert(ptr != NULL);
|
303
|
+
|
304
|
+
/* flush data */
|
305
|
+
memset(ptr, 0x00, total_size);
|
306
|
+
|
307
|
+
/* copy header & clean copy */
|
308
|
+
memcpy(ptr, (void*)haystack, sizeof(trigram_map_t));
|
309
|
+
offset += round_to_page(sizeof(trigram_map_t));
|
310
|
+
header = (trigram_map)ptr;
|
311
|
+
|
312
|
+
header->mapped_size = 0;
|
313
|
+
header->mapped_fd = 0;
|
314
|
+
|
315
|
+
/* copy each map, set offset in header */
|
316
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
317
|
+
size_t block_size = get_map_size(haystack, k);
|
318
|
+
|
319
|
+
if (block_size > 0) {
|
320
|
+
memcpy(ptr+offset, haystack->map[k].entries, block_size);
|
321
|
+
|
322
|
+
header->map[k].entries = NULL;
|
323
|
+
header->map[k].entries_offset = offset;
|
324
|
+
|
325
|
+
offset += round_to_page(block_size);
|
326
|
+
} else {
|
327
|
+
header->map[k].entries = NULL;
|
328
|
+
header->map[k].entries_offset = 0;
|
329
|
+
}
|
330
|
+
}
|
331
|
+
assert(offset == total_size);
|
332
|
+
|
333
|
+
res = munmap(ptr, total_size);
|
334
|
+
assert(res >= 0);
|
335
|
+
|
336
|
+
res = close(fd);
|
337
|
+
assert(res >= 0);
|
338
|
+
|
339
|
+
/* commit by renaming the file */
|
340
|
+
res = rename(path_tmp, path);
|
341
|
+
assert(res >= 0);
|
342
|
+
|
343
|
+
return 0;
|
344
|
+
}
|
345
|
+
|
346
|
+
/******************************************************************************/
|
347
|
+
|
348
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight)
|
349
|
+
{
|
350
|
+
int nb_trigrams = -1;
|
351
|
+
int length = strlen(needle);
|
352
|
+
trigram_t* trigrams = (trigram_t*)NULL;
|
353
|
+
|
354
|
+
trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
|
355
|
+
nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
|
356
|
+
|
357
|
+
if (weight <= 0) weight = length;
|
358
|
+
|
359
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
360
|
+
trigram_t t = trigrams[k];
|
361
|
+
trigram_entries_t* map = &haystack->map[t];
|
362
|
+
trigram_entry_t entry = { reference, weight };
|
363
|
+
|
364
|
+
assert(t < TRIGRAM_COUNT);
|
365
|
+
assert(map-> used <= map-> buckets);
|
366
|
+
|
367
|
+
/* allocate more space as needed (exponential growth) */
|
368
|
+
if (map->buckets == 0) {
|
369
|
+
LOG("- alloc for %d\n", t);
|
370
|
+
|
371
|
+
map->buckets = TRIGRAM_ENTRIES_START_SIZE;
|
372
|
+
map->entries = (trigram_entry_t*) calloc(map->buckets, sizeof(trigram_entry_t));
|
373
|
+
}
|
374
|
+
if (map->used == map->buckets) {
|
375
|
+
uint32_t new_buckets = map->buckets * 4/3;
|
376
|
+
trigram_entry_t* new_entries = NULL;
|
377
|
+
LOG("- realloc for %d\n", t);
|
378
|
+
|
379
|
+
/* copy old data, free old pointer, zero extra space */
|
380
|
+
new_entries = malloc(new_buckets * sizeof(trigram_entry_t));
|
381
|
+
assert(new_entries != NULL);
|
382
|
+
memcpy(new_entries, map->entries, map->buckets * sizeof(trigram_entry_t));
|
383
|
+
free(map->entries);
|
384
|
+
memset(new_entries + map->buckets, 0x00, (new_buckets - map->buckets) * sizeof(trigram_entry_t));
|
385
|
+
/* swap fields */
|
386
|
+
map->buckets = new_buckets;
|
387
|
+
map->entries = new_entries;
|
388
|
+
}
|
389
|
+
map->entries[map->used] = entry;
|
390
|
+
|
391
|
+
map->used += 1;
|
392
|
+
map->dirty = 1;
|
393
|
+
}
|
394
|
+
haystack->total_trigrams += nb_trigrams;
|
395
|
+
haystack->total_references += 1;
|
396
|
+
|
397
|
+
free((void*)trigrams);
|
398
|
+
return 0;
|
399
|
+
}
|
400
|
+
|
401
|
+
/******************************************************************************/
|
402
|
+
|
403
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results)
|
404
|
+
{
|
405
|
+
int nb_trigrams = -1;
|
406
|
+
int length = strlen(needle);
|
407
|
+
trigram_t* trigrams = (trigram_t*)NULL;
|
408
|
+
int nb_entries = -1;
|
409
|
+
trigram_entry_t* entries = NULL;
|
410
|
+
trigram_entry_t* entry_ptr = NULL;
|
411
|
+
int nb_matches = -1;
|
412
|
+
trigram_match_t* matches = NULL;
|
413
|
+
trigram_match_t* match_ptr = NULL;
|
414
|
+
uint32_t last_ref = (uint32_t)-1;
|
415
|
+
int nb_results = 0;
|
416
|
+
|
417
|
+
trigrams = (trigram_t*)malloc((length+1) * sizeof(trigram_t));
|
418
|
+
nb_trigrams = blurrily_tokeniser_parse_string(needle, trigrams);
|
419
|
+
if (nb_trigrams == 0) goto cleanup;
|
420
|
+
|
421
|
+
LOG("%d trigrams in '%s'\n", nb_trigrams, needle);
|
422
|
+
|
423
|
+
/* measure size required for sorting */
|
424
|
+
nb_entries = 0;
|
425
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
426
|
+
trigram_t t = trigrams[k];
|
427
|
+
nb_entries += haystack->map[t].used;
|
428
|
+
}
|
429
|
+
if (nb_entries == 0) goto cleanup;
|
430
|
+
|
431
|
+
/* allocate sorting memory */
|
432
|
+
entries = (trigram_entry_t*) malloc(nb_entries * sizeof(trigram_entry_t));
|
433
|
+
assert(entries != NULL);
|
434
|
+
LOG("allocated space for %zd trigrams entries\n", nb_entries);
|
435
|
+
|
436
|
+
/* copy data for sorting */
|
437
|
+
entry_ptr = entries;
|
438
|
+
for (int k = 0; k < nb_trigrams; ++k) {
|
439
|
+
trigram_t t = trigrams[k];
|
440
|
+
size_t buckets = haystack->map[t].used;
|
441
|
+
|
442
|
+
sort_map_if_dirty(haystack->map + t);
|
443
|
+
memcpy(entry_ptr, haystack->map[t].entries, buckets * sizeof(trigram_entry_t));
|
444
|
+
entry_ptr += buckets;
|
445
|
+
}
|
446
|
+
assert(entry_ptr == entries + nb_entries);
|
447
|
+
|
448
|
+
/* sort data */
|
449
|
+
MERGESORT(entries, nb_entries, sizeof(trigram_entry_t), &compare_entries);
|
450
|
+
LOG("sorting entries\n");
|
451
|
+
|
452
|
+
/* count distinct matches */
|
453
|
+
entry_ptr = entries;
|
454
|
+
last_ref = -1;
|
455
|
+
nb_matches = 0;
|
456
|
+
for (int k = 0; k < nb_entries; ++k) {
|
457
|
+
if (entry_ptr->reference != last_ref) {
|
458
|
+
last_ref = entry_ptr->reference;
|
459
|
+
++nb_matches;
|
460
|
+
}
|
461
|
+
++entry_ptr;
|
462
|
+
}
|
463
|
+
assert(entry_ptr == entries + nb_entries);
|
464
|
+
LOG("total %zd distinct matches\n", nb_matches);
|
465
|
+
|
466
|
+
/* allocate maches result */
|
467
|
+
matches = (trigram_match_t*) calloc(nb_matches, sizeof(trigram_match_t));
|
468
|
+
assert(matches != NULL);
|
469
|
+
|
470
|
+
/* reduction, counting matches per reference */
|
471
|
+
entry_ptr = entries;
|
472
|
+
match_ptr = matches;
|
473
|
+
match_ptr->matches = 0;
|
474
|
+
match_ptr->reference = entry_ptr->reference; /* setup the first match to */
|
475
|
+
match_ptr->weight = entry_ptr->weight; /* simplify the loop */
|
476
|
+
for (int k = 0; k < nb_entries; ++k) {
|
477
|
+
if (entry_ptr->reference != match_ptr->reference) {
|
478
|
+
++match_ptr;
|
479
|
+
match_ptr->reference = entry_ptr->reference;
|
480
|
+
match_ptr->weight = entry_ptr->weight;
|
481
|
+
match_ptr->matches = 1;
|
482
|
+
} else {
|
483
|
+
match_ptr->matches += 1;
|
484
|
+
}
|
485
|
+
assert((int) match_ptr->matches <= nb_trigrams);
|
486
|
+
++entry_ptr;
|
487
|
+
}
|
488
|
+
assert(match_ptr == matches + nb_matches - 1);
|
489
|
+
assert(entry_ptr == entries + nb_entries);
|
490
|
+
|
491
|
+
/* sort by weight (qsort) */
|
492
|
+
qsort(matches, nb_matches, sizeof(trigram_match_t), &compare_matches);
|
493
|
+
|
494
|
+
/* output results */
|
495
|
+
nb_results = (limit < nb_matches) ? limit : nb_matches;
|
496
|
+
for (int k = 0; k < nb_results; ++k) {
|
497
|
+
results[k] = matches[k];
|
498
|
+
LOG("match %d: reference %d, matchiness %d, weight %d\n", k, matches[k].reference, matches[k].matches, matches[k].weight);
|
499
|
+
}
|
500
|
+
|
501
|
+
cleanup:
|
502
|
+
free_if(entries);
|
503
|
+
free_if(matches);
|
504
|
+
free_if(trigrams);
|
505
|
+
return nb_results;
|
506
|
+
}
|
507
|
+
|
508
|
+
/******************************************************************************/
|
509
|
+
|
510
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference)
|
511
|
+
{
|
512
|
+
int trigrams_deleted = 0;
|
513
|
+
|
514
|
+
for (int k = 0; k < TRIGRAM_COUNT; ++k) {
|
515
|
+
trigram_entries_t* map = haystack->map + k;
|
516
|
+
trigram_entry_t* entry = NULL;
|
517
|
+
|
518
|
+
for (unsigned int j = 0; j < map->used; ++j) {
|
519
|
+
entry = map->entries + j;
|
520
|
+
if (entry->reference != reference) continue;
|
521
|
+
|
522
|
+
*entry = map->entries[map->used - 1];
|
523
|
+
map->used -= 1;
|
524
|
+
|
525
|
+
++trigrams_deleted;
|
526
|
+
--j;
|
527
|
+
}
|
528
|
+
}
|
529
|
+
haystack->total_trigrams -= trigrams_deleted;
|
530
|
+
haystack->total_references -= 1;
|
531
|
+
return trigrams_deleted;
|
532
|
+
}
|
533
|
+
|
534
|
+
/******************************************************************************/
|
535
|
+
|
536
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats)
|
537
|
+
{
|
538
|
+
stats->references = haystack->total_references;
|
539
|
+
stats->trigrams = haystack->total_trigrams;
|
540
|
+
return 0;
|
541
|
+
}
|
@@ -0,0 +1,109 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
storage.h --
|
4
|
+
|
5
|
+
Trigram map creation, persistence, and qurying.
|
6
|
+
|
7
|
+
*/
|
8
|
+
#include <inttypes.h>
|
9
|
+
#include "tokeniser.h"
|
10
|
+
#include "blurrily.h"
|
11
|
+
|
12
|
+
struct trigram_map_t;
|
13
|
+
typedef struct trigram_map_t* trigram_map;
|
14
|
+
|
15
|
+
struct PACKED_STRUCT trigram_match_t {
|
16
|
+
uint32_t reference;
|
17
|
+
uint32_t matches;
|
18
|
+
uint32_t weight;
|
19
|
+
};
|
20
|
+
typedef struct trigram_match_t trigram_match_t;
|
21
|
+
typedef struct trigram_match_t* trigram_match;
|
22
|
+
|
23
|
+
typedef struct trigram_stat_t {
|
24
|
+
uint32_t references;
|
25
|
+
uint32_t trigrams;
|
26
|
+
|
27
|
+
} trigram_stat_t;
|
28
|
+
|
29
|
+
|
30
|
+
/*
|
31
|
+
Create a new trigram map, resident in memory.
|
32
|
+
*/
|
33
|
+
int blurrily_storage_new(trigram_map* haystack);
|
34
|
+
|
35
|
+
/*
|
36
|
+
Load an existing trigram map from disk.
|
37
|
+
*/
|
38
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path);
|
39
|
+
|
40
|
+
/*
|
41
|
+
Release resources claimed by <new> or <open>.
|
42
|
+
*/
|
43
|
+
int blurrily_storage_close(trigram_map* haystack);
|
44
|
+
|
45
|
+
/*
|
46
|
+
Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
|
47
|
+
gave you.
|
48
|
+
*/
|
49
|
+
int blurrily_storage_save(trigram_map haystack, const char* path);
|
50
|
+
|
51
|
+
/*
|
52
|
+
Add a new string to the map. <reference> is your identifier for that
|
53
|
+
string, <weight> will be using to discriminate entries that match "as
|
54
|
+
well" when searching.
|
55
|
+
|
56
|
+
If <weight> is zero, it will be replaced by the number of characters in
|
57
|
+
the <needle>.
|
58
|
+
|
59
|
+
Returns positive on success, negative on failure.
|
60
|
+
*/
|
61
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
|
62
|
+
|
63
|
+
/*
|
64
|
+
Check the map for an existing <reference>.
|
65
|
+
|
66
|
+
Returns < 0 on error, 0 if the reference is not found, the number of trigrams
|
67
|
+
for that reference otherwise.
|
68
|
+
|
69
|
+
If <weight> is not NULL, will be set to the weight value passed to the put
|
70
|
+
method on return (is the reference is found).
|
71
|
+
|
72
|
+
If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
|
73
|
+
and up to <nb_trigrams> will be copied into it matching the <needle>
|
74
|
+
originally passed to the put method.
|
75
|
+
|
76
|
+
Not that this is a O(n) method: the whole map will be read.
|
77
|
+
*/
|
78
|
+
// int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
|
79
|
+
|
80
|
+
/*
|
81
|
+
Remove a <reference> from the map.
|
82
|
+
|
83
|
+
Note that this is very innefective.
|
84
|
+
|
85
|
+
Returns positive on success, negative on failure.
|
86
|
+
*/
|
87
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
|
88
|
+
|
89
|
+
/*
|
90
|
+
Return at most <limit> entries matching <needle> from the <haystack>.
|
91
|
+
|
92
|
+
Results are written to <results>. The first results are the ones entries
|
93
|
+
sharing the most trigrams with the <needle>. Amongst entries with the same
|
94
|
+
number of matches, the lightest ones (lowest <weight>) will be returned
|
95
|
+
first.
|
96
|
+
|
97
|
+
<results> should be allocated by the caller.
|
98
|
+
|
99
|
+
Returns number of matches on success, negative on failure.
|
100
|
+
*/
|
101
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
|
102
|
+
|
103
|
+
/*
|
104
|
+
Copies metadata into <stats>
|
105
|
+
|
106
|
+
Returns positive on success, negative on failure.
|
107
|
+
*/
|
108
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
|
109
|
+
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include "tokeniser.h"
|
6
|
+
#include "log.h"
|
7
|
+
#include "blurrily.h"
|
8
|
+
|
9
|
+
|
10
|
+
/******************************************************************************/
|
11
|
+
|
12
|
+
static int ipow(int a, int b)
|
13
|
+
{
|
14
|
+
int result = 1;
|
15
|
+
|
16
|
+
while (b-- > 0) result = result * a;
|
17
|
+
return result;
|
18
|
+
}
|
19
|
+
|
20
|
+
/******************************************************************************/
|
21
|
+
|
22
|
+
static void string_to_code(const char* input, trigram_t *output)
|
23
|
+
{
|
24
|
+
trigram_t result = 0;
|
25
|
+
|
26
|
+
for (int k = 0 ; k < 3; ++k) {
|
27
|
+
if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
|
28
|
+
result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
|
29
|
+
}
|
30
|
+
|
31
|
+
*output = result;
|
32
|
+
}
|
33
|
+
|
34
|
+
/******************************************************************************/
|
35
|
+
|
36
|
+
static void code_to_string(trigram_t input, char* output)
|
37
|
+
{
|
38
|
+
for (int k = 0 ; k < 3; ++k) {
|
39
|
+
uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
|
40
|
+
if (elem == 0) {
|
41
|
+
output[k] = '*';
|
42
|
+
} else {
|
43
|
+
output[k] = ('a' + elem - 1);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
output[3] = 0;
|
47
|
+
}
|
48
|
+
|
49
|
+
/******************************************************************************/
|
50
|
+
|
51
|
+
static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
|
52
|
+
{
|
53
|
+
trigram_t* left = (trigram_t*)left_p;
|
54
|
+
trigram_t* right = (trigram_t*)right_p;
|
55
|
+
return (int)*left - (int)*right;
|
56
|
+
}
|
57
|
+
|
58
|
+
/******************************************************************************/
|
59
|
+
|
60
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
|
61
|
+
{
|
62
|
+
int length = strlen(input);
|
63
|
+
char* normalized = (char*) malloc(length+5);
|
64
|
+
int duplicates = 0;
|
65
|
+
|
66
|
+
snprintf(normalized, length+4, "**%s*", input);
|
67
|
+
|
68
|
+
/* replace spaces with '*' */
|
69
|
+
for (int k = 0; k < length+3; ++k) {
|
70
|
+
if (normalized[k] == ' ') normalized[k] = '*';
|
71
|
+
}
|
72
|
+
|
73
|
+
/* compute trigrams */
|
74
|
+
for (int k = 0; k <= length; ++k) {
|
75
|
+
string_to_code(normalized+k, output+k);
|
76
|
+
}
|
77
|
+
|
78
|
+
/* print results */
|
79
|
+
LOG("-- normalization\n");
|
80
|
+
LOG("%s -> %s\n", input, normalized);
|
81
|
+
LOG("-- tokenisation\n");
|
82
|
+
for (int k = 0; k <= length; ++k) {
|
83
|
+
char res[4];
|
84
|
+
|
85
|
+
code_to_string(output[k], res);
|
86
|
+
|
87
|
+
LOG("%c%c%c -> %d -> %s\n",
|
88
|
+
normalized[k], normalized[k+1], normalized[k+2],
|
89
|
+
output[k], res
|
90
|
+
);
|
91
|
+
}
|
92
|
+
|
93
|
+
/* sort */
|
94
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
95
|
+
|
96
|
+
/* remove duplicates */
|
97
|
+
for (int k = 1; k <= length; ++k) {
|
98
|
+
trigram_t* previous = output + k - 1;
|
99
|
+
trigram_t* current = output + k;
|
100
|
+
|
101
|
+
if (*previous == *current) {
|
102
|
+
*previous = 32768;
|
103
|
+
++duplicates;
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* compact */
|
108
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
109
|
+
|
110
|
+
/* print again */
|
111
|
+
LOG("-- after sort/compact\n");
|
112
|
+
for (int k = 0; k <= length-duplicates; ++k) {
|
113
|
+
char res[4];
|
114
|
+
code_to_string(output[k], res);
|
115
|
+
LOG("%d -> %s\n", output[k], res);
|
116
|
+
}
|
117
|
+
|
118
|
+
free((void*)normalized);
|
119
|
+
return length+1 - duplicates;
|
120
|
+
}
|
121
|
+
|
122
|
+
/******************************************************************************/
|
123
|
+
|
124
|
+
int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
|
125
|
+
{
|
126
|
+
return 0;
|
127
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
tokeniser.h --
|
4
|
+
|
5
|
+
Split a string into an array of trigrams.
|
6
|
+
|
7
|
+
The input string should be only lowercase latin letters and spaces
|
8
|
+
(convert using iconv).
|
9
|
+
|
10
|
+
Each trigram is a three-symbol tuple consisting of latters and the
|
11
|
+
"epsilon" character used to represent spaces and beginning-of-word/end-of-
|
12
|
+
word anchors.
|
13
|
+
|
14
|
+
Each trigram is represented by a 16-bit integer.
|
15
|
+
|
16
|
+
*/
|
17
|
+
#include <inttypes.h>
|
18
|
+
|
19
|
+
#define TRIGRAM_BASE 28
|
20
|
+
|
21
|
+
typedef uint16_t trigram_t;
|
22
|
+
|
23
|
+
/*
|
24
|
+
Parse the <input> string and store the result in <ouput>.
|
25
|
+
<output> must be allocated by the caller and provide at least as many slots
|
26
|
+
as characters in <input>, plus one.
|
27
|
+
(not all will be necessarily be filled)
|
28
|
+
|
29
|
+
Returns the number of trigrams on success, a negative number on failure.
|
30
|
+
*/
|
31
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
|
32
|
+
|
33
|
+
|
34
|
+
/*
|
35
|
+
Given an <input> returns a string representation of the trigram in <output>.
|
36
|
+
<output> must be allocated by caller and will always be exactly 3
|
37
|
+
<characters plus NULL.
|
38
|
+
|
39
|
+
Returns positive on success, negative on failure.
|
40
|
+
*/
|
41
|
+
int blurrily_tokeniser_trigram(trigram_t input, char* output);
|
data/lib/blurrily.rb
ADDED
data/lib/blurrily/map.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'blurrily/map_ext'
|
2
|
+
require 'active_support/all' # fixme: we only need enough to get mb_chars and alias_method_chain in
|
3
|
+
|
4
|
+
module Blurrily
|
5
|
+
Map.class_eval do
|
6
|
+
|
7
|
+
def put_with_string_normalize(needle, reference, weight=0)
|
8
|
+
needle = normalize_string needle
|
9
|
+
put_without_string_normalize(needle, reference, weight)
|
10
|
+
end
|
11
|
+
|
12
|
+
alias_method_chain :put, :string_normalize
|
13
|
+
|
14
|
+
|
15
|
+
def find_with_string_normalize(needle, limit=10)
|
16
|
+
needle = normalize_string needle
|
17
|
+
find_without_string_normalize(needle, limit)
|
18
|
+
end
|
19
|
+
|
20
|
+
alias_method_chain :find, :string_normalize
|
21
|
+
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def normalize_string(needle)
|
26
|
+
result = needle.downcase
|
27
|
+
unless result =~ /^([a-z ])+$/
|
28
|
+
result = result.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/,'').to_s.gsub(/[^a-z]/,' ')
|
29
|
+
end
|
30
|
+
result.gsub(/\s+/,' ').strip
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
File without changes
|
metadata
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: blurrily
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Julien Letessier
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: eventmachine
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake-compiler
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry-nav
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pry-doc
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: progressbar
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - '>='
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
description: Native fuzzy string search
|
154
|
+
email:
|
155
|
+
- julien.letessier@gmail.com
|
156
|
+
executables: []
|
157
|
+
extensions:
|
158
|
+
- ext/blurrily/extconf.rb
|
159
|
+
extra_rdoc_files: []
|
160
|
+
files:
|
161
|
+
- lib/blurrily/map.rb
|
162
|
+
- lib/blurrily/server.rb
|
163
|
+
- lib/blurrily/version.rb
|
164
|
+
- lib/blurrily.rb
|
165
|
+
- ext/blurrily/map_ext.c
|
166
|
+
- ext/blurrily/storage.c
|
167
|
+
- ext/blurrily/tokeniser.c
|
168
|
+
- ext/blurrily/blurrily.h
|
169
|
+
- ext/blurrily/log.h
|
170
|
+
- ext/blurrily/storage.h
|
171
|
+
- ext/blurrily/tokeniser.h
|
172
|
+
- ext/blurrily/extconf.rb
|
173
|
+
- README.md
|
174
|
+
- LICENSE.txt
|
175
|
+
homepage: http://github.com/mezis/blurrily
|
176
|
+
licenses: []
|
177
|
+
metadata: {}
|
178
|
+
post_install_message:
|
179
|
+
rdoc_options: []
|
180
|
+
require_paths:
|
181
|
+
- lib
|
182
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
183
|
+
requirements:
|
184
|
+
- - '>='
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: '0'
|
187
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
|
+
requirements:
|
189
|
+
- - '>='
|
190
|
+
- !ruby/object:Gem::Version
|
191
|
+
version: '0'
|
192
|
+
requirements: []
|
193
|
+
rubyforge_project:
|
194
|
+
rubygems_version: 2.0.0
|
195
|
+
signing_key:
|
196
|
+
specification_version: 4
|
197
|
+
summary: Native fuzzy string search
|
198
|
+
test_files: []
|
199
|
+
has_rdoc:
|