vinted-blurrily 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/bin/blurrily +43 -0
- data/ext/blurrily/blurrily.h +21 -0
- data/ext/blurrily/extconf.rb +21 -0
- data/ext/blurrily/map_ext.c +230 -0
- data/ext/blurrily/search_tree.c +66 -0
- data/ext/blurrily/search_tree.h +30 -0
- data/ext/blurrily/storage.c +629 -0
- data/ext/blurrily/storage.h +119 -0
- data/ext/blurrily/tokeniser.c +126 -0
- data/ext/blurrily/tokeniser.h +46 -0
- data/lib/blurrily.rb +1 -0
- data/lib/blurrily/client.rb +136 -0
- data/lib/blurrily/command_processor.rb +53 -0
- data/lib/blurrily/defaults.rb +10 -0
- data/lib/blurrily/map.rb +49 -0
- data/lib/blurrily/map_group.rb +39 -0
- data/lib/blurrily/server.rb +49 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +280 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
storage.h --
|
4
|
+
|
5
|
+
Trigram map creation, persistence, and qurying.
|
6
|
+
|
7
|
+
*/
|
8
|
+
#ifndef __STORAGE_H__
|
9
|
+
#define __STORAGE_H__
|
10
|
+
|
11
|
+
#include <inttypes.h>
|
12
|
+
#include "tokeniser.h"
|
13
|
+
#include "blurrily.h"
|
14
|
+
|
15
|
+
struct trigram_map_t;
|
16
|
+
typedef struct trigram_map_t* trigram_map;
|
17
|
+
|
18
|
+
struct BR_PACKED_STRUCT trigram_match_t {
|
19
|
+
uint32_t reference;
|
20
|
+
uint32_t matches;
|
21
|
+
uint32_t weight;
|
22
|
+
};
|
23
|
+
typedef struct trigram_match_t trigram_match_t;
|
24
|
+
typedef struct trigram_match_t* trigram_match;
|
25
|
+
|
26
|
+
typedef struct trigram_stat_t {
|
27
|
+
uint32_t references;
|
28
|
+
uint32_t trigrams;
|
29
|
+
|
30
|
+
} trigram_stat_t;
|
31
|
+
|
32
|
+
|
33
|
+
/*
|
34
|
+
Create a new trigram map, resident in memory.
|
35
|
+
*/
|
36
|
+
int blurrily_storage_new(trigram_map* haystack);
|
37
|
+
|
38
|
+
/*
|
39
|
+
Load an existing trigram map from disk.
|
40
|
+
*/
|
41
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path);
|
42
|
+
|
43
|
+
/*
|
44
|
+
Release resources claimed by <new> or <open>.
|
45
|
+
*/
|
46
|
+
int blurrily_storage_close(trigram_map* haystack);
|
47
|
+
|
48
|
+
/*
|
49
|
+
Mark resources managed by Ruby GC.
|
50
|
+
*/
|
51
|
+
void blurrily_storage_mark(trigram_map haystack);
|
52
|
+
|
53
|
+
|
54
|
+
/*
|
55
|
+
Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
|
56
|
+
gave you.
|
57
|
+
*/
|
58
|
+
int blurrily_storage_save(trigram_map haystack, const char* path);
|
59
|
+
|
60
|
+
/*
|
61
|
+
Add a new string to the map. <reference> is your identifier for that
|
62
|
+
string, <weight> will be using to discriminate entries that match "as
|
63
|
+
well" when searching.
|
64
|
+
|
65
|
+
If <weight> is zero, it will be replaced by the number of characters in
|
66
|
+
the <needle>.
|
67
|
+
|
68
|
+
Returns positive on success, negative on failure.
|
69
|
+
*/
|
70
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
|
71
|
+
|
72
|
+
/*
|
73
|
+
Check the map for an existing <reference>.
|
74
|
+
|
75
|
+
Returns < 0 on error, 0 if the reference is not found, the number of trigrams
|
76
|
+
for that reference otherwise.
|
77
|
+
|
78
|
+
If <weight> is not NULL, will be set to the weight value passed to the put
|
79
|
+
method on return (is the reference is found).
|
80
|
+
|
81
|
+
If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
|
82
|
+
and up to <nb_trigrams> will be copied into it matching the <needle>
|
83
|
+
originally passed to the put method.
|
84
|
+
|
85
|
+
Not that this is a O(n) method: the whole map will be read.
|
86
|
+
*/
|
87
|
+
// int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
|
88
|
+
|
89
|
+
/*
|
90
|
+
Remove a <reference> from the map.
|
91
|
+
|
92
|
+
Note that this is very innefective.
|
93
|
+
|
94
|
+
Returns positive on success, negative on failure.
|
95
|
+
*/
|
96
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
|
97
|
+
|
98
|
+
/*
|
99
|
+
Return at most <limit> entries matching <needle> from the <haystack>.
|
100
|
+
|
101
|
+
Results are written to <results>. The first results are the ones entries
|
102
|
+
sharing the most trigrams with the <needle>. Amongst entries with the same
|
103
|
+
number of matches, the lightest ones (lowest <weight>) will be returned
|
104
|
+
first.
|
105
|
+
|
106
|
+
<results> should be allocated by the caller.
|
107
|
+
|
108
|
+
Returns number of matches on success, negative on failure.
|
109
|
+
*/
|
110
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
|
111
|
+
|
112
|
+
/*
|
113
|
+
Copies metadata into <stats>
|
114
|
+
|
115
|
+
Returns positive on success, negative on failure.
|
116
|
+
*/
|
117
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
|
118
|
+
|
119
|
+
#endif
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include "tokeniser.h"
|
6
|
+
#include "blurrily.h"
|
7
|
+
|
8
|
+
|
9
|
+
/******************************************************************************/
|
10
|
+
|
11
|
+
static int ipow(int a, int b)
|
12
|
+
{
|
13
|
+
int result = 1;
|
14
|
+
|
15
|
+
while (b-- > 0) result = result * a;
|
16
|
+
return result;
|
17
|
+
}
|
18
|
+
|
19
|
+
/******************************************************************************/
|
20
|
+
|
21
|
+
static void string_to_code(const char* input, trigram_t *output)
|
22
|
+
{
|
23
|
+
trigram_t result = 0;
|
24
|
+
|
25
|
+
for (int k = 0 ; k < 3; ++k) {
|
26
|
+
if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
|
27
|
+
result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
|
28
|
+
}
|
29
|
+
|
30
|
+
*output = result;
|
31
|
+
}
|
32
|
+
|
33
|
+
/******************************************************************************/
|
34
|
+
|
35
|
+
static void code_to_string(trigram_t input, char* output)
|
36
|
+
{
|
37
|
+
for (int k = 0 ; k < 3; ++k) {
|
38
|
+
uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
|
39
|
+
if (elem == 0) {
|
40
|
+
output[k] = '*';
|
41
|
+
} else {
|
42
|
+
output[k] = ('a' + elem - 1);
|
43
|
+
}
|
44
|
+
}
|
45
|
+
output[3] = 0;
|
46
|
+
}
|
47
|
+
|
48
|
+
/******************************************************************************/
|
49
|
+
|
50
|
+
static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
|
51
|
+
{
|
52
|
+
trigram_t* left = (trigram_t*)left_p;
|
53
|
+
trigram_t* right = (trigram_t*)right_p;
|
54
|
+
return (int)*left - (int)*right;
|
55
|
+
}
|
56
|
+
|
57
|
+
/******************************************************************************/
|
58
|
+
|
59
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
|
60
|
+
{
|
61
|
+
size_t length = strlen(input);
|
62
|
+
char* normalized = (char*) malloc(length+5);
|
63
|
+
size_t duplicates = 0;
|
64
|
+
|
65
|
+
snprintf(normalized, length+4, "**%s*", input);
|
66
|
+
|
67
|
+
/* replace spaces with '*' */
|
68
|
+
for (size_t k = 0; k < length+3; ++k) {
|
69
|
+
if (normalized[k] == ' ') normalized[k] = '*';
|
70
|
+
}
|
71
|
+
|
72
|
+
/* compute trigrams */
|
73
|
+
for (size_t k = 0; k <= length; ++k) {
|
74
|
+
string_to_code(normalized+k, output+k);
|
75
|
+
}
|
76
|
+
|
77
|
+
/* print results */
|
78
|
+
LOG("-- normalization\n");
|
79
|
+
LOG("%s -> %s\n", input, normalized);
|
80
|
+
LOG("-- tokenisation\n");
|
81
|
+
for (size_t k = 0; k <= length; ++k) {
|
82
|
+
char res[4];
|
83
|
+
|
84
|
+
code_to_string(output[k], res);
|
85
|
+
|
86
|
+
LOG("%c%c%c -> %d -> %s\n",
|
87
|
+
normalized[k], normalized[k+1], normalized[k+2],
|
88
|
+
output[k], res
|
89
|
+
);
|
90
|
+
}
|
91
|
+
|
92
|
+
/* sort */
|
93
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
94
|
+
|
95
|
+
/* remove duplicates */
|
96
|
+
for (size_t k = 1; k <= length; ++k) {
|
97
|
+
trigram_t* previous = output + k - 1;
|
98
|
+
trigram_t* current = output + k;
|
99
|
+
|
100
|
+
if (*previous == *current) {
|
101
|
+
*previous = 32768;
|
102
|
+
++duplicates;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
/* compact */
|
107
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
108
|
+
|
109
|
+
/* print again */
|
110
|
+
LOG("-- after sort/compact\n");
|
111
|
+
for (size_t k = 0; k <= length-duplicates; ++k) {
|
112
|
+
char res[4];
|
113
|
+
code_to_string(output[k], res);
|
114
|
+
LOG("%d -> %s\n", output[k], res);
|
115
|
+
}
|
116
|
+
|
117
|
+
free((void*)normalized);
|
118
|
+
return (int) (length + 1 - duplicates);
|
119
|
+
}
|
120
|
+
|
121
|
+
/******************************************************************************/
|
122
|
+
|
123
|
+
int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
|
124
|
+
{
|
125
|
+
return 0;
|
126
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
tokeniser.h --
|
4
|
+
|
5
|
+
Split a string into an array of trigrams.
|
6
|
+
|
7
|
+
The input string should be only lowercase latin letters and spaces
|
8
|
+
(convert using iconv).
|
9
|
+
|
10
|
+
Each trigram is a three-symbol tuple consisting of latters and the
|
11
|
+
"epsilon" character used to represent spaces and beginning-of-word/end-of-
|
12
|
+
word anchors.
|
13
|
+
|
14
|
+
Each trigram is represented by a 16-bit integer.
|
15
|
+
|
16
|
+
*/
|
17
|
+
#ifndef __TOKENISER_H__
|
18
|
+
#define __TOKENISER_H__
|
19
|
+
|
20
|
+
#include <inttypes.h>
|
21
|
+
|
22
|
+
#define TRIGRAM_BASE 28
|
23
|
+
|
24
|
+
typedef uint16_t trigram_t;
|
25
|
+
|
26
|
+
/*
|
27
|
+
Parse the <input> string and store the result in <ouput>.
|
28
|
+
<output> must be allocated by the caller and provide at least as many slots
|
29
|
+
as characters in <input>, plus one.
|
30
|
+
(not all will be necessarily be filled)
|
31
|
+
|
32
|
+
Returns the number of trigrams on success, a negative number on failure.
|
33
|
+
*/
|
34
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
|
35
|
+
|
36
|
+
|
37
|
+
/*
|
38
|
+
Given an <input> returns a string representation of the trigram in <output>.
|
39
|
+
<output> must be allocated by caller and will always be exactly 3
|
40
|
+
<characters plus NULL.
|
41
|
+
|
42
|
+
Returns positive on success, negative on failure.
|
43
|
+
*/
|
44
|
+
int blurrily_tokeniser_trigram(trigram_t input, char* output);
|
45
|
+
|
46
|
+
#endif
|
data/lib/blurrily.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'blurrily/version'
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'socket'
|
4
|
+
require 'ipaddr'
|
5
|
+
require 'blurrily/defaults'
|
6
|
+
|
7
|
+
module Blurrily
|
8
|
+
class Client
|
9
|
+
Error = Class.new(RuntimeError)
|
10
|
+
|
11
|
+
# Initialize a new {Blurrily::Client} connection to {Blurrily::Server}.
|
12
|
+
#
|
13
|
+
# @param host IP Address or FQDN of the Blurrily::Server.
|
14
|
+
# Defaults to Blurrily::DEFAULT_HOST.
|
15
|
+
# @param port Port Blurrily::Server is listening on.
|
16
|
+
# Defaults to Blurrily::DEFAULT_PORT.
|
17
|
+
# @param db_name Name of the data store being targeted.
|
18
|
+
# Defaults to Blurrily::DEFAULT_DATABASE.
|
19
|
+
#
|
20
|
+
# Examples
|
21
|
+
#
|
22
|
+
# ```
|
23
|
+
# Blurrily::Client.new('127.0.0.1', 12021, 'location_en')
|
24
|
+
# # => #<Blurrily::Client:0x007fcd0d33e708 @host="127.0.0.1", @port=12021, @db_name="location_en">
|
25
|
+
# ```
|
26
|
+
#
|
27
|
+
# @returns the instance of {Blurrily::Client}
|
28
|
+
def initialize(options = {})
|
29
|
+
@host = options.fetch(:host, DEFAULT_HOST)
|
30
|
+
@port = options.fetch(:port, DEFAULT_PORT)
|
31
|
+
@db_name = options.fetch(:db_name, DEFAULT_DATABASE)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Find record references based on a given string (needle)
|
35
|
+
#
|
36
|
+
# @param needle The string you're searching for matches on.
|
37
|
+
# Must not contain tabs.
|
38
|
+
# Required
|
39
|
+
# @param limit Limit the number of results retruned (default: 10).
|
40
|
+
# Must be numeric.
|
41
|
+
# Optional
|
42
|
+
#
|
43
|
+
# Examples
|
44
|
+
#
|
45
|
+
# ```
|
46
|
+
# @client.find('London')
|
47
|
+
# # => [[123,6,3],[124,5,3]...]
|
48
|
+
# ```
|
49
|
+
#
|
50
|
+
# @returns an Array of matching [`ref`,`score`,`weight`] ordered by score. `ref` is the identifying value of the original record.
|
51
|
+
# Note that unless modified, `weight` is simply the string length.
|
52
|
+
def find(needle, limit = nil)
|
53
|
+
limit ||= LIMIT_DEFAULT
|
54
|
+
check_valid_needle(needle)
|
55
|
+
raise(ArgumentError, "LIMIT value must be in #{LIMIT_RANGE}") unless LIMIT_RANGE.include?(limit)
|
56
|
+
|
57
|
+
cmd = ["FIND", @db_name, needle, limit]
|
58
|
+
send_cmd_and_get_results(cmd).map(&:to_i).each_slice(3).to_a
|
59
|
+
end
|
60
|
+
|
61
|
+
# Index a given record.
|
62
|
+
#
|
63
|
+
# @param db_name The name of the data store being targeted. Required
|
64
|
+
# @param needle The string you wish to index. Must not contain tabs. Required
|
65
|
+
# @param ref The indentifying value of the record being indexed. Must be numeric. Required
|
66
|
+
# @param weight Weight of this particular reference. Default 0. Don't change unless you know what you're doing. Optional.
|
67
|
+
#
|
68
|
+
# Examples
|
69
|
+
#
|
70
|
+
# ```
|
71
|
+
# @client.put('location_en', 'London', 123, 0)
|
72
|
+
# # => OK
|
73
|
+
# ```
|
74
|
+
#
|
75
|
+
# @returns something to let you know that all is well.
|
76
|
+
def put(needle, ref, weight = 0)
|
77
|
+
check_valid_needle(needle)
|
78
|
+
check_valid_ref(ref)
|
79
|
+
raise(ArgumentError, "WEIGHT value must be in #{WEIGHT_RANGE}") unless WEIGHT_RANGE.include?(weight)
|
80
|
+
|
81
|
+
cmd = ["PUT", @db_name, needle, ref, weight]
|
82
|
+
send_cmd_and_get_results(cmd)
|
83
|
+
return
|
84
|
+
end
|
85
|
+
|
86
|
+
def delete(ref)
|
87
|
+
check_valid_ref(ref)
|
88
|
+
cmd = ['DELETE', @db_name, ref]
|
89
|
+
send_cmd_and_get_results(cmd)
|
90
|
+
return
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear()
|
94
|
+
send_cmd_and_get_results(['CLEAR', @db_name])
|
95
|
+
return
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
|
102
|
+
PORT_RANGE = 1025..32768
|
103
|
+
|
104
|
+
def check_valid_needle(needle)
|
105
|
+
raise(ArgumentError, "bad needle") if !needle.kind_of?(String) || needle.empty? || needle.include?("\t")
|
106
|
+
end
|
107
|
+
|
108
|
+
def check_valid_ref(ref)
|
109
|
+
raise(ArgumentError, "REF value must be in #{REF_RANGE}") unless REF_RANGE.include?(ref)
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
def connection
|
114
|
+
@connection ||= TCPSocket.new(@host, @port)
|
115
|
+
end
|
116
|
+
|
117
|
+
def send_cmd_and_get_results(argv)
|
118
|
+
output = argv.join("\t")
|
119
|
+
connection.puts output
|
120
|
+
input = connection.gets
|
121
|
+
case input
|
122
|
+
when "OK\n"
|
123
|
+
return []
|
124
|
+
when /^OK\t(.*)\n/
|
125
|
+
return $1.split("\t")
|
126
|
+
when /^ERROR\t(.*)\n/
|
127
|
+
raise Error, $1
|
128
|
+
when nil
|
129
|
+
raise Error, 'Server disconnected'
|
130
|
+
else
|
131
|
+
raise Error, 'Server did not respect protocol'
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'blurrily/defaults'
|
3
|
+
|
4
|
+
module Blurrily
|
5
|
+
class CommandProcessor
|
6
|
+
ProtocolError = Class.new(StandardError)
|
7
|
+
|
8
|
+
def initialize(map_group)
|
9
|
+
@map_group = map_group
|
10
|
+
end
|
11
|
+
|
12
|
+
def process_command(line)
|
13
|
+
command, map_name, *args = line.split(/\t/)
|
14
|
+
raise ProtocolError, 'Unknown command' unless COMMANDS.include? command
|
15
|
+
raise ProtocolError, 'Invalid database name' unless map_name =~ /^[a-z_]+$/
|
16
|
+
result = send("on_#{command}", map_name, *args)
|
17
|
+
['OK', *result].compact.join("\t")
|
18
|
+
rescue ArgumentError, ProtocolError => e
|
19
|
+
['ERROR', e.message].join("\t")
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
COMMANDS = %w(FIND PUT DELETE CLEAR)
|
25
|
+
|
26
|
+
def on_PUT(map_name, needle, ref, weight = nil)
|
27
|
+
raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
|
28
|
+
raise ProtocolError, 'Invalid weight' unless weight.nil? || (weight =~ /^\d+$/ && WEIGHT_RANGE.include?(weight.to_i))
|
29
|
+
|
30
|
+
@map_group.map(map_name).put(*[needle, ref.to_i, weight.to_i].compact)
|
31
|
+
return
|
32
|
+
end
|
33
|
+
|
34
|
+
def on_DELETE(map_name, ref)
|
35
|
+
raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
|
36
|
+
|
37
|
+
@map_group.map(map_name).delete(ref.to_i)
|
38
|
+
return
|
39
|
+
end
|
40
|
+
|
41
|
+
def on_FIND(map_name, needle, limit = nil)
|
42
|
+
raise ProtocolError, 'Limit must be a number' if limit && !LIMIT_RANGE.include?(limit.to_i)
|
43
|
+
|
44
|
+
results = @map_group.map(map_name).find(*[needle, limit && limit.to_i].compact)
|
45
|
+
return results.flatten
|
46
|
+
end
|
47
|
+
|
48
|
+
def on_CLEAR(map_name)
|
49
|
+
@map_group.clear(map_name)
|
50
|
+
return
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|