vinted-blurrily 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +213 -0
- data/bin/blurrily +43 -0
- data/ext/blurrily/blurrily.h +21 -0
- data/ext/blurrily/extconf.rb +21 -0
- data/ext/blurrily/map_ext.c +230 -0
- data/ext/blurrily/search_tree.c +66 -0
- data/ext/blurrily/search_tree.h +30 -0
- data/ext/blurrily/storage.c +629 -0
- data/ext/blurrily/storage.h +119 -0
- data/ext/blurrily/tokeniser.c +126 -0
- data/ext/blurrily/tokeniser.h +46 -0
- data/lib/blurrily.rb +1 -0
- data/lib/blurrily/client.rb +136 -0
- data/lib/blurrily/command_processor.rb +53 -0
- data/lib/blurrily/defaults.rb +10 -0
- data/lib/blurrily/map.rb +49 -0
- data/lib/blurrily/map_group.rb +39 -0
- data/lib/blurrily/server.rb +49 -0
- data/lib/blurrily/version.rb +3 -0
- metadata +280 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
storage.h --
|
4
|
+
|
5
|
+
Trigram map creation, persistence, and qurying.
|
6
|
+
|
7
|
+
*/
|
8
|
+
#ifndef __STORAGE_H__
|
9
|
+
#define __STORAGE_H__
|
10
|
+
|
11
|
+
#include <inttypes.h>
|
12
|
+
#include "tokeniser.h"
|
13
|
+
#include "blurrily.h"
|
14
|
+
|
15
|
+
struct trigram_map_t;
|
16
|
+
typedef struct trigram_map_t* trigram_map;
|
17
|
+
|
18
|
+
struct BR_PACKED_STRUCT trigram_match_t {
|
19
|
+
uint32_t reference;
|
20
|
+
uint32_t matches;
|
21
|
+
uint32_t weight;
|
22
|
+
};
|
23
|
+
typedef struct trigram_match_t trigram_match_t;
|
24
|
+
typedef struct trigram_match_t* trigram_match;
|
25
|
+
|
26
|
+
typedef struct trigram_stat_t {
|
27
|
+
uint32_t references;
|
28
|
+
uint32_t trigrams;
|
29
|
+
|
30
|
+
} trigram_stat_t;
|
31
|
+
|
32
|
+
|
33
|
+
/*
|
34
|
+
Create a new trigram map, resident in memory.
|
35
|
+
*/
|
36
|
+
int blurrily_storage_new(trigram_map* haystack);
|
37
|
+
|
38
|
+
/*
|
39
|
+
Load an existing trigram map from disk.
|
40
|
+
*/
|
41
|
+
int blurrily_storage_load(trigram_map* haystack, const char* path);
|
42
|
+
|
43
|
+
/*
|
44
|
+
Release resources claimed by <new> or <open>.
|
45
|
+
*/
|
46
|
+
int blurrily_storage_close(trigram_map* haystack);
|
47
|
+
|
48
|
+
/*
|
49
|
+
Mark resources managed by Ruby GC.
|
50
|
+
*/
|
51
|
+
void blurrily_storage_mark(trigram_map haystack);
|
52
|
+
|
53
|
+
|
54
|
+
/*
|
55
|
+
Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
|
56
|
+
gave you.
|
57
|
+
*/
|
58
|
+
int blurrily_storage_save(trigram_map haystack, const char* path);
|
59
|
+
|
60
|
+
/*
|
61
|
+
Add a new string to the map. <reference> is your identifier for that
|
62
|
+
string, <weight> will be using to discriminate entries that match "as
|
63
|
+
well" when searching.
|
64
|
+
|
65
|
+
If <weight> is zero, it will be replaced by the number of characters in
|
66
|
+
the <needle>.
|
67
|
+
|
68
|
+
Returns positive on success, negative on failure.
|
69
|
+
*/
|
70
|
+
int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
|
71
|
+
|
72
|
+
/*
|
73
|
+
Check the map for an existing <reference>.
|
74
|
+
|
75
|
+
Returns < 0 on error, 0 if the reference is not found, the number of trigrams
|
76
|
+
for that reference otherwise.
|
77
|
+
|
78
|
+
If <weight> is not NULL, will be set to the weight value passed to the put
|
79
|
+
method on return (is the reference is found).
|
80
|
+
|
81
|
+
If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
|
82
|
+
and up to <nb_trigrams> will be copied into it matching the <needle>
|
83
|
+
originally passed to the put method.
|
84
|
+
|
85
|
+
Not that this is a O(n) method: the whole map will be read.
|
86
|
+
*/
|
87
|
+
// int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
|
88
|
+
|
89
|
+
/*
|
90
|
+
Remove a <reference> from the map.
|
91
|
+
|
92
|
+
Note that this is very innefective.
|
93
|
+
|
94
|
+
Returns positive on success, negative on failure.
|
95
|
+
*/
|
96
|
+
int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
|
97
|
+
|
98
|
+
/*
|
99
|
+
Return at most <limit> entries matching <needle> from the <haystack>.
|
100
|
+
|
101
|
+
Results are written to <results>. The first results are the ones entries
|
102
|
+
sharing the most trigrams with the <needle>. Amongst entries with the same
|
103
|
+
number of matches, the lightest ones (lowest <weight>) will be returned
|
104
|
+
first.
|
105
|
+
|
106
|
+
<results> should be allocated by the caller.
|
107
|
+
|
108
|
+
Returns number of matches on success, negative on failure.
|
109
|
+
*/
|
110
|
+
int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
|
111
|
+
|
112
|
+
/*
|
113
|
+
Copies metadata into <stats>
|
114
|
+
|
115
|
+
Returns positive on success, negative on failure.
|
116
|
+
*/
|
117
|
+
int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
|
118
|
+
|
119
|
+
#endif
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#include <stdlib.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <math.h>
|
5
|
+
#include "tokeniser.h"
|
6
|
+
#include "blurrily.h"
|
7
|
+
|
8
|
+
|
9
|
+
/******************************************************************************/
|
10
|
+
|
11
|
+
static int ipow(int a, int b)
|
12
|
+
{
|
13
|
+
int result = 1;
|
14
|
+
|
15
|
+
while (b-- > 0) result = result * a;
|
16
|
+
return result;
|
17
|
+
}
|
18
|
+
|
19
|
+
/******************************************************************************/
|
20
|
+
|
21
|
+
static void string_to_code(const char* input, trigram_t *output)
|
22
|
+
{
|
23
|
+
trigram_t result = 0;
|
24
|
+
|
25
|
+
for (int k = 0 ; k < 3; ++k) {
|
26
|
+
if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
|
27
|
+
result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
|
28
|
+
}
|
29
|
+
|
30
|
+
*output = result;
|
31
|
+
}
|
32
|
+
|
33
|
+
/******************************************************************************/
|
34
|
+
|
35
|
+
static void code_to_string(trigram_t input, char* output)
|
36
|
+
{
|
37
|
+
for (int k = 0 ; k < 3; ++k) {
|
38
|
+
uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
|
39
|
+
if (elem == 0) {
|
40
|
+
output[k] = '*';
|
41
|
+
} else {
|
42
|
+
output[k] = ('a' + elem - 1);
|
43
|
+
}
|
44
|
+
}
|
45
|
+
output[3] = 0;
|
46
|
+
}
|
47
|
+
|
48
|
+
/******************************************************************************/
|
49
|
+
|
50
|
+
static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
|
51
|
+
{
|
52
|
+
trigram_t* left = (trigram_t*)left_p;
|
53
|
+
trigram_t* right = (trigram_t*)right_p;
|
54
|
+
return (int)*left - (int)*right;
|
55
|
+
}
|
56
|
+
|
57
|
+
/******************************************************************************/
|
58
|
+
|
59
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
|
60
|
+
{
|
61
|
+
size_t length = strlen(input);
|
62
|
+
char* normalized = (char*) malloc(length+5);
|
63
|
+
size_t duplicates = 0;
|
64
|
+
|
65
|
+
snprintf(normalized, length+4, "**%s*", input);
|
66
|
+
|
67
|
+
/* replace spaces with '*' */
|
68
|
+
for (size_t k = 0; k < length+3; ++k) {
|
69
|
+
if (normalized[k] == ' ') normalized[k] = '*';
|
70
|
+
}
|
71
|
+
|
72
|
+
/* compute trigrams */
|
73
|
+
for (size_t k = 0; k <= length; ++k) {
|
74
|
+
string_to_code(normalized+k, output+k);
|
75
|
+
}
|
76
|
+
|
77
|
+
/* print results */
|
78
|
+
LOG("-- normalization\n");
|
79
|
+
LOG("%s -> %s\n", input, normalized);
|
80
|
+
LOG("-- tokenisation\n");
|
81
|
+
for (size_t k = 0; k <= length; ++k) {
|
82
|
+
char res[4];
|
83
|
+
|
84
|
+
code_to_string(output[k], res);
|
85
|
+
|
86
|
+
LOG("%c%c%c -> %d -> %s\n",
|
87
|
+
normalized[k], normalized[k+1], normalized[k+2],
|
88
|
+
output[k], res
|
89
|
+
);
|
90
|
+
}
|
91
|
+
|
92
|
+
/* sort */
|
93
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
94
|
+
|
95
|
+
/* remove duplicates */
|
96
|
+
for (size_t k = 1; k <= length; ++k) {
|
97
|
+
trigram_t* previous = output + k - 1;
|
98
|
+
trigram_t* current = output + k;
|
99
|
+
|
100
|
+
if (*previous == *current) {
|
101
|
+
*previous = 32768;
|
102
|
+
++duplicates;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
/* compact */
|
107
|
+
qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
|
108
|
+
|
109
|
+
/* print again */
|
110
|
+
LOG("-- after sort/compact\n");
|
111
|
+
for (size_t k = 0; k <= length-duplicates; ++k) {
|
112
|
+
char res[4];
|
113
|
+
code_to_string(output[k], res);
|
114
|
+
LOG("%d -> %s\n", output[k], res);
|
115
|
+
}
|
116
|
+
|
117
|
+
free((void*)normalized);
|
118
|
+
return (int) (length + 1 - duplicates);
|
119
|
+
}
|
120
|
+
|
121
|
+
/******************************************************************************/
|
122
|
+
|
123
|
+
int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
|
124
|
+
{
|
125
|
+
return 0;
|
126
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
|
3
|
+
tokeniser.h --
|
4
|
+
|
5
|
+
Split a string into an array of trigrams.
|
6
|
+
|
7
|
+
The input string should be only lowercase latin letters and spaces
|
8
|
+
(convert using iconv).
|
9
|
+
|
10
|
+
Each trigram is a three-symbol tuple consisting of latters and the
|
11
|
+
"epsilon" character used to represent spaces and beginning-of-word/end-of-
|
12
|
+
word anchors.
|
13
|
+
|
14
|
+
Each trigram is represented by a 16-bit integer.
|
15
|
+
|
16
|
+
*/
|
17
|
+
#ifndef __TOKENISER_H__
|
18
|
+
#define __TOKENISER_H__
|
19
|
+
|
20
|
+
#include <inttypes.h>
|
21
|
+
|
22
|
+
#define TRIGRAM_BASE 28
|
23
|
+
|
24
|
+
typedef uint16_t trigram_t;
|
25
|
+
|
26
|
+
/*
|
27
|
+
Parse the <input> string and store the result in <ouput>.
|
28
|
+
<output> must be allocated by the caller and provide at least as many slots
|
29
|
+
as characters in <input>, plus one.
|
30
|
+
(not all will be necessarily be filled)
|
31
|
+
|
32
|
+
Returns the number of trigrams on success, a negative number on failure.
|
33
|
+
*/
|
34
|
+
int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
|
35
|
+
|
36
|
+
|
37
|
+
/*
|
38
|
+
Given an <input> returns a string representation of the trigram in <output>.
|
39
|
+
<output> must be allocated by caller and will always be exactly 3
|
40
|
+
<characters plus NULL.
|
41
|
+
|
42
|
+
Returns positive on success, negative on failure.
|
43
|
+
*/
|
44
|
+
int blurrily_tokeniser_trigram(trigram_t input, char* output);
|
45
|
+
|
46
|
+
#endif
|
data/lib/blurrily.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'blurrily/version'
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'socket'
|
4
|
+
require 'ipaddr'
|
5
|
+
require 'blurrily/defaults'
|
6
|
+
|
7
|
+
module Blurrily
|
8
|
+
class Client
|
9
|
+
Error = Class.new(RuntimeError)
|
10
|
+
|
11
|
+
# Initialize a new {Blurrily::Client} connection to {Blurrily::Server}.
|
12
|
+
#
|
13
|
+
# @param host IP Address or FQDN of the Blurrily::Server.
|
14
|
+
# Defaults to Blurrily::DEFAULT_HOST.
|
15
|
+
# @param port Port Blurrily::Server is listening on.
|
16
|
+
# Defaults to Blurrily::DEFAULT_PORT.
|
17
|
+
# @param db_name Name of the data store being targeted.
|
18
|
+
# Defaults to Blurrily::DEFAULT_DATABASE.
|
19
|
+
#
|
20
|
+
# Examples
|
21
|
+
#
|
22
|
+
# ```
|
23
|
+
# Blurrily::Client.new('127.0.0.1', 12021, 'location_en')
|
24
|
+
# # => #<Blurrily::Client:0x007fcd0d33e708 @host="127.0.0.1", @port=12021, @db_name="location_en">
|
25
|
+
# ```
|
26
|
+
#
|
27
|
+
# @returns the instance of {Blurrily::Client}
|
28
|
+
def initialize(options = {})
|
29
|
+
@host = options.fetch(:host, DEFAULT_HOST)
|
30
|
+
@port = options.fetch(:port, DEFAULT_PORT)
|
31
|
+
@db_name = options.fetch(:db_name, DEFAULT_DATABASE)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Find record references based on a given string (needle)
|
35
|
+
#
|
36
|
+
# @param needle The string you're searching for matches on.
|
37
|
+
# Must not contain tabs.
|
38
|
+
# Required
|
39
|
+
# @param limit Limit the number of results retruned (default: 10).
|
40
|
+
# Must be numeric.
|
41
|
+
# Optional
|
42
|
+
#
|
43
|
+
# Examples
|
44
|
+
#
|
45
|
+
# ```
|
46
|
+
# @client.find('London')
|
47
|
+
# # => [[123,6,3],[124,5,3]...]
|
48
|
+
# ```
|
49
|
+
#
|
50
|
+
# @returns an Array of matching [`ref`,`score`,`weight`] ordered by score. `ref` is the identifying value of the original record.
|
51
|
+
# Note that unless modified, `weight` is simply the string length.
|
52
|
+
def find(needle, limit = nil)
|
53
|
+
limit ||= LIMIT_DEFAULT
|
54
|
+
check_valid_needle(needle)
|
55
|
+
raise(ArgumentError, "LIMIT value must be in #{LIMIT_RANGE}") unless LIMIT_RANGE.include?(limit)
|
56
|
+
|
57
|
+
cmd = ["FIND", @db_name, needle, limit]
|
58
|
+
send_cmd_and_get_results(cmd).map(&:to_i).each_slice(3).to_a
|
59
|
+
end
|
60
|
+
|
61
|
+
# Index a given record.
|
62
|
+
#
|
63
|
+
# @param db_name The name of the data store being targeted. Required
|
64
|
+
# @param needle The string you wish to index. Must not contain tabs. Required
|
65
|
+
# @param ref The indentifying value of the record being indexed. Must be numeric. Required
|
66
|
+
# @param weight Weight of this particular reference. Default 0. Don't change unless you know what you're doing. Optional.
|
67
|
+
#
|
68
|
+
# Examples
|
69
|
+
#
|
70
|
+
# ```
|
71
|
+
# @client.put('location_en', 'London', 123, 0)
|
72
|
+
# # => OK
|
73
|
+
# ```
|
74
|
+
#
|
75
|
+
# @returns something to let you know that all is well.
|
76
|
+
def put(needle, ref, weight = 0)
|
77
|
+
check_valid_needle(needle)
|
78
|
+
check_valid_ref(ref)
|
79
|
+
raise(ArgumentError, "WEIGHT value must be in #{WEIGHT_RANGE}") unless WEIGHT_RANGE.include?(weight)
|
80
|
+
|
81
|
+
cmd = ["PUT", @db_name, needle, ref, weight]
|
82
|
+
send_cmd_and_get_results(cmd)
|
83
|
+
return
|
84
|
+
end
|
85
|
+
|
86
|
+
def delete(ref)
|
87
|
+
check_valid_ref(ref)
|
88
|
+
cmd = ['DELETE', @db_name, ref]
|
89
|
+
send_cmd_and_get_results(cmd)
|
90
|
+
return
|
91
|
+
end
|
92
|
+
|
93
|
+
def clear()
|
94
|
+
send_cmd_and_get_results(['CLEAR', @db_name])
|
95
|
+
return
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
|
102
|
+
PORT_RANGE = 1025..32768
|
103
|
+
|
104
|
+
def check_valid_needle(needle)
|
105
|
+
raise(ArgumentError, "bad needle") if !needle.kind_of?(String) || needle.empty? || needle.include?("\t")
|
106
|
+
end
|
107
|
+
|
108
|
+
def check_valid_ref(ref)
|
109
|
+
raise(ArgumentError, "REF value must be in #{REF_RANGE}") unless REF_RANGE.include?(ref)
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
def connection
|
114
|
+
@connection ||= TCPSocket.new(@host, @port)
|
115
|
+
end
|
116
|
+
|
117
|
+
def send_cmd_and_get_results(argv)
|
118
|
+
output = argv.join("\t")
|
119
|
+
connection.puts output
|
120
|
+
input = connection.gets
|
121
|
+
case input
|
122
|
+
when "OK\n"
|
123
|
+
return []
|
124
|
+
when /^OK\t(.*)\n/
|
125
|
+
return $1.split("\t")
|
126
|
+
when /^ERROR\t(.*)\n/
|
127
|
+
raise Error, $1
|
128
|
+
when nil
|
129
|
+
raise Error, 'Server disconnected'
|
130
|
+
else
|
131
|
+
raise Error, 'Server did not respect protocol'
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'blurrily/defaults'
|
3
|
+
|
4
|
+
module Blurrily
|
5
|
+
class CommandProcessor
|
6
|
+
ProtocolError = Class.new(StandardError)
|
7
|
+
|
8
|
+
def initialize(map_group)
|
9
|
+
@map_group = map_group
|
10
|
+
end
|
11
|
+
|
12
|
+
def process_command(line)
|
13
|
+
command, map_name, *args = line.split(/\t/)
|
14
|
+
raise ProtocolError, 'Unknown command' unless COMMANDS.include? command
|
15
|
+
raise ProtocolError, 'Invalid database name' unless map_name =~ /^[a-z_]+$/
|
16
|
+
result = send("on_#{command}", map_name, *args)
|
17
|
+
['OK', *result].compact.join("\t")
|
18
|
+
rescue ArgumentError, ProtocolError => e
|
19
|
+
['ERROR', e.message].join("\t")
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
COMMANDS = %w(FIND PUT DELETE CLEAR)
|
25
|
+
|
26
|
+
def on_PUT(map_name, needle, ref, weight = nil)
|
27
|
+
raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
|
28
|
+
raise ProtocolError, 'Invalid weight' unless weight.nil? || (weight =~ /^\d+$/ && WEIGHT_RANGE.include?(weight.to_i))
|
29
|
+
|
30
|
+
@map_group.map(map_name).put(*[needle, ref.to_i, weight.to_i].compact)
|
31
|
+
return
|
32
|
+
end
|
33
|
+
|
34
|
+
def on_DELETE(map_name, ref)
|
35
|
+
raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
|
36
|
+
|
37
|
+
@map_group.map(map_name).delete(ref.to_i)
|
38
|
+
return
|
39
|
+
end
|
40
|
+
|
41
|
+
def on_FIND(map_name, needle, limit = nil)
|
42
|
+
raise ProtocolError, 'Limit must be a number' if limit && !LIMIT_RANGE.include?(limit.to_i)
|
43
|
+
|
44
|
+
results = @map_group.map(map_name).find(*[needle, limit && limit.to_i].compact)
|
45
|
+
return results.flatten
|
46
|
+
end
|
47
|
+
|
48
|
+
def on_CLEAR(map_name)
|
49
|
+
@map_group.clear(map_name)
|
50
|
+
return
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|