vinted-blurrily 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,119 @@
1
+ /*
2
+
3
+ storage.h --
4
+
5
+ Trigram map creation, persistence, and qurying.
6
+
7
+ */
8
+ #ifndef __STORAGE_H__
9
+ #define __STORAGE_H__
10
+
11
+ #include <inttypes.h>
12
+ #include "tokeniser.h"
13
+ #include "blurrily.h"
14
+
15
+ struct trigram_map_t;
16
+ typedef struct trigram_map_t* trigram_map;
17
+
18
+ struct BR_PACKED_STRUCT trigram_match_t {
19
+ uint32_t reference;
20
+ uint32_t matches;
21
+ uint32_t weight;
22
+ };
23
+ typedef struct trigram_match_t trigram_match_t;
24
+ typedef struct trigram_match_t* trigram_match;
25
+
26
+ typedef struct trigram_stat_t {
27
+ uint32_t references;
28
+ uint32_t trigrams;
29
+
30
+ } trigram_stat_t;
31
+
32
+
33
+ /*
34
+ Create a new trigram map, resident in memory.
35
+ */
36
+ int blurrily_storage_new(trigram_map* haystack);
37
+
38
+ /*
39
+ Load an existing trigram map from disk.
40
+ */
41
+ int blurrily_storage_load(trigram_map* haystack, const char* path);
42
+
43
+ /*
44
+ Release resources claimed by <new> or <open>.
45
+ */
46
+ int blurrily_storage_close(trigram_map* haystack);
47
+
48
+ /*
49
+ Mark resources managed by Ruby GC.
50
+ */
51
+ void blurrily_storage_mark(trigram_map haystack);
52
+
53
+
54
+ /*
55
+ Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
56
+ gave you.
57
+ */
58
+ int blurrily_storage_save(trigram_map haystack, const char* path);
59
+
60
+ /*
61
+ Add a new string to the map. <reference> is your identifier for that
62
+ string, <weight> will be using to discriminate entries that match "as
63
+ well" when searching.
64
+
65
+ If <weight> is zero, it will be replaced by the number of characters in
66
+ the <needle>.
67
+
68
+ Returns positive on success, negative on failure.
69
+ */
70
+ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
71
+
72
+ /*
73
+ Check the map for an existing <reference>.
74
+
75
+ Returns < 0 on error, 0 if the reference is not found, the number of trigrams
76
+ for that reference otherwise.
77
+
78
+ If <weight> is not NULL, will be set to the weight value passed to the put
79
+ method on return (is the reference is found).
80
+
81
+ If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
82
+ and up to <nb_trigrams> will be copied into it matching the <needle>
83
+ originally passed to the put method.
84
+
85
+ Not that this is a O(n) method: the whole map will be read.
86
+ */
87
+ // int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
88
+
89
+ /*
90
+ Remove a <reference> from the map.
91
+
92
+ Note that this is very innefective.
93
+
94
+ Returns positive on success, negative on failure.
95
+ */
96
+ int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
97
+
98
+ /*
99
+ Return at most <limit> entries matching <needle> from the <haystack>.
100
+
101
+ Results are written to <results>. The first results are the ones entries
102
+ sharing the most trigrams with the <needle>. Amongst entries with the same
103
+ number of matches, the lightest ones (lowest <weight>) will be returned
104
+ first.
105
+
106
+ <results> should be allocated by the caller.
107
+
108
+ Returns number of matches on success, negative on failure.
109
+ */
110
+ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
111
+
112
+ /*
113
+ Copies metadata into <stats>
114
+
115
+ Returns positive on success, negative on failure.
116
+ */
117
+ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
118
+
119
+ #endif
@@ -0,0 +1,126 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include <math.h>
5
+ #include "tokeniser.h"
6
+ #include "blurrily.h"
7
+
8
+
9
+ /******************************************************************************/
10
+
11
+ static int ipow(int a, int b)
12
+ {
13
+ int result = 1;
14
+
15
+ while (b-- > 0) result = result * a;
16
+ return result;
17
+ }
18
+
19
+ /******************************************************************************/
20
+
21
+ static void string_to_code(const char* input, trigram_t *output)
22
+ {
23
+ trigram_t result = 0;
24
+
25
+ for (int k = 0 ; k < 3; ++k) {
26
+ if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
27
+ result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
28
+ }
29
+
30
+ *output = result;
31
+ }
32
+
33
+ /******************************************************************************/
34
+
35
+ static void code_to_string(trigram_t input, char* output)
36
+ {
37
+ for (int k = 0 ; k < 3; ++k) {
38
+ uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
39
+ if (elem == 0) {
40
+ output[k] = '*';
41
+ } else {
42
+ output[k] = ('a' + elem - 1);
43
+ }
44
+ }
45
+ output[3] = 0;
46
+ }
47
+
48
+ /******************************************************************************/
49
+
50
+ static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
51
+ {
52
+ trigram_t* left = (trigram_t*)left_p;
53
+ trigram_t* right = (trigram_t*)right_p;
54
+ return (int)*left - (int)*right;
55
+ }
56
+
57
+ /******************************************************************************/
58
+
59
+ int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
60
+ {
61
+ size_t length = strlen(input);
62
+ char* normalized = (char*) malloc(length+5);
63
+ size_t duplicates = 0;
64
+
65
+ snprintf(normalized, length+4, "**%s*", input);
66
+
67
+ /* replace spaces with '*' */
68
+ for (size_t k = 0; k < length+3; ++k) {
69
+ if (normalized[k] == ' ') normalized[k] = '*';
70
+ }
71
+
72
+ /* compute trigrams */
73
+ for (size_t k = 0; k <= length; ++k) {
74
+ string_to_code(normalized+k, output+k);
75
+ }
76
+
77
+ /* print results */
78
+ LOG("-- normalization\n");
79
+ LOG("%s -> %s\n", input, normalized);
80
+ LOG("-- tokenisation\n");
81
+ for (size_t k = 0; k <= length; ++k) {
82
+ char res[4];
83
+
84
+ code_to_string(output[k], res);
85
+
86
+ LOG("%c%c%c -> %d -> %s\n",
87
+ normalized[k], normalized[k+1], normalized[k+2],
88
+ output[k], res
89
+ );
90
+ }
91
+
92
+ /* sort */
93
+ qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
94
+
95
+ /* remove duplicates */
96
+ for (size_t k = 1; k <= length; ++k) {
97
+ trigram_t* previous = output + k - 1;
98
+ trigram_t* current = output + k;
99
+
100
+ if (*previous == *current) {
101
+ *previous = 32768;
102
+ ++duplicates;
103
+ }
104
+ }
105
+
106
+ /* compact */
107
+ qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
108
+
109
+ /* print again */
110
+ LOG("-- after sort/compact\n");
111
+ for (size_t k = 0; k <= length-duplicates; ++k) {
112
+ char res[4];
113
+ code_to_string(output[k], res);
114
+ LOG("%d -> %s\n", output[k], res);
115
+ }
116
+
117
+ free((void*)normalized);
118
+ return (int) (length + 1 - duplicates);
119
+ }
120
+
121
+ /******************************************************************************/
122
+
123
+ int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
124
+ {
125
+ return 0;
126
+ }
@@ -0,0 +1,46 @@
1
+ /*
2
+
3
+ tokeniser.h --
4
+
5
+ Split a string into an array of trigrams.
6
+
7
+ The input string should be only lowercase latin letters and spaces
8
+ (convert using iconv).
9
+
10
+ Each trigram is a three-symbol tuple consisting of latters and the
11
+ "epsilon" character used to represent spaces and beginning-of-word/end-of-
12
+ word anchors.
13
+
14
+ Each trigram is represented by a 16-bit integer.
15
+
16
+ */
17
+ #ifndef __TOKENISER_H__
18
+ #define __TOKENISER_H__
19
+
20
+ #include <inttypes.h>
21
+
22
+ #define TRIGRAM_BASE 28
23
+
24
+ typedef uint16_t trigram_t;
25
+
26
+ /*
27
+ Parse the <input> string and store the result in <ouput>.
28
+ <output> must be allocated by the caller and provide at least as many slots
29
+ as characters in <input>, plus one.
30
+ (not all will be necessarily be filled)
31
+
32
+ Returns the number of trigrams on success, a negative number on failure.
33
+ */
34
+ int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
35
+
36
+
37
+ /*
38
+ Given an <input> returns a string representation of the trigram in <output>.
39
+ <output> must be allocated by caller and will always be exactly 3
40
+ <characters plus NULL.
41
+
42
+ Returns positive on success, negative on failure.
43
+ */
44
+ int blurrily_tokeniser_trigram(trigram_t input, char* output);
45
+
46
+ #endif
data/lib/blurrily.rb ADDED
@@ -0,0 +1 @@
1
+ require 'blurrily/version'
@@ -0,0 +1,136 @@
1
+ # encoding: utf-8
2
+
3
+ require 'socket'
4
+ require 'ipaddr'
5
+ require 'blurrily/defaults'
6
+
7
+ module Blurrily
8
+ class Client
9
+ Error = Class.new(RuntimeError)
10
+
11
+ # Initialize a new {Blurrily::Client} connection to {Blurrily::Server}.
12
+ #
13
+ # @param host IP Address or FQDN of the Blurrily::Server.
14
+ # Defaults to Blurrily::DEFAULT_HOST.
15
+ # @param port Port Blurrily::Server is listening on.
16
+ # Defaults to Blurrily::DEFAULT_PORT.
17
+ # @param db_name Name of the data store being targeted.
18
+ # Defaults to Blurrily::DEFAULT_DATABASE.
19
+ #
20
+ # Examples
21
+ #
22
+ # ```
23
+ # Blurrily::Client.new('127.0.0.1', 12021, 'location_en')
24
+ # # => #<Blurrily::Client:0x007fcd0d33e708 @host="127.0.0.1", @port=12021, @db_name="location_en">
25
+ # ```
26
+ #
27
+ # @returns the instance of {Blurrily::Client}
28
+ def initialize(options = {})
29
+ @host = options.fetch(:host, DEFAULT_HOST)
30
+ @port = options.fetch(:port, DEFAULT_PORT)
31
+ @db_name = options.fetch(:db_name, DEFAULT_DATABASE)
32
+ end
33
+
34
+ # Find record references based on a given string (needle)
35
+ #
36
+ # @param needle The string you're searching for matches on.
37
+ # Must not contain tabs.
38
+ # Required
39
+ # @param limit Limit the number of results retruned (default: 10).
40
+ # Must be numeric.
41
+ # Optional
42
+ #
43
+ # Examples
44
+ #
45
+ # ```
46
+ # @client.find('London')
47
+ # # => [[123,6,3],[124,5,3]...]
48
+ # ```
49
+ #
50
+ # @returns an Array of matching [`ref`,`score`,`weight`] ordered by score. `ref` is the identifying value of the original record.
51
+ # Note that unless modified, `weight` is simply the string length.
52
+ def find(needle, limit = nil)
53
+ limit ||= LIMIT_DEFAULT
54
+ check_valid_needle(needle)
55
+ raise(ArgumentError, "LIMIT value must be in #{LIMIT_RANGE}") unless LIMIT_RANGE.include?(limit)
56
+
57
+ cmd = ["FIND", @db_name, needle, limit]
58
+ send_cmd_and_get_results(cmd).map(&:to_i).each_slice(3).to_a
59
+ end
60
+
61
+ # Index a given record.
62
+ #
63
+ # @param db_name The name of the data store being targeted. Required
64
+ # @param needle The string you wish to index. Must not contain tabs. Required
65
+ # @param ref The indentifying value of the record being indexed. Must be numeric. Required
66
+ # @param weight Weight of this particular reference. Default 0. Don't change unless you know what you're doing. Optional.
67
+ #
68
+ # Examples
69
+ #
70
+ # ```
71
+ # @client.put('location_en', 'London', 123, 0)
72
+ # # => OK
73
+ # ```
74
+ #
75
+ # @returns something to let you know that all is well.
76
+ def put(needle, ref, weight = 0)
77
+ check_valid_needle(needle)
78
+ check_valid_ref(ref)
79
+ raise(ArgumentError, "WEIGHT value must be in #{WEIGHT_RANGE}") unless WEIGHT_RANGE.include?(weight)
80
+
81
+ cmd = ["PUT", @db_name, needle, ref, weight]
82
+ send_cmd_and_get_results(cmd)
83
+ return
84
+ end
85
+
86
+ def delete(ref)
87
+ check_valid_ref(ref)
88
+ cmd = ['DELETE', @db_name, ref]
89
+ send_cmd_and_get_results(cmd)
90
+ return
91
+ end
92
+
93
+ def clear()
94
+ send_cmd_and_get_results(['CLEAR', @db_name])
95
+ return
96
+ end
97
+
98
+
99
+ private
100
+
101
+
102
+ PORT_RANGE = 1025..32768
103
+
104
+ def check_valid_needle(needle)
105
+ raise(ArgumentError, "bad needle") if !needle.kind_of?(String) || needle.empty? || needle.include?("\t")
106
+ end
107
+
108
+ def check_valid_ref(ref)
109
+ raise(ArgumentError, "REF value must be in #{REF_RANGE}") unless REF_RANGE.include?(ref)
110
+ end
111
+
112
+
113
+ def connection
114
+ @connection ||= TCPSocket.new(@host, @port)
115
+ end
116
+
117
+ def send_cmd_and_get_results(argv)
118
+ output = argv.join("\t")
119
+ connection.puts output
120
+ input = connection.gets
121
+ case input
122
+ when "OK\n"
123
+ return []
124
+ when /^OK\t(.*)\n/
125
+ return $1.split("\t")
126
+ when /^ERROR\t(.*)\n/
127
+ raise Error, $1
128
+ when nil
129
+ raise Error, 'Server disconnected'
130
+ else
131
+ raise Error, 'Server did not respect protocol'
132
+ end
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+ require 'blurrily/defaults'
3
+
4
+ module Blurrily
5
+ class CommandProcessor
6
+ ProtocolError = Class.new(StandardError)
7
+
8
+ def initialize(map_group)
9
+ @map_group = map_group
10
+ end
11
+
12
+ def process_command(line)
13
+ command, map_name, *args = line.split(/\t/)
14
+ raise ProtocolError, 'Unknown command' unless COMMANDS.include? command
15
+ raise ProtocolError, 'Invalid database name' unless map_name =~ /^[a-z_]+$/
16
+ result = send("on_#{command}", map_name, *args)
17
+ ['OK', *result].compact.join("\t")
18
+ rescue ArgumentError, ProtocolError => e
19
+ ['ERROR', e.message].join("\t")
20
+ end
21
+
22
+ private
23
+
24
+ COMMANDS = %w(FIND PUT DELETE CLEAR)
25
+
26
+ def on_PUT(map_name, needle, ref, weight = nil)
27
+ raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
28
+ raise ProtocolError, 'Invalid weight' unless weight.nil? || (weight =~ /^\d+$/ && WEIGHT_RANGE.include?(weight.to_i))
29
+
30
+ @map_group.map(map_name).put(*[needle, ref.to_i, weight.to_i].compact)
31
+ return
32
+ end
33
+
34
+ def on_DELETE(map_name, ref)
35
+ raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
36
+
37
+ @map_group.map(map_name).delete(ref.to_i)
38
+ return
39
+ end
40
+
41
+ def on_FIND(map_name, needle, limit = nil)
42
+ raise ProtocolError, 'Limit must be a number' if limit && !LIMIT_RANGE.include?(limit.to_i)
43
+
44
+ results = @map_group.map(map_name).find(*[needle, limit && limit.to_i].compact)
45
+ return results.flatten
46
+ end
47
+
48
+ def on_CLEAR(map_name)
49
+ @map_group.clear(map_name)
50
+ return
51
+ end
52
+ end
53
+ end