vinted-blurrily 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ /*
2
+
3
+ storage.h --
4
+
5
+ Trigram map creation, persistence, and qurying.
6
+
7
+ */
8
+ #ifndef __STORAGE_H__
9
+ #define __STORAGE_H__
10
+
11
+ #include <inttypes.h>
12
+ #include "tokeniser.h"
13
+ #include "blurrily.h"
14
+
15
+ struct trigram_map_t;
16
+ typedef struct trigram_map_t* trigram_map;
17
+
18
+ struct BR_PACKED_STRUCT trigram_match_t {
19
+ uint32_t reference;
20
+ uint32_t matches;
21
+ uint32_t weight;
22
+ };
23
+ typedef struct trigram_match_t trigram_match_t;
24
+ typedef struct trigram_match_t* trigram_match;
25
+
26
+ typedef struct trigram_stat_t {
27
+ uint32_t references;
28
+ uint32_t trigrams;
29
+
30
+ } trigram_stat_t;
31
+
32
+
33
+ /*
34
+ Create a new trigram map, resident in memory.
35
+ */
36
+ int blurrily_storage_new(trigram_map* haystack);
37
+
38
+ /*
39
+ Load an existing trigram map from disk.
40
+ */
41
+ int blurrily_storage_load(trigram_map* haystack, const char* path);
42
+
43
+ /*
44
+ Release resources claimed by <new> or <open>.
45
+ */
46
+ int blurrily_storage_close(trigram_map* haystack);
47
+
48
+ /*
49
+ Mark resources managed by Ruby GC.
50
+ */
51
+ void blurrily_storage_mark(trigram_map haystack);
52
+
53
+
54
+ /*
55
+ Persist to disk what <blurrily_storage_new> or <blurrily_storage_open>
56
+ gave you.
57
+ */
58
+ int blurrily_storage_save(trigram_map haystack, const char* path);
59
+
60
+ /*
61
+ Add a new string to the map. <reference> is your identifier for that
62
+ string, <weight> will be using to discriminate entries that match "as
63
+ well" when searching.
64
+
65
+ If <weight> is zero, it will be replaced by the number of characters in
66
+ the <needle>.
67
+
68
+ Returns positive on success, negative on failure.
69
+ */
70
+ int blurrily_storage_put(trigram_map haystack, const char* needle, uint32_t reference, uint32_t weight);
71
+
72
+ /*
73
+ Check the map for an existing <reference>.
74
+
75
+ Returns < 0 on error, 0 if the reference is not found, the number of trigrams
76
+ for that reference otherwise.
77
+
78
+ If <weight> is not NULL, will be set to the weight value passed to the put
79
+ method on return (is the reference is found).
80
+
81
+ If <trigrams> is not NULL, it should point an array <nb_trigrams> long,
82
+ and up to <nb_trigrams> will be copied into it matching the <needle>
83
+ originally passed to the put method.
84
+
85
+ Not that this is a O(n) method: the whole map will be read.
86
+ */
87
+ // int blurrily_storage_get(trigram_map haystack, uint32_t reference, uint32_t* weight, int nb_trigrams, trigram_t* trigrams);
88
+
89
+ /*
90
+ Remove a <reference> from the map.
91
+
92
+ Note that this is very innefective.
93
+
94
+ Returns positive on success, negative on failure.
95
+ */
96
+ int blurrily_storage_delete(trigram_map haystack, uint32_t reference);
97
+
98
+ /*
99
+ Return at most <limit> entries matching <needle> from the <haystack>.
100
+
101
+ Results are written to <results>. The first results are the ones entries
102
+ sharing the most trigrams with the <needle>. Amongst entries with the same
103
+ number of matches, the lightest ones (lowest <weight>) will be returned
104
+ first.
105
+
106
+ <results> should be allocated by the caller.
107
+
108
+ Returns number of matches on success, negative on failure.
109
+ */
110
+ int blurrily_storage_find(trigram_map haystack, const char* needle, uint16_t limit, trigram_match results);
111
+
112
+ /*
113
+ Copies metadata into <stats>
114
+
115
+ Returns positive on success, negative on failure.
116
+ */
117
+ int blurrily_storage_stats(trigram_map haystack, trigram_stat_t* stats);
118
+
119
+ #endif
@@ -0,0 +1,126 @@
1
+ #include <stdlib.h>
2
+ #include <string.h>
3
+ #include <stdio.h>
4
+ #include <math.h>
5
+ #include "tokeniser.h"
6
+ #include "blurrily.h"
7
+
8
+
9
+ /******************************************************************************/
10
+
11
+ static int ipow(int a, int b)
12
+ {
13
+ int result = 1;
14
+
15
+ while (b-- > 0) result = result * a;
16
+ return result;
17
+ }
18
+
19
+ /******************************************************************************/
20
+
21
+ static void string_to_code(const char* input, trigram_t *output)
22
+ {
23
+ trigram_t result = 0;
24
+
25
+ for (int k = 0 ; k < 3; ++k) {
26
+ if (input[k] == '*' || input[k] < 'a' || input[k] > 'z') continue;
27
+ result += ipow(TRIGRAM_BASE, k) * (input[k] - 'a' + 1);
28
+ }
29
+
30
+ *output = result;
31
+ }
32
+
33
+ /******************************************************************************/
34
+
35
+ static void code_to_string(trigram_t input, char* output)
36
+ {
37
+ for (int k = 0 ; k < 3; ++k) {
38
+ uint16_t elem = input / ipow(TRIGRAM_BASE, k) % TRIGRAM_BASE;
39
+ if (elem == 0) {
40
+ output[k] = '*';
41
+ } else {
42
+ output[k] = ('a' + elem - 1);
43
+ }
44
+ }
45
+ output[3] = 0;
46
+ }
47
+
48
+ /******************************************************************************/
49
+
50
+ static int blurrily_compare_trigrams(const void* left_p, const void* right_p)
51
+ {
52
+ trigram_t* left = (trigram_t*)left_p;
53
+ trigram_t* right = (trigram_t*)right_p;
54
+ return (int)*left - (int)*right;
55
+ }
56
+
57
+ /******************************************************************************/
58
+
59
+ int blurrily_tokeniser_parse_string(const char* input, trigram_t* output)
60
+ {
61
+ size_t length = strlen(input);
62
+ char* normalized = (char*) malloc(length+5);
63
+ size_t duplicates = 0;
64
+
65
+ snprintf(normalized, length+4, "**%s*", input);
66
+
67
+ /* replace spaces with '*' */
68
+ for (size_t k = 0; k < length+3; ++k) {
69
+ if (normalized[k] == ' ') normalized[k] = '*';
70
+ }
71
+
72
+ /* compute trigrams */
73
+ for (size_t k = 0; k <= length; ++k) {
74
+ string_to_code(normalized+k, output+k);
75
+ }
76
+
77
+ /* print results */
78
+ LOG("-- normalization\n");
79
+ LOG("%s -> %s\n", input, normalized);
80
+ LOG("-- tokenisation\n");
81
+ for (size_t k = 0; k <= length; ++k) {
82
+ char res[4];
83
+
84
+ code_to_string(output[k], res);
85
+
86
+ LOG("%c%c%c -> %d -> %s\n",
87
+ normalized[k], normalized[k+1], normalized[k+2],
88
+ output[k], res
89
+ );
90
+ }
91
+
92
+ /* sort */
93
+ qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
94
+
95
+ /* remove duplicates */
96
+ for (size_t k = 1; k <= length; ++k) {
97
+ trigram_t* previous = output + k - 1;
98
+ trigram_t* current = output + k;
99
+
100
+ if (*previous == *current) {
101
+ *previous = 32768;
102
+ ++duplicates;
103
+ }
104
+ }
105
+
106
+ /* compact */
107
+ qsort((void*)output, length+1, sizeof(trigram_t), &blurrily_compare_trigrams);
108
+
109
+ /* print again */
110
+ LOG("-- after sort/compact\n");
111
+ for (size_t k = 0; k <= length-duplicates; ++k) {
112
+ char res[4];
113
+ code_to_string(output[k], res);
114
+ LOG("%d -> %s\n", output[k], res);
115
+ }
116
+
117
+ free((void*)normalized);
118
+ return (int) (length + 1 - duplicates);
119
+ }
120
+
121
+ /******************************************************************************/
122
+
123
+ int blurrily_tokeniser_trigram(trigram_t UNUSED(input), char* UNUSED(output))
124
+ {
125
+ return 0;
126
+ }
@@ -0,0 +1,46 @@
1
+ /*
2
+
3
+ tokeniser.h --
4
+
5
+ Split a string into an array of trigrams.
6
+
7
+ The input string should be only lowercase latin letters and spaces
8
+ (convert using iconv).
9
+
10
+ Each trigram is a three-symbol tuple consisting of latters and the
11
+ "epsilon" character used to represent spaces and beginning-of-word/end-of-
12
+ word anchors.
13
+
14
+ Each trigram is represented by a 16-bit integer.
15
+
16
+ */
17
+ #ifndef __TOKENISER_H__
18
+ #define __TOKENISER_H__
19
+
20
+ #include <inttypes.h>
21
+
22
+ #define TRIGRAM_BASE 28
23
+
24
+ typedef uint16_t trigram_t;
25
+
26
+ /*
27
+ Parse the <input> string and store the result in <ouput>.
28
+ <output> must be allocated by the caller and provide at least as many slots
29
+ as characters in <input>, plus one.
30
+ (not all will be necessarily be filled)
31
+
32
+ Returns the number of trigrams on success, a negative number on failure.
33
+ */
34
+ int blurrily_tokeniser_parse_string(const char* input, trigram_t* output);
35
+
36
+
37
+ /*
38
+ Given an <input> returns a string representation of the trigram in <output>.
39
+ <output> must be allocated by caller and will always be exactly 3
40
+ <characters plus NULL.
41
+
42
+ Returns positive on success, negative on failure.
43
+ */
44
+ int blurrily_tokeniser_trigram(trigram_t input, char* output);
45
+
46
+ #endif
data/lib/blurrily.rb ADDED
@@ -0,0 +1 @@
1
+ require 'blurrily/version'
@@ -0,0 +1,136 @@
1
+ # encoding: utf-8
2
+
3
+ require 'socket'
4
+ require 'ipaddr'
5
+ require 'blurrily/defaults'
6
+
7
+ module Blurrily
8
+ class Client
9
+ Error = Class.new(RuntimeError)
10
+
11
+ # Initialize a new {Blurrily::Client} connection to {Blurrily::Server}.
12
+ #
13
+ # @param host IP Address or FQDN of the Blurrily::Server.
14
+ # Defaults to Blurrily::DEFAULT_HOST.
15
+ # @param port Port Blurrily::Server is listening on.
16
+ # Defaults to Blurrily::DEFAULT_PORT.
17
+ # @param db_name Name of the data store being targeted.
18
+ # Defaults to Blurrily::DEFAULT_DATABASE.
19
+ #
20
+ # Examples
21
+ #
22
+ # ```
23
+ # Blurrily::Client.new('127.0.0.1', 12021, 'location_en')
24
+ # # => #<Blurrily::Client:0x007fcd0d33e708 @host="127.0.0.1", @port=12021, @db_name="location_en">
25
+ # ```
26
+ #
27
+ # @returns the instance of {Blurrily::Client}
28
+ def initialize(options = {})
29
+ @host = options.fetch(:host, DEFAULT_HOST)
30
+ @port = options.fetch(:port, DEFAULT_PORT)
31
+ @db_name = options.fetch(:db_name, DEFAULT_DATABASE)
32
+ end
33
+
34
+ # Find record references based on a given string (needle)
35
+ #
36
+ # @param needle The string you're searching for matches on.
37
+ # Must not contain tabs.
38
+ # Required
39
+ # @param limit Limit the number of results retruned (default: 10).
40
+ # Must be numeric.
41
+ # Optional
42
+ #
43
+ # Examples
44
+ #
45
+ # ```
46
+ # @client.find('London')
47
+ # # => [[123,6,3],[124,5,3]...]
48
+ # ```
49
+ #
50
+ # @returns an Array of matching [`ref`,`score`,`weight`] ordered by score. `ref` is the identifying value of the original record.
51
+ # Note that unless modified, `weight` is simply the string length.
52
+ def find(needle, limit = nil)
53
+ limit ||= LIMIT_DEFAULT
54
+ check_valid_needle(needle)
55
+ raise(ArgumentError, "LIMIT value must be in #{LIMIT_RANGE}") unless LIMIT_RANGE.include?(limit)
56
+
57
+ cmd = ["FIND", @db_name, needle, limit]
58
+ send_cmd_and_get_results(cmd).map(&:to_i).each_slice(3).to_a
59
+ end
60
+
61
+ # Index a given record.
62
+ #
63
+ # @param db_name The name of the data store being targeted. Required
64
+ # @param needle The string you wish to index. Must not contain tabs. Required
65
+ # @param ref The indentifying value of the record being indexed. Must be numeric. Required
66
+ # @param weight Weight of this particular reference. Default 0. Don't change unless you know what you're doing. Optional.
67
+ #
68
+ # Examples
69
+ #
70
+ # ```
71
+ # @client.put('location_en', 'London', 123, 0)
72
+ # # => OK
73
+ # ```
74
+ #
75
+ # @returns something to let you know that all is well.
76
+ def put(needle, ref, weight = 0)
77
+ check_valid_needle(needle)
78
+ check_valid_ref(ref)
79
+ raise(ArgumentError, "WEIGHT value must be in #{WEIGHT_RANGE}") unless WEIGHT_RANGE.include?(weight)
80
+
81
+ cmd = ["PUT", @db_name, needle, ref, weight]
82
+ send_cmd_and_get_results(cmd)
83
+ return
84
+ end
85
+
86
+ def delete(ref)
87
+ check_valid_ref(ref)
88
+ cmd = ['DELETE', @db_name, ref]
89
+ send_cmd_and_get_results(cmd)
90
+ return
91
+ end
92
+
93
+ def clear()
94
+ send_cmd_and_get_results(['CLEAR', @db_name])
95
+ return
96
+ end
97
+
98
+
99
+ private
100
+
101
+
102
+ PORT_RANGE = 1025..32768
103
+
104
+ def check_valid_needle(needle)
105
+ raise(ArgumentError, "bad needle") if !needle.kind_of?(String) || needle.empty? || needle.include?("\t")
106
+ end
107
+
108
+ def check_valid_ref(ref)
109
+ raise(ArgumentError, "REF value must be in #{REF_RANGE}") unless REF_RANGE.include?(ref)
110
+ end
111
+
112
+
113
+ def connection
114
+ @connection ||= TCPSocket.new(@host, @port)
115
+ end
116
+
117
+ def send_cmd_and_get_results(argv)
118
+ output = argv.join("\t")
119
+ connection.puts output
120
+ input = connection.gets
121
+ case input
122
+ when "OK\n"
123
+ return []
124
+ when /^OK\t(.*)\n/
125
+ return $1.split("\t")
126
+ when /^ERROR\t(.*)\n/
127
+ raise Error, $1
128
+ when nil
129
+ raise Error, 'Server disconnected'
130
+ else
131
+ raise Error, 'Server did not respect protocol'
132
+ end
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+ require 'blurrily/defaults'
3
+
4
+ module Blurrily
5
+ class CommandProcessor
6
+ ProtocolError = Class.new(StandardError)
7
+
8
+ def initialize(map_group)
9
+ @map_group = map_group
10
+ end
11
+
12
+ def process_command(line)
13
+ command, map_name, *args = line.split(/\t/)
14
+ raise ProtocolError, 'Unknown command' unless COMMANDS.include? command
15
+ raise ProtocolError, 'Invalid database name' unless map_name =~ /^[a-z_]+$/
16
+ result = send("on_#{command}", map_name, *args)
17
+ ['OK', *result].compact.join("\t")
18
+ rescue ArgumentError, ProtocolError => e
19
+ ['ERROR', e.message].join("\t")
20
+ end
21
+
22
+ private
23
+
24
+ COMMANDS = %w(FIND PUT DELETE CLEAR)
25
+
26
+ def on_PUT(map_name, needle, ref, weight = nil)
27
+ raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
28
+ raise ProtocolError, 'Invalid weight' unless weight.nil? || (weight =~ /^\d+$/ && WEIGHT_RANGE.include?(weight.to_i))
29
+
30
+ @map_group.map(map_name).put(*[needle, ref.to_i, weight.to_i].compact)
31
+ return
32
+ end
33
+
34
+ def on_DELETE(map_name, ref)
35
+ raise ProtocolError, 'Invalid reference' unless ref =~ /^\d+$/ && REF_RANGE.include?(ref.to_i)
36
+
37
+ @map_group.map(map_name).delete(ref.to_i)
38
+ return
39
+ end
40
+
41
+ def on_FIND(map_name, needle, limit = nil)
42
+ raise ProtocolError, 'Limit must be a number' if limit && !LIMIT_RANGE.include?(limit.to_i)
43
+
44
+ results = @map_group.map(map_name).find(*[needle, limit && limit.to_i].compact)
45
+ return results.flatten
46
+ end
47
+
48
+ def on_CLEAR(map_name)
49
+ @map_group.clear(map_name)
50
+ return
51
+ end
52
+ end
53
+ end