hmsearch-postgres 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6819501caf23acccdaa5b8fd3f06087395c29322
4
+ data.tar.gz: 9d065fdd3196ba0404529f744d84a80f361c8124
5
+ SHA512:
6
+ metadata.gz: bd3b9373805b316ddf0269fafe616cb22a2b70b40c39ad60c8d42783f2a6a08818ecd50ebf69a3e5f143d820f84272eb8322edd5c9771049f0509d3a8adcb51c
7
+ data.tar.gz: f1864f94b7069b97b646c84d4b32b04029951648dd64ee32497a1554178861d9ab687d9cc3a39fb12ad07d921539ea00e12e4b733c20bb676ad72684b4f81e1b
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ Gemfile.lock
2
+ tmp
3
+ lib/hmsearch/postgres_ext.*
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Mine
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1,27 @@
1
+ require 'rake/clean'
2
+
3
+ CLEAN.include('lib/hmsearch/Makefile')
4
+
5
+ CLOBBER.include('lib/hmsearch/postgres_ext.*')
6
+
7
+ task default: :test
8
+
9
+ desc 'build native extension'
10
+ task :build do
11
+ mkdir_p 'tmp'
12
+ chdir 'tmp' do
13
+ sh 'ruby ../ext/hmsearch/extconf.rb'
14
+ sh 'make'
15
+ end
16
+
17
+ require 'rbconfig'
18
+ dlext = RbConfig::CONFIG['DLEXT']
19
+ cp "tmp/postgres_ext.#{dlext}", "lib/hmsearch/"
20
+ end
21
+
22
+ desc 'run tests'
23
+ task test: :build do
24
+ FileList.new('./test/*_test.rb').each do |test|
25
+ load test
26
+ end
27
+ end
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+ require 'rbconfig'
3
+
4
+ $LDFLAGS += ' -lm -lpqxx'
5
+
6
+ create_makefile("hmsearch/postgres_ext")
@@ -0,0 +1,613 @@
1
+ /* HmSearch hash lookup library
2
+ *
3
+ * http://commonsmachinery.se/
4
+ * Distributed under an MIT license
5
+ *
6
+ * Copyright (c) 2014 Commons Machinery
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ * of this software and associated documentation files (the "Software"), to deal
10
+ * in the Software without restriction, including without limitation the rights
11
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ * copies of the Software, and to permit persons to whom the Software is
13
+ * furnished to do so, subject to the following conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be included in all
16
+ * copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ * SOFTWARE.
25
+ */
26
+
27
+ #include <math.h>
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <string.h>
31
+
32
+ #include <memory>
33
+ #include <algorithm>
34
+ #include <map>
35
+ #include <iostream>
36
+
37
+ #include <pqxx/pqxx>
38
+
39
+ #include "hmsearch.h"
40
+
41
+ /** The actual implementation of the HmSearch database.
42
+ *
43
+ * A difference between this implementation and the HmSearch algorithm
44
+ * in the paper is that only exact-matches are stored in the database,
45
+ * not the 1-matches. The 1-var partitions are instead generated
46
+ * during lookup. This drastically reduces database size, which
47
+ * speeds up insertion and probably lookups too.
48
+ *
49
+ * The database contains some setting records controlling the
50
+ * operation:
51
+ *
52
+ * _hb: hash bits
53
+ * _me: max errors
54
+ * _se: global sequence when adding new keys
55
+ *
56
+ * These can't be changed once the database has been initialised.
57
+ *
58
+ * Each partition is stored as a key on the following format:
59
+ * Byte 0: 'P'
60
+ * Byte 1: Partition number (thus limiting to max error 518)
61
+ * Bytes 2-N: Partition bits.
62
+ * Bytes N-M: Sequence number (globally incremented with each addition)
63
+ */
64
+ class HmSearchImpl : public HmSearch
65
+ {
66
+ public:
67
+ HmSearchImpl(std::string connstr, int hash_bits, int max_error)
68
+ : _hash_bits(hash_bits)
69
+ , _max_error(max_error)
70
+ , _hash_bytes((hash_bits + 7) / 8)
71
+ , _partitions((max_error + 3) / 2)
72
+ , _partition_bits(ceil((double)hash_bits / _partitions))
73
+ , _partition_bytes((_partition_bits + 7) / 8 + 1)
74
+ {
75
+ _db = new pqxx::connection(connstr);
76
+ }
77
+
78
+ ~HmSearchImpl() {
79
+ close();
80
+ }
81
+
82
+ bool insert(const hash_string& hash,
83
+ std::string* error_msg = NULL);
84
+
85
+ bool print_copystring(const hash_string& hash,
86
+ std::string* error_msg = NULL);
87
+
88
+ bool lookup(const hash_string& query,
89
+ LookupResultList& result,
90
+ int max_error = -1,
91
+ std::string* error_msg = NULL);
92
+
93
+ bool close(std::string* error_msg = NULL);
94
+
95
+ private:
96
+ struct Candidate {
97
+ Candidate() : matches(0), first_match(0), second_match(0) {}
98
+ int matches;
99
+ int first_match;
100
+ int second_match;
101
+ };
102
+
103
+ typedef std::map<hash_string, Candidate> CandidateMap;
104
+
105
+ void get_candidates(const hash_string& query, CandidateMap& candidates);
106
+ void add_hash_candidates(CandidateMap& candidates, int match,
107
+ const uint8_t* hashes, size_t length);
108
+ hash_string get_multiple_keys(uint8_t *key, int partition);
109
+ bool valid_candidate(const Candidate& candidate);
110
+ int hamming_distance(const hash_string& query, const hash_string& hash);
111
+
112
+ int get_partition_key(const hash_string& hash, int partition, uint8_t *key);
113
+
114
+ pqxx::connection *_db;
115
+ int _hash_bits;
116
+ int _max_error;
117
+ int _hash_bytes;
118
+ int _partitions;
119
+ int _partition_bits;
120
+ int _partition_bytes;
121
+
122
+ static int one_bits[256];
123
+ };
124
+
125
+
126
+ bool HmSearch::init(const std::string& path,
127
+ unsigned hash_bits, unsigned max_error,
128
+ uint64_t num_hashes,
129
+ std::string* error_msg)
130
+ {
131
+ std::string dummy;
132
+ if (!error_msg) {
133
+ error_msg = &dummy;
134
+ }
135
+ *error_msg = "";
136
+
137
+ if (hash_bits == 0 || (hash_bits & 7)) {
138
+ *error_msg = "invalid hash_bits value";
139
+ return false;
140
+ }
141
+
142
+ if (max_error == 0 || max_error >= hash_bits || max_error > 518) {
143
+ *error_msg = "invalid max_error value";
144
+ return false;
145
+ }
146
+
147
+ pqxx::connection db(path);
148
+ if (!db.is_open()) {
149
+ *error_msg = "Can't open database";
150
+ return false;
151
+ }
152
+
153
+ std::string sql;
154
+ sql = "INSERT INTO config VALUES ($1, $2)";
155
+ db.prepare("hash_max", sql);
156
+
157
+ pqxx::work W(db);
158
+ sql = "CREATE TABLE IF NOT EXISTS config ("\
159
+ " hash_bits int,"\
160
+ " max_error int); TRUNCATE config";
161
+ W.exec(sql);
162
+
163
+ W.prepared("hash_max")(hash_bits)(max_error).exec();
164
+
165
+ for (unsigned int i = 0; i < ((max_error + 3) / 2); i++) {
166
+ {
167
+ std::stringstream s;
168
+ s << "CREATE TABLE IF NOT EXISTS partition" << i << " ("
169
+ << " hash bytea,"
170
+ << " key bytea); TRUNCATE partition" << i;
171
+ W.exec(s.str());
172
+ }
173
+
174
+ {
175
+ std::stringstream s;
176
+ s << "DROP INDEX IF EXISTS ix_key_" << i;
177
+ W.exec(s.str());
178
+ }
179
+
180
+ {
181
+ std::stringstream s;
182
+ s << "CREATE INDEX ix_key_" << i << " ON partition" << i << "(key)";
183
+ W.exec(s.str());
184
+ }
185
+ }
186
+
187
+ W.commit();
188
+
189
+ db.disconnect();
190
+
191
+ return true;
192
+ }
193
+
194
+
195
+ HmSearch* HmSearch::open(const std::string& path,
196
+ std::string* error_msg)
197
+ {
198
+ std::string dummy;
199
+
200
+ if (!error_msg) {
201
+ error_msg = &dummy;
202
+ }
203
+ *error_msg = "";
204
+
205
+ try {
206
+ pqxx::connection db(path);
207
+
208
+ std::string sql;
209
+
210
+ sql = "SELECT max_error, hash_bits FROM config";
211
+ pqxx::nontransaction n(db);
212
+ pqxx::result res(n.exec(sql));
213
+
214
+ pqxx::result::const_iterator c = res.begin(); // We retrieve just one row
215
+
216
+ unsigned long hash_bits, max_error;
217
+ max_error = c[0].as<long>();
218
+ hash_bits = c[1].as<long>();
219
+
220
+ db.disconnect();
221
+
222
+ HmSearch* hm = new HmSearchImpl(path, hash_bits, max_error);
223
+ if (!hm) {
224
+ *error_msg = "out of memory";
225
+ return NULL;
226
+ }
227
+
228
+ return hm;
229
+
230
+ }
231
+ catch (const pqxx::broken_connection& e) {
232
+ *error_msg = e.what();
233
+ return NULL;
234
+ }
235
+ }
236
+
237
+
238
+ HmSearch::hash_string HmSearch::parse_hexhash(const std::string& hexhash)
239
+ {
240
+ int len = hexhash.length() / 2;
241
+ uint8_t hash[len];
242
+
243
+ for (int i = 0; i < len; i++) {
244
+ char buf[3];
245
+ char* err;
246
+
247
+ buf[0] = hexhash[i * 2];
248
+ buf[1] = hexhash[i * 2 + 1];
249
+ buf[2] = 0;
250
+
251
+ hash[i] = strtoul(buf, &err, 16);
252
+
253
+ if (*err != '\0') {
254
+ return hash_string();
255
+ }
256
+ }
257
+
258
+ return hash_string(hash, len);
259
+ }
260
+
261
+ std::string HmSearch::format_hexhash(const HmSearch::hash_string& hash)
262
+ {
263
+ char hex[hash.length() * 2 + 1];
264
+
265
+ for (size_t i = 0; i < hash.length(); i++) {
266
+ sprintf(hex + 2 * i, "%02x", hash[i]);
267
+ }
268
+
269
+ return hex;
270
+ }
271
+
272
+
273
+ bool HmSearchImpl::print_copystring(const hash_string& hash,
274
+ std::string* error_msg)
275
+ {
276
+ std::string dummy;
277
+ if (!error_msg) {
278
+ error_msg = &dummy;
279
+ }
280
+ *error_msg = "";
281
+
282
+ if (hash.length() != (size_t) _hash_bytes) {
283
+ *error_msg = "incorrect hash length";
284
+ return false;
285
+ }
286
+
287
+ for (int i = 0; i < _partitions; i++) {
288
+ uint8_t key[_partition_bytes];
289
+
290
+ get_partition_key(hash, i, key);
291
+
292
+ std::cout << "\\\\x" << format_hexhash(hash)
293
+ << " "
294
+ << int(i)
295
+ << " "
296
+ << "\\\\x" << format_hexhash(hash_string(key, _partition_bytes))
297
+ << std::endl;
298
+
299
+ }
300
+
301
+ return true;
302
+ }
303
+
304
+ bool HmSearchImpl::insert(const hash_string& hash,
305
+ std::string* error_msg)
306
+ {
307
+ std::string dummy;
308
+ if (!error_msg) {
309
+ error_msg = &dummy;
310
+ }
311
+ *error_msg = "";
312
+
313
+ if (hash.length() != (size_t) _hash_bytes) {
314
+ *error_msg = "incorrect hash length";
315
+ return false;
316
+ }
317
+
318
+ if (!_db->is_open()) {
319
+ *error_msg = "database is closed";
320
+ return false;
321
+ }
322
+
323
+ for (int i = 0; i < _partitions; i++) {
324
+ std::stringstream s;
325
+ s << "INSERT INTO partition" << i
326
+ << " VALUES ($1, $2)";
327
+ _db->prepare("insert_"+i, s.str());
328
+ }
329
+ pqxx::work W(*_db);
330
+ for (int i = 0; i < _partitions; i++) {
331
+ uint8_t key[_partition_bytes];
332
+
333
+ get_partition_key(hash, i, key);
334
+
335
+ pqxx::binarystring key_blob(key, _partition_bytes);
336
+ pqxx::binarystring hash_blob(hash.data(), hash.length());
337
+
338
+ W.prepared("insert_"+i)(hash_blob)(key_blob).exec();
339
+ }
340
+
341
+ W.commit();
342
+
343
+ return true;
344
+ }
345
+
346
+
347
+ bool HmSearchImpl::lookup(const hash_string& query,
348
+ LookupResultList& result,
349
+ int reduced_error,
350
+ std::string* error_msg)
351
+ {
352
+ std::string dummy;
353
+ if (!error_msg) {
354
+ error_msg = &dummy;
355
+ }
356
+ *error_msg = "";
357
+
358
+ if (query.length() != (size_t) _hash_bytes) {
359
+ *error_msg = "incorrect hash length";
360
+ return false;
361
+ }
362
+
363
+ if (!_db->is_open()) {
364
+ *error_msg = "database is closed";
365
+ return false;
366
+ }
367
+
368
+ try {
369
+ CandidateMap candidates;
370
+ get_candidates(query, candidates);
371
+
372
+ for (CandidateMap::const_iterator i = candidates.begin(); i != candidates.end(); ++i) {
373
+ if (valid_candidate(i->second)) {
374
+ int distance = hamming_distance(query, i->first);
375
+
376
+ if (distance <= _max_error
377
+ && (reduced_error < 0 || distance <= reduced_error)) {
378
+ result.push_back(LookupResult(i->first, distance));
379
+ }
380
+ }
381
+ }
382
+ }
383
+ catch (const pqxx::pqxx_exception &e) {
384
+ *error_msg = e.base().what();
385
+ return false;
386
+ }
387
+
388
+ return true;
389
+ }
390
+
391
+
392
+ bool HmSearchImpl::close(std::string* error_msg)
393
+ {
394
+ std::string dummy;
395
+ if (!error_msg) {
396
+ error_msg = &dummy;
397
+ }
398
+ *error_msg = "";
399
+
400
+ if (!_db->is_open()) {
401
+ // Already closed
402
+ return true;
403
+ }
404
+
405
+ _db->disconnect();
406
+
407
+ return true;
408
+ }
409
+
410
+
411
+ HmSearchImpl::hash_string HmSearchImpl::get_multiple_keys(
412
+ uint8_t *key,
413
+ int partition)
414
+ {
415
+ hash_string hashes;
416
+
417
+ pqxx::nontransaction n(*_db);
418
+ pqxx::binarystring key_blob(key, _partition_bytes);
419
+ pqxx::result res = n.prepared("select_"+partition)(key_blob).exec();
420
+
421
+ for (pqxx::result::const_iterator c = res.begin(); c != res.end(); ++c) {
422
+ pqxx::binarystring hash_result(c[0]);
423
+ hashes.append(hash_string(hash_result.data(), hash_result.size()));
424
+ }
425
+
426
+ return hashes;
427
+ }
428
+
429
+ void HmSearchImpl::get_candidates(
430
+ const HmSearchImpl::hash_string& query,
431
+ HmSearchImpl::CandidateMap& candidates)
432
+ {
433
+ uint8_t key[_partition_bytes];
434
+
435
+ for (int i = 0; i < _partitions; i++) {
436
+ int psize = _hash_bits - i * _partition_bits;
437
+ if (psize > _partition_bits) {
438
+ psize = _partition_bits;
439
+ }
440
+ std::stringstream single;
441
+ single << "SELECT hash FROM partition" << i
442
+ << " WHERE key=$1";
443
+
444
+ std::stringstream s;
445
+ s << "SELECT hash FROM partition"
446
+ << i
447
+ << " INNER JOIN (SELECT $1::bytea AS key";
448
+ for (int j = 2; j <= psize; j++) {
449
+ s << " UNION ALL SELECT "
450
+ << "$"
451
+ << j;
452
+ }
453
+ s << ") AS x ON partition"
454
+ << i
455
+ << ".key = x.key";
456
+ std::string sql;
457
+ sql.append(s.str());
458
+ _db->prepare("select_multiple_"+i, sql);
459
+
460
+ sql.clear();
461
+ sql.append(single.str());
462
+ _db->prepare("select_"+i, sql);
463
+ }
464
+ for (int i = 0; i < _partitions; i++) {
465
+ hash_string hashes;
466
+
467
+ int bits = get_partition_key(query, i, key);
468
+
469
+ // Get exact matches
470
+
471
+ hashes = get_multiple_keys(key, i);
472
+
473
+ if (hashes.length() > 0) {
474
+ add_hash_candidates(candidates, 0, (const uint8_t*)hashes.data(), hashes.length());
475
+ }
476
+
477
+ // Get 1-variant matches
478
+ pqxx::nontransaction n(*_db);
479
+ pqxx::prepare::invocation prep = n.prepared("select_multiple_"+i);
480
+ int pbyte = (i * _partition_bits) / 8;
481
+ int count = 0;
482
+ for (int pbit = i * _partition_bits; bits > 0; pbit++, bits--, count++) {
483
+ uint8_t flip = 1 << (7 - (pbit % 8));
484
+ key[pbit / 8 - pbyte] ^= flip;
485
+ pqxx::binarystring key_blob(key, _partition_bytes);
486
+ prep(key_blob);
487
+ key[pbit / 8 - pbyte] ^= flip;
488
+ }
489
+ pqxx::result res = prep.exec();
490
+
491
+ hashes.clear();
492
+ for (pqxx::result::const_iterator c = res.begin(); c != res.end(); ++c) {
493
+ pqxx::binarystring hash_result(c[0]);
494
+ hashes.append(hash_string(hash_result.data(), hash_result.size()));
495
+ }
496
+ add_hash_candidates(candidates, 1, (const uint8_t*)hashes.data(), hashes.length());
497
+ }
498
+ }
499
+
500
+
501
+ void HmSearchImpl::add_hash_candidates(
502
+ HmSearchImpl::CandidateMap& candidates, int match,
503
+ const uint8_t* hashes, size_t length)
504
+ {
505
+ for (size_t n = 0; n < length; n += _hash_bytes) {
506
+ hash_string hash = hash_string(hashes + n, _hash_bytes);
507
+ Candidate& cand = candidates[hash];
508
+
509
+ ++cand.matches;
510
+ if (cand.matches == 1) {
511
+ cand.first_match = match;
512
+ }
513
+ else if (cand.matches == 2) {
514
+ cand.second_match = match;
515
+ }
516
+ }
517
+ }
518
+
519
+
520
+ bool HmSearchImpl::valid_candidate(
521
+ const HmSearchImpl::Candidate& candidate)
522
+ {
523
+ if (_max_error & 1) {
524
+ // Odd k
525
+ if (candidate.matches < 3) {
526
+ if (candidate.matches == 1 || (candidate.first_match && candidate.second_match)) {
527
+ return false;
528
+ }
529
+ }
530
+ }
531
+ else {
532
+ // Even k
533
+ if (candidate.matches < 2) {
534
+ if (candidate.first_match)
535
+ return false;
536
+ }
537
+ }
538
+
539
+ return true;
540
+ }
541
+
542
+
543
+ int HmSearchImpl::hamming_distance(
544
+ const HmSearchImpl::hash_string& query,
545
+ const HmSearchImpl::hash_string& hash)
546
+ {
547
+ int distance = 0;
548
+
549
+ for (size_t i = 0; i < query.length(); i++) {
550
+ distance += one_bits[query[i] ^ hash[i]];
551
+ }
552
+
553
+ return distance;
554
+ }
555
+
556
+
557
+ int HmSearchImpl::get_partition_key(const hash_string& hash, int partition, uint8_t *key)
558
+ {
559
+ int psize, hash_bit, bits_left;
560
+
561
+ psize = _hash_bits - partition * _partition_bits;
562
+ if (psize > _partition_bits) {
563
+ psize = _partition_bits;
564
+ }
565
+
566
+ // Copy bytes, masking out some bits at the start and end
567
+ bits_left = psize;
568
+ hash_bit = partition * _partition_bits;
569
+
570
+ for (int i = 0; i < _partition_bytes; i++) {
571
+ int byte = hash_bit / 8;
572
+ int bit = hash_bit % 8;
573
+ int bits = 8 - bit;
574
+
575
+ if (bits > bits_left) {
576
+ bits = bits_left;
577
+ }
578
+
579
+ bits_left -= bits;
580
+ hash_bit += bits;
581
+
582
+ key[i] = hash[byte] & (((1 << bits) - 1) << (8 - bit - bits));
583
+ }
584
+
585
+ return psize;
586
+ }
587
+
588
+
589
+ int HmSearchImpl::one_bits[256] = {
590
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
591
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
592
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
593
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
594
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
595
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
596
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
597
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
598
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
599
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
600
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
601
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
602
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
603
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
604
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
605
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
606
+ };
607
+
608
+ /*
609
+ Local Variables:
610
+ c-file-style: "stroustrup"
611
+ indent-tabs-mode:nil
612
+ End:
613
+ */
@@ -0,0 +1,204 @@
1
+ /* HmSearch hash lookup library
2
+ *
3
+ * http://commonsmachinery.se/
4
+ * Distributed under an MIT license
5
+ *
6
+ * Copyright (c) 2014 Commons Machinery
7
+ *
8
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ * of this software and associated documentation files (the "Software"), to deal
10
+ * in the Software without restriction, including without limitation the rights
11
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ * copies of the Software, and to permit persons to whom the Software is
13
+ * furnished to do so, subject to the following conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be included in all
16
+ * copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ * SOFTWARE.
25
+ */
26
+
27
+ #ifndef __HMSEARCH_H_INCLUDED__
28
+ #define __HMSEARCH_H_INCLUDED__
29
+
30
+ #include <string>
31
+ #include <list>
32
+ #include <stdint.h>
33
+
34
+ /** Interface to a HmSearch database.
35
+ *
36
+ * It cannot be instantiated directly, instead open() must be used to
37
+ * get a pointer to a database object.
38
+ *
39
+ * The database is closed when the object is deleted. Until it is closed,
40
+ * any added hashes may not yet have been written to disk.
41
+ *
42
+ * Multiple threads can call HmSearch::insert() and HmSearch::lookup()
43
+ * on the same database object without locks.
44
+ *
45
+ * HmSearch::close() is not thread-safe, so the caller must ensure
46
+ * that no inserts or lookups are in progress.
47
+ *
48
+ * A database file can only be opened by a single process. This is a
49
+ * limitation in the underlying LevelDB library.
50
+ */
51
+ class HmSearch
52
+ {
53
+ public:
54
+ /** A string for passing around raw (i.e. not hexadecimal) hashes.
55
+ */
56
+ typedef std::basic_string<uint8_t> hash_string;
57
+
58
+ /** A record holding a hash found by lookup() and its hamming distance
59
+ * from the query hash.
60
+ */
61
+ struct LookupResult {
62
+ LookupResult(const hash_string& h, int d) : hash(h), distance(d) {}
63
+ hash_string hash;
64
+ int distance;
65
+ };
66
+
67
+ typedef std::list<LookupResult> LookupResultList;
68
+
69
+ /** Initialise a new hash database file.
70
+ *
71
+ * The database file should not exist, or if it does it must not
72
+ * contain any records.
73
+ *
74
+ * Parameters:
75
+ *
76
+ * - path: file path, typically ending in ".ldb"
77
+ *
78
+ * - hash_bits: number of bits in the hash (must be a multiple of 8)
79
+ *
80
+ * - max_error: maximum hamming distance, must be less than hash_bits
81
+ *
82
+ * - num_hashes: target number of hashes, used for tuning the database
83
+ *
84
+ * - error_msg: if provided, will be set to an string describing any
85
+ * error, or to an empty string if no error occurred.
86
+ *
87
+ * Returns true if the database could be initialised, false on errors.
88
+ */
89
+ static bool init(const std::string& path,
90
+ unsigned hash_bits, unsigned max_error,
91
+ uint64_t num_hashes,
92
+ std::string* error_msg = NULL);
93
+
94
+ /** Open a database file.
95
+ *
96
+ * The returned object must be deleted when not used any longer to
97
+ * ensure that the database is synced and closed.
98
+ *
99
+ * Parameters:
100
+ *
101
+ * - path: file path, typically ending in ".ldb"
102
+ *
103
+ * - mode: database open mode
104
+ *
105
+ * - error_msg: if provided, will be set to an string describing any
106
+ * error, or to an empty string if no error occurred.
107
+ *
108
+ * Returns the new object on success, or NULL on error.
109
+ */
110
+ static HmSearch* open(const std::string& path,
111
+ std::string* error_msg = NULL);
112
+
113
+
114
+ /** Parse a hash in hexadecimal format, returning
115
+ * a string of raw bytes.
116
+ */
117
+ static hash_string parse_hexhash(const std::string& hexhash);
118
+
119
+ /** Format a hash of raw bytes into a hexadecimal string.
120
+ */
121
+ static std::string format_hexhash(const hash_string& hash);
122
+
123
+
124
+ /** Prints a string suitable for PostgreSQL COPY onto stdout.
125
+ *
126
+ * No check is made if the hash already exists in the database,
127
+ * so this may result in duplicate records.
128
+ *
129
+ * Parameters:
130
+ * - hash: The hash to print, as raw bytes
131
+ * - error_msg: if provided, will be set to an string describing any
132
+ * error, or to an empty string if no error occurred.
133
+ *
134
+ * Returns true if the command succeded, false on any error.
135
+ */
136
+ virtual bool print_copystring(const hash_string& hash,
137
+ std::string* error_msg = NULL) = 0;
138
+
139
+ /** Insert a hash into the database.
140
+ *
141
+ * No check is made if the hash already exists in the database,
142
+ * so this may result in duplicate records.
143
+ *
144
+ * Parameters:
145
+ * - hash: The hash to insert, as raw bytes
146
+ * - error_msg: if provided, will be set to an string describing any
147
+ * error, or to an empty string if no error occurred.
148
+ *
149
+ * Returns true if the insert succeded, false on any error.
150
+ */
151
+ virtual bool insert(const hash_string& hash,
152
+ std::string* error_msg = NULL) = 0;
153
+
154
+ /** Lookup a hash in the database, returning a list of matches.
155
+ *
156
+ * Parameters:
157
+ *
158
+ * - query: query hash string
159
+ *
160
+ * - result: matches are added to this list (which is not emptied)
161
+ *
162
+ * - max_error: if >= 0, reduce the maximum accepted error
163
+ * from the database default
164
+ *
165
+ * - error_msg: if provided, will be set to an string describing any
166
+ * error, or to an empty string if no error occurred.
167
+ *
168
+ * Returns true if the lookup could be performed (even if no
169
+ * hashes were found), false if an error occurred.
170
+ */
171
+ virtual bool lookup(const hash_string& query,
172
+ LookupResultList& result,
173
+ int max_error = -1,
174
+ std::string* error_msg = NULL) = 0;
175
+
176
+ /** Explicitly sync and close the database file.
177
+ *
178
+ * Parameter:
179
+ * - error_msg: if provided, will be set to an string describing any
180
+ * error, or to an empty string if no error occurred.
181
+ *
182
+ * Returns true if all went well, false on errors.
183
+ */
184
+ virtual bool close(std::string* error_msg = NULL) = 0;
185
+
186
+ /** Delete the database object, syncing and closing the database
187
+ * file if not already done.
188
+ */
189
+ virtual ~HmSearch() {}
190
+
191
+ protected:
192
+ HmSearch() {}
193
+ };
194
+
195
+
196
+ /*
197
+ Local Variables:
198
+ c-file-style: "stroustrup"
199
+ indent-tabs-mode:nil
200
+ End:
201
+ */
202
+
203
+ #endif // __HMSEARCH_H_INCLUDED__
204
+
@@ -0,0 +1,127 @@
1
+ #include "ruby.h"
2
+ #include "hmsearch.h"
3
+
4
+ extern "C" void Init_postgres_ext();
5
+
6
+ static VALUE eHmSearchError;
7
+ static ID id_hash;
8
+ static ID id_distance;
9
+
10
+ static HmSearch *HmSearch_ptr(VALUE obj);
11
+ static void HmSearch_free(void *p);
12
+ static VALUE HmSearch_initdb(VALUE klass, VALUE _path, VALUE _hash_bits, VALUE _max_error, VALUE _num_hashes);
13
+ static VALUE HmSearch_open(VALUE klass, VALUE _path);
14
+ static VALUE HmSearch_close(VALUE self);
15
+ static VALUE HmSearch_insert(VALUE self, VALUE _hash);
16
+ static VALUE HmSearch_lookup(int argc, VALUE *argv, VALUE self);
17
+
18
+ static HmSearch *HmSearch_ptr(VALUE obj)
19
+ {
20
+ HmSearch *p;
21
+ Data_Get_Struct(obj, HmSearch, p);
22
+ return p;
23
+ }
24
+
25
+ static void HmSearch_free(void *p)
26
+ {
27
+ static_cast<HmSearch*>(p)->~HmSearch();
28
+ }
29
+
30
+ static VALUE HmSearch_initdb(VALUE klass, VALUE _path, VALUE _hash_bits, VALUE _max_error, VALUE _num_hashes)
31
+ {
32
+ const char *path = StringValueCStr(_path);
33
+ unsigned hash_bits = FIX2UINT(_hash_bits);
34
+ unsigned max_error = FIX2UINT(_max_error);
35
+ uint64_t num_hashes = NUM2ULONG(_num_hashes);
36
+
37
+ std::string error_msg;
38
+ if (!HmSearch::init(path, hash_bits, max_error, num_hashes, &error_msg)) {
39
+ rb_raise(eHmSearchError, "%s", error_msg.c_str());
40
+ }
41
+
42
+ return Qnil;
43
+ }
44
+
45
+ static VALUE HmSearch_open(VALUE klass, VALUE _path)
46
+ {
47
+ const char *path = StringValueCStr(_path);
48
+
49
+ std::string error_msg;
50
+ HmSearch* p = HmSearch::open(path, &error_msg);
51
+ if (!p) {
52
+ rb_raise(eHmSearchError, "%s", error_msg.c_str());
53
+ }
54
+
55
+ VALUE obj = Data_Wrap_Struct(klass, NULL, HmSearch_free, p);
56
+
57
+ if (rb_block_given_p()) {
58
+ return rb_ensure(RUBY_METHOD_FUNC(rb_yield), obj, RUBY_METHOD_FUNC(HmSearch_close), obj);
59
+ }
60
+
61
+ return obj;
62
+ }
63
+
64
+ static VALUE HmSearch_close(VALUE self)
65
+ {
66
+ HmSearch_ptr(self)->close();
67
+ return Qnil;
68
+ }
69
+
70
+ static VALUE HmSearch_insert(VALUE self, VALUE _hash)
71
+ {
72
+ HmSearch_ptr(self)->insert(HmSearch::parse_hexhash(StringValueCStr(_hash)));
73
+ return Qnil;
74
+ }
75
+
76
+ static VALUE HmSearch_lookup(int argc, VALUE *argv, VALUE self)
77
+ {
78
+ VALUE _hash, _reduced_error;
79
+ rb_scan_args(argc, argv, "11", &_hash, &_reduced_error);
80
+
81
+ const char *hash = StringValueCStr(_hash);
82
+
83
+ int reduced_error = NIL_P(_reduced_error) ? -1 : FIX2INT(_reduced_error);
84
+
85
+ HmSearch::LookupResultList matches;
86
+ std::string error_msg;
87
+ if (!HmSearch_ptr(self)->lookup(HmSearch::parse_hexhash(hash), matches, reduced_error, &error_msg)) {
88
+ rb_raise(eHmSearchError, "%s", error_msg.c_str());
89
+ }
90
+
91
+ VALUE results = rb_ary_new2(matches.size());
92
+
93
+ for (HmSearch::LookupResultList::const_iterator it=matches.begin(); it != matches.end(); ++it) {
94
+ VALUE result = rb_hash_new();
95
+
96
+ rb_hash_aset(result, ID2SYM(id_hash), rb_str_new2(HmSearch::format_hexhash(it->hash).c_str()));
97
+ rb_hash_aset(result, ID2SYM(id_distance), INT2NUM(it->distance));
98
+
99
+ rb_ary_push(results, result);
100
+ }
101
+
102
+ return results;
103
+ }
104
+
105
+ void Init_postgres_ext() {
106
+ VALUE mHmSearch = rb_define_module("HmSearch");
107
+
108
+ eHmSearchError = rb_define_class_under(mHmSearch, "HmSearchError", rb_eStandardError);
109
+
110
+ VALUE cPostgres = rb_define_class_under(mHmSearch, "Postgres", rb_cObject);
111
+
112
+ VALUE cPostgresSingleton = rb_singleton_class(cPostgres);
113
+
114
+ id_hash = rb_intern("hash");
115
+ id_distance = rb_intern("distance");
116
+
117
+ rb_define_method(cPostgresSingleton, "initdb", RUBY_METHOD_FUNC(HmSearch_initdb), 4);
118
+
119
+ rb_define_method(cPostgresSingleton, "open", RUBY_METHOD_FUNC(HmSearch_open), 1);
120
+ // we have to allocate via open
121
+ rb_undef_method(cPostgresSingleton, "new");
122
+ rb_undef_method(cPostgresSingleton, "allocate");
123
+
124
+ rb_define_method(cPostgres, "close", RUBY_METHOD_FUNC(HmSearch_close), 0);
125
+ rb_define_method(cPostgres, "insert", RUBY_METHOD_FUNC(HmSearch_insert), 1);
126
+ rb_define_method(cPostgres, "lookup", RUBY_METHOD_FUNC(HmSearch_lookup), -1);
127
+ }
@@ -0,0 +1,22 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'hmsearch-postgres'
3
+ s.version = '0.1.0'
4
+
5
+ s.summary = 'hmsearch postgres client'
6
+ s.description = <<-EOF
7
+ A ruby wrapper for the hmsearch postgres client.
8
+ EOF
9
+
10
+ s.files = `git ls-files`.split("\n")
11
+ s.require_path = 'lib'
12
+
13
+ s.add_development_dependency 'rake'
14
+ s.add_development_dependency 'pg'
15
+
16
+ s.extensions = 'ext/hmsearch/extconf.rb'
17
+
18
+ s.authors = ['Kris Selden']
19
+ s.email = 'kris.selden@gmail.com'
20
+ s.homepage = 'https://github.com/denisnazarov/hmsearch-postgres-ruby'
21
+ s.license = 'MIT'
22
+ end
@@ -0,0 +1 @@
1
+ require 'hmsearch/postgres_ext'
@@ -0,0 +1,40 @@
1
+ require 'hmsearch/postgres'
2
+ require 'minitest/autorun'
3
+ require 'pg'
4
+
5
+ class TestHmsearchPostgres < MiniTest::Unit::TestCase
6
+ def setup
7
+ conn = PG::Connection.open('host=localhost port=5432')
8
+ conn.exec('drop database hmsearch_test') rescue nil
9
+ conn.exec('create database hmsearch_test')
10
+ end
11
+
12
+ def test_hmsearch
13
+ HmSearch::Postgres.initdb('host=localhost port=5432 dbname=hmsearch_test', 256, 10, 100000000)
14
+
15
+ conn = HmSearch::Postgres.open('host=localhost port=5432 dbname=hmsearch_test');
16
+
17
+ conn.insert('6e6fb315fa8c43fe9c2687d5be14575abb7252104236747d571b97e003563df0')
18
+
19
+ # we can't do more than one op per open because of a bug in underlying library
20
+ # warning: adding 'int' to a string does not append to the string [-Wstring-plus-int]
21
+ # pqxx::prepare::invocation prep = n.prepared("select_multiple_"+i);
22
+ #
23
+ # instead of appending to the string this just moves the char pointer
24
+ # happens to work out ok one run through but not subsequent
25
+ conn.close
26
+
27
+ HmSearch::Postgres.open('host=localhost port=5432 dbname=hmsearch_test') do |conn|
28
+
29
+ actual = conn.lookup('6e6fb315fa8c43fe9c2687d5be14575abb7252104236747d571b97e003563df2', -1)
30
+
31
+ expected = [{
32
+ hash: '6e6fb315fa8c43fe9c2687d5be14575abb7252104236747d571b97e003563df0',
33
+ distance: 1
34
+ }]
35
+
36
+ assert_equal(expected, actual)
37
+
38
+ end
39
+ end
40
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hmsearch-postgres
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kris Selden
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pg
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: |2
42
+ A ruby wrapper for the hmsearch postgres client.
43
+ email: kris.selden@gmail.com
44
+ executables: []
45
+ extensions:
46
+ - ext/hmsearch/extconf.rb
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - Gemfile
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - ext/hmsearch/extconf.rb
55
+ - ext/hmsearch/hmsearch.cc
56
+ - ext/hmsearch/hmsearch.h
57
+ - ext/hmsearch/postgres_ext.cc
58
+ - hmsearch-postgres.gemspec
59
+ - lib/hmsearch/postgres.rb
60
+ - test/hmsearch_test.rb
61
+ homepage: https://github.com/denisnazarov/hmsearch-postgres-ruby
62
+ licenses:
63
+ - MIT
64
+ metadata: {}
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ requirements: []
80
+ rubyforge_project:
81
+ rubygems_version: 2.2.2
82
+ signing_key:
83
+ specification_version: 4
84
+ summary: hmsearch postgres client
85
+ test_files: []