hmsearch-postgres 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6819501caf23acccdaa5b8fd3f06087395c29322
4
- data.tar.gz: 9d065fdd3196ba0404529f744d84a80f361c8124
3
+ metadata.gz: 2f4777db8ccdba08da153ddf1ce3bb44a5b74a7a
4
+ data.tar.gz: 15033367730bc27eb38137fe97b4e74d04512d5e
5
5
  SHA512:
6
- metadata.gz: bd3b9373805b316ddf0269fafe616cb22a2b70b40c39ad60c8d42783f2a6a08818ecd50ebf69a3e5f143d820f84272eb8322edd5c9771049f0509d3a8adcb51c
7
- data.tar.gz: f1864f94b7069b97b646c84d4b32b04029951648dd64ee32497a1554178861d9ab687d9cc3a39fb12ad07d921539ea00e12e4b733c20bb676ad72684b4f81e1b
6
+ metadata.gz: 963af2c221832611a18146dbbb83230d1e758b780743d027a2e23b7aa8e4f756d486e2e2f7c96ff60057ed7041e54a58e1b1dc5084a15ef0d6c7d655cfef22b9
7
+ data.tar.gz: f5a49475ba134cb7dcbc87adfd382bd5c74668b5639c1dca3afb78d6823054b231cb36508d82164a4262e42669ec4e20b60942526aa63edab016dccfc5b6e5fb
@@ -2,19 +2,19 @@
2
2
  *
3
3
  * http://commonsmachinery.se/
4
4
  * Distributed under an MIT license
5
- *
5
+ *
6
6
  * Copyright (c) 2014 Commons Machinery
7
- *
7
+ *
8
8
  * Permission is hereby granted, free of charge, to any person obtaining a copy
9
9
  * of this software and associated documentation files (the "Software"), to deal
10
10
  * in the Software without restriction, including without limitation the rights
11
11
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
12
  * copies of the Software, and to permit persons to whom the Software is
13
13
  * furnished to do so, subject to the following conditions:
14
- *
14
+ *
15
15
  * The above copyright notice and this permission notice shall be included in all
16
16
  * copies or substantial portions of the Software.
17
- *
17
+ *
18
18
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
19
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
20
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -65,33 +65,33 @@ class HmSearchImpl : public HmSearch
65
65
  {
66
66
  public:
67
67
  HmSearchImpl(std::string connstr, int hash_bits, int max_error)
68
- : _hash_bits(hash_bits)
69
- , _max_error(max_error)
70
- , _hash_bytes((hash_bits + 7) / 8)
71
- , _partitions((max_error + 3) / 2)
72
- , _partition_bits(ceil((double)hash_bits / _partitions))
73
- , _partition_bytes((_partition_bits + 7) / 8 + 1)
68
+ : _hash_bits(hash_bits)
69
+ , _max_error(max_error)
70
+ , _hash_bytes((hash_bits + 7) / 8)
71
+ , _partitions((max_error + 3) / 2)
72
+ , _partition_bits(ceil((double)hash_bits / _partitions))
73
+ , _partition_bytes((_partition_bits + 7) / 8 + 1)
74
74
  {
75
- _db = new pqxx::connection(connstr);
75
+ _db = new pqxx::connection(connstr);
76
76
  }
77
-
77
+
78
78
  ~HmSearchImpl() {
79
79
  close();
80
80
  }
81
81
 
82
82
  bool insert(const hash_string& hash,
83
83
  std::string* error_msg = NULL);
84
-
84
+
85
85
  bool print_copystring(const hash_string& hash,
86
- std::string* error_msg = NULL);
87
-
86
+ std::string* error_msg = NULL);
87
+
88
88
  bool lookup(const hash_string& query,
89
89
  LookupResultList& result,
90
90
  int max_error = -1,
91
91
  std::string* error_msg = NULL);
92
-
92
+
93
93
  bool close(std::string* error_msg = NULL);
94
-
94
+
95
95
  private:
96
96
  struct Candidate {
97
97
  Candidate() : matches(0), first_match(0), second_match(0) {}
@@ -99,7 +99,7 @@ private:
99
99
  int first_match;
100
100
  int second_match;
101
101
  };
102
-
102
+
103
103
  typedef std::map<hash_string, Candidate> CandidateMap;
104
104
 
105
105
  void get_candidates(const hash_string& query, CandidateMap& candidates);
@@ -110,7 +110,7 @@ private:
110
110
  int hamming_distance(const hash_string& query, const hash_string& hash);
111
111
 
112
112
  int get_partition_key(const hash_string& hash, int partition, uint8_t *key);
113
-
113
+
114
114
  pqxx::connection *_db;
115
115
  int _hash_bits;
116
116
  int _max_error;
@@ -118,7 +118,7 @@ private:
118
118
  int _partitions;
119
119
  int _partition_bits;
120
120
  int _partition_bytes;
121
-
121
+
122
122
  static int one_bits[256];
123
123
  };
124
124
 
@@ -133,61 +133,61 @@ bool HmSearch::init(const std::string& path,
133
133
  error_msg = &dummy;
134
134
  }
135
135
  *error_msg = "";
136
-
136
+
137
137
  if (hash_bits == 0 || (hash_bits & 7)) {
138
138
  *error_msg = "invalid hash_bits value";
139
139
  return false;
140
140
  }
141
-
141
+
142
142
  if (max_error == 0 || max_error >= hash_bits || max_error > 518) {
143
143
  *error_msg = "invalid max_error value";
144
144
  return false;
145
145
  }
146
-
146
+
147
147
  pqxx::connection db(path);
148
148
  if (!db.is_open()) {
149
- *error_msg = "Can't open database";
150
- return false;
149
+ *error_msg = "Can't open database";
150
+ return false;
151
151
  }
152
-
152
+
153
153
  std::string sql;
154
154
  sql = "INSERT INTO config VALUES ($1, $2)";
155
155
  db.prepare("hash_max", sql);
156
-
156
+
157
157
  pqxx::work W(db);
158
158
  sql = "CREATE TABLE IF NOT EXISTS config ("\
159
- " hash_bits int,"\
160
- " max_error int); TRUNCATE config";
159
+ " hash_bits int,"\
160
+ " max_error int); TRUNCATE config";
161
161
  W.exec(sql);
162
-
162
+
163
163
  W.prepared("hash_max")(hash_bits)(max_error).exec();
164
-
164
+
165
165
  for (unsigned int i = 0; i < ((max_error + 3) / 2); i++) {
166
166
  {
167
167
  std::stringstream s;
168
168
  s << "CREATE TABLE IF NOT EXISTS partition" << i << " ("
169
- << " hash bytea,"
170
- << " key bytea); TRUNCATE partition" << i;
169
+ << " hash bytea,"
170
+ << " key bytea); TRUNCATE partition" << i;
171
171
  W.exec(s.str());
172
172
  }
173
-
173
+
174
174
  {
175
175
  std::stringstream s;
176
176
  s << "DROP INDEX IF EXISTS ix_key_" << i;
177
177
  W.exec(s.str());
178
178
  }
179
-
179
+
180
180
  {
181
181
  std::stringstream s;
182
182
  s << "CREATE INDEX ix_key_" << i << " ON partition" << i << "(key)";
183
183
  W.exec(s.str());
184
184
  }
185
185
  }
186
-
186
+
187
187
  W.commit();
188
-
188
+
189
189
  db.disconnect();
190
-
190
+
191
191
  return true;
192
192
  }
193
193
 
@@ -196,37 +196,37 @@ HmSearch* HmSearch::open(const std::string& path,
196
196
  std::string* error_msg)
197
197
  {
198
198
  std::string dummy;
199
-
199
+
200
200
  if (!error_msg) {
201
201
  error_msg = &dummy;
202
202
  }
203
203
  *error_msg = "";
204
-
204
+
205
205
  try {
206
206
  pqxx::connection db(path);
207
-
207
+
208
208
  std::string sql;
209
-
209
+
210
210
  sql = "SELECT max_error, hash_bits FROM config";
211
211
  pqxx::nontransaction n(db);
212
212
  pqxx::result res(n.exec(sql));
213
-
213
+
214
214
  pqxx::result::const_iterator c = res.begin(); // We retrieve just one row
215
-
215
+
216
216
  unsigned long hash_bits, max_error;
217
217
  max_error = c[0].as<long>();
218
218
  hash_bits = c[1].as<long>();
219
-
219
+
220
220
  db.disconnect();
221
-
222
- HmSearch* hm = new HmSearchImpl(path, hash_bits, max_error);
221
+
222
+ HmSearch* hm = new HmSearchImpl(path, (int)hash_bits, (int)max_error);
223
223
  if (!hm) {
224
224
  *error_msg = "out of memory";
225
225
  return NULL;
226
226
  }
227
-
227
+
228
228
  return hm;
229
-
229
+
230
230
  }
231
231
  catch (const pqxx::broken_connection& e) {
232
232
  *error_msg = e.what();
@@ -237,7 +237,7 @@ HmSearch* HmSearch::open(const std::string& path,
237
237
 
238
238
  HmSearch::hash_string HmSearch::parse_hexhash(const std::string& hexhash)
239
239
  {
240
- int len = hexhash.length() / 2;
240
+ int len = (int)hexhash.length() / 2;
241
241
  uint8_t hash[len];
242
242
 
243
243
  for (int i = 0; i < len; i++) {
@@ -247,57 +247,57 @@ HmSearch::hash_string HmSearch::parse_hexhash(const std::string& hexhash)
247
247
  buf[0] = hexhash[i * 2];
248
248
  buf[1] = hexhash[i * 2 + 1];
249
249
  buf[2] = 0;
250
-
250
+
251
251
  hash[i] = strtoul(buf, &err, 16);
252
252
 
253
253
  if (*err != '\0') {
254
254
  return hash_string();
255
255
  }
256
256
  }
257
-
257
+
258
258
  return hash_string(hash, len);
259
259
  }
260
260
 
261
261
  std::string HmSearch::format_hexhash(const HmSearch::hash_string& hash)
262
262
  {
263
263
  char hex[hash.length() * 2 + 1];
264
-
264
+
265
265
  for (size_t i = 0; i < hash.length(); i++) {
266
266
  sprintf(hex + 2 * i, "%02x", hash[i]);
267
267
  }
268
-
268
+
269
269
  return hex;
270
270
  }
271
271
 
272
272
 
273
273
  bool HmSearchImpl::print_copystring(const hash_string& hash,
274
- std::string* error_msg)
274
+ std::string* error_msg)
275
275
  {
276
276
  std::string dummy;
277
277
  if (!error_msg) {
278
278
  error_msg = &dummy;
279
279
  }
280
280
  *error_msg = "";
281
-
281
+
282
282
  if (hash.length() != (size_t) _hash_bytes) {
283
283
  *error_msg = "incorrect hash length";
284
284
  return false;
285
285
  }
286
-
286
+
287
287
  for (int i = 0; i < _partitions; i++) {
288
288
  uint8_t key[_partition_bytes];
289
-
289
+
290
290
  get_partition_key(hash, i, key);
291
-
291
+
292
292
  std::cout << "\\\\x" << format_hexhash(hash)
293
- << " "
294
- << int(i)
295
- << " "
296
- << "\\\\x" << format_hexhash(hash_string(key, _partition_bytes))
297
- << std::endl;
298
-
293
+ << " "
294
+ << int(i)
295
+ << " "
296
+ << "\\\\x" << format_hexhash(hash_string(key, _partition_bytes))
297
+ << std::endl;
298
+
299
299
  }
300
-
300
+
301
301
  return true;
302
302
  }
303
303
 
@@ -309,37 +309,37 @@ bool HmSearchImpl::insert(const hash_string& hash,
309
309
  error_msg = &dummy;
310
310
  }
311
311
  *error_msg = "";
312
-
312
+
313
313
  if (hash.length() != (size_t) _hash_bytes) {
314
314
  *error_msg = "incorrect hash length";
315
315
  return false;
316
316
  }
317
-
317
+
318
318
  if (!_db->is_open()) {
319
319
  *error_msg = "database is closed";
320
320
  return false;
321
321
  }
322
-
322
+
323
323
  for (int i = 0; i < _partitions; i++) {
324
- std::stringstream s;
325
- s << "INSERT INTO partition" << i
326
- << " VALUES ($1, $2)";
327
- _db->prepare("insert_"+i, s.str());
324
+ std::stringstream s;
325
+ s << "INSERT INTO partition" << i
326
+ << " VALUES ($1, $2)";
327
+ _db->prepare("insert_"+std::to_string(i), s.str());
328
328
  }
329
329
  pqxx::work W(*_db);
330
330
  for (int i = 0; i < _partitions; i++) {
331
331
  uint8_t key[_partition_bytes];
332
-
332
+
333
333
  get_partition_key(hash, i, key);
334
-
334
+
335
335
  pqxx::binarystring key_blob(key, _partition_bytes);
336
336
  pqxx::binarystring hash_blob(hash.data(), hash.length());
337
-
338
- W.prepared("insert_"+i)(hash_blob)(key_blob).exec();
337
+
338
+ W.prepared("insert_"+std::to_string(i))(hash_blob)(key_blob).exec();
339
339
  }
340
-
340
+
341
341
  W.commit();
342
-
342
+
343
343
  return true;
344
344
  }
345
345
 
@@ -354,25 +354,25 @@ bool HmSearchImpl::lookup(const hash_string& query,
354
354
  error_msg = &dummy;
355
355
  }
356
356
  *error_msg = "";
357
-
357
+
358
358
  if (query.length() != (size_t) _hash_bytes) {
359
359
  *error_msg = "incorrect hash length";
360
360
  return false;
361
361
  }
362
-
362
+
363
363
  if (!_db->is_open()) {
364
364
  *error_msg = "database is closed";
365
365
  return false;
366
366
  }
367
-
367
+
368
368
  try {
369
369
  CandidateMap candidates;
370
370
  get_candidates(query, candidates);
371
-
371
+
372
372
  for (CandidateMap::const_iterator i = candidates.begin(); i != candidates.end(); ++i) {
373
373
  if (valid_candidate(i->second)) {
374
374
  int distance = hamming_distance(query, i->first);
375
-
375
+
376
376
  if (distance <= _max_error
377
377
  && (reduced_error < 0 || distance <= reduced_error)) {
378
378
  result.push_back(LookupResult(i->first, distance));
@@ -384,7 +384,7 @@ bool HmSearchImpl::lookup(const hash_string& query,
384
384
  *error_msg = e.base().what();
385
385
  return false;
386
386
  }
387
-
387
+
388
388
  return true;
389
389
  }
390
390
 
@@ -396,42 +396,43 @@ bool HmSearchImpl::close(std::string* error_msg)
396
396
  error_msg = &dummy;
397
397
  }
398
398
  *error_msg = "";
399
-
399
+
400
400
  if (!_db->is_open()) {
401
401
  // Already closed
402
402
  return true;
403
403
  }
404
-
404
+
405
405
  _db->disconnect();
406
-
406
+
407
407
  return true;
408
408
  }
409
409
 
410
410
 
411
411
  HmSearchImpl::hash_string HmSearchImpl::get_multiple_keys(
412
- uint8_t *key,
413
- int partition)
412
+ uint8_t *key,
413
+ int partition)
414
414
  {
415
415
  hash_string hashes;
416
-
416
+
417
417
  pqxx::nontransaction n(*_db);
418
418
  pqxx::binarystring key_blob(key, _partition_bytes);
419
- pqxx::result res = n.prepared("select_"+partition)(key_blob).exec();
420
-
419
+ pqxx::result res = n.prepared("select_"+std::to_string(partition))(key_blob).exec();
420
+
421
421
  for (pqxx::result::const_iterator c = res.begin(); c != res.end(); ++c) {
422
- pqxx::binarystring hash_result(c[0]);
423
- hashes.append(hash_string(hash_result.data(), hash_result.size()));
422
+ pqxx::binarystring hash_result(c[0]);
423
+ hashes.append(hash_string(hash_result.data(), hash_result.size()));
424
424
  }
425
-
425
+
426
426
  return hashes;
427
427
  }
428
428
 
429
429
  void HmSearchImpl::get_candidates(
430
- const HmSearchImpl::hash_string& query,
431
- HmSearchImpl::CandidateMap& candidates)
430
+ const HmSearchImpl::hash_string& query,
431
+ HmSearchImpl::CandidateMap& candidates)
432
432
  {
433
433
  uint8_t key[_partition_bytes];
434
-
434
+ memset(key, 0, _partition_bytes);
435
+
435
436
  for (int i = 0; i < _partitions; i++) {
436
437
  int psize = _hash_bits - i * _partition_bits;
437
438
  if (psize > _partition_bits) {
@@ -439,44 +440,44 @@ void HmSearchImpl::get_candidates(
439
440
  }
440
441
  std::stringstream single;
441
442
  single << "SELECT hash FROM partition" << i
442
- << " WHERE key=$1";
443
-
443
+ << " WHERE key=$1";
444
+
444
445
  std::stringstream s;
445
446
  s << "SELECT hash FROM partition"
446
- << i
447
- << " INNER JOIN (SELECT $1::bytea AS key";
447
+ << i
448
+ << " INNER JOIN (SELECT $1::bytea AS key";
448
449
  for (int j = 2; j <= psize; j++) {
449
- s << " UNION ALL SELECT "
450
- << "$"
451
- << j;
450
+ s << " UNION ALL SELECT "
451
+ << "$"
452
+ << j;
452
453
  }
453
454
  s << ") AS x ON partition"
454
- << i
455
- << ".key = x.key";
455
+ << i
456
+ << ".key = x.key";
456
457
  std::string sql;
457
458
  sql.append(s.str());
458
- _db->prepare("select_multiple_"+i, sql);
459
-
459
+ _db->prepare("select_multiple_"+std::to_string(i), sql);
460
+
460
461
  sql.clear();
461
462
  sql.append(single.str());
462
- _db->prepare("select_"+i, sql);
463
+ _db->prepare("select_"+std::to_string(i), sql);
463
464
  }
464
465
  for (int i = 0; i < _partitions; i++) {
465
466
  hash_string hashes;
466
467
 
467
468
  int bits = get_partition_key(query, i, key);
468
-
469
+
469
470
  // Get exact matches
470
-
471
+
471
472
  hashes = get_multiple_keys(key, i);
472
-
473
+
473
474
  if (hashes.length() > 0) {
474
475
  add_hash_candidates(candidates, 0, (const uint8_t*)hashes.data(), hashes.length());
475
476
  }
476
-
477
+
477
478
  // Get 1-variant matches
478
479
  pqxx::nontransaction n(*_db);
479
- pqxx::prepare::invocation prep = n.prepared("select_multiple_"+i);
480
+ pqxx::prepare::invocation prep = n.prepared("select_multiple_"+std::to_string(i));
480
481
  int pbyte = (i * _partition_bits) / 8;
481
482
  int count = 0;
482
483
  for (int pbit = i * _partition_bits; bits > 0; pbit++, bits--, count++) {
@@ -487,7 +488,7 @@ void HmSearchImpl::get_candidates(
487
488
  key[pbit / 8 - pbyte] ^= flip;
488
489
  }
489
490
  pqxx::result res = prep.exec();
490
-
491
+
491
492
  hashes.clear();
492
493
  for (pqxx::result::const_iterator c = res.begin(); c != res.end(); ++c) {
493
494
  pqxx::binarystring hash_result(c[0]);
@@ -499,13 +500,13 @@ void HmSearchImpl::get_candidates(
499
500
 
500
501
 
501
502
  void HmSearchImpl::add_hash_candidates(
502
- HmSearchImpl::CandidateMap& candidates, int match,
503
- const uint8_t* hashes, size_t length)
503
+ HmSearchImpl::CandidateMap& candidates, int match,
504
+ const uint8_t* hashes, size_t length)
504
505
  {
505
506
  for (size_t n = 0; n < length; n += _hash_bytes) {
506
507
  hash_string hash = hash_string(hashes + n, _hash_bytes);
507
508
  Candidate& cand = candidates[hash];
508
-
509
+
509
510
  ++cand.matches;
510
511
  if (cand.matches == 1) {
511
512
  cand.first_match = match;
@@ -518,7 +519,7 @@ void HmSearchImpl::add_hash_candidates(
518
519
 
519
520
 
520
521
  bool HmSearchImpl::valid_candidate(
521
- const HmSearchImpl::Candidate& candidate)
522
+ const HmSearchImpl::Candidate& candidate)
522
523
  {
523
524
  if (_max_error & 1) {
524
525
  // Odd k
@@ -535,21 +536,21 @@ bool HmSearchImpl::valid_candidate(
535
536
  return false;
536
537
  }
537
538
  }
538
-
539
+
539
540
  return true;
540
541
  }
541
542
 
542
543
 
543
544
  int HmSearchImpl::hamming_distance(
544
- const HmSearchImpl::hash_string& query,
545
- const HmSearchImpl::hash_string& hash)
545
+ const HmSearchImpl::hash_string& query,
546
+ const HmSearchImpl::hash_string& hash)
546
547
  {
547
548
  int distance = 0;
548
-
549
+
549
550
  for (size_t i = 0; i < query.length(); i++) {
550
551
  distance += one_bits[query[i] ^ hash[i]];
551
552
  }
552
-
553
+
553
554
  return distance;
554
555
  }
555
556
 
@@ -557,31 +558,31 @@ int HmSearchImpl::hamming_distance(
557
558
  int HmSearchImpl::get_partition_key(const hash_string& hash, int partition, uint8_t *key)
558
559
  {
559
560
  int psize, hash_bit, bits_left;
560
-
561
+
561
562
  psize = _hash_bits - partition * _partition_bits;
562
563
  if (psize > _partition_bits) {
563
564
  psize = _partition_bits;
564
565
  }
565
-
566
+
566
567
  // Copy bytes, masking out some bits at the start and end
567
568
  bits_left = psize;
568
569
  hash_bit = partition * _partition_bits;
569
-
570
+
570
571
  for (int i = 0; i < _partition_bytes; i++) {
571
572
  int byte = hash_bit / 8;
572
573
  int bit = hash_bit % 8;
573
574
  int bits = 8 - bit;
574
-
575
+
575
576
  if (bits > bits_left) {
576
577
  bits = bits_left;
577
578
  }
578
-
579
+
579
580
  bits_left -= bits;
580
581
  hash_bit += bits;
581
-
582
+
582
583
  key[i] = hash[byte] & (((1 << bits) - 1) << (8 - bit - bits));
583
584
  }
584
-
585
+
585
586
  return psize;
586
587
  }
587
588
 
@@ -606,8 +607,8 @@ int HmSearchImpl::one_bits[256] = {
606
607
  };
607
608
 
608
609
  /*
609
- Local Variables:
610
- c-file-style: "stroustrup"
611
- indent-tabs-mode:nil
612
- End:
613
- */
610
+ Local Variables:
611
+ c-file-style: "stroustrup"
612
+ indent-tabs-mode:nil
613
+ End:
614
+ */
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'hmsearch-postgres'
3
- s.version = '0.1.0'
3
+ s.version = '0.1.1'
4
4
 
5
5
  s.summary = 'hmsearch postgres client'
6
6
  s.description = <<-EOF
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hmsearch-postgres
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kris Selden
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-13 00:00:00.000000000 Z
11
+ date: 2014-12-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake