whistlepig 0.11.2 → 0.12

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -8,7 +8,7 @@ full-text search without the frills, Whistlepig may be for you.
8
8
  Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
9
  bindings.
10
10
 
11
- Latest version: 0.11.2, released 2012-05-19.
11
+ Latest version: 0.12, released 2012-06-09.
12
12
  Status: beta
13
13
  News: http://all-thing.net/label/whistlepig/
14
14
  Homepage: http://masanjin.net/whistlepig/
@@ -16,7 +16,7 @@ Latest version: 0.11.2, released 2012-05-19.
16
16
 
17
17
  = Getting it
18
18
 
19
- Tarball: http://masanjin.net/whistlepig/whistlepig-0.11.2.tar.gz
19
+ Tarball: http://masanjin.net/whistlepig/whistlepig-0.12.tar.gz
20
20
  Rubygem: gem install whistlepig
21
21
  Git: git clone git://github.com/wmorgan/whistlepig.git
22
22
 
@@ -1,9 +1,10 @@
1
1
  #include <stdlib.h>
2
2
  #include "error.h"
3
3
 
4
- wp_error* wp_error_new(const char* msg, const char* src) {
4
+ wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) {
5
5
  wp_error* ret = malloc(sizeof(wp_error));
6
6
  ret->msg = msg;
7
+ ret->type = type;
7
8
  ret->size = 1;
8
9
  ret->srcs = malloc(sizeof(const char*));
9
10
  ret->srcs[0] = src;
@@ -25,8 +25,13 @@
25
25
  #include <stdio.h>
26
26
  #include <string.h>
27
27
 
28
+ #define WP_ERROR_TYPE_BASIC 1
29
+ #define WP_ERROR_TYPE_SYSTEM 2
30
+ #define WP_ERROR_TYPE_VERSION 3
31
+
28
32
  // pseudo-backtrace
29
33
  typedef struct wp_error {
34
+ unsigned char type;
30
35
  unsigned int size;
31
36
  const char* msg;
32
37
  const char** srcs;
@@ -39,25 +44,30 @@ typedef struct wp_error {
39
44
  // API methods
40
45
 
41
46
  // private: make a new error object with a message and source line
42
- wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
47
+ wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) RAISES_ERROR;
43
48
  // private: add a source line to a pre-existing error
44
49
  wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
45
50
 
46
51
  // public: free an error, once handled
47
52
  void wp_error_free(wp_error* e);
48
53
 
49
- // public: raise an error with a printf-style message
50
- #define RAISE_ERROR(fmt, ...) do { \
54
+ // private: internal mechanics for raising an error
55
+ #define RAISE_ERROR_OF_TYPE(type, fmt, ...) do { \
51
56
  char* msg = malloc(1024); \
52
57
  char* src = malloc(1024); \
53
58
  snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
54
59
  snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
55
- return wp_error_new(msg, src); \
60
+ return wp_error_new(msg, src, type); \
56
61
  } while(0)
57
62
 
58
- // public: raise an error with a printf-style message and have strerror() autoamtically
59
- // appended
60
- #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR(fmt ": %s", ## __VA_ARGS__, strerror(errno))
63
+ // public: raise a basic error
64
+ #define RAISE_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_BASIC, fmt, ## __VA_ARGS__)
65
+
66
+ // public: raise a version error
67
+ #define RAISE_VERSION_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_VERSION, fmt, ## __VA_ARGS__)
68
+
69
+ // public: raise a system error with strerror() automatically appended to the message
70
+ #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_SYSTEM, fmt ": %s", ## __VA_ARGS__, strerror(errno))
61
71
 
62
72
  // public: relay an error up the stack if the called function returns one.
63
73
  #define RELAY_ERROR(e) do { \
@@ -32,7 +32,6 @@ RAISING_STATIC(release_lock(wp_index* index)) {
32
32
  return NO_ERROR;
33
33
  }
34
34
 
35
-
36
35
  RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
37
36
  ii->index_version = index_version;
38
37
  ii->num_segments = 0;
@@ -42,7 +41,7 @@ RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
42
41
  }
43
42
 
44
43
  RAISING_STATIC(index_info_validate(index_info* ii, uint32_t index_version)) {
45
- if(ii->index_version != index_version) RAISE_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
44
+ if(ii->index_version != index_version) RAISE_VERSION_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
46
45
  return NO_ERROR;
47
46
  }
48
47
 
@@ -149,6 +148,57 @@ wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
149
148
  return NO_ERROR;
150
149
  }
151
150
 
151
+ #define RESULT_BUF_SIZE 1024
152
+ // count the results by running the query until it stops. slow!
153
+ RAISING_STATIC(count_query_by_running_it(wp_index* index, wp_query* query, uint32_t* num_results)) {
154
+ uint64_t results[RESULT_BUF_SIZE];
155
+
156
+ *num_results = 0;
157
+ while(1) {
158
+ uint32_t this_num_results;
159
+ RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
160
+ *num_results += this_num_results;
161
+ if(this_num_results < RESULT_BUF_SIZE) break; // done
162
+ }
163
+
164
+ return NO_ERROR;
165
+ }
166
+
167
+ RAISING_STATIC(count_query_from_posting_list_header(wp_index* index, wp_query* query, uint32_t* num_results)) {
168
+ // make sure we have know about all segments (one could've been added by a writer)
169
+ RELAY_ERROR(grab_readlock(index));
170
+ RELAY_ERROR(ensure_all_segments(index));
171
+ RELAY_ERROR(release_lock(index));
172
+
173
+ *num_results = 0;
174
+ for(int i = 0; i < index->num_segments; i++) {
175
+ uint32_t this_num_results;
176
+
177
+ DEBUG("counting on segment %d", i);
178
+ wp_segment* seg = &index->segments[i];
179
+ RELAY_ERROR(wp_segment_grab_readlock(seg));
180
+ RELAY_ERROR(wp_segment_reload(seg));
181
+ RELAY_ERROR(wp_segment_count_term(seg, query->field, query->word, &this_num_results));
182
+ RELAY_ERROR(wp_segment_release_lock(seg));
183
+ *num_results += this_num_results;
184
+ DEBUG("got %d results from segment %d", this_num_results, i);
185
+ }
186
+
187
+ return NO_ERROR;
188
+ }
189
+
190
+ RAISING_STATIC(count_query(wp_index* index, wp_query* query, uint32_t* num_results)) {
191
+ switch(query->type) {
192
+ case WP_QUERY_TERM:
193
+ case WP_QUERY_LABEL:
194
+ RELAY_ERROR(count_query_from_posting_list_header(index, query, num_results));
195
+ break;
196
+ case WP_QUERY_EVERY: // TODO -- special case this
197
+ default:
198
+ RELAY_ERROR(count_query_by_running_it(index, query, num_results));
199
+ }
200
+ return NO_ERROR;
201
+ }
152
202
  // can be called multiple times to resume
153
203
  wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
154
204
  *num_results = 0;
@@ -209,20 +259,10 @@ wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_
209
259
  return NO_ERROR;
210
260
  }
211
261
 
212
- #define RESULT_BUF_SIZE 1024
213
- // count the results by just running the query until it stops. slow!
262
+ // just count the results, don't return them
214
263
  wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
215
- uint64_t results[RESULT_BUF_SIZE];
216
-
217
- *num_results = 0;
218
264
  RELAY_ERROR(wp_index_setup_query(index, query));
219
- while(1) {
220
- uint32_t this_num_results;
221
- RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
222
- *num_results += this_num_results;
223
- if(this_num_results < RESULT_BUF_SIZE) break; // done
224
- }
225
-
265
+ RELAY_ERROR(count_query(index, query, num_results));
226
266
  RELAY_ERROR(wp_index_teardown_query(index, query));
227
267
 
228
268
  return NO_ERROR;
@@ -369,7 +409,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
369
409
  if(doc_id > index->docid_offsets[i - 1]) {
370
410
  wp_segment* seg = &index->segments[i - 1];
371
411
 
372
- DEBUG("found doc %llu in segment %u", doc_id, i - 1);
412
+ DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
373
413
  RELAY_ERROR(wp_segment_grab_writelock(seg));
374
414
  RELAY_ERROR(wp_segment_reload(seg));
375
415
  RELAY_ERROR(wp_segment_add_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
@@ -377,7 +417,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
377
417
  found = 1;
378
418
  break;
379
419
  }
380
- else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
420
+ else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
381
421
  }
382
422
 
383
423
  if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
@@ -396,7 +436,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
396
436
  if(doc_id > index->docid_offsets[i - 1]) {
397
437
  wp_segment* seg = &index->segments[i - 1];
398
438
 
399
- DEBUG("found doc %llu in segment %u", doc_id, i - 1);
439
+ DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
400
440
  RELAY_ERROR(wp_segment_grab_writelock(seg));
401
441
  RELAY_ERROR(wp_segment_reload(seg));
402
442
  RELAY_ERROR(wp_segment_remove_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
@@ -404,7 +444,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
404
444
  found = 1;
405
445
  break;
406
446
  }
407
- else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
447
+ else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
408
448
  }
409
449
 
410
450
  if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
@@ -14,7 +14,7 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
14
14
  if(o->fd == -1) RAISE_SYSERROR("cannot create %s", pathname);
15
15
 
16
16
  uint32_t size = initial_size + (uint32_t)sizeof(mmap_obj_header);
17
- DEBUG("creating %s with %u + %u = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
17
+ DEBUG("creating %s with %u + %lu = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
18
18
  lseek(o->fd, size - 1, SEEK_SET);
19
19
  ssize_t num_bytes = write(o->fd, "", 1);
20
20
  if(num_bytes == -1) RAISE_SYSERROR("write");
@@ -35,14 +35,14 @@ wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) {
35
35
  // load header
36
36
  o->content = mmap(NULL, sizeof(mmap_obj_header), PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
37
37
  if(o->content == MAP_FAILED) RAISE_SYSERROR("header mmap");
38
- DEBUG("loaded header of %u bytes for %s object", sizeof(mmap_obj_header), magic);
38
+ DEBUG("loaded header of %lu bytes for %s object", sizeof(mmap_obj_header), magic);
39
39
 
40
40
  RELAY_ERROR(validate(o->content, magic));
41
41
 
42
42
  o->loaded_size = o->content->size;
43
43
 
44
44
  uint32_t size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
45
- DEBUG("full size is %u bytes (including %u-byte header)", size, sizeof(mmap_obj_header));
45
+ DEBUG("full size is %u bytes (including %lu-byte header)", size, sizeof(mmap_obj_header));
46
46
  if(munmap(o->content, sizeof(mmap_obj_header)) == -1) RAISE_SYSERROR("munmap");
47
47
 
48
48
  o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
@@ -85,7 +85,7 @@ wp_error* mmap_obj_resize(mmap_obj* o, uint32_t data_size) {
85
85
  }
86
86
 
87
87
  wp_error* mmap_obj_unload(mmap_obj* o) {
88
- DEBUG("unloading %u bytes", sizeof(mmap_obj_header) + o->content->size);
88
+ DEBUG("unloading %lu bytes", sizeof(mmap_obj_header) + o->content->size);
89
89
  if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
90
90
  o->content = NULL;
91
91
  return NO_ERROR;
@@ -195,8 +195,14 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
195
195
 
196
196
  t.word_s = stringmap_string_to_int(sh, sp, q->word);
197
197
 
198
- uint32_t offset = termhash_get_val(th, t);
199
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
198
+ uint32_t offset;
199
+ posting_list_header* plh = termhash_get_val(th, t);
200
+
201
+ DEBUG("posting list header for %s:%s (-> %u:%u) is %p", q->field, q->word, t.field_s, t.word_s, plh);
202
+ if(plh == NULL) offset = OFFSET_NONE;
203
+ else offset = plh->next_offset;
204
+
205
+ if(plh) DEBUG("posting list header has count=%u next_offset=%u", plh->count, plh->next_offset);
200
206
 
201
207
  if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
202
208
  else {
@@ -475,8 +481,8 @@ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* resu
475
481
 
476
482
  while(!found && !*done) {
477
483
  RELAY_ERROR(query_next_doc(master, seg, result, done));
478
- DEBUG("master reports doc %u done %d", result->doc_id, *done);
479
484
  if(!*done) {
485
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
480
486
  search_doc = result->doc_id;
481
487
  wp_search_result_free(result); // sigh
482
488
  RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
@@ -564,8 +570,8 @@ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* re
564
570
 
565
571
  while(!found && !*done) {
566
572
  RELAY_ERROR(query_next_doc(master, seg, result, done));
567
- DEBUG("master reports doc %u done %d", result->doc_id, *done);
568
573
  if(!*done) {
574
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
569
575
  search_doc = result->doc_id;
570
576
  wp_search_result_free(result); // sigh
571
577
  RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
@@ -784,7 +790,6 @@ static wp_error* every_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_
784
790
 
785
791
  wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
786
792
  int done;
787
-
788
793
  *num_results = 0;
789
794
 
790
795
  #ifdef DEBUG
@@ -3,13 +3,16 @@
3
3
  #include <unistd.h>
4
4
  #include "whistlepig.h"
5
5
 
6
- #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
6
+ #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
7
7
  #define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
8
8
 
9
- #define SEGMENT_VERSION 3
9
+ #define SEGMENT_VERSION 4
10
10
 
11
11
  #define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
12
12
 
13
+ static posting_list_header blank_plh = { .count = 0, .next_offset = OFFSET_NONE };
14
+ static term dead_term = { .field_s = 0, .word_s = 0 };
15
+
13
16
  wp_error* wp_segment_grab_readlock(wp_segment* seg) {
14
17
  segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
15
18
  RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
@@ -28,6 +31,23 @@ wp_error* wp_segment_release_lock(wp_segment* seg) {
28
31
  return NO_ERROR;
29
32
  }
30
33
 
34
+ wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* word, uint32_t* num_results) {
35
+ stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
36
+ stringpool* sp = MMAP_OBJ(seg->stringpool, stringpool);
37
+ termhash* th = MMAP_OBJ(seg->termhash, termhash);
38
+
39
+ term t;
40
+ if(field == NULL) t.field_s = 0; // label sentinel
41
+ else t.field_s = stringmap_string_to_int(sh, sp, field);
42
+ t.word_s = stringmap_string_to_int(sh, sp, word);
43
+
44
+ posting_list_header* plh = termhash_get_val(th, t);
45
+ if(plh == NULL) *num_results = 0;
46
+ else *num_results = plh->count;
47
+
48
+ return NO_ERROR;
49
+ }
50
+
31
51
  static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t postings_type_and_flags) {
32
52
  pr->postings_type_and_flags = postings_type_and_flags;
33
53
  pr->num_postings = 0;
@@ -44,7 +64,7 @@ RAISING_STATIC(segment_info_init(segment_info* si, uint32_t segment_version)) {
44
64
  }
45
65
 
46
66
  RAISING_STATIC(segment_info_validate(segment_info* si, uint32_t segment_version)) {
47
- if(si->segment_version != segment_version) RAISE_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
67
+ if(si->segment_version != segment_version) RAISE_VERSION_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
48
68
  return NO_ERROR;
49
69
  }
50
70
 
@@ -449,7 +469,7 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
449
469
  RELAY_ERROR(bump_stringpool(s, &success));
450
470
  RELAY_ERROR(bump_termhash(s, &success));
451
471
 
452
- DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
472
+ DEBUG("adding posting for %s:%s and doc %u with %u positions", field, word, doc_id, num_positions);
453
473
 
454
474
  postings_region* pr = MMAP_OBJ(s->postings, postings_region);
455
475
  stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
@@ -461,26 +481,38 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
461
481
  RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
462
482
  RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
463
483
 
484
+ DEBUG("%s:%s maps to %u:%u", field, word, t.field_s, t.word_s);
485
+
464
486
  // find the offset of the next posting
487
+ posting_list_header* plh = termhash_get_val(th, t);
488
+ if(plh == NULL) {
489
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
490
+ plh = termhash_get_val(th, t);
491
+ }
492
+ DEBUG("posting list header for %s:%s is at %p", field, word, plh);
493
+
465
494
  posting po;
466
- uint32_t next_offset = termhash_get_val(th, t);
467
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
468
- if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
495
+ uint32_t next_offset = plh->next_offset;
496
+
497
+ if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy [PERFORMANCE]
469
498
  RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
470
499
  if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
471
500
  }
472
501
 
473
502
  // write the entry to the postings region
474
503
  uint32_t entry_offset = pr->postings_head;
475
- //DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
504
+ DEBUG("writing posting at offset %u. next offset is %u.", entry_offset, next_offset);
505
+
476
506
  po.doc_id = doc_id;
477
507
  po.next_offset = next_offset;
478
508
  po.num_positions = num_positions;
479
509
  RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
480
- DEBUG("postings list head now at %u", pr->postings_head);
510
+ DEBUG("posting list head now at %u", pr->postings_head);
481
511
 
482
512
  // really finally, update the tail pointer so that readers can access this posting
483
- RELAY_ERROR(termhash_put_val(th, t, entry_offset));
513
+ plh->count++;
514
+ plh->next_offset = entry_offset;
515
+ DEBUG("posting list header for %s:%s now reads count=%u offset=%u", field, word, plh->count, plh->next_offset);
484
516
 
485
517
  return NO_ERROR;
486
518
  }
@@ -537,22 +569,25 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
537
569
 
538
570
  // find the previous and next label postings, between which we'll insert this
539
571
  // posting
540
- uint32_t prev_offset = OFFSET_NONE;
541
- uint32_t next_offset = termhash_get_val(th, t);
572
+ posting_list_header* plh = termhash_get_val(th, t);
573
+ if(plh == NULL) {
574
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
575
+ plh = termhash_get_val(th, t);
576
+ }
577
+
578
+ uint32_t next_offset = plh->next_offset;
542
579
  docid_t last_docid = DOCID_NONE;
580
+ uint32_t prev_offset = OFFSET_NONE;
543
581
 
544
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
545
582
  DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
546
583
 
547
584
  while(next_offset != OFFSET_NONE) {
548
585
  label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
549
586
 
550
- if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
587
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
551
588
  RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
552
- }
553
- else {
589
+ else
554
590
  last_docid = lp->doc_id;
555
- }
556
591
 
557
592
  DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
558
593
  if(lp->doc_id == doc_id) {
@@ -567,18 +602,23 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
567
602
  // find a space for the posting by first checking for a free postings in the
568
603
  // dead list. the dead list is the list stored under the sentinel term with
569
604
  // field 0 and word 0.
570
- term dead_term = { .field_s = 0, .word_s = 0 };
605
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
606
+ if(dead_plh == NULL) {
607
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
608
+ dead_plh = termhash_get_val(th, t);
609
+ }
610
+
571
611
  uint32_t entry_offset;
572
- uint32_t dead_offset = termhash_get_val(th, dead_term);
573
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
612
+ uint32_t dead_offset = dead_plh->next_offset;
574
613
 
575
614
  if(dead_offset == OFFSET_NONE) { // make a new posting
576
615
  entry_offset = pr->postings_head;
577
616
  }
578
617
  else { // we'll use this one; remove it from the linked list
579
618
  DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
580
- entry_offset = dead_offset;
581
- RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
619
+ entry_offset = dead_plh->next_offset;
620
+ dead_plh->next_offset = wp_segment_label_posting_at(pr, dead_offset)->next_offset;
621
+ dead_plh->count--;
582
622
  }
583
623
 
584
624
  // finally, write the entry to the label postings region
@@ -588,11 +628,12 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
588
628
  po->next_offset = next_offset;
589
629
 
590
630
  pr->postings_head += (uint32_t)sizeof(label_posting);
591
- DEBUG("label postings list head now at %u", pr->postings_head);
631
+ DEBUG("label posting list head now at %u", pr->postings_head);
592
632
 
593
633
  // really finally, update either the previous offset or the tail pointer
594
634
  // for this label so that readers can access this posting
595
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
635
+ plh->count++;
636
+ if(prev_offset == OFFSET_NONE) plh->next_offset = entry_offset;
596
637
  else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
597
638
 
598
639
  return NO_ERROR;
@@ -615,13 +656,16 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
615
656
  t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
616
657
 
617
658
  // find the posting and the previous posting in the list, if any
618
- uint32_t prev_offset = OFFSET_NONE;
619
- uint32_t offset = termhash_get_val(th, t);
620
659
  docid_t last_docid = DOCID_NONE;
660
+ uint32_t prev_offset = OFFSET_NONE;
661
+ posting_list_header* plh = termhash_get_val(th, t);
662
+ if(plh == NULL) {
663
+ DEBUG("no such label %s", label);
664
+ return NO_ERROR;
665
+ }
621
666
 
622
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
667
+ uint32_t offset = plh->next_offset;
623
668
  label_posting* lp = NULL;
624
-
625
669
  while(offset != OFFSET_NONE) {
626
670
  lp = wp_segment_label_posting_at(pr, offset);
627
671
 
@@ -646,17 +690,22 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
646
690
  }
647
691
 
648
692
  // we've found the posting; now remove it from the list
649
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
693
+ if(prev_offset == OFFSET_NONE) plh->next_offset = lp->next_offset;
650
694
  else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
695
+ plh->count--;
651
696
 
652
697
  // now add it to the dead list for later reclamation
653
- term dead_term = { .field_s = 0, .word_s = 0 };
654
- uint32_t dead_offset = termhash_get_val(th, dead_term);
655
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
698
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
699
+ if(dead_plh == NULL) {
700
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
701
+ dead_plh = termhash_get_val(th, t);
702
+ }
656
703
 
657
- lp->next_offset = dead_offset;
658
704
  DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
659
- RELAY_ERROR(termhash_put_val(th, dead_term, offset));
705
+
706
+ uint32_t dead_offset = dead_plh->next_offset;
707
+ lp->next_offset = dead_offset;
708
+ dead_plh->next_offset = offset;
660
709
 
661
710
  return NO_ERROR;
662
711
  }
@@ -150,4 +150,7 @@ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32
150
150
  // private: return the size on disk of a position array
151
151
  wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
152
152
 
153
+ // private: count the number of occurences of a particular term
154
+ wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* term, uint32_t* num_results);
155
+
153
156
  #endif
@@ -87,7 +87,7 @@ RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t
87
87
 
88
88
  RARRAY_INIT(pword, words);
89
89
  while(yylex(*scanner) != TOK_DONE) {
90
- pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
90
+ pword pw = { .token = strdup(yyget_text(*scanner)), .start = charpos->start, .end = charpos->end };
91
91
  RARRAY_ADD(pword, words, pw);
92
92
  }
93
93
 
@@ -106,15 +106,15 @@ wp_error* termhash_bump_size(termhash *h) {
106
106
 
107
107
  // get pointers to the old locations
108
108
  term* oldkeys = TERMHASH_KEYS(h);
109
- uint32_t* oldvals = TERMHASH_VALS(h);
109
+ posting_list_header* oldvals = TERMHASH_VALS(h);
110
110
 
111
111
  // set pointers to the new locations
112
112
  uint32_t* newflags = (uint32_t*)h->boundary;
113
113
  term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
114
- uint32_t* newvals = (uint32_t*)(newkeys + new_n_buckets);
114
+ posting_list_header* newvals = (posting_list_header*)(newkeys + new_n_buckets);
115
115
 
116
116
  // move the vals and keys
117
- memmove(newvals, oldvals, h->n_buckets * sizeof(uint32_t));
117
+ memmove(newvals, oldvals, h->n_buckets * sizeof(posting_list_header));
118
118
  memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
119
119
 
120
120
  // clear the new flags
@@ -124,8 +124,7 @@ wp_error* termhash_bump_size(termhash *h) {
124
124
  for (unsigned int j = 0; j != h->n_buckets; ++j) {
125
125
  if (iseither(flagbaks, j) == 0) {
126
126
  term key = newkeys[j];
127
- uint32_t val;
128
- val = newvals[j];
127
+ posting_list_header val = newvals[j];
129
128
  set_isdel_true(flagbaks, j);
130
129
  while (1) {
131
130
  uint32_t inc, k, i;
@@ -139,7 +138,7 @@ wp_error* termhash_bump_size(termhash *h) {
139
138
  set_isempty_false(newflags, i);
140
139
  if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
141
140
  { term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
142
- { uint32_t tmp = newvals[i]; newvals[i] = val; val = tmp; }
141
+ { posting_list_header tmp = newvals[i]; newvals[i] = val; val = tmp; }
143
142
  set_isdel_true(flagbaks, i);
144
143
  } else {
145
144
  newkeys[i] = key;
@@ -235,20 +234,20 @@ void termhash_del(termhash *h, uint32_t x) {
235
234
  }
236
235
  }
237
236
 
238
- uint32_t termhash_get_val(termhash* h, term t) {
239
- uint32_t* vals = TERMHASH_VALS(h);
237
+ posting_list_header* termhash_get_val(termhash* h, term t) {
238
+ posting_list_header* vals = TERMHASH_VALS(h);
240
239
  uint32_t idx = termhash_get(h, t);
241
- if(idx == h->n_buckets) return (uint32_t)-1;
242
- return vals[idx];
240
+ if(idx == h->n_buckets) return NULL;
241
+ return &vals[idx];
243
242
  }
244
243
 
245
- wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
244
+ wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) {
246
245
  int status;
247
- uint32_t* vals = TERMHASH_VALS(h);
246
+ posting_list_header* vals = TERMHASH_VALS(h);
248
247
  uint32_t loc = termhash_put(h, t, &status);
249
248
  DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
250
249
  if(status == -1) RAISE_ERROR("out of space in hash");
251
- vals[loc] = val;
250
+ memcpy(&vals[loc], val, sizeof(posting_list_header));
252
251
  return NO_ERROR;
253
252
  }
254
253
 
@@ -257,22 +256,22 @@ int termhash_needs_bump(termhash* h) {
257
256
  }
258
257
 
259
258
  // returns the total size in bytes
260
- // memory layout: termhash, then:
259
+ // memory layout: termhash struct, then:
261
260
  // ((n_buckets >> 4) + 1) uint32_t's for the flags
262
261
  // n_buckets terms for the keys
263
- // n_buckets uint32_t's for the vals (offsets into postings lists)
262
+ // n_buckets posting_list_header for the vals (offsets into postings lists)
264
263
  static uint32_t size(uint32_t n_buckets) {
265
264
  uint32_t size = (uint32_t)sizeof(termhash) +
266
265
  (((n_buckets >> 4) + 1) * (uint32_t)sizeof(uint32_t)) +
267
266
  (n_buckets * (uint32_t)sizeof(term)) +
268
- (n_buckets * (uint32_t)sizeof(uint32_t));
267
+ (n_buckets * (uint32_t)sizeof(posting_list_header));
269
268
 
270
269
  DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
271
270
  n_buckets,
272
271
  (long)sizeof(termhash),
273
272
  (long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
274
273
  (long)(n_buckets * sizeof(term)),
275
- (long)(n_buckets * sizeof(uint32_t)),
274
+ (long)(n_buckets * sizeof(posting_list_header)),
276
275
  size);
277
276
 
278
277
  return size;
@@ -22,6 +22,18 @@ typedef struct term {
22
22
  uint32_t word_s;
23
23
  } term;
24
24
 
25
+ typedef struct posting_list_header {
26
+ uint32_t count;
27
+ uint32_t next_offset;
28
+ } posting_list_header;
29
+
30
+ typedef struct block_header {
31
+ uint32_t max_docid;
32
+ uint32_t next_offset;
33
+ uint32_t block_start;
34
+ uint8_t data[];
35
+ } block_header;
36
+
25
37
  #define INITIAL_N_BUCKETS_IDX 1
26
38
 
27
39
  typedef struct termhash {
@@ -31,12 +43,12 @@ typedef struct termhash {
31
43
  // in memory at this point
32
44
  // ((n_buckets >> 4) + 1) uint32_t's for the flags
33
45
  // n_buckets terms for the keys
34
- // n_buckets uint32_t's for the vals (offsets into postings lists)
46
+ // n_buckets posting_list_header for the vals
35
47
  } termhash;
36
48
 
37
49
  #define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
38
50
  #define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
39
- #define TERMHASH_VALS(h) ((uint32_t*)(TERMHASH_KEYS(h) + (h)->n_buckets))
51
+ #define TERMHASH_VALS(h) ((posting_list_header*)(TERMHASH_KEYS(h) + (h)->n_buckets))
40
52
 
41
53
  // API methods
42
54
 
@@ -50,14 +62,14 @@ uint32_t termhash_get(termhash *h, term t);
50
62
 
51
63
  // public: get an int given a term. returns (uint32_t)-1 if the term is not in
52
64
  // the hash.
53
- uint32_t termhash_get_val(termhash* h, term t); // convenience
65
+ posting_list_header* termhash_get_val(termhash* h, term t); // convenience
54
66
 
55
67
  // private: khash-style setter: insert a term into the hash. see the code
56
68
  // for details on what all the return values mean.
57
69
  uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
58
70
 
59
71
  // public: adds a term to the hash with the given value
60
- wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
72
+ wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) RAISES_ERROR; // convenience
61
73
 
62
74
  // public: returns the byte size of the termhash
63
75
  uint32_t termhash_size(termhash* h);
@@ -7,7 +7,9 @@ static VALUE c_index;
7
7
  static VALUE c_entry;
8
8
  static VALUE c_query;
9
9
  static VALUE c_error;
10
- static VALUE c_parseerror;
10
+ static VALUE c_parse_error;
11
+ static VALUE c_sys_error;
12
+ static VALUE c_version_error;
11
13
 
12
14
  static void index_free(wp_index* index) {
13
15
  wp_error* e = wp_index_free(index);
@@ -20,7 +22,12 @@ static void index_free(wp_index* index) {
20
22
 
21
23
  #define RAISE_IF_NECESSARY(e) do { \
22
24
  if(e != NULL) { \
23
- VALUE exc = rb_exc_new2(c_error, e->msg); \
25
+ VALUE exc; \
26
+ switch(e->type) { \
27
+ case WP_ERROR_TYPE_SYSTEM: exc = rb_exc_new2(c_sys_error, e->msg); break; \
28
+ case WP_ERROR_TYPE_VERSION: exc = rb_exc_new2(c_version_error, e->msg); break; \
29
+ default: exc = rb_exc_new2(c_error, e->msg); break; \
30
+ } \
24
31
  wp_error_free(e); \
25
32
  rb_exc_raise(exc); \
26
33
  } \
@@ -331,7 +338,7 @@ static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
331
338
  wp_query* query;
332
339
  wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
333
340
  if(e != NULL) {
334
- VALUE exc = rb_exc_new2(c_parseerror, e->msg);
341
+ VALUE exc = rb_exc_new2(c_parse_error, e->msg);
335
342
  wp_error_free(e);
336
343
  rb_exc_raise(exc);
337
344
  }
@@ -614,5 +621,7 @@ void Init_whistlepig() {
614
621
  rb_define_attr(c_query, "query", 1, 0);
615
622
 
616
623
  c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
617
- c_parseerror = rb_define_class_under(m_whistlepig, "ParseError", rb_eStandardError);
624
+ c_parse_error = rb_define_class_under(m_whistlepig, "ParseError", c_error);
625
+ c_sys_error = rb_define_class_under(m_whistlepig, "SystemError", c_error);
626
+ c_version_error = rb_define_class_under(m_whistlepig, "VersionError", c_error);
618
627
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whistlepig
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.2
4
+ version: '0.12'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-19 00:00:00.000000000 Z
12
+ date: 2012-06-09 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Whistlepig is a minimalist realtime full-text search index. Its goal
15
15
  is to be as small and minimally-featured as possible, while still remaining useful,