whistlepig 0.11.2 → 0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -8,7 +8,7 @@ full-text search without the frills, Whistlepig may be for you.
8
8
  Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
9
  bindings.
10
10
 
11
- Latest version: 0.11.2, released 2012-05-19.
11
+ Latest version: 0.12, released 2012-06-09.
12
12
  Status: beta
13
13
  News: http://all-thing.net/label/whistlepig/
14
14
  Homepage: http://masanjin.net/whistlepig/
@@ -16,7 +16,7 @@ Latest version: 0.11.2, released 2012-05-19.
16
16
 
17
17
  = Getting it
18
18
 
19
- Tarball: http://masanjin.net/whistlepig/whistlepig-0.11.2.tar.gz
19
+ Tarball: http://masanjin.net/whistlepig/whistlepig-0.12.tar.gz
20
20
  Rubygem: gem install whistlepig
21
21
  Git: git clone git://github.com/wmorgan/whistlepig.git
22
22
 
@@ -1,9 +1,10 @@
1
1
  #include <stdlib.h>
2
2
  #include "error.h"
3
3
 
4
- wp_error* wp_error_new(const char* msg, const char* src) {
4
+ wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) {
5
5
  wp_error* ret = malloc(sizeof(wp_error));
6
6
  ret->msg = msg;
7
+ ret->type = type;
7
8
  ret->size = 1;
8
9
  ret->srcs = malloc(sizeof(const char*));
9
10
  ret->srcs[0] = src;
@@ -25,8 +25,13 @@
25
25
  #include <stdio.h>
26
26
  #include <string.h>
27
27
 
28
+ #define WP_ERROR_TYPE_BASIC 1
29
+ #define WP_ERROR_TYPE_SYSTEM 2
30
+ #define WP_ERROR_TYPE_VERSION 3
31
+
28
32
  // pseudo-backtrace
29
33
  typedef struct wp_error {
34
+ unsigned char type;
30
35
  unsigned int size;
31
36
  const char* msg;
32
37
  const char** srcs;
@@ -39,25 +44,30 @@ typedef struct wp_error {
39
44
  // API methods
40
45
 
41
46
  // private: make a new error object with a message and source line
42
- wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
47
+ wp_error* wp_error_new(const char* msg, const char* src, unsigned char type) RAISES_ERROR;
43
48
  // private: add a source line to a pre-existing error
44
49
  wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
45
50
 
46
51
  // public: free an error, once handled
47
52
  void wp_error_free(wp_error* e);
48
53
 
49
- // public: raise an error with a printf-style message
50
- #define RAISE_ERROR(fmt, ...) do { \
54
+ // private: internal mechanics for raising an error
55
+ #define RAISE_ERROR_OF_TYPE(type, fmt, ...) do { \
51
56
  char* msg = malloc(1024); \
52
57
  char* src = malloc(1024); \
53
58
  snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
54
59
  snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
55
- return wp_error_new(msg, src); \
60
+ return wp_error_new(msg, src, type); \
56
61
  } while(0)
57
62
 
58
- // public: raise an error with a printf-style message and have strerror() autoamtically
59
- // appended
60
- #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR(fmt ": %s", ## __VA_ARGS__, strerror(errno))
63
+ // public: raise a basic error
64
+ #define RAISE_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_BASIC, fmt, ## __VA_ARGS__)
65
+
66
+ // public: raise a version error
67
+ #define RAISE_VERSION_ERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_VERSION, fmt, ## __VA_ARGS__)
68
+
69
+ // public: raise a system error with strerror() automatically appended to the message
70
+ #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR_OF_TYPE(WP_ERROR_TYPE_SYSTEM, fmt ": %s", ## __VA_ARGS__, strerror(errno))
61
71
 
62
72
  // public: relay an error up the stack if the called function returns one.
63
73
  #define RELAY_ERROR(e) do { \
@@ -32,7 +32,6 @@ RAISING_STATIC(release_lock(wp_index* index)) {
32
32
  return NO_ERROR;
33
33
  }
34
34
 
35
-
36
35
  RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
37
36
  ii->index_version = index_version;
38
37
  ii->num_segments = 0;
@@ -42,7 +41,7 @@ RAISING_STATIC(index_info_init(index_info* ii, uint32_t index_version)) {
42
41
  }
43
42
 
44
43
  RAISING_STATIC(index_info_validate(index_info* ii, uint32_t index_version)) {
45
- if(ii->index_version != index_version) RAISE_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
44
+ if(ii->index_version != index_version) RAISE_VERSION_ERROR("index has type %u; expecting type %u", ii->index_version, index_version);
46
45
  return NO_ERROR;
47
46
  }
48
47
 
@@ -149,6 +148,57 @@ wp_error* wp_index_setup_query(wp_index* index, wp_query* query) {
149
148
  return NO_ERROR;
150
149
  }
151
150
 
151
+ #define RESULT_BUF_SIZE 1024
152
+ // count the results by running the query until it stops. slow!
153
+ RAISING_STATIC(count_query_by_running_it(wp_index* index, wp_query* query, uint32_t* num_results)) {
154
+ uint64_t results[RESULT_BUF_SIZE];
155
+
156
+ *num_results = 0;
157
+ while(1) {
158
+ uint32_t this_num_results;
159
+ RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
160
+ *num_results += this_num_results;
161
+ if(this_num_results < RESULT_BUF_SIZE) break; // done
162
+ }
163
+
164
+ return NO_ERROR;
165
+ }
166
+
167
+ RAISING_STATIC(count_query_from_posting_list_header(wp_index* index, wp_query* query, uint32_t* num_results)) {
168
+ // make sure we have know about all segments (one could've been added by a writer)
169
+ RELAY_ERROR(grab_readlock(index));
170
+ RELAY_ERROR(ensure_all_segments(index));
171
+ RELAY_ERROR(release_lock(index));
172
+
173
+ *num_results = 0;
174
+ for(int i = 0; i < index->num_segments; i++) {
175
+ uint32_t this_num_results;
176
+
177
+ DEBUG("counting on segment %d", i);
178
+ wp_segment* seg = &index->segments[i];
179
+ RELAY_ERROR(wp_segment_grab_readlock(seg));
180
+ RELAY_ERROR(wp_segment_reload(seg));
181
+ RELAY_ERROR(wp_segment_count_term(seg, query->field, query->word, &this_num_results));
182
+ RELAY_ERROR(wp_segment_release_lock(seg));
183
+ *num_results += this_num_results;
184
+ DEBUG("got %d results from segment %d", this_num_results, i);
185
+ }
186
+
187
+ return NO_ERROR;
188
+ }
189
+
190
+ RAISING_STATIC(count_query(wp_index* index, wp_query* query, uint32_t* num_results)) {
191
+ switch(query->type) {
192
+ case WP_QUERY_TERM:
193
+ case WP_QUERY_LABEL:
194
+ RELAY_ERROR(count_query_from_posting_list_header(index, query, num_results));
195
+ break;
196
+ case WP_QUERY_EVERY: // TODO -- special case this
197
+ default:
198
+ RELAY_ERROR(count_query_by_running_it(index, query, num_results));
199
+ }
200
+ return NO_ERROR;
201
+ }
152
202
  // can be called multiple times to resume
153
203
  wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_results, uint32_t* num_results, uint64_t* results) {
154
204
  *num_results = 0;
@@ -209,20 +259,10 @@ wp_error* wp_index_run_query(wp_index* index, wp_query* query, uint32_t max_num_
209
259
  return NO_ERROR;
210
260
  }
211
261
 
212
- #define RESULT_BUF_SIZE 1024
213
- // count the results by just running the query until it stops. slow!
262
+ // just count the results, don't return them
214
263
  wp_error* wp_index_count_results(wp_index* index, wp_query* query, uint32_t* num_results) {
215
- uint64_t results[RESULT_BUF_SIZE];
216
-
217
- *num_results = 0;
218
264
  RELAY_ERROR(wp_index_setup_query(index, query));
219
- while(1) {
220
- uint32_t this_num_results;
221
- RELAY_ERROR(wp_index_run_query(index, query, RESULT_BUF_SIZE, &this_num_results, results));
222
- *num_results += this_num_results;
223
- if(this_num_results < RESULT_BUF_SIZE) break; // done
224
- }
225
-
265
+ RELAY_ERROR(count_query(index, query, num_results));
226
266
  RELAY_ERROR(wp_index_teardown_query(index, query));
227
267
 
228
268
  return NO_ERROR;
@@ -369,7 +409,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
369
409
  if(doc_id > index->docid_offsets[i - 1]) {
370
410
  wp_segment* seg = &index->segments[i - 1];
371
411
 
372
- DEBUG("found doc %llu in segment %u", doc_id, i - 1);
412
+ DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
373
413
  RELAY_ERROR(wp_segment_grab_writelock(seg));
374
414
  RELAY_ERROR(wp_segment_reload(seg));
375
415
  RELAY_ERROR(wp_segment_add_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
@@ -377,7 +417,7 @@ wp_error* wp_index_add_label(wp_index* index, const char* label, uint64_t doc_id
377
417
  found = 1;
378
418
  break;
379
419
  }
380
- else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
420
+ else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
381
421
  }
382
422
 
383
423
  if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
@@ -396,7 +436,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
396
436
  if(doc_id > index->docid_offsets[i - 1]) {
397
437
  wp_segment* seg = &index->segments[i - 1];
398
438
 
399
- DEBUG("found doc %llu in segment %u", doc_id, i - 1);
439
+ DEBUG("found doc %"PRIu64" in segment %u", doc_id, i - 1);
400
440
  RELAY_ERROR(wp_segment_grab_writelock(seg));
401
441
  RELAY_ERROR(wp_segment_reload(seg));
402
442
  RELAY_ERROR(wp_segment_remove_label(seg, label, (docid_t)(doc_id - index->docid_offsets[i - 1])));
@@ -404,7 +444,7 @@ wp_error* wp_index_remove_label(wp_index* index, const char* label, uint64_t doc
404
444
  found = 1;
405
445
  break;
406
446
  }
407
- else DEBUG("did not find doc %llu in segment %u", doc_id, i - 1);
447
+ else DEBUG("did not find doc %"PRIu64" in segment %u", doc_id, i - 1);
408
448
  }
409
449
 
410
450
  if(!found) RAISE_ERROR("couldn't find doc id %"PRIu64, doc_id);
@@ -14,7 +14,7 @@ wp_error* mmap_obj_create(mmap_obj* o, const char* magic, const char* pathname,
14
14
  if(o->fd == -1) RAISE_SYSERROR("cannot create %s", pathname);
15
15
 
16
16
  uint32_t size = initial_size + (uint32_t)sizeof(mmap_obj_header);
17
- DEBUG("creating %s with %u + %u = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
17
+ DEBUG("creating %s with %u + %lu = %u bytes for %s object", pathname, initial_size, sizeof(mmap_obj_header), size, magic);
18
18
  lseek(o->fd, size - 1, SEEK_SET);
19
19
  ssize_t num_bytes = write(o->fd, "", 1);
20
20
  if(num_bytes == -1) RAISE_SYSERROR("write");
@@ -35,14 +35,14 @@ wp_error* mmap_obj_load(mmap_obj* o, const char* magic, const char* pathname) {
35
35
  // load header
36
36
  o->content = mmap(NULL, sizeof(mmap_obj_header), PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
37
37
  if(o->content == MAP_FAILED) RAISE_SYSERROR("header mmap");
38
- DEBUG("loaded header of %u bytes for %s object", sizeof(mmap_obj_header), magic);
38
+ DEBUG("loaded header of %lu bytes for %s object", sizeof(mmap_obj_header), magic);
39
39
 
40
40
  RELAY_ERROR(validate(o->content, magic));
41
41
 
42
42
  o->loaded_size = o->content->size;
43
43
 
44
44
  uint32_t size = o->content->size + (uint32_t)sizeof(mmap_obj_header);
45
- DEBUG("full size is %u bytes (including %u-byte header)", size, sizeof(mmap_obj_header));
45
+ DEBUG("full size is %u bytes (including %lu-byte header)", size, sizeof(mmap_obj_header));
46
46
  if(munmap(o->content, sizeof(mmap_obj_header)) == -1) RAISE_SYSERROR("munmap");
47
47
 
48
48
  o->content = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, o->fd, 0);
@@ -85,7 +85,7 @@ wp_error* mmap_obj_resize(mmap_obj* o, uint32_t data_size) {
85
85
  }
86
86
 
87
87
  wp_error* mmap_obj_unload(mmap_obj* o) {
88
- DEBUG("unloading %u bytes", sizeof(mmap_obj_header) + o->content->size);
88
+ DEBUG("unloading %lu bytes", sizeof(mmap_obj_header) + o->content->size);
89
89
  if(munmap(o->content, sizeof(mmap_obj_header) + o->content->size) == -1) RAISE_SYSERROR("munmap");
90
90
  o->content = NULL;
91
91
  return NO_ERROR;
@@ -195,8 +195,14 @@ static wp_error* term_init_search_state(wp_query* q, wp_segment* seg) {
195
195
 
196
196
  t.word_s = stringmap_string_to_int(sh, sp, q->word);
197
197
 
198
- uint32_t offset = termhash_get_val(th, t);
199
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
198
+ uint32_t offset;
199
+ posting_list_header* plh = termhash_get_val(th, t);
200
+
201
+ DEBUG("posting list header for %s:%s (-> %u:%u) is %p", q->field, q->word, t.field_s, t.word_s, plh);
202
+ if(plh == NULL) offset = OFFSET_NONE;
203
+ else offset = plh->next_offset;
204
+
205
+ if(plh) DEBUG("posting list header has count=%u next_offset=%u", plh->count, plh->next_offset);
200
206
 
201
207
  if(offset == OFFSET_NONE) state->done = 1; // no entry in term hash
202
208
  else {
@@ -475,8 +481,8 @@ static wp_error* conj_next_doc(wp_query* q, wp_segment* seg, search_result* resu
475
481
 
476
482
  while(!found && !*done) {
477
483
  RELAY_ERROR(query_next_doc(master, seg, result, done));
478
- DEBUG("master reports doc %u done %d", result->doc_id, *done);
479
484
  if(!*done) {
485
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
480
486
  search_doc = result->doc_id;
481
487
  wp_search_result_free(result); // sigh
482
488
  RELAY_ERROR(conj_advance_to_doc(q, seg, search_doc, result, &found, done));
@@ -564,8 +570,8 @@ static wp_error* phrase_next_doc(wp_query* q, wp_segment* seg, search_result* re
564
570
 
565
571
  while(!found && !*done) {
566
572
  RELAY_ERROR(query_next_doc(master, seg, result, done));
567
- DEBUG("master reports doc %u done %d", result->doc_id, *done);
568
573
  if(!*done) {
574
+ DEBUG("master reports doc %u done %d", result->doc_id, *done);
569
575
  search_doc = result->doc_id;
570
576
  wp_search_result_free(result); // sigh
571
577
  RELAY_ERROR(phrase_advance_to_doc(q, seg, search_doc, result, &found, done));
@@ -784,7 +790,6 @@ static wp_error* every_advance_to_doc(wp_query* q, wp_segment* seg, docid_t doc_
784
790
 
785
791
  wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment* s, uint32_t max_num_results, uint32_t* num_results, search_result* results) {
786
792
  int done;
787
-
788
793
  *num_results = 0;
789
794
 
790
795
  #ifdef DEBUG
@@ -3,13 +3,16 @@
3
3
  #include <unistd.h>
4
4
  #include "whistlepig.h"
5
5
 
6
- #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
6
+ #define POSTINGS_REGION_TYPE_IMMUTABLE_VBE 1
7
7
  #define POSTINGS_REGION_TYPE_MUTABLE_NO_POSITIONS 2 // bigger, mutable
8
8
 
9
- #define SEGMENT_VERSION 3
9
+ #define SEGMENT_VERSION 4
10
10
 
11
11
  #define wp_segment_label_posting_at(posting_region, offset) ((label_posting*)(posting_region->postings + offset))
12
12
 
13
+ static posting_list_header blank_plh = { .count = 0, .next_offset = OFFSET_NONE };
14
+ static term dead_term = { .field_s = 0, .word_s = 0 };
15
+
13
16
  wp_error* wp_segment_grab_readlock(wp_segment* seg) {
14
17
  segment_info* si = MMAP_OBJ(seg->seginfo, segment_info);
15
18
  RELAY_ERROR(wp_lock_grab(&si->lock, WP_LOCK_READLOCK));
@@ -28,6 +31,23 @@ wp_error* wp_segment_release_lock(wp_segment* seg) {
28
31
  return NO_ERROR;
29
32
  }
30
33
 
34
+ wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* word, uint32_t* num_results) {
35
+ stringmap* sh = MMAP_OBJ(seg->stringmap, stringmap);
36
+ stringpool* sp = MMAP_OBJ(seg->stringpool, stringpool);
37
+ termhash* th = MMAP_OBJ(seg->termhash, termhash);
38
+
39
+ term t;
40
+ if(field == NULL) t.field_s = 0; // label sentinel
41
+ else t.field_s = stringmap_string_to_int(sh, sp, field);
42
+ t.word_s = stringmap_string_to_int(sh, sp, word);
43
+
44
+ posting_list_header* plh = termhash_get_val(th, t);
45
+ if(plh == NULL) *num_results = 0;
46
+ else *num_results = plh->count;
47
+
48
+ return NO_ERROR;
49
+ }
50
+
31
51
  static void postings_region_init(postings_region* pr, uint32_t initial_size, uint32_t postings_type_and_flags) {
32
52
  pr->postings_type_and_flags = postings_type_and_flags;
33
53
  pr->num_postings = 0;
@@ -44,7 +64,7 @@ RAISING_STATIC(segment_info_init(segment_info* si, uint32_t segment_version)) {
44
64
  }
45
65
 
46
66
  RAISING_STATIC(segment_info_validate(segment_info* si, uint32_t segment_version)) {
47
- if(si->segment_version != segment_version) RAISE_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
67
+ if(si->segment_version != segment_version) RAISE_VERSION_ERROR("segment has type %u; expecting type %u", si->segment_version, segment_version);
48
68
  return NO_ERROR;
49
69
  }
50
70
 
@@ -449,7 +469,7 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
449
469
  RELAY_ERROR(bump_stringpool(s, &success));
450
470
  RELAY_ERROR(bump_termhash(s, &success));
451
471
 
452
- DEBUG("adding posting for %s:%s and doc %u", field, word, doc_id);
472
+ DEBUG("adding posting for %s:%s and doc %u with %u positions", field, word, doc_id, num_positions);
453
473
 
454
474
  postings_region* pr = MMAP_OBJ(s->postings, postings_region);
455
475
  stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
@@ -461,26 +481,38 @@ wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* w
461
481
  RELAY_ERROR(stringmap_add(sh, sp, field, &t.field_s));
462
482
  RELAY_ERROR(stringmap_add(sh, sp, word, &t.word_s));
463
483
 
484
+ DEBUG("%s:%s maps to %u:%u", field, word, t.field_s, t.word_s);
485
+
464
486
  // find the offset of the next posting
487
+ posting_list_header* plh = termhash_get_val(th, t);
488
+ if(plh == NULL) {
489
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
490
+ plh = termhash_get_val(th, t);
491
+ }
492
+ DEBUG("posting list header for %s:%s is at %p", field, word, plh);
493
+
465
494
  posting po;
466
- uint32_t next_offset = termhash_get_val(th, t);
467
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
468
- if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy
495
+ uint32_t next_offset = plh->next_offset;
496
+
497
+ if(next_offset != OFFSET_NONE) { // TODO remove this check for speed once happy [PERFORMANCE]
469
498
  RELAY_ERROR(wp_segment_read_posting(s, next_offset, &po, 0));
470
499
  if(po.doc_id >= doc_id) RAISE_ERROR("cannot add a doc_id out of sorted order");
471
500
  }
472
501
 
473
502
  // write the entry to the postings region
474
503
  uint32_t entry_offset = pr->postings_head;
475
- //DEBUG("entry will be at offset %u, prev offset is %u and next offset is %u", entry_offset, prev_offset, next_offset);
504
+ DEBUG("writing posting at offset %u. next offset is %u.", entry_offset, next_offset);
505
+
476
506
  po.doc_id = doc_id;
477
507
  po.next_offset = next_offset;
478
508
  po.num_positions = num_positions;
479
509
  RELAY_ERROR(write_posting(s, &po, positions)); // prev_docid is 0 for th
480
- DEBUG("postings list head now at %u", pr->postings_head);
510
+ DEBUG("posting list head now at %u", pr->postings_head);
481
511
 
482
512
  // really finally, update the tail pointer so that readers can access this posting
483
- RELAY_ERROR(termhash_put_val(th, t, entry_offset));
513
+ plh->count++;
514
+ plh->next_offset = entry_offset;
515
+ DEBUG("posting list header for %s:%s now reads count=%u offset=%u", field, word, plh->count, plh->next_offset);
484
516
 
485
517
  return NO_ERROR;
486
518
  }
@@ -537,22 +569,25 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
537
569
 
538
570
  // find the previous and next label postings, between which we'll insert this
539
571
  // posting
540
- uint32_t prev_offset = OFFSET_NONE;
541
- uint32_t next_offset = termhash_get_val(th, t);
572
+ posting_list_header* plh = termhash_get_val(th, t);
573
+ if(plh == NULL) {
574
+ RELAY_ERROR(termhash_put_val(th, t, &blank_plh));
575
+ plh = termhash_get_val(th, t);
576
+ }
577
+
578
+ uint32_t next_offset = plh->next_offset;
542
579
  docid_t last_docid = DOCID_NONE;
580
+ uint32_t prev_offset = OFFSET_NONE;
543
581
 
544
- if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
545
582
  DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
546
583
 
547
584
  while(next_offset != OFFSET_NONE) {
548
585
  label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
549
586
 
550
- if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
587
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid))
551
588
  RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
552
- }
553
- else {
589
+ else
554
590
  last_docid = lp->doc_id;
555
- }
556
591
 
557
592
  DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
558
593
  if(lp->doc_id == doc_id) {
@@ -567,18 +602,23 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
567
602
  // find a space for the posting by first checking for a free postings in the
568
603
  // dead list. the dead list is the list stored under the sentinel term with
569
604
  // field 0 and word 0.
570
- term dead_term = { .field_s = 0, .word_s = 0 };
605
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
606
+ if(dead_plh == NULL) {
607
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
608
+ dead_plh = termhash_get_val(th, t);
609
+ }
610
+
571
611
  uint32_t entry_offset;
572
- uint32_t dead_offset = termhash_get_val(th, dead_term);
573
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
612
+ uint32_t dead_offset = dead_plh->next_offset;
574
613
 
575
614
  if(dead_offset == OFFSET_NONE) { // make a new posting
576
615
  entry_offset = pr->postings_head;
577
616
  }
578
617
  else { // we'll use this one; remove it from the linked list
579
618
  DEBUG("offset from dead list is %u, using it for the new posting!", dead_offset);
580
- entry_offset = dead_offset;
581
- RELAY_ERROR(termhash_put_val(th, dead_term, wp_segment_label_posting_at(pr, dead_offset)->next_offset));
619
+ entry_offset = dead_plh->next_offset;
620
+ dead_plh->next_offset = wp_segment_label_posting_at(pr, dead_offset)->next_offset;
621
+ dead_plh->count--;
582
622
  }
583
623
 
584
624
  // finally, write the entry to the label postings region
@@ -588,11 +628,12 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
588
628
  po->next_offset = next_offset;
589
629
 
590
630
  pr->postings_head += (uint32_t)sizeof(label_posting);
591
- DEBUG("label postings list head now at %u", pr->postings_head);
631
+ DEBUG("label posting list head now at %u", pr->postings_head);
592
632
 
593
633
  // really finally, update either the previous offset or the tail pointer
594
634
  // for this label so that readers can access this posting
595
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, entry_offset));
635
+ plh->count++;
636
+ if(prev_offset == OFFSET_NONE) plh->next_offset = entry_offset;
596
637
  else wp_segment_label_posting_at(pr, prev_offset)->next_offset = entry_offset;
597
638
 
598
639
  return NO_ERROR;
@@ -615,13 +656,16 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
615
656
  t.word_s = stringmap_string_to_int(sh, sp, label); // will be -1 if not there
616
657
 
617
658
  // find the posting and the previous posting in the list, if any
618
- uint32_t prev_offset = OFFSET_NONE;
619
- uint32_t offset = termhash_get_val(th, t);
620
659
  docid_t last_docid = DOCID_NONE;
660
+ uint32_t prev_offset = OFFSET_NONE;
661
+ posting_list_header* plh = termhash_get_val(th, t);
662
+ if(plh == NULL) {
663
+ DEBUG("no such label %s", label);
664
+ return NO_ERROR;
665
+ }
621
666
 
622
- if(offset == (uint32_t)-1) offset = OFFSET_NONE;
667
+ uint32_t offset = plh->next_offset;
623
668
  label_posting* lp = NULL;
624
-
625
669
  while(offset != OFFSET_NONE) {
626
670
  lp = wp_segment_label_posting_at(pr, offset);
627
671
 
@@ -646,17 +690,22 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
646
690
  }
647
691
 
648
692
  // we've found the posting; now remove it from the list
649
- if(prev_offset == OFFSET_NONE) RELAY_ERROR(termhash_put_val(th, t, lp->next_offset));
693
+ if(prev_offset == OFFSET_NONE) plh->next_offset = lp->next_offset;
650
694
  else wp_segment_label_posting_at(pr, prev_offset)->next_offset = lp->next_offset;
695
+ plh->count--;
651
696
 
652
697
  // now add it to the dead list for later reclamation
653
- term dead_term = { .field_s = 0, .word_s = 0 };
654
- uint32_t dead_offset = termhash_get_val(th, dead_term);
655
- if(dead_offset == (uint32_t)-1) dead_offset = OFFSET_NONE;
698
+ posting_list_header* dead_plh = termhash_get_val(th, dead_term);
699
+ if(dead_plh == NULL) {
700
+ RELAY_ERROR(termhash_put_val(th, dead_term, &blank_plh));
701
+ dead_plh = termhash_get_val(th, t);
702
+ }
656
703
 
657
- lp->next_offset = dead_offset;
658
704
  DEBUG("adding dead label posting %u to head of deadlist with next_offset %u", offset, lp->next_offset);
659
- RELAY_ERROR(termhash_put_val(th, dead_term, offset));
705
+
706
+ uint32_t dead_offset = dead_plh->next_offset;
707
+ lp->next_offset = dead_offset;
708
+ dead_plh->next_offset = offset;
660
709
 
661
710
  return NO_ERROR;
662
711
  }
@@ -150,4 +150,7 @@ wp_error* wp_segment_ensure_fit(wp_segment* seg, uint32_t postings_bytes, uint32
150
150
  // private: return the size on disk of a position array
151
151
  wp_error* wp_segment_sizeof_posarray(wp_segment* seg, uint32_t num_positions, pos_t* positions, uint32_t* size) RAISES_ERROR;
152
152
 
153
+ // private: count the number of occurences of a particular term
154
+ wp_error* wp_segment_count_term(wp_segment* seg, const char* field, const char* term, uint32_t* num_results);
155
+
153
156
  #endif
@@ -87,7 +87,7 @@ RAISING_STATIC(snippetize_from_lexer(wp_query* query, lexinfo* charpos, yyscan_t
87
87
 
88
88
  RARRAY_INIT(pword, words);
89
89
  while(yylex(*scanner) != TOK_DONE) {
90
- pword pw = { strdup(yyget_text(*scanner)), charpos->start, charpos->end };
90
+ pword pw = { .token = strdup(yyget_text(*scanner)), .start = charpos->start, .end = charpos->end };
91
91
  RARRAY_ADD(pword, words, pw);
92
92
  }
93
93
 
@@ -106,15 +106,15 @@ wp_error* termhash_bump_size(termhash *h) {
106
106
 
107
107
  // get pointers to the old locations
108
108
  term* oldkeys = TERMHASH_KEYS(h);
109
- uint32_t* oldvals = TERMHASH_VALS(h);
109
+ posting_list_header* oldvals = TERMHASH_VALS(h);
110
110
 
111
111
  // set pointers to the new locations
112
112
  uint32_t* newflags = (uint32_t*)h->boundary;
113
113
  term* newkeys = (term*)(newflags + ((new_n_buckets >> 4) + 1));
114
- uint32_t* newvals = (uint32_t*)(newkeys + new_n_buckets);
114
+ posting_list_header* newvals = (posting_list_header*)(newkeys + new_n_buckets);
115
115
 
116
116
  // move the vals and keys
117
- memmove(newvals, oldvals, h->n_buckets * sizeof(uint32_t));
117
+ memmove(newvals, oldvals, h->n_buckets * sizeof(posting_list_header));
118
118
  memmove(newkeys, oldkeys, h->n_buckets * sizeof(term));
119
119
 
120
120
  // clear the new flags
@@ -124,8 +124,7 @@ wp_error* termhash_bump_size(termhash *h) {
124
124
  for (unsigned int j = 0; j != h->n_buckets; ++j) {
125
125
  if (iseither(flagbaks, j) == 0) {
126
126
  term key = newkeys[j];
127
- uint32_t val;
128
- val = newvals[j];
127
+ posting_list_header val = newvals[j];
129
128
  set_isdel_true(flagbaks, j);
130
129
  while (1) {
131
130
  uint32_t inc, k, i;
@@ -139,7 +138,7 @@ wp_error* termhash_bump_size(termhash *h) {
139
138
  set_isempty_false(newflags, i);
140
139
  if (i < h->n_buckets && iseither(flagbaks, i) == 0) {
141
140
  { term tmp = newkeys[i]; newkeys[i] = key; key = tmp; }
142
- { uint32_t tmp = newvals[i]; newvals[i] = val; val = tmp; }
141
+ { posting_list_header tmp = newvals[i]; newvals[i] = val; val = tmp; }
143
142
  set_isdel_true(flagbaks, i);
144
143
  } else {
145
144
  newkeys[i] = key;
@@ -235,20 +234,20 @@ void termhash_del(termhash *h, uint32_t x) {
235
234
  }
236
235
  }
237
236
 
238
- uint32_t termhash_get_val(termhash* h, term t) {
239
- uint32_t* vals = TERMHASH_VALS(h);
237
+ posting_list_header* termhash_get_val(termhash* h, term t) {
238
+ posting_list_header* vals = TERMHASH_VALS(h);
240
239
  uint32_t idx = termhash_get(h, t);
241
- if(idx == h->n_buckets) return (uint32_t)-1;
242
- return vals[idx];
240
+ if(idx == h->n_buckets) return NULL;
241
+ return &vals[idx];
243
242
  }
244
243
 
245
- wp_error* termhash_put_val(termhash* h, term t, uint32_t val) {
244
+ wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) {
246
245
  int status;
247
- uint32_t* vals = TERMHASH_VALS(h);
246
+ posting_list_header* vals = TERMHASH_VALS(h);
248
247
  uint32_t loc = termhash_put(h, t, &status);
249
248
  DEBUG("put(%u,%u) has status %d and loc %u (error val is %u)", t.field_s, t.word_s, status, loc, h->n_buckets);
250
249
  if(status == -1) RAISE_ERROR("out of space in hash");
251
- vals[loc] = val;
250
+ memcpy(&vals[loc], val, sizeof(posting_list_header));
252
251
  return NO_ERROR;
253
252
  }
254
253
 
@@ -257,22 +256,22 @@ int termhash_needs_bump(termhash* h) {
257
256
  }
258
257
 
259
258
  // returns the total size in bytes
260
- // memory layout: termhash, then:
259
+ // memory layout: termhash struct, then:
261
260
  // ((n_buckets >> 4) + 1) uint32_t's for the flags
262
261
  // n_buckets terms for the keys
263
- // n_buckets uint32_t's for the vals (offsets into postings lists)
262
+ // n_buckets posting_list_header for the vals (offsets into postings lists)
264
263
  static uint32_t size(uint32_t n_buckets) {
265
264
  uint32_t size = (uint32_t)sizeof(termhash) +
266
265
  (((n_buckets >> 4) + 1) * (uint32_t)sizeof(uint32_t)) +
267
266
  (n_buckets * (uint32_t)sizeof(term)) +
268
- (n_buckets * (uint32_t)sizeof(uint32_t));
267
+ (n_buckets * (uint32_t)sizeof(posting_list_header));
269
268
 
270
269
  DEBUG("size of a termhash with %u buckets is %lu + %lu + %lu + %lu = %u",
271
270
  n_buckets,
272
271
  (long)sizeof(termhash),
273
272
  (long)(((n_buckets >> 4) + 1) * sizeof(uint32_t)),
274
273
  (long)(n_buckets * sizeof(term)),
275
- (long)(n_buckets * sizeof(uint32_t)),
274
+ (long)(n_buckets * sizeof(posting_list_header)),
276
275
  size);
277
276
 
278
277
  return size;
@@ -22,6 +22,18 @@ typedef struct term {
22
22
  uint32_t word_s;
23
23
  } term;
24
24
 
25
+ typedef struct posting_list_header {
26
+ uint32_t count;
27
+ uint32_t next_offset;
28
+ } posting_list_header;
29
+
30
+ typedef struct block_header {
31
+ uint32_t max_docid;
32
+ uint32_t next_offset;
33
+ uint32_t block_start;
34
+ uint8_t data[];
35
+ } block_header;
36
+
25
37
  #define INITIAL_N_BUCKETS_IDX 1
26
38
 
27
39
  typedef struct termhash {
@@ -31,12 +43,12 @@ typedef struct termhash {
31
43
  // in memory at this point
32
44
  // ((n_buckets >> 4) + 1) uint32_t's for the flags
33
45
  // n_buckets terms for the keys
34
- // n_buckets uint32_t's for the vals (offsets into postings lists)
46
+ // n_buckets posting_list_header for the vals
35
47
  } termhash;
36
48
 
37
49
  #define TERMHASH_FLAGS(h) ((uint32_t*)(h)->boundary)
38
50
  #define TERMHASH_KEYS(h) ((term*)((uint32_t*)(h)->boundary + (((h)->n_buckets >> 4) + 1)))
39
- #define TERMHASH_VALS(h) ((uint32_t*)(TERMHASH_KEYS(h) + (h)->n_buckets))
51
+ #define TERMHASH_VALS(h) ((posting_list_header*)(TERMHASH_KEYS(h) + (h)->n_buckets))
40
52
 
41
53
  // API methods
42
54
 
@@ -50,14 +62,14 @@ uint32_t termhash_get(termhash *h, term t);
50
62
 
51
63
  // public: get an int given a term. returns (uint32_t)-1 if the term is not in
52
64
  // the hash.
53
- uint32_t termhash_get_val(termhash* h, term t); // convenience
65
+ posting_list_header* termhash_get_val(termhash* h, term t); // convenience
54
66
 
55
67
  // private: khash-style setter: insert a term into the hash. see the code
56
68
  // for details on what all the return values mean.
57
69
  uint32_t termhash_put(termhash* h, term t, int *ret); // khash-style
58
70
 
59
71
  // public: adds a term to the hash with the given value
60
- wp_error* termhash_put_val(termhash* h, term t, uint32_t val) RAISES_ERROR; // convenience
72
+ wp_error* termhash_put_val(termhash* h, term t, posting_list_header* val) RAISES_ERROR; // convenience
61
73
 
62
74
  // public: returns the byte size of the termhash
63
75
  uint32_t termhash_size(termhash* h);
@@ -7,7 +7,9 @@ static VALUE c_index;
7
7
  static VALUE c_entry;
8
8
  static VALUE c_query;
9
9
  static VALUE c_error;
10
- static VALUE c_parseerror;
10
+ static VALUE c_parse_error;
11
+ static VALUE c_sys_error;
12
+ static VALUE c_version_error;
11
13
 
12
14
  static void index_free(wp_index* index) {
13
15
  wp_error* e = wp_index_free(index);
@@ -20,7 +22,12 @@ static void index_free(wp_index* index) {
20
22
 
21
23
  #define RAISE_IF_NECESSARY(e) do { \
22
24
  if(e != NULL) { \
23
- VALUE exc = rb_exc_new2(c_error, e->msg); \
25
+ VALUE exc; \
26
+ switch(e->type) { \
27
+ case WP_ERROR_TYPE_SYSTEM: exc = rb_exc_new2(c_sys_error, e->msg); break; \
28
+ case WP_ERROR_TYPE_VERSION: exc = rb_exc_new2(c_version_error, e->msg); break; \
29
+ default: exc = rb_exc_new2(c_error, e->msg); break; \
30
+ } \
24
31
  wp_error_free(e); \
25
32
  rb_exc_raise(exc); \
26
33
  } \
@@ -331,7 +338,7 @@ static VALUE query_new(VALUE class, VALUE default_field, VALUE string) {
331
338
  wp_query* query;
332
339
  wp_error* e = wp_query_parse(RSTRING_PTR(string), RSTRING_PTR(default_field), &query);
333
340
  if(e != NULL) {
334
- VALUE exc = rb_exc_new2(c_parseerror, e->msg);
341
+ VALUE exc = rb_exc_new2(c_parse_error, e->msg);
335
342
  wp_error_free(e);
336
343
  rb_exc_raise(exc);
337
344
  }
@@ -614,5 +621,7 @@ void Init_whistlepig() {
614
621
  rb_define_attr(c_query, "query", 1, 0);
615
622
 
616
623
  c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
617
- c_parseerror = rb_define_class_under(m_whistlepig, "ParseError", rb_eStandardError);
624
+ c_parse_error = rb_define_class_under(m_whistlepig, "ParseError", c_error);
625
+ c_sys_error = rb_define_class_under(m_whistlepig, "SystemError", c_error);
626
+ c_version_error = rb_define_class_under(m_whistlepig, "VersionError", c_error);
618
627
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whistlepig
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.2
4
+ version: '0.12'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-19 00:00:00.000000000 Z
12
+ date: 2012-06-09 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Whistlepig is a minimalist realtime full-text search index. Its goal
15
15
  is to be as small and minimally-featured as possible, while still remaining useful,