whistlepig 0.7 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -8,8 +8,8 @@ the frills, Whistlepig may be for you.
8
8
  Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
9
  bindings.
10
10
 
11
- Latest version: 0.7, released 2011-07-19.
12
- Status: alpha
11
+ Latest version: 0.8, released 2012-03-13.
12
+ Status: beta
13
13
  News: http://all-thing.net/label/whistlepig/
14
14
  Homepage: http://masanjin.net/whistlepig/
15
15
  Bug reports: http://github.com/wmorgan/whistlepig/issues
@@ -0,0 +1,65 @@
1
+ #include <stdio.h>
2
+ #include "whistlepig.h"
3
+
4
+ #define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
5
+ #define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
6
+ #define KEY(h, i) &(h->pool[h->keys[i]])
7
+
8
+ RAISING_STATIC(dump_posting_list(wp_segment* s, uint32_t offset)) {
9
+ posting po;
10
+
11
+ while(offset != OFFSET_NONE) {
12
+ RELAY_ERROR(wp_segment_read_posting(s, offset, &po, 1));
13
+
14
+ printf(" @%u doc %u:", offset, po.doc_id);
15
+ for(uint32_t i = 0; i < po.num_positions; i++) {
16
+ printf(" %d", po.positions[i]);
17
+ }
18
+ printf("\n");
19
+
20
+ offset = po.next_offset;
21
+ free(po.positions);
22
+ }
23
+
24
+ return NO_ERROR;
25
+ }
26
+
27
+ RAISING_STATIC(dump(wp_segment* segment)) {
28
+ termhash* th = MMAP_OBJ(segment->termhash, termhash);
29
+ stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
30
+
31
+ for(uint32_t i = 0; i < th->n_buckets; i++) {
32
+ if(isempty(th->flags, i)); // do nothing
33
+ else if(isdel(th->flags, i)) printf("%u: [deleted]", i);
34
+ else {
35
+ term t = th->keys[i];
36
+ const char* field = stringmap_int_to_string(sh, t.field_s);
37
+ const char* word = stringmap_int_to_string(sh, t.word_s);
38
+ printf("%u: %s:'%s'\n", i, field, word);
39
+ RELAY_ERROR(dump_posting_list(segment, th->vals[i]));
40
+ }
41
+ }
42
+
43
+ return NO_ERROR;
44
+ }
45
+
46
+ int main(int argc, char* argv[]) {
47
+ if(argc != 2) {
48
+ fprintf(stderr, "Usage: %s <segment filename>\n", argv[0]);
49
+ return -1;
50
+ }
51
+
52
+ wp_index* index;
53
+ DIE_IF_ERROR(wp_index_load(&index, argv[1]));
54
+ DIE_IF_ERROR(wp_index_dumpinfo(index, stdout));
55
+
56
+ for(int i = 0; i < index->num_segments; i++) {
57
+ printf("\nsegment %d details:\n", i);
58
+ DIE_IF_ERROR(dump(&index->segments[i]));
59
+ }
60
+
61
+ DIE_IF_ERROR(wp_index_unload(index));
62
+
63
+ return 0;
64
+ }
65
+
@@ -3,4 +3,4 @@ require 'mkmf'
3
3
  $CFLAGS = "-g -O3 -std=c99 $(cflags) -D_ANSI_SOURCE"
4
4
 
5
5
  create_header
6
- create_makefile "whistlepigc"
6
+ create_makefile "whistlepig"
@@ -12,7 +12,17 @@ static wp_query* wp_query_new() {
12
12
  return ret;
13
13
  }
14
14
 
15
+ static const char* identity(const char* field, const char* word) {
16
+ (void)field;
17
+ if(word) return strdup(word);
18
+ else return NULL;
19
+ }
20
+
15
21
  wp_query* wp_query_clone(wp_query* other) {
22
+ return wp_query_substitute(other, identity);
23
+ }
24
+
25
+ wp_query* wp_query_substitute(wp_query* other, const char *(*substituter)(const char* field, const char* word)) {
16
26
  wp_query* ret = malloc(sizeof(wp_query));
17
27
  ret->type = other->type;
18
28
  ret->num_children = other->num_children;
@@ -21,12 +31,12 @@ wp_query* wp_query_clone(wp_query* other) {
21
31
  if(other->field) ret->field = strdup(other->field);
22
32
  else ret->field = NULL;
23
33
 
24
- if(other->word) ret->word = strdup(other->word);
34
+ if(other->field && other->word) ret->word = substituter(other->field, other->word);
25
35
  else ret->word = NULL;
26
36
 
27
37
  ret->children = ret->next = ret->last = NULL; // set below
28
38
  for(wp_query* child = other->children; child != NULL; child = child->next) {
29
- wp_query* clone = wp_query_clone(child);
39
+ wp_query* clone = wp_query_substitute(child, substituter);
30
40
  if(ret->last == NULL) ret->children = ret->last = clone;
31
41
  else {
32
42
  ret->last->next = clone;
@@ -64,9 +64,12 @@ wp_query* wp_query_new_empty();
64
64
  // public: make an every-document query node.
65
65
  wp_query* wp_query_new_every();
66
66
 
67
- // public: deep clone of a query, but dropping all search state.
67
+ // public: deep clone of a query, dropping all search state.
68
68
  wp_query* wp_query_clone(wp_query* other);
69
69
 
70
+ // public: build a new query by substituting words from the old query, dropping all search state
71
+ wp_query* wp_query_substitute(wp_query* other, const char *(*substituter)(const char* field, const char* word));
72
+
70
73
  // public: add a query node as a child of another
71
74
  wp_query* wp_query_add(wp_query* a, wp_query* b);
72
75
 
@@ -793,6 +793,7 @@ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment*
793
793
  #endif
794
794
 
795
795
  while(*num_results < max_num_results) {
796
+ DEBUG("got %d results so far (max is %d)", *num_results, max_num_results);
796
797
  RELAY_ERROR(query_next_doc(q, s, &results[*num_results], &done));
797
798
  if(done) break;
798
799
  DEBUG("got result %u (%u doc matches)", results[*num_results].doc_id, results[*num_results].num_doc_matches);
@@ -396,6 +396,9 @@ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, i
396
396
  wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
397
397
  // TODO move this logic up to ensure_fit()
398
398
  int success;
399
+
400
+ if(doc_id == 0) RAISE_ERROR("can't add a label to doc 0");
401
+
399
402
  RELAY_ERROR(bump_stringmap(s, &success));
400
403
  RELAY_ERROR(bump_stringpool(s, &success));
401
404
  RELAY_ERROR(bump_termhash(s, &success));
@@ -465,11 +468,14 @@ wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
465
468
  wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
466
469
  // TODO move this logic up to ensure_fit()
467
470
  int success;
471
+
472
+ if(doc_id == 0) RAISE_ERROR("can't add a label to doc 0");
473
+
468
474
  RELAY_ERROR(bump_stringmap(s, &success));
469
475
  RELAY_ERROR(bump_stringpool(s, &success));
470
476
  RELAY_ERROR(bump_termhash(s, &success));
471
477
 
472
- DEBUG("adding label %s to doc %u", label, doc_id);
478
+ DEBUG("adding label '%s' to doc %u", label, doc_id);
473
479
 
474
480
  postings_region* pr = MMAP_OBJ(s->labels, postings_region);
475
481
  stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
@@ -485,22 +491,34 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
485
491
  // posting
486
492
  uint32_t prev_offset = OFFSET_NONE;
487
493
  uint32_t next_offset = termhash_get_val(th, t);
494
+ docid_t last_docid = DOCID_NONE;
495
+
488
496
  if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
497
+ DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
489
498
 
490
499
  while(next_offset != OFFSET_NONE) {
491
- label_posting* po = wp_segment_label_posting_at(pr, next_offset);
492
- if(po->doc_id == doc_id) {
500
+ label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
501
+
502
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
503
+ RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
504
+ }
505
+ else {
506
+ last_docid = lp->doc_id;
507
+ }
508
+
509
+ DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
510
+ if(lp->doc_id == doc_id) {
493
511
  DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
494
512
  return NO_ERROR;
495
513
  }
496
- else if(po->doc_id < doc_id) break;
514
+ else if(lp->doc_id < doc_id) break;
497
515
  prev_offset = next_offset;
498
- next_offset = po->next_offset;
516
+ next_offset = lp->next_offset;
499
517
  }
500
518
 
501
519
  // find a space for the posting by first checking for a free postings in the
502
- // dead list. the dead list is the list stored under the sentinel term
503
- // with field 0 and word 0.
520
+ // dead list. the dead list is the list stored under the sentinel term with
521
+ // field 0 and word 0.
504
522
  term dead_term = { .field_s = 0, .word_s = 0 };
505
523
  uint32_t entry_offset;
506
524
  uint32_t dead_offset = termhash_get_val(th, dead_term);
@@ -550,11 +568,21 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
550
568
  // find the posting and the previous posting in the list, if any
551
569
  uint32_t prev_offset = OFFSET_NONE;
552
570
  uint32_t offset = termhash_get_val(th, t);
571
+ docid_t last_docid = DOCID_NONE;
572
+
553
573
  if(offset == (uint32_t)-1) offset = OFFSET_NONE;
554
574
  label_posting* lp = NULL;
555
575
 
556
576
  while(offset != OFFSET_NONE) {
557
577
  lp = wp_segment_label_posting_at(pr, offset);
578
+
579
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
580
+ RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", offset, lp->doc_id, prev_offset, last_docid);
581
+ }
582
+ else {
583
+ last_docid = lp->doc_id;
584
+ }
585
+
558
586
  if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
559
587
  if(lp->doc_id <= doc_id) break;
560
588
  prev_offset = offset;
@@ -0,0 +1,404 @@
1
+ #include "test.h"
2
+ #include "segment.h"
3
+ #include "tokenizer.lex.h"
4
+ #include "query.h"
5
+ #include "index.h"
6
+
7
+ #define SEGMENT_PATH "/tmp/segment-test"
8
+
9
+ wp_error* setup(wp_segment* segment) {
10
+ RELAY_ERROR(wp_segment_delete(SEGMENT_PATH));
11
+ RELAY_ERROR(wp_segment_create(segment, SEGMENT_PATH));
12
+ return NO_ERROR;
13
+ }
14
+
15
+ #define ADD_DOC(word, pos) \
16
+ positions[0] = pos; \
17
+ RELAY_ERROR(wp_segment_ensure_fit(segment, postings_bytes, 0, &success)); \
18
+ if(success != 1) RAISE_ERROR("couldn't ensure segment fit"); \
19
+ RELAY_ERROR(wp_segment_add_posting(segment, "body", word, doc_id, 1, positions));
20
+
21
+ wp_error* add_docs(wp_segment* segment) {
22
+ docid_t doc_id;
23
+ pos_t positions[10];
24
+ uint32_t postings_bytes;
25
+ int success;
26
+
27
+ RELAY_ERROR(wp_segment_sizeof_posarray(segment, 1, NULL, &postings_bytes));
28
+
29
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
30
+ ADD_DOC("one", 0);
31
+ ADD_DOC("two", 1);
32
+ ADD_DOC("three", 2);
33
+
34
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
35
+ ADD_DOC("two", 0);
36
+ ADD_DOC("three", 1);
37
+ ADD_DOC("four", 2);
38
+
39
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
40
+ ADD_DOC("three", 0);
41
+ ADD_DOC("four", 1);
42
+ ADD_DOC("five", 2);
43
+
44
+ return NO_ERROR;
45
+ }
46
+
47
+ TEST(initial_state) {
48
+ wp_segment segment;
49
+ RELAY_ERROR(setup(&segment));
50
+
51
+ postings_region* pr = MMAP_OBJ(segment.postings, postings_region);
52
+ ASSERT(pr->num_docs == 0);
53
+ ASSERT(pr->num_postings == 0);
54
+
55
+ RELAY_ERROR(wp_segment_unload(&segment));
56
+ return NO_ERROR;
57
+ }
58
+
59
+ TEST(adding_a_doc_increments_counts) {
60
+ wp_segment segment;
61
+ pos_t positions[10];
62
+ docid_t doc_id;
63
+
64
+ RELAY_ERROR(setup(&segment));
65
+ RELAY_ERROR(wp_segment_grab_docid(&segment, &doc_id));
66
+
67
+ positions[0] = 0;
68
+ RELAY_ERROR(wp_segment_add_posting(&segment, "body", "hello", doc_id, 1, positions));
69
+ positions[0] = 1;
70
+ RELAY_ERROR(wp_segment_add_posting(&segment, "body", "there", doc_id, 1, positions));
71
+
72
+ postings_region* pr = MMAP_OBJ(segment.postings, postings_region);
73
+ ASSERT(pr->num_docs == 1);
74
+ ASSERT(pr->num_postings == 2);
75
+
76
+ RELAY_ERROR(wp_segment_unload(&segment));
77
+ return NO_ERROR;
78
+ }
79
+
80
+ #define RUN_QUERY(query) \
81
+ RELAY_ERROR(wp_search_init_search_state(query, &segment)); \
82
+ RELAY_ERROR(wp_search_run_query_on_segment(query, &segment, 10, &num_results, &results[0])); \
83
+ RELAY_ERROR(wp_search_release_search_state(query));
84
+
85
+ TEST(simple_term_queries) {
86
+ wp_segment segment;
87
+ uint32_t num_results;
88
+ search_result results[10];
89
+ wp_query* query;
90
+
91
+ RELAY_ERROR(setup(&segment));
92
+ RELAY_ERROR(add_docs(&segment));
93
+
94
+ query = wp_query_new_term("body", "one");
95
+ RUN_QUERY(query);
96
+
97
+ ASSERT(num_results == 1);
98
+ ASSERT(results[0].doc_id == 1);
99
+
100
+ query = wp_query_new_term("body", "two");
101
+ RUN_QUERY(query);
102
+
103
+ ASSERT(num_results == 2);
104
+ ASSERT(results[0].doc_id == 2);
105
+ ASSERT(results[1].doc_id == 1);
106
+
107
+ RELAY_ERROR(wp_segment_unload(&segment));
108
+ return NO_ERROR;
109
+ }
110
+
111
+ TEST(simple_conjunctive_queries) {
112
+ wp_segment segment;
113
+ uint32_t num_results;
114
+ search_result results[10];
115
+ wp_query* query;
116
+
117
+ RELAY_ERROR(setup(&segment));
118
+ RELAY_ERROR(add_docs(&segment));
119
+
120
+ query = wp_query_new_conjunction();
121
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
122
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
123
+
124
+ RUN_QUERY(query);
125
+
126
+ ASSERT(num_results == 1);
127
+ ASSERT(results[0].doc_id == 1);
128
+
129
+ query = wp_query_new_conjunction();
130
+ query = wp_query_add(query, wp_query_new_term("body", "four"));
131
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
132
+
133
+ RUN_QUERY(query);
134
+
135
+ ASSERT(num_results == 1);
136
+ ASSERT(results[0].doc_id == 2);
137
+
138
+ // <empty>
139
+ query = wp_query_new_conjunction();
140
+ RUN_QUERY(query);
141
+ ASSERT(num_results == 0);
142
+
143
+ RELAY_ERROR(wp_segment_unload(&segment));
144
+ return NO_ERROR;
145
+ }
146
+
147
+ TEST(simple_phrasal_queries) {
148
+ wp_segment segment;
149
+ uint32_t num_results;
150
+ search_result results[10];
151
+ wp_query* query;
152
+
153
+ RELAY_ERROR(setup(&segment));
154
+ RELAY_ERROR(add_docs(&segment));
155
+
156
+ query = wp_query_new_phrase();
157
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
158
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
159
+ RUN_QUERY(query);
160
+ ASSERT(num_results == 1);
161
+ ASSERT(results[0].doc_id == 1);
162
+
163
+ query = wp_query_new_phrase();
164
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
165
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
166
+ RUN_QUERY(query);
167
+ ASSERT(num_results == 0);
168
+
169
+ query = wp_query_new_phrase();
170
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
171
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
172
+ RUN_QUERY(query);
173
+ ASSERT(num_results == 2);
174
+ ASSERT(results[0].doc_id == 2);
175
+ ASSERT(results[1].doc_id == 1);
176
+
177
+ query = wp_query_new_phrase();
178
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
179
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
180
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
181
+ RUN_QUERY(query);
182
+ ASSERT(num_results == 1);
183
+ ASSERT(results[0].doc_id == 1);
184
+
185
+ RELAY_ERROR(wp_segment_unload(&segment));
186
+ return NO_ERROR;
187
+ }
188
+
189
+ TEST(segment_conjuction_of_phrase_queries) {
190
+ wp_segment segment;
191
+ uint32_t num_results;
192
+ search_result results[10];
193
+ wp_query* query;
194
+ wp_query* subquery;
195
+
196
+ RELAY_ERROR(setup(&segment));
197
+ RELAY_ERROR(add_docs(&segment));
198
+
199
+ // one "two three"
200
+ subquery = wp_query_new_phrase();
201
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
202
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
203
+ query = wp_query_new_conjunction();
204
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
205
+ query = wp_query_add(query, subquery);
206
+
207
+ RUN_QUERY(query);
208
+ ASSERT(num_results == 1);
209
+ ASSERT(results[0].doc_id == 1);
210
+
211
+ // "two three" one
212
+ subquery = wp_query_new_phrase();
213
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
214
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
215
+ query = wp_query_new_conjunction();
216
+ query = wp_query_add(query, subquery);
217
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
218
+
219
+ RUN_QUERY(query);
220
+ ASSERT(num_results == 1);
221
+ ASSERT(results[0].doc_id == 1);
222
+
223
+ // one "three two"
224
+ subquery = wp_query_new_phrase();
225
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
226
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
227
+ query = wp_query_new_conjunction();
228
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
229
+ query = wp_query_add(query, subquery);
230
+
231
+ RUN_QUERY(query);
232
+ ASSERT(num_results == 0);
233
+
234
+ // two "two three"
235
+ subquery = wp_query_new_phrase();
236
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
237
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
238
+ query = wp_query_new_conjunction();
239
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
240
+ query = wp_query_add(query, subquery);
241
+
242
+ RUN_QUERY(query);
243
+ ASSERT(num_results == 2);
244
+ ASSERT(results[0].doc_id == 2);
245
+ ASSERT(results[1].doc_id == 1);
246
+
247
+ RELAY_ERROR(wp_segment_unload(&segment));
248
+ return NO_ERROR;
249
+ }
250
+
251
+ TEST(negation_queries) {
252
+ wp_segment segment;
253
+ uint32_t num_results;
254
+ search_result results[10];
255
+ wp_query* query;
256
+ wp_query* subquery;
257
+
258
+ RELAY_ERROR(setup(&segment));
259
+ RELAY_ERROR(add_docs(&segment));
260
+
261
+ // one "two three"
262
+ subquery = wp_query_new_phrase();
263
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
264
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
265
+ query = wp_query_new_conjunction();
266
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
267
+ query = wp_query_add(query, subquery);
268
+
269
+ RUN_QUERY(query);
270
+ ASSERT(num_results == 1);
271
+ ASSERT(results[0].doc_id == 1);
272
+
273
+ // "two three" one
274
+ subquery = wp_query_new_phrase();
275
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
276
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
277
+ query = wp_query_new_conjunction();
278
+ query = wp_query_add(query, subquery);
279
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
280
+
281
+ RUN_QUERY(query);
282
+ ASSERT(num_results == 1);
283
+ ASSERT(results[0].doc_id == 1);
284
+
285
+ // one "three two"
286
+ subquery = wp_query_new_phrase();
287
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
288
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
289
+ query = wp_query_new_conjunction();
290
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
291
+ query = wp_query_add(query, subquery);
292
+
293
+ RUN_QUERY(query);
294
+ ASSERT(num_results == 0);
295
+
296
+ // two "two three"
297
+ subquery = wp_query_new_phrase();
298
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
299
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
300
+ query = wp_query_new_conjunction();
301
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
302
+ query = wp_query_add(query, subquery);
303
+
304
+ RUN_QUERY(query);
305
+ ASSERT(num_results == 2);
306
+ ASSERT(results[0].doc_id == 2);
307
+ ASSERT(results[1].doc_id == 1);
308
+
309
+ // <empty>
310
+ query = wp_query_new_conjunction();
311
+ RUN_QUERY(query);
312
+ ASSERT(num_results == 0);
313
+
314
+ // -one
315
+ subquery = wp_query_new_term("body", "one");
316
+ query = wp_query_new_negation();
317
+ query = wp_query_add(query, subquery);
318
+ RUN_QUERY(query);
319
+ ASSERT(num_results == 2);
320
+ ASSERT(results[0].doc_id == 3);
321
+ ASSERT(results[1].doc_id == 2);
322
+
323
+ // -two
324
+ subquery = wp_query_new_term("body", "two");
325
+ query = wp_query_new_negation();
326
+ query = wp_query_add(query, subquery);
327
+ RUN_QUERY(query);
328
+ ASSERT(num_results == 1);
329
+ ASSERT(results[0].doc_id == 3);
330
+
331
+ // -three
332
+ subquery = wp_query_new_term("body", "three");
333
+ query = wp_query_new_negation();
334
+ query = wp_query_add(query, subquery);
335
+ RUN_QUERY(query);
336
+ ASSERT(num_results == 0);
337
+
338
+ // -potato
339
+ subquery = wp_query_new_term("body", "potato");
340
+ query = wp_query_new_negation();
341
+ query = wp_query_add(query, subquery);
342
+ RUN_QUERY(query);
343
+ ASSERT(num_results == 3);
344
+ ASSERT(results[0].doc_id == 3);
345
+ ASSERT(results[1].doc_id == 2);
346
+ ASSERT(results[2].doc_id == 1);
347
+
348
+ // -"one two"
349
+ subquery = wp_query_new_conjunction();
350
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
351
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
352
+ query = wp_query_new_negation();
353
+ query = wp_query_add(query, subquery);
354
+ RUN_QUERY(query);
355
+ ASSERT(num_results == 2);
356
+ ASSERT(results[0].doc_id == 3);
357
+ ASSERT(results[1].doc_id == 2);
358
+
359
+ // -(AND one three)
360
+ subquery = wp_query_new_conjunction();
361
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
362
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
363
+ query = wp_query_new_negation();
364
+ query = wp_query_add(query, subquery);
365
+ RUN_QUERY(query);
366
+ ASSERT(num_results == 2);
367
+ ASSERT(results[0].doc_id == 3);
368
+ ASSERT(results[1].doc_id == 2);
369
+
370
+ // -"one three"
371
+ subquery = wp_query_new_phrase();
372
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
373
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
374
+ query = wp_query_new_negation();
375
+ query = wp_query_add(query, subquery);
376
+ RUN_QUERY(query);
377
+ ASSERT(num_results == 3);
378
+
379
+ // (AND -one three)
380
+ subquery = wp_query_new_negation();
381
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
382
+ query = wp_query_new_conjunction();
383
+ query = wp_query_add(query, subquery);
384
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
385
+ RUN_QUERY(query);
386
+ ASSERT(num_results == 2);
387
+ ASSERT(results[0].doc_id == 3);
388
+ ASSERT(results[1].doc_id == 2);
389
+
390
+ // (AND three -one)
391
+ subquery = wp_query_new_negation();
392
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
393
+ query = wp_query_new_conjunction();
394
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
395
+ query = wp_query_add(query, subquery);
396
+ RUN_QUERY(query);
397
+ ASSERT(num_results == 2);
398
+ ASSERT(results[0].doc_id == 3);
399
+ ASSERT(results[1].doc_id == 2);
400
+
401
+ RELAY_ERROR(wp_segment_unload(&segment));
402
+ return NO_ERROR;
403
+ }
404
+
@@ -0,0 +1,82 @@
1
+ #include <string.h>
2
+ #include "stringmap.h"
3
+ #include "error.h"
4
+ #include "test.h"
5
+
6
+ static stringmap* setup() {
7
+ stringpool* p = malloc(stringpool_initial_size());
8
+ stringpool_init(p);
9
+ stringmap* q = malloc(stringmap_initial_size());
10
+ stringmap_init(q, p);
11
+ return q;
12
+ }
13
+
14
+ TEST(stringmap_initial_state) {
15
+ stringmap* q = setup();
16
+ ASSERT(q->n_occupied == 0);
17
+ ASSERT(!stringmap_needs_bump(q));
18
+
19
+ free(q);
20
+ return NO_ERROR;
21
+ }
22
+
23
+ TEST(stringmap_lookups_on_empty) {
24
+ stringmap* q = setup();
25
+
26
+ ASSERT(stringmap_string_to_int(q, "hot potato") == (uint32_t)-1);
27
+ ASSERT(stringmap_int_to_string(q, 0) == NULL);
28
+ ASSERT(stringmap_int_to_string(q, 1234) == NULL);
29
+
30
+ free(q);
31
+ return NO_ERROR;
32
+ }
33
+
34
+ TEST(stringmap_multiple_adds) {
35
+ stringmap* q = setup();
36
+
37
+ ASSERT(stringmap_string_to_int(q, "hot potato") == (uint32_t)-1);
38
+ uint32_t x, y;
39
+ RELAY_ERROR(stringmap_add(q, "hot potato", &x));
40
+ ASSERT(x != (uint32_t)-1);
41
+ RELAY_ERROR(stringmap_add(q, "hot potato", &y));
42
+ ASSERT(y != (uint32_t)-1);
43
+ ASSERT(x == y);
44
+
45
+ free(q);
46
+ return NO_ERROR;
47
+ }
48
+
49
+ TEST(stringmap_hashing_is_preserved) {
50
+ stringmap* q = setup();
51
+
52
+ uint32_t x, y;
53
+ RELAY_ERROR(stringmap_add(q, "hello there", &x));
54
+ ASSERT(x != (uint32_t)-1);
55
+ const char* a = stringmap_int_to_string(q, x);
56
+ ASSERT(strcmp(a, "hello there") == 0);
57
+
58
+ RELAY_ERROR(stringmap_add(q, "how are you?", &y));
59
+ const char* b = stringmap_int_to_string(q, y);
60
+ ASSERT(strcmp(b, "how are you?") == 0);
61
+
62
+ ASSERT(x != y);
63
+
64
+ free(q);
65
+ return NO_ERROR;
66
+ }
67
+
68
+ TEST(stringmap_detects_out_of_room) {
69
+ stringmap* q = setup();
70
+
71
+ uint32_t x, y, z, w;
72
+ RELAY_ERROR(stringmap_add(q, "one", &x));
73
+ RELAY_ERROR(stringmap_add(q, "two", &y));
74
+ RELAY_ERROR(stringmap_add(q, "three", &z));
75
+
76
+ wp_error* e = stringmap_add(q, "four", &w);
77
+ ASSERT(e != NULL);
78
+ wp_error_free(e);
79
+
80
+ free(q);
81
+ return NO_ERROR;
82
+ }
@@ -0,0 +1,67 @@
1
+ #include <string.h>
2
+ #include "stringpool.h"
3
+ #include "error.h"
4
+ #include "test.h"
5
+
6
+ TEST(stringpool_initial_state) {
7
+ stringpool* p = malloc(stringpool_initial_size());
8
+ stringpool_init(p);
9
+
10
+ ASSERT(!stringpool_needs_bump(p));
11
+
12
+ free(p);
13
+ return NO_ERROR;
14
+ }
15
+
16
+ TEST(stringpool_add_gives_unique_ids) {
17
+ stringpool* p = malloc(stringpool_initial_size());
18
+ stringpool_init(p);
19
+
20
+ uint32_t ret1 = stringpool_add(p, "potato");
21
+ ASSERT(ret1 > 0);
22
+
23
+ uint32_t ret2 = stringpool_add(p, "monkey");
24
+ ASSERT(ret2 > 0);
25
+
26
+ ASSERT(ret1 != ret2);
27
+
28
+ free(p);
29
+ return NO_ERROR;
30
+ }
31
+
32
+ TEST(stringpool_add_gives_ids_that_lookup_returns) {
33
+ stringpool* p = malloc(stringpool_initial_size());
34
+ stringpool_init(p);
35
+
36
+ uint32_t ret;
37
+ char* s;
38
+
39
+ ret = stringpool_add(p, "potato");
40
+ s = stringpool_lookup(p, ret);
41
+ ASSERT(!strcmp(s, "potato"));
42
+
43
+ ret = stringpool_add(p, "monkey");
44
+ s = stringpool_lookup(p, ret);
45
+ ASSERT(!strcmp(s, "monkey"));
46
+
47
+ free(p);
48
+ return NO_ERROR;
49
+ }
50
+
51
+ TEST(stringpool_detects_out_of_room) {
52
+ stringpool* p = malloc(stringpool_initial_size());
53
+ stringpool_init(p);
54
+
55
+ uint32_t ret;
56
+ int times = stringpool_initial_size() / 6;
57
+ for(int i = 0; i < times - 1; i++) {
58
+ ret = stringpool_add(p, "12345");
59
+ ASSERT(ret != (uint32_t)-1);
60
+ }
61
+
62
+ ret = stringpool_add(p, "12345");
63
+ ASSERT(ret == (uint32_t)-1);
64
+
65
+ return NO_ERROR;
66
+ }
67
+
@@ -0,0 +1,95 @@
1
+ #include "termhash.h"
2
+ #include "test.h"
3
+ #include "error.h"
4
+
5
+ TEST(termhash_initial_state) {
6
+ termhash* h = malloc(termhash_initial_size());
7
+ termhash_init(h);
8
+
9
+ ASSERT(h->n_occupied == 0);
10
+ //ASSERT(!termhash_getting_full(h));
11
+
12
+ free(h);
13
+ return NO_ERROR;
14
+ }
15
+
16
+ TEST(termhash_lookups_on_empty) {
17
+ termhash* h = malloc(termhash_initial_size());
18
+ termhash_init(h);
19
+
20
+ term t1 = {0, 0};
21
+ term t2 = {10, 20};
22
+ term t3 = {123, 345};
23
+
24
+ ASSERT(termhash_get_val(h, t1) == (uint32_t)-1);
25
+ ASSERT(termhash_get_val(h, t2) == (uint32_t)-1);
26
+ ASSERT(termhash_get_val(h, t3) == (uint32_t)-1);
27
+
28
+ free(h);
29
+ return NO_ERROR;
30
+ }
31
+
32
+ TEST(termhash_overwriting) {
33
+ termhash* h = malloc(termhash_initial_size());
34
+ termhash_init(h);
35
+
36
+ term t1 = {5, 11};
37
+
38
+ ASSERT(termhash_get_val(h, t1) == (uint32_t)-1);
39
+ RELAY_ERROR(termhash_put_val(h, t1, 1234));
40
+ ASSERT(termhash_get_val(h, t1) == 1234);
41
+
42
+ RELAY_ERROR(termhash_put_val(h, t1, 2345));
43
+ ASSERT(termhash_get_val(h, t1) == 2345);
44
+
45
+ RELAY_ERROR(termhash_put_val(h, t1, 1));
46
+ ASSERT(termhash_get_val(h, t1) == 1);
47
+
48
+ free(h);
49
+ return NO_ERROR;
50
+ }
51
+
52
+ TEST(termhash_many_puts) { // try and force a resize
53
+ termhash* h = malloc(termhash_initial_size());
54
+ termhash_init(h);
55
+
56
+ term t1 = {1, 0};
57
+
58
+ for(int i = 1; i < 100; i++) {
59
+ t1.word_s = i;
60
+ RELAY_ERROR(termhash_put_val(h, t1, 1000 + i));
61
+ if(termhash_needs_bump(h)) {
62
+ h = realloc(h, termhash_next_size(h));
63
+ if(h == NULL) RAISE_SYSERROR("realloc");
64
+ termhash_setup(h);
65
+ RELAY_ERROR(termhash_bump_size(h));
66
+ }
67
+ }
68
+
69
+ t1.word_s = 55;
70
+ uint32_t v = termhash_get_val(h, t1);
71
+ ASSERT(v == 1055);
72
+
73
+ free(h);
74
+ return NO_ERROR;
75
+ }
76
+
77
+ TEST(termhash_detects_out_of_room) {
78
+ termhash* h = malloc(termhash_initial_size());
79
+ termhash_init(h);
80
+
81
+ term t = {1, 0};
82
+
83
+ for(int i = 0; i < 3; i++) {
84
+ t.word_s = i;
85
+ RELAY_ERROR(termhash_put_val(h, t, 100 + i));
86
+ }
87
+
88
+ t.word_s = 999;
89
+ wp_error* e = termhash_put_val(h, t, 999);
90
+ ASSERT(e != NULL);
91
+ wp_error_free(e);
92
+
93
+ free(h);
94
+ return NO_ERROR;
95
+ }
@@ -0,0 +1,55 @@
1
+ #include "test.h"
2
+ #include "tokenizer.lex.h"
3
+
4
+ #define ASSERT_NEXT_WORD(word) { \
5
+ int token_type = yylex(scanner); \
6
+ ASSERT(token_type == TOK_WORD); \
7
+ ASSERT(!strcmp(word, yyget_text(scanner))); \
8
+ }
9
+
10
+ #define ASSERT_DONE { \
11
+ int token_type = yylex(scanner); \
12
+ ASSERT(token_type == TOK_DONE); \
13
+ }
14
+
15
+ TEST(tokenizes_easy_words) {
16
+ yyscan_t scanner;
17
+ lexinfo charpos = {0, 0};
18
+
19
+ yylex_init_extra(&charpos, &scanner);
20
+
21
+ const char* string = "i love mice";
22
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
23
+
24
+ ASSERT_NEXT_WORD("i");
25
+ ASSERT_NEXT_WORD("love");
26
+ ASSERT_NEXT_WORD("mice");
27
+ ASSERT_DONE;
28
+
29
+ yy_delete_buffer(state, scanner);
30
+ yylex_destroy(scanner);
31
+
32
+ return NO_ERROR;
33
+ }
34
+
35
+ TEST(strips_trailing_punctuation) {
36
+ yyscan_t scanner;
37
+ lexinfo charpos = {0, 0};
38
+
39
+ yylex_init_extra(&charpos, &scanner);
40
+
41
+ const char* string = "hey! this: you're <cool>";
42
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
43
+
44
+ ASSERT_NEXT_WORD("hey");
45
+ ASSERT_NEXT_WORD("this");
46
+ ASSERT_NEXT_WORD("you're");
47
+ ASSERT_NEXT_WORD("cool");
48
+ ASSERT_DONE;
49
+
50
+ yy_delete_buffer(state, scanner);
51
+ yylex_destroy(scanner);
52
+
53
+ return NO_ERROR;
54
+ }
55
+
@@ -0,0 +1,38 @@
1
+ #ifndef WP_TEST_H_
2
+ #define WP_TEST_H_
3
+
4
+ // whistlepig test header file
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // macros for the c unit tests
8
+
9
+ #define ASSERT(x) do { \
10
+ (*asserts)++; \
11
+ if(!(x)) { \
12
+ printf("-- test failure: (" #x ") is FALSE in %s (%s:%d)\n\n", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
13
+ *fail = 1; \
14
+ return NO_ERROR; \
15
+ } \
16
+ } while(0)
17
+
18
+ #define TEST(x) wp_error* test_##x(int* fail, int* asserts)
19
+
20
+ #define RUNTEST(x) do { \
21
+ int fail = 0; \
22
+ int this_asserts = 0; \
23
+ tests++; \
24
+ wp_error* err = test_##x(&fail, &this_asserts); \
25
+ asserts += this_asserts; \
26
+ if(fail) { \
27
+ printf("FAIL " #x "\n"); \
28
+ failures++; \
29
+ } \
30
+ else if(err) { \
31
+ errors++; \
32
+ printf(" ERR " #x "\n"); \
33
+ PRINT_ERROR(err, stdout); \
34
+ } \
35
+ else printf("PASS %d/%d " #x "\n", this_asserts, this_asserts); \
36
+ } while(0)
37
+
38
+ #endif
@@ -0,0 +1,28 @@
1
+ #ifndef WP_TIMER_H_
2
+ #define WP_TIMER_H_
3
+
4
+ // whistlepig main header file
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // just some timer macros
8
+
9
+ #include <sys/time.h>
10
+
11
+ #define TIMER(name) \
12
+ struct timeval name##_startt, name##_endt; \
13
+ long name##_elapsed;
14
+
15
+ #define START_TIMER(name) \
16
+ TIMER(name) \
17
+ gettimeofday(&name##_startt, NULL);
18
+
19
+ #define RESET_TIMER(name) gettimeofday(&name##_startt, NULL);
20
+
21
+ #define MARK_TIMER(name) \
22
+ gettimeofday(&name##_endt, NULL); \
23
+ name##_elapsed = ((name##_endt.tv_sec - name##_startt.tv_sec) * 1000) + ((name##_endt.tv_usec - name##_startt.tv_usec) / 1000);
24
+
25
+ #define TIMER_MS(name) name##_elapsed
26
+ #define TIMER_MS(name) name##_elapsed
27
+
28
+ #endif
@@ -367,6 +367,28 @@ static VALUE query_clone(VALUE self) {
367
367
  return o_query;
368
368
  }
369
369
 
370
+ static const char* yielding_substituter(const char* field, const char* term) {
371
+ VALUE result = rb_yield_values(2, rb_str_new2(field), rb_str_new2(term));
372
+ if(NIL_P(result)) return strdup(term);
373
+ else return strdup(RSTRING_PTR(result));
374
+ }
375
+
376
+ /*
377
+ * Returns a new query that's the result of applying the block to each
378
+ * word in the query. Useful for transforming queries programmatically
379
+ * after they've been parsed.
380
+ *
381
+ */
382
+ static VALUE query_map_terms(VALUE self) {
383
+ char buf[1024];
384
+
385
+ wp_query* query; Data_Get_Struct(self, wp_query, query);
386
+ wp_query* result = wp_query_substitute(query, yielding_substituter);
387
+
388
+ VALUE o_query = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
389
+ return o_query;
390
+ }
391
+
370
392
  /*
371
393
  * call-seq: and(other)
372
394
  *
@@ -504,7 +526,7 @@ static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results)
504
526
  return array;
505
527
  }
506
528
 
507
- void Init_whistlepigc() {
529
+ void Init_whistlepig() {
508
530
  VALUE m_whistlepig;
509
531
 
510
532
  m_whistlepig = rb_define_module("Whistlepig");
@@ -541,6 +563,7 @@ void Init_whistlepigc() {
541
563
  rb_define_method(c_query, "or", query_or, 1);
542
564
  rb_define_method(c_query, "to_s", query_to_s, 0);
543
565
  rb_define_method(c_query, "clone", query_clone, 0);
566
+ rb_define_method(c_query, "term_map", query_map_terms, 0);
544
567
  rb_define_attr(c_query, "query", 1, 0);
545
568
 
546
569
  c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
data/lib/whistlepig.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "whistlepigc"
1
+ require "whistlepig/whistlepig"
2
2
 
3
3
  module Whistlepig
4
4
  ## A full-text index. You can add entries to it, and you can run queries
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whistlepig
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.8'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-06-19 13:10:35.000000000 -07:00
12
+ date: 2012-03-13 14:35:03.000000000 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
  description: Whistlepig is a minimalist realtime full-text search index. Its goal
@@ -22,43 +22,51 @@ extensions:
22
22
  - ext/whistlepig/extconf.rb
23
23
  extra_rdoc_files:
24
24
  - README
25
- - ext/whistlepig/whistlepigc.c
25
+ - ext/whistlepig/whistlepig.c
26
26
  files:
27
27
  - README
28
28
  - ext/whistlepig/extconf.rb
29
29
  - lib/whistlepig.rb
30
30
  - ext/whistlepig/query-parser.lex.h
31
31
  - ext/whistlepig/entry.h
32
- - ext/whistlepig/whistlepigc.c
33
32
  - ext/whistlepig/stringmap.c
34
33
  - ext/whistlepig/tokenizer.lex.h
35
34
  - ext/whistlepig/whistlepig.h
36
35
  - ext/whistlepig/error.c
37
36
  - ext/whistlepig/extconf.h
38
37
  - ext/whistlepig/stringmap.h
38
+ - ext/whistlepig/timer.h
39
39
  - ext/whistlepig/query-parser.lex.c
40
40
  - ext/whistlepig/defaults.h
41
41
  - ext/whistlepig/tokenizer.lex.c
42
+ - ext/whistlepig/test-stringpool.c
42
43
  - ext/whistlepig/termhash.c
43
44
  - ext/whistlepig/query-parser.h
44
45
  - ext/whistlepig/index.c
45
46
  - ext/whistlepig/stringpool.c
47
+ - ext/whistlepig/test-termhash.c
46
48
  - ext/whistlepig/query.h
47
49
  - ext/whistlepig/query-parser.c
48
50
  - ext/whistlepig/stringpool.h
49
51
  - ext/whistlepig/mmap-obj.c
52
+ - ext/whistlepig/whistlepig.c
50
53
  - ext/whistlepig/search.c
51
54
  - ext/whistlepig/termhash.h
52
55
  - ext/whistlepig/query.c
56
+ - ext/whistlepig/dump.c
53
57
  - ext/whistlepig/query-parser.tab.h
58
+ - ext/whistlepig/test.h
54
59
  - ext/whistlepig/khash.h
55
60
  - ext/whistlepig/query-parser.tab.c
56
61
  - ext/whistlepig/entry.c
57
62
  - ext/whistlepig/index.h
58
63
  - ext/whistlepig/segment.h
64
+ - ext/whistlepig/test-stringmap.c
59
65
  - ext/whistlepig/mmap-obj.h
60
66
  - ext/whistlepig/segment.c
67
+ - ext/whistlepig/test-segment.c
61
68
  - ext/whistlepig/search.h
69
+ - ext/whistlepig/test-tokenizer.c
62
70
  - ext/whistlepig/error.h
63
71
  has_rdoc: true
64
72
  homepage: http://masanjin.net/whistlepig