whistlepig 0.7 → 0.8

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -8,8 +8,8 @@ the frills, Whistlepig may be for you.
8
8
  Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
9
  bindings.
10
10
 
11
- Latest version: 0.7, released 2011-07-19.
12
- Status: alpha
11
+ Latest version: 0.8, released 2012-03-13.
12
+ Status: beta
13
13
  News: http://all-thing.net/label/whistlepig/
14
14
  Homepage: http://masanjin.net/whistlepig/
15
15
  Bug reports: http://github.com/wmorgan/whistlepig/issues
@@ -0,0 +1,65 @@
1
+ #include <stdio.h>
2
+ #include "whistlepig.h"
3
+
4
+ #define isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
5
+ #define isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
6
+ #define KEY(h, i) &(h->pool[h->keys[i]])
7
+
8
+ RAISING_STATIC(dump_posting_list(wp_segment* s, uint32_t offset)) {
9
+ posting po;
10
+
11
+ while(offset != OFFSET_NONE) {
12
+ RELAY_ERROR(wp_segment_read_posting(s, offset, &po, 1));
13
+
14
+ printf(" @%u doc %u:", offset, po.doc_id);
15
+ for(uint32_t i = 0; i < po.num_positions; i++) {
16
+ printf(" %d", po.positions[i]);
17
+ }
18
+ printf("\n");
19
+
20
+ offset = po.next_offset;
21
+ free(po.positions);
22
+ }
23
+
24
+ return NO_ERROR;
25
+ }
26
+
27
+ RAISING_STATIC(dump(wp_segment* segment)) {
28
+ termhash* th = MMAP_OBJ(segment->termhash, termhash);
29
+ stringmap* sh = MMAP_OBJ(segment->stringmap, stringmap);
30
+
31
+ for(uint32_t i = 0; i < th->n_buckets; i++) {
32
+ if(isempty(th->flags, i)); // do nothing
33
+ else if(isdel(th->flags, i)) printf("%u: [deleted]", i);
34
+ else {
35
+ term t = th->keys[i];
36
+ const char* field = stringmap_int_to_string(sh, t.field_s);
37
+ const char* word = stringmap_int_to_string(sh, t.word_s);
38
+ printf("%u: %s:'%s'\n", i, field, word);
39
+ RELAY_ERROR(dump_posting_list(segment, th->vals[i]));
40
+ }
41
+ }
42
+
43
+ return NO_ERROR;
44
+ }
45
+
46
+ int main(int argc, char* argv[]) {
47
+ if(argc != 2) {
48
+ fprintf(stderr, "Usage: %s <segment filename>\n", argv[0]);
49
+ return -1;
50
+ }
51
+
52
+ wp_index* index;
53
+ DIE_IF_ERROR(wp_index_load(&index, argv[1]));
54
+ DIE_IF_ERROR(wp_index_dumpinfo(index, stdout));
55
+
56
+ for(int i = 0; i < index->num_segments; i++) {
57
+ printf("\nsegment %d details:\n", i);
58
+ DIE_IF_ERROR(dump(&index->segments[i]));
59
+ }
60
+
61
+ DIE_IF_ERROR(wp_index_unload(index));
62
+
63
+ return 0;
64
+ }
65
+
@@ -3,4 +3,4 @@ require 'mkmf'
3
3
  $CFLAGS = "-g -O3 -std=c99 $(cflags) -D_ANSI_SOURCE"
4
4
 
5
5
  create_header
6
- create_makefile "whistlepigc"
6
+ create_makefile "whistlepig"
@@ -12,7 +12,17 @@ static wp_query* wp_query_new() {
12
12
  return ret;
13
13
  }
14
14
 
15
+ static const char* identity(const char* field, const char* word) {
16
+ (void)field;
17
+ if(word) return strdup(word);
18
+ else return NULL;
19
+ }
20
+
15
21
  wp_query* wp_query_clone(wp_query* other) {
22
+ return wp_query_substitute(other, identity);
23
+ }
24
+
25
+ wp_query* wp_query_substitute(wp_query* other, const char *(*substituter)(const char* field, const char* word)) {
16
26
  wp_query* ret = malloc(sizeof(wp_query));
17
27
  ret->type = other->type;
18
28
  ret->num_children = other->num_children;
@@ -21,12 +31,12 @@ wp_query* wp_query_clone(wp_query* other) {
21
31
  if(other->field) ret->field = strdup(other->field);
22
32
  else ret->field = NULL;
23
33
 
24
- if(other->word) ret->word = strdup(other->word);
34
+ if(other->field && other->word) ret->word = substituter(other->field, other->word);
25
35
  else ret->word = NULL;
26
36
 
27
37
  ret->children = ret->next = ret->last = NULL; // set below
28
38
  for(wp_query* child = other->children; child != NULL; child = child->next) {
29
- wp_query* clone = wp_query_clone(child);
39
+ wp_query* clone = wp_query_substitute(child, substituter);
30
40
  if(ret->last == NULL) ret->children = ret->last = clone;
31
41
  else {
32
42
  ret->last->next = clone;
@@ -64,9 +64,12 @@ wp_query* wp_query_new_empty();
64
64
  // public: make an every-document query node.
65
65
  wp_query* wp_query_new_every();
66
66
 
67
- // public: deep clone of a query, but dropping all search state.
67
+ // public: deep clone of a query, dropping all search state.
68
68
  wp_query* wp_query_clone(wp_query* other);
69
69
 
70
+ // public: build a new query by substituting words from the old query, dropping all search state
71
+ wp_query* wp_query_substitute(wp_query* other, const char *(*substituter)(const char* field, const char* word));
72
+
70
73
  // public: add a query node as a child of another
71
74
  wp_query* wp_query_add(wp_query* a, wp_query* b);
72
75
 
@@ -793,6 +793,7 @@ wp_error* wp_search_run_query_on_segment(struct wp_query* q, struct wp_segment*
793
793
  #endif
794
794
 
795
795
  while(*num_results < max_num_results) {
796
+ DEBUG("got %d results so far (max is %d)", *num_results, max_num_results);
796
797
  RELAY_ERROR(query_next_doc(q, s, &results[*num_results], &done));
797
798
  if(done) break;
798
799
  DEBUG("got result %u (%u doc matches)", results[*num_results].doc_id, results[*num_results].num_doc_matches);
@@ -396,6 +396,9 @@ wp_error* wp_segment_read_posting(wp_segment* s, uint32_t offset, posting* po, i
396
396
  wp_error* wp_segment_add_posting(wp_segment* s, const char* field, const char* word, docid_t doc_id, uint32_t num_positions, pos_t positions[]) {
397
397
  // TODO move this logic up to ensure_fit()
398
398
  int success;
399
+
400
+ if(doc_id == 0) RAISE_ERROR("can't add a label to doc 0");
401
+
399
402
  RELAY_ERROR(bump_stringmap(s, &success));
400
403
  RELAY_ERROR(bump_stringpool(s, &success));
401
404
  RELAY_ERROR(bump_termhash(s, &success));
@@ -465,11 +468,14 @@ wp_error* wp_segment_read_label(wp_segment* s, uint32_t offset, posting* po) {
465
468
  wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id) {
466
469
  // TODO move this logic up to ensure_fit()
467
470
  int success;
471
+
472
+ if(doc_id == 0) RAISE_ERROR("can't add a label to doc 0");
473
+
468
474
  RELAY_ERROR(bump_stringmap(s, &success));
469
475
  RELAY_ERROR(bump_stringpool(s, &success));
470
476
  RELAY_ERROR(bump_termhash(s, &success));
471
477
 
472
- DEBUG("adding label %s to doc %u", label, doc_id);
478
+ DEBUG("adding label '%s' to doc %u", label, doc_id);
473
479
 
474
480
  postings_region* pr = MMAP_OBJ(s->labels, postings_region);
475
481
  stringmap* sh = MMAP_OBJ(s->stringmap, stringmap);
@@ -485,22 +491,34 @@ wp_error* wp_segment_add_label(wp_segment* s, const char* label, docid_t doc_id)
485
491
  // posting
486
492
  uint32_t prev_offset = OFFSET_NONE;
487
493
  uint32_t next_offset = termhash_get_val(th, t);
494
+ docid_t last_docid = DOCID_NONE;
495
+
488
496
  if(next_offset == (uint32_t)-1) next_offset = OFFSET_NONE;
497
+ DEBUG("start offset is %u (none is %u)", next_offset, OFFSET_NONE);
489
498
 
490
499
  while(next_offset != OFFSET_NONE) {
491
- label_posting* po = wp_segment_label_posting_at(pr, next_offset);
492
- if(po->doc_id == doc_id) {
500
+ label_posting* lp = wp_segment_label_posting_at(pr, next_offset);
501
+
502
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
503
+ RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", next_offset, lp->doc_id, prev_offset, last_docid);
504
+ }
505
+ else {
506
+ last_docid = lp->doc_id;
507
+ }
508
+
509
+ DEBUG("got doc id %u next_offset %u at offset %u (looking for doc id %u)", lp->doc_id, lp->next_offset, next_offset, doc_id);
510
+ if(lp->doc_id == doc_id) {
493
511
  DEBUG("already have label '%s' for doc %u; returning", label, doc_id);
494
512
  return NO_ERROR;
495
513
  }
496
- else if(po->doc_id < doc_id) break;
514
+ else if(lp->doc_id < doc_id) break;
497
515
  prev_offset = next_offset;
498
- next_offset = po->next_offset;
516
+ next_offset = lp->next_offset;
499
517
  }
500
518
 
501
519
  // find a space for the posting by first checking for a free postings in the
502
- // dead list. the dead list is the list stored under the sentinel term
503
- // with field 0 and word 0.
520
+ // dead list. the dead list is the list stored under the sentinel term with
521
+ // field 0 and word 0.
504
522
  term dead_term = { .field_s = 0, .word_s = 0 };
505
523
  uint32_t entry_offset;
506
524
  uint32_t dead_offset = termhash_get_val(th, dead_term);
@@ -550,11 +568,21 @@ wp_error* wp_segment_remove_label(wp_segment* s, const char* label, docid_t doc_
550
568
  // find the posting and the previous posting in the list, if any
551
569
  uint32_t prev_offset = OFFSET_NONE;
552
570
  uint32_t offset = termhash_get_val(th, t);
571
+ docid_t last_docid = DOCID_NONE;
572
+
553
573
  if(offset == (uint32_t)-1) offset = OFFSET_NONE;
554
574
  label_posting* lp = NULL;
555
575
 
556
576
  while(offset != OFFSET_NONE) {
557
577
  lp = wp_segment_label_posting_at(pr, offset);
578
+
579
+ if((last_docid != DOCID_NONE) && (lp->doc_id >= last_docid)) {
580
+ RAISE_ERROR("whistlepig index corruption! lp %u has docid %u but last docid at lp %u was %u", offset, lp->doc_id, prev_offset, last_docid);
581
+ }
582
+ else {
583
+ last_docid = lp->doc_id;
584
+ }
585
+
558
586
  if(lp->doc_id < doc_id) offset = OFFSET_NONE; // nasty hack to induce failure
559
587
  if(lp->doc_id <= doc_id) break;
560
588
  prev_offset = offset;
@@ -0,0 +1,404 @@
1
+ #include "test.h"
2
+ #include "segment.h"
3
+ #include "tokenizer.lex.h"
4
+ #include "query.h"
5
+ #include "index.h"
6
+
7
+ #define SEGMENT_PATH "/tmp/segment-test"
8
+
9
+ wp_error* setup(wp_segment* segment) {
10
+ RELAY_ERROR(wp_segment_delete(SEGMENT_PATH));
11
+ RELAY_ERROR(wp_segment_create(segment, SEGMENT_PATH));
12
+ return NO_ERROR;
13
+ }
14
+
15
+ #define ADD_DOC(word, pos) \
16
+ positions[0] = pos; \
17
+ RELAY_ERROR(wp_segment_ensure_fit(segment, postings_bytes, 0, &success)); \
18
+ if(success != 1) RAISE_ERROR("couldn't ensure segment fit"); \
19
+ RELAY_ERROR(wp_segment_add_posting(segment, "body", word, doc_id, 1, positions));
20
+
21
+ wp_error* add_docs(wp_segment* segment) {
22
+ docid_t doc_id;
23
+ pos_t positions[10];
24
+ uint32_t postings_bytes;
25
+ int success;
26
+
27
+ RELAY_ERROR(wp_segment_sizeof_posarray(segment, 1, NULL, &postings_bytes));
28
+
29
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
30
+ ADD_DOC("one", 0);
31
+ ADD_DOC("two", 1);
32
+ ADD_DOC("three", 2);
33
+
34
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
35
+ ADD_DOC("two", 0);
36
+ ADD_DOC("three", 1);
37
+ ADD_DOC("four", 2);
38
+
39
+ RELAY_ERROR(wp_segment_grab_docid(segment, &doc_id));
40
+ ADD_DOC("three", 0);
41
+ ADD_DOC("four", 1);
42
+ ADD_DOC("five", 2);
43
+
44
+ return NO_ERROR;
45
+ }
46
+
47
+ TEST(initial_state) {
48
+ wp_segment segment;
49
+ RELAY_ERROR(setup(&segment));
50
+
51
+ postings_region* pr = MMAP_OBJ(segment.postings, postings_region);
52
+ ASSERT(pr->num_docs == 0);
53
+ ASSERT(pr->num_postings == 0);
54
+
55
+ RELAY_ERROR(wp_segment_unload(&segment));
56
+ return NO_ERROR;
57
+ }
58
+
59
+ TEST(adding_a_doc_increments_counts) {
60
+ wp_segment segment;
61
+ pos_t positions[10];
62
+ docid_t doc_id;
63
+
64
+ RELAY_ERROR(setup(&segment));
65
+ RELAY_ERROR(wp_segment_grab_docid(&segment, &doc_id));
66
+
67
+ positions[0] = 0;
68
+ RELAY_ERROR(wp_segment_add_posting(&segment, "body", "hello", doc_id, 1, positions));
69
+ positions[0] = 1;
70
+ RELAY_ERROR(wp_segment_add_posting(&segment, "body", "there", doc_id, 1, positions));
71
+
72
+ postings_region* pr = MMAP_OBJ(segment.postings, postings_region);
73
+ ASSERT(pr->num_docs == 1);
74
+ ASSERT(pr->num_postings == 2);
75
+
76
+ RELAY_ERROR(wp_segment_unload(&segment));
77
+ return NO_ERROR;
78
+ }
79
+
80
+ #define RUN_QUERY(query) \
81
+ RELAY_ERROR(wp_search_init_search_state(query, &segment)); \
82
+ RELAY_ERROR(wp_search_run_query_on_segment(query, &segment, 10, &num_results, &results[0])); \
83
+ RELAY_ERROR(wp_search_release_search_state(query));
84
+
85
+ TEST(simple_term_queries) {
86
+ wp_segment segment;
87
+ uint32_t num_results;
88
+ search_result results[10];
89
+ wp_query* query;
90
+
91
+ RELAY_ERROR(setup(&segment));
92
+ RELAY_ERROR(add_docs(&segment));
93
+
94
+ query = wp_query_new_term("body", "one");
95
+ RUN_QUERY(query);
96
+
97
+ ASSERT(num_results == 1);
98
+ ASSERT(results[0].doc_id == 1);
99
+
100
+ query = wp_query_new_term("body", "two");
101
+ RUN_QUERY(query);
102
+
103
+ ASSERT(num_results == 2);
104
+ ASSERT(results[0].doc_id == 2);
105
+ ASSERT(results[1].doc_id == 1);
106
+
107
+ RELAY_ERROR(wp_segment_unload(&segment));
108
+ return NO_ERROR;
109
+ }
110
+
111
+ TEST(simple_conjunctive_queries) {
112
+ wp_segment segment;
113
+ uint32_t num_results;
114
+ search_result results[10];
115
+ wp_query* query;
116
+
117
+ RELAY_ERROR(setup(&segment));
118
+ RELAY_ERROR(add_docs(&segment));
119
+
120
+ query = wp_query_new_conjunction();
121
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
122
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
123
+
124
+ RUN_QUERY(query);
125
+
126
+ ASSERT(num_results == 1);
127
+ ASSERT(results[0].doc_id == 1);
128
+
129
+ query = wp_query_new_conjunction();
130
+ query = wp_query_add(query, wp_query_new_term("body", "four"));
131
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
132
+
133
+ RUN_QUERY(query);
134
+
135
+ ASSERT(num_results == 1);
136
+ ASSERT(results[0].doc_id == 2);
137
+
138
+ // <empty>
139
+ query = wp_query_new_conjunction();
140
+ RUN_QUERY(query);
141
+ ASSERT(num_results == 0);
142
+
143
+ RELAY_ERROR(wp_segment_unload(&segment));
144
+ return NO_ERROR;
145
+ }
146
+
147
+ TEST(simple_phrasal_queries) {
148
+ wp_segment segment;
149
+ uint32_t num_results;
150
+ search_result results[10];
151
+ wp_query* query;
152
+
153
+ RELAY_ERROR(setup(&segment));
154
+ RELAY_ERROR(add_docs(&segment));
155
+
156
+ query = wp_query_new_phrase();
157
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
158
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
159
+ RUN_QUERY(query);
160
+ ASSERT(num_results == 1);
161
+ ASSERT(results[0].doc_id == 1);
162
+
163
+ query = wp_query_new_phrase();
164
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
165
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
166
+ RUN_QUERY(query);
167
+ ASSERT(num_results == 0);
168
+
169
+ query = wp_query_new_phrase();
170
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
171
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
172
+ RUN_QUERY(query);
173
+ ASSERT(num_results == 2);
174
+ ASSERT(results[0].doc_id == 2);
175
+ ASSERT(results[1].doc_id == 1);
176
+
177
+ query = wp_query_new_phrase();
178
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
179
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
180
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
181
+ RUN_QUERY(query);
182
+ ASSERT(num_results == 1);
183
+ ASSERT(results[0].doc_id == 1);
184
+
185
+ RELAY_ERROR(wp_segment_unload(&segment));
186
+ return NO_ERROR;
187
+ }
188
+
189
+ TEST(segment_conjuction_of_phrase_queries) {
190
+ wp_segment segment;
191
+ uint32_t num_results;
192
+ search_result results[10];
193
+ wp_query* query;
194
+ wp_query* subquery;
195
+
196
+ RELAY_ERROR(setup(&segment));
197
+ RELAY_ERROR(add_docs(&segment));
198
+
199
+ // one "two three"
200
+ subquery = wp_query_new_phrase();
201
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
202
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
203
+ query = wp_query_new_conjunction();
204
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
205
+ query = wp_query_add(query, subquery);
206
+
207
+ RUN_QUERY(query);
208
+ ASSERT(num_results == 1);
209
+ ASSERT(results[0].doc_id == 1);
210
+
211
+ // "two three" one
212
+ subquery = wp_query_new_phrase();
213
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
214
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
215
+ query = wp_query_new_conjunction();
216
+ query = wp_query_add(query, subquery);
217
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
218
+
219
+ RUN_QUERY(query);
220
+ ASSERT(num_results == 1);
221
+ ASSERT(results[0].doc_id == 1);
222
+
223
+ // one "three two"
224
+ subquery = wp_query_new_phrase();
225
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
226
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
227
+ query = wp_query_new_conjunction();
228
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
229
+ query = wp_query_add(query, subquery);
230
+
231
+ RUN_QUERY(query);
232
+ ASSERT(num_results == 0);
233
+
234
+ // two "two three"
235
+ subquery = wp_query_new_phrase();
236
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
237
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
238
+ query = wp_query_new_conjunction();
239
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
240
+ query = wp_query_add(query, subquery);
241
+
242
+ RUN_QUERY(query);
243
+ ASSERT(num_results == 2);
244
+ ASSERT(results[0].doc_id == 2);
245
+ ASSERT(results[1].doc_id == 1);
246
+
247
+ RELAY_ERROR(wp_segment_unload(&segment));
248
+ return NO_ERROR;
249
+ }
250
+
251
+ TEST(negation_queries) {
252
+ wp_segment segment;
253
+ uint32_t num_results;
254
+ search_result results[10];
255
+ wp_query* query;
256
+ wp_query* subquery;
257
+
258
+ RELAY_ERROR(setup(&segment));
259
+ RELAY_ERROR(add_docs(&segment));
260
+
261
+ // one "two three"
262
+ subquery = wp_query_new_phrase();
263
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
264
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
265
+ query = wp_query_new_conjunction();
266
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
267
+ query = wp_query_add(query, subquery);
268
+
269
+ RUN_QUERY(query);
270
+ ASSERT(num_results == 1);
271
+ ASSERT(results[0].doc_id == 1);
272
+
273
+ // "two three" one
274
+ subquery = wp_query_new_phrase();
275
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
276
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
277
+ query = wp_query_new_conjunction();
278
+ query = wp_query_add(query, subquery);
279
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
280
+
281
+ RUN_QUERY(query);
282
+ ASSERT(num_results == 1);
283
+ ASSERT(results[0].doc_id == 1);
284
+
285
+ // one "three two"
286
+ subquery = wp_query_new_phrase();
287
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
288
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
289
+ query = wp_query_new_conjunction();
290
+ query = wp_query_add(query, wp_query_new_term("body", "one"));
291
+ query = wp_query_add(query, subquery);
292
+
293
+ RUN_QUERY(query);
294
+ ASSERT(num_results == 0);
295
+
296
+ // two "two three"
297
+ subquery = wp_query_new_phrase();
298
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
299
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
300
+ query = wp_query_new_conjunction();
301
+ query = wp_query_add(query, wp_query_new_term("body", "two"));
302
+ query = wp_query_add(query, subquery);
303
+
304
+ RUN_QUERY(query);
305
+ ASSERT(num_results == 2);
306
+ ASSERT(results[0].doc_id == 2);
307
+ ASSERT(results[1].doc_id == 1);
308
+
309
+ // <empty>
310
+ query = wp_query_new_conjunction();
311
+ RUN_QUERY(query);
312
+ ASSERT(num_results == 0);
313
+
314
+ // -one
315
+ subquery = wp_query_new_term("body", "one");
316
+ query = wp_query_new_negation();
317
+ query = wp_query_add(query, subquery);
318
+ RUN_QUERY(query);
319
+ ASSERT(num_results == 2);
320
+ ASSERT(results[0].doc_id == 3);
321
+ ASSERT(results[1].doc_id == 2);
322
+
323
+ // -two
324
+ subquery = wp_query_new_term("body", "two");
325
+ query = wp_query_new_negation();
326
+ query = wp_query_add(query, subquery);
327
+ RUN_QUERY(query);
328
+ ASSERT(num_results == 1);
329
+ ASSERT(results[0].doc_id == 3);
330
+
331
+ // -three
332
+ subquery = wp_query_new_term("body", "three");
333
+ query = wp_query_new_negation();
334
+ query = wp_query_add(query, subquery);
335
+ RUN_QUERY(query);
336
+ ASSERT(num_results == 0);
337
+
338
+ // -potato
339
+ subquery = wp_query_new_term("body", "potato");
340
+ query = wp_query_new_negation();
341
+ query = wp_query_add(query, subquery);
342
+ RUN_QUERY(query);
343
+ ASSERT(num_results == 3);
344
+ ASSERT(results[0].doc_id == 3);
345
+ ASSERT(results[1].doc_id == 2);
346
+ ASSERT(results[2].doc_id == 1);
347
+
348
+ // -"one two"
349
+ subquery = wp_query_new_conjunction();
350
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
351
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "two"));
352
+ query = wp_query_new_negation();
353
+ query = wp_query_add(query, subquery);
354
+ RUN_QUERY(query);
355
+ ASSERT(num_results == 2);
356
+ ASSERT(results[0].doc_id == 3);
357
+ ASSERT(results[1].doc_id == 2);
358
+
359
+ // -(AND one three)
360
+ subquery = wp_query_new_conjunction();
361
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
362
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
363
+ query = wp_query_new_negation();
364
+ query = wp_query_add(query, subquery);
365
+ RUN_QUERY(query);
366
+ ASSERT(num_results == 2);
367
+ ASSERT(results[0].doc_id == 3);
368
+ ASSERT(results[1].doc_id == 2);
369
+
370
+ // -"one three"
371
+ subquery = wp_query_new_phrase();
372
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
373
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "three"));
374
+ query = wp_query_new_negation();
375
+ query = wp_query_add(query, subquery);
376
+ RUN_QUERY(query);
377
+ ASSERT(num_results == 3);
378
+
379
+ // (AND -one three)
380
+ subquery = wp_query_new_negation();
381
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
382
+ query = wp_query_new_conjunction();
383
+ query = wp_query_add(query, subquery);
384
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
385
+ RUN_QUERY(query);
386
+ ASSERT(num_results == 2);
387
+ ASSERT(results[0].doc_id == 3);
388
+ ASSERT(results[1].doc_id == 2);
389
+
390
+ // (AND three -one)
391
+ subquery = wp_query_new_negation();
392
+ subquery = wp_query_add(subquery, wp_query_new_term("body", "one"));
393
+ query = wp_query_new_conjunction();
394
+ query = wp_query_add(query, wp_query_new_term("body", "three"));
395
+ query = wp_query_add(query, subquery);
396
+ RUN_QUERY(query);
397
+ ASSERT(num_results == 2);
398
+ ASSERT(results[0].doc_id == 3);
399
+ ASSERT(results[1].doc_id == 2);
400
+
401
+ RELAY_ERROR(wp_segment_unload(&segment));
402
+ return NO_ERROR;
403
+ }
404
+
@@ -0,0 +1,82 @@
1
+ #include <string.h>
2
+ #include "stringmap.h"
3
+ #include "error.h"
4
+ #include "test.h"
5
+
6
+ static stringmap* setup() {
7
+ stringpool* p = malloc(stringpool_initial_size());
8
+ stringpool_init(p);
9
+ stringmap* q = malloc(stringmap_initial_size());
10
+ stringmap_init(q, p);
11
+ return q;
12
+ }
13
+
14
+ TEST(stringmap_initial_state) {
15
+ stringmap* q = setup();
16
+ ASSERT(q->n_occupied == 0);
17
+ ASSERT(!stringmap_needs_bump(q));
18
+
19
+ free(q);
20
+ return NO_ERROR;
21
+ }
22
+
23
+ TEST(stringmap_lookups_on_empty) {
24
+ stringmap* q = setup();
25
+
26
+ ASSERT(stringmap_string_to_int(q, "hot potato") == (uint32_t)-1);
27
+ ASSERT(stringmap_int_to_string(q, 0) == NULL);
28
+ ASSERT(stringmap_int_to_string(q, 1234) == NULL);
29
+
30
+ free(q);
31
+ return NO_ERROR;
32
+ }
33
+
34
+ TEST(stringmap_multiple_adds) {
35
+ stringmap* q = setup();
36
+
37
+ ASSERT(stringmap_string_to_int(q, "hot potato") == (uint32_t)-1);
38
+ uint32_t x, y;
39
+ RELAY_ERROR(stringmap_add(q, "hot potato", &x));
40
+ ASSERT(x != (uint32_t)-1);
41
+ RELAY_ERROR(stringmap_add(q, "hot potato", &y));
42
+ ASSERT(y != (uint32_t)-1);
43
+ ASSERT(x == y);
44
+
45
+ free(q);
46
+ return NO_ERROR;
47
+ }
48
+
49
+ TEST(stringmap_hashing_is_preserved) {
50
+ stringmap* q = setup();
51
+
52
+ uint32_t x, y;
53
+ RELAY_ERROR(stringmap_add(q, "hello there", &x));
54
+ ASSERT(x != (uint32_t)-1);
55
+ const char* a = stringmap_int_to_string(q, x);
56
+ ASSERT(strcmp(a, "hello there") == 0);
57
+
58
+ RELAY_ERROR(stringmap_add(q, "how are you?", &y));
59
+ const char* b = stringmap_int_to_string(q, y);
60
+ ASSERT(strcmp(b, "how are you?") == 0);
61
+
62
+ ASSERT(x != y);
63
+
64
+ free(q);
65
+ return NO_ERROR;
66
+ }
67
+
68
+ TEST(stringmap_detects_out_of_room) {
69
+ stringmap* q = setup();
70
+
71
+ uint32_t x, y, z, w;
72
+ RELAY_ERROR(stringmap_add(q, "one", &x));
73
+ RELAY_ERROR(stringmap_add(q, "two", &y));
74
+ RELAY_ERROR(stringmap_add(q, "three", &z));
75
+
76
+ wp_error* e = stringmap_add(q, "four", &w);
77
+ ASSERT(e != NULL);
78
+ wp_error_free(e);
79
+
80
+ free(q);
81
+ return NO_ERROR;
82
+ }
@@ -0,0 +1,67 @@
1
+ #include <string.h>
2
+ #include "stringpool.h"
3
+ #include "error.h"
4
+ #include "test.h"
5
+
6
+ TEST(stringpool_initial_state) {
7
+ stringpool* p = malloc(stringpool_initial_size());
8
+ stringpool_init(p);
9
+
10
+ ASSERT(!stringpool_needs_bump(p));
11
+
12
+ free(p);
13
+ return NO_ERROR;
14
+ }
15
+
16
+ TEST(stringpool_add_gives_unique_ids) {
17
+ stringpool* p = malloc(stringpool_initial_size());
18
+ stringpool_init(p);
19
+
20
+ uint32_t ret1 = stringpool_add(p, "potato");
21
+ ASSERT(ret1 > 0);
22
+
23
+ uint32_t ret2 = stringpool_add(p, "monkey");
24
+ ASSERT(ret2 > 0);
25
+
26
+ ASSERT(ret1 != ret2);
27
+
28
+ free(p);
29
+ return NO_ERROR;
30
+ }
31
+
32
+ TEST(stringpool_add_gives_ids_that_lookup_returns) {
33
+ stringpool* p = malloc(stringpool_initial_size());
34
+ stringpool_init(p);
35
+
36
+ uint32_t ret;
37
+ char* s;
38
+
39
+ ret = stringpool_add(p, "potato");
40
+ s = stringpool_lookup(p, ret);
41
+ ASSERT(!strcmp(s, "potato"));
42
+
43
+ ret = stringpool_add(p, "monkey");
44
+ s = stringpool_lookup(p, ret);
45
+ ASSERT(!strcmp(s, "monkey"));
46
+
47
+ free(p);
48
+ return NO_ERROR;
49
+ }
50
+
51
+ TEST(stringpool_detects_out_of_room) {
52
+ stringpool* p = malloc(stringpool_initial_size());
53
+ stringpool_init(p);
54
+
55
+ uint32_t ret;
56
+ int times = stringpool_initial_size() / 6;
57
+ for(int i = 0; i < times - 1; i++) {
58
+ ret = stringpool_add(p, "12345");
59
+ ASSERT(ret != (uint32_t)-1);
60
+ }
61
+
62
+ ret = stringpool_add(p, "12345");
63
+ ASSERT(ret == (uint32_t)-1);
64
+
65
+ return NO_ERROR;
66
+ }
67
+
@@ -0,0 +1,95 @@
1
+ #include "termhash.h"
2
+ #include "test.h"
3
+ #include "error.h"
4
+
5
+ TEST(termhash_initial_state) {
6
+ termhash* h = malloc(termhash_initial_size());
7
+ termhash_init(h);
8
+
9
+ ASSERT(h->n_occupied == 0);
10
+ //ASSERT(!termhash_getting_full(h));
11
+
12
+ free(h);
13
+ return NO_ERROR;
14
+ }
15
+
16
+ TEST(termhash_lookups_on_empty) {
17
+ termhash* h = malloc(termhash_initial_size());
18
+ termhash_init(h);
19
+
20
+ term t1 = {0, 0};
21
+ term t2 = {10, 20};
22
+ term t3 = {123, 345};
23
+
24
+ ASSERT(termhash_get_val(h, t1) == (uint32_t)-1);
25
+ ASSERT(termhash_get_val(h, t2) == (uint32_t)-1);
26
+ ASSERT(termhash_get_val(h, t3) == (uint32_t)-1);
27
+
28
+ free(h);
29
+ return NO_ERROR;
30
+ }
31
+
32
+ TEST(termhash_overwriting) {
33
+ termhash* h = malloc(termhash_initial_size());
34
+ termhash_init(h);
35
+
36
+ term t1 = {5, 11};
37
+
38
+ ASSERT(termhash_get_val(h, t1) == (uint32_t)-1);
39
+ RELAY_ERROR(termhash_put_val(h, t1, 1234));
40
+ ASSERT(termhash_get_val(h, t1) == 1234);
41
+
42
+ RELAY_ERROR(termhash_put_val(h, t1, 2345));
43
+ ASSERT(termhash_get_val(h, t1) == 2345);
44
+
45
+ RELAY_ERROR(termhash_put_val(h, t1, 1));
46
+ ASSERT(termhash_get_val(h, t1) == 1);
47
+
48
+ free(h);
49
+ return NO_ERROR;
50
+ }
51
+
52
+ TEST(termhash_many_puts) { // try and force a resize
53
+ termhash* h = malloc(termhash_initial_size());
54
+ termhash_init(h);
55
+
56
+ term t1 = {1, 0};
57
+
58
+ for(int i = 1; i < 100; i++) {
59
+ t1.word_s = i;
60
+ RELAY_ERROR(termhash_put_val(h, t1, 1000 + i));
61
+ if(termhash_needs_bump(h)) {
62
+ h = realloc(h, termhash_next_size(h));
63
+ if(h == NULL) RAISE_SYSERROR("realloc");
64
+ termhash_setup(h);
65
+ RELAY_ERROR(termhash_bump_size(h));
66
+ }
67
+ }
68
+
69
+ t1.word_s = 55;
70
+ uint32_t v = termhash_get_val(h, t1);
71
+ ASSERT(v == 1055);
72
+
73
+ free(h);
74
+ return NO_ERROR;
75
+ }
76
+
77
+ TEST(termhash_detects_out_of_room) {
78
+ termhash* h = malloc(termhash_initial_size());
79
+ termhash_init(h);
80
+
81
+ term t = {1, 0};
82
+
83
+ for(int i = 0; i < 3; i++) {
84
+ t.word_s = i;
85
+ RELAY_ERROR(termhash_put_val(h, t, 100 + i));
86
+ }
87
+
88
+ t.word_s = 999;
89
+ wp_error* e = termhash_put_val(h, t, 999);
90
+ ASSERT(e != NULL);
91
+ wp_error_free(e);
92
+
93
+ free(h);
94
+ return NO_ERROR;
95
+ }
@@ -0,0 +1,55 @@
1
+ #include "test.h"
2
+ #include "tokenizer.lex.h"
3
+
4
+ #define ASSERT_NEXT_WORD(word) { \
5
+ int token_type = yylex(scanner); \
6
+ ASSERT(token_type == TOK_WORD); \
7
+ ASSERT(!strcmp(word, yyget_text(scanner))); \
8
+ }
9
+
10
+ #define ASSERT_DONE { \
11
+ int token_type = yylex(scanner); \
12
+ ASSERT(token_type == TOK_DONE); \
13
+ }
14
+
15
+ TEST(tokenizes_easy_words) {
16
+ yyscan_t scanner;
17
+ lexinfo charpos = {0, 0};
18
+
19
+ yylex_init_extra(&charpos, &scanner);
20
+
21
+ const char* string = "i love mice";
22
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
23
+
24
+ ASSERT_NEXT_WORD("i");
25
+ ASSERT_NEXT_WORD("love");
26
+ ASSERT_NEXT_WORD("mice");
27
+ ASSERT_DONE;
28
+
29
+ yy_delete_buffer(state, scanner);
30
+ yylex_destroy(scanner);
31
+
32
+ return NO_ERROR;
33
+ }
34
+
35
+ TEST(strips_trailing_punctuation) {
36
+ yyscan_t scanner;
37
+ lexinfo charpos = {0, 0};
38
+
39
+ yylex_init_extra(&charpos, &scanner);
40
+
41
+ const char* string = "hey! this: you're <cool>";
42
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
43
+
44
+ ASSERT_NEXT_WORD("hey");
45
+ ASSERT_NEXT_WORD("this");
46
+ ASSERT_NEXT_WORD("you're");
47
+ ASSERT_NEXT_WORD("cool");
48
+ ASSERT_DONE;
49
+
50
+ yy_delete_buffer(state, scanner);
51
+ yylex_destroy(scanner);
52
+
53
+ return NO_ERROR;
54
+ }
55
+
@@ -0,0 +1,38 @@
1
+ #ifndef WP_TEST_H_
2
+ #define WP_TEST_H_
3
+
4
+ // whistlepig test header file
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // macros for the c unit tests
8
+
9
+ #define ASSERT(x) do { \
10
+ (*asserts)++; \
11
+ if(!(x)) { \
12
+ printf("-- test failure: (" #x ") is FALSE in %s (%s:%d)\n\n", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
13
+ *fail = 1; \
14
+ return NO_ERROR; \
15
+ } \
16
+ } while(0)
17
+
18
+ #define TEST(x) wp_error* test_##x(int* fail, int* asserts)
19
+
20
+ #define RUNTEST(x) do { \
21
+ int fail = 0; \
22
+ int this_asserts = 0; \
23
+ tests++; \
24
+ wp_error* err = test_##x(&fail, &this_asserts); \
25
+ asserts += this_asserts; \
26
+ if(fail) { \
27
+ printf("FAIL " #x "\n"); \
28
+ failures++; \
29
+ } \
30
+ else if(err) { \
31
+ errors++; \
32
+ printf(" ERR " #x "\n"); \
33
+ PRINT_ERROR(err, stdout); \
34
+ } \
35
+ else printf("PASS %d/%d " #x "\n", this_asserts, this_asserts); \
36
+ } while(0)
37
+
38
+ #endif
@@ -0,0 +1,28 @@
1
+ #ifndef WP_TIMER_H_
2
+ #define WP_TIMER_H_
3
+
4
+ // whistlepig main header file
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // just some timer macros
8
+
9
+ #include <sys/time.h>
10
+
11
+ #define TIMER(name) \
12
+ struct timeval name##_startt, name##_endt; \
13
+ long name##_elapsed;
14
+
15
+ #define START_TIMER(name) \
16
+ TIMER(name) \
17
+ gettimeofday(&name##_startt, NULL);
18
+
19
+ #define RESET_TIMER(name) gettimeofday(&name##_startt, NULL);
20
+
21
+ #define MARK_TIMER(name) \
22
+ gettimeofday(&name##_endt, NULL); \
23
+ name##_elapsed = ((name##_endt.tv_sec - name##_startt.tv_sec) * 1000) + ((name##_endt.tv_usec - name##_startt.tv_usec) / 1000);
24
+
25
+ #define TIMER_MS(name) name##_elapsed
26
+ #define TIMER_MS(name) name##_elapsed
27
+
28
+ #endif
@@ -367,6 +367,28 @@ static VALUE query_clone(VALUE self) {
367
367
  return o_query;
368
368
  }
369
369
 
370
+ static const char* yielding_substituter(const char* field, const char* term) {
371
+ VALUE result = rb_yield_values(2, rb_str_new2(field), rb_str_new2(term));
372
+ if(NIL_P(result)) return strdup(term);
373
+ else return strdup(RSTRING_PTR(result));
374
+ }
375
+
376
+ /*
377
+ * Returns a new query that's the result of applying the block to each
378
+ * word in the query. Useful for transforming queries programmatically
379
+ * after they've been parsed.
380
+ *
381
+ */
382
+ static VALUE query_map_terms(VALUE self) {
383
+ char buf[1024];
384
+
385
+ wp_query* query; Data_Get_Struct(self, wp_query, query);
386
+ wp_query* result = wp_query_substitute(query, yielding_substituter);
387
+
388
+ VALUE o_query = Data_Wrap_Struct(c_query, NULL, wp_query_free, result);
389
+ return o_query;
390
+ }
391
+
370
392
  /*
371
393
  * call-seq: and(other)
372
394
  *
@@ -504,7 +526,7 @@ static VALUE index_run_query(VALUE self, VALUE v_query, VALUE v_max_num_results)
504
526
  return array;
505
527
  }
506
528
 
507
- void Init_whistlepigc() {
529
+ void Init_whistlepig() {
508
530
  VALUE m_whistlepig;
509
531
 
510
532
  m_whistlepig = rb_define_module("Whistlepig");
@@ -541,6 +563,7 @@ void Init_whistlepigc() {
541
563
  rb_define_method(c_query, "or", query_or, 1);
542
564
  rb_define_method(c_query, "to_s", query_to_s, 0);
543
565
  rb_define_method(c_query, "clone", query_clone, 0);
566
+ rb_define_method(c_query, "term_map", query_map_terms, 0);
544
567
  rb_define_attr(c_query, "query", 1, 0);
545
568
 
546
569
  c_error = rb_define_class_under(m_whistlepig, "Error", rb_eStandardError);
data/lib/whistlepig.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "whistlepigc"
1
+ require "whistlepig/whistlepig"
2
2
 
3
3
  module Whistlepig
4
4
  ## A full-text index. You can add entries to it, and you can run queries
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whistlepig
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.7'
4
+ version: '0.8'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-06-19 13:10:35.000000000 -07:00
12
+ date: 2012-03-13 14:35:03.000000000 -07:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
  description: Whistlepig is a minimalist realtime full-text search index. Its goal
@@ -22,43 +22,51 @@ extensions:
22
22
  - ext/whistlepig/extconf.rb
23
23
  extra_rdoc_files:
24
24
  - README
25
- - ext/whistlepig/whistlepigc.c
25
+ - ext/whistlepig/whistlepig.c
26
26
  files:
27
27
  - README
28
28
  - ext/whistlepig/extconf.rb
29
29
  - lib/whistlepig.rb
30
30
  - ext/whistlepig/query-parser.lex.h
31
31
  - ext/whistlepig/entry.h
32
- - ext/whistlepig/whistlepigc.c
33
32
  - ext/whistlepig/stringmap.c
34
33
  - ext/whistlepig/tokenizer.lex.h
35
34
  - ext/whistlepig/whistlepig.h
36
35
  - ext/whistlepig/error.c
37
36
  - ext/whistlepig/extconf.h
38
37
  - ext/whistlepig/stringmap.h
38
+ - ext/whistlepig/timer.h
39
39
  - ext/whistlepig/query-parser.lex.c
40
40
  - ext/whistlepig/defaults.h
41
41
  - ext/whistlepig/tokenizer.lex.c
42
+ - ext/whistlepig/test-stringpool.c
42
43
  - ext/whistlepig/termhash.c
43
44
  - ext/whistlepig/query-parser.h
44
45
  - ext/whistlepig/index.c
45
46
  - ext/whistlepig/stringpool.c
47
+ - ext/whistlepig/test-termhash.c
46
48
  - ext/whistlepig/query.h
47
49
  - ext/whistlepig/query-parser.c
48
50
  - ext/whistlepig/stringpool.h
49
51
  - ext/whistlepig/mmap-obj.c
52
+ - ext/whistlepig/whistlepig.c
50
53
  - ext/whistlepig/search.c
51
54
  - ext/whistlepig/termhash.h
52
55
  - ext/whistlepig/query.c
56
+ - ext/whistlepig/dump.c
53
57
  - ext/whistlepig/query-parser.tab.h
58
+ - ext/whistlepig/test.h
54
59
  - ext/whistlepig/khash.h
55
60
  - ext/whistlepig/query-parser.tab.c
56
61
  - ext/whistlepig/entry.c
57
62
  - ext/whistlepig/index.h
58
63
  - ext/whistlepig/segment.h
64
+ - ext/whistlepig/test-stringmap.c
59
65
  - ext/whistlepig/mmap-obj.h
60
66
  - ext/whistlepig/segment.c
67
+ - ext/whistlepig/test-segment.c
61
68
  - ext/whistlepig/search.h
69
+ - ext/whistlepig/test-tokenizer.c
62
70
  - ext/whistlepig/error.h
63
71
  has_rdoc: true
64
72
  homepage: http://masanjin.net/whistlepig