whistlepig 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,86 @@
1
+ = Whistlepig
2
+
3
+ Whistlepig is a minimalist realtime full-text search index. Its goal is to be
4
+ as small and feature-free as possible, while still remaining useful, performant
5
+ and scalable to large corpora. If you want realtime full-text search without
6
+ the frills, Whistlepig may be for you.
7
+
8
+ Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
+ bindings.
10
+
11
+ Latest version: 0.1, released 2010-02-08.
12
+ Status: alpha
13
+ News: http://all-thing.net/label/whistlepig/
14
+ Homepage: http://masanjin.net/whistlepig/
15
+
16
+ = Getting it
17
+
18
+ Tarball: http://masanjin.net/whistlepig/whistlepig-0.1.tar.gz
19
+ Rubygem: gem install whistlepig
20
+ Git: git clone git://github.com/wmorgan/whistlepig.git
21
+
22
+ = Realtime search
23
+
24
+ Roughly speaking, realtime search means:
25
+ - documents are available to to queries immediately after indexing, without
26
+ any further index merging steps; and
27
+ - later documents are more important than earlier documents.
28
+
29
+ Whistlepig takes these principles to an extreme. In particular:
30
+ - It only returns documents in the reverse order to which they were added
31
+ (i.e. LIFO order), and performs no ranking, reordering, or scoring.
32
+ - It only supports incremental indexing. There is no notion of batch indexing
33
+ or index merging.
34
+ - It does not support document deletion or modification (except in the
35
+ special case of labels; see below).
36
+ - In only supports in-memory indexes.
37
+
38
+ Features that Whistlepig does provide:
39
+ - Incremental indexing. Updates to the index are immediately available to
40
+ readers.
41
+ - Fielded terms with arbitrary fields.
42
+ - A full query language and parser with conjunctions, disjunctions, phrases,
43
+ negations, grouping, and nesting.
44
+ - Labels: arbitrary tokens which can be added to and removed from documents
45
+ at any point, and incorporated into search queries. (This is the only
46
+ mutable aspect of a document once it has been indexed.)
47
+ - Early query termination.
48
+ - Resumable queries.
49
+ - A tiny, < 3 KLOC ANSI C99 implementation.
50
+
51
+ == Synopsis (using Ruby bindings)
52
+
53
+ require 'rubygems'
54
+ require 'whistlepig'
55
+
56
+ include Whistlepig
57
+
58
+ index = Index.new "index"
59
+
60
+ entry1 = Entry.new
61
+ entry1.add_string "body", "hello there bob"
62
+ docid1 = index.add_entry entry1 # => 1
63
+
64
+ entry2 = Entry.new
65
+ entry2.add_string "body", "goodbye bob"
66
+ docid2 = index.add_entry entry2 # => 2
67
+
68
+ q1 = Query.new "body", "bob"
69
+ results1 = index.search q1 # => [2, 1]
70
+
71
+ q2 = q1.and Query.new("body", "hello")
72
+ results2 = index.search q2 # => [1]
73
+
74
+ index.add_label docid2, "funny"
75
+
76
+ q3 = Query.new "body", "bob ~funny"
77
+ results3 = index.search q3 # => [2]
78
+
79
+ == A note on concurrency:
80
+
81
+ Whistlepig is currently single-process and single-thread only. However, it is
82
+ built with multi-process access in mind. Per-segment single-writer,
83
+ multi-reader support is planned in the near future. Multi-writer support can be
84
+ accomplished via index striping and is planned for the distant future.
85
+
86
+ Please send bug reports and comments to: wmorgan-whistlepig-design@masanjin.net.
@@ -0,0 +1,28 @@
1
+ #ifndef WP_DEFAULTS_H_
2
+ #define WP_DEFAULTS_H_
3
+
4
+ // whistlepig defaults
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // just some generic definitions that we use in many places.
8
+
9
+ #include <stdint.h>
10
+
11
+ // these two types are segment-specific. an index as a whole uses a larger
12
+ // datatype for docids, and doesn't do anything with positions. but we
13
+ // refer to them all over the place, so it's convenient to break them out
14
+ // here rather than in segment.h.
15
+ typedef uint32_t docid_t;
16
+ typedef uint32_t pos_t; // position of a term within a document
17
+
18
+ // if you define DEBUGOUTPUT, all the DEBUG statements will magically start
19
+ // printing stuff out...
20
+ #ifdef DEBUGOUTPUT
21
+ #define DEBUG(fmt, ...) do { \
22
+ fprintf(stdout, "DEBUG %s:%d (%s): " fmt "\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, ## __VA_ARGS__); \
23
+ } while(0)
24
+ #else
25
+ #define DEBUG(fmt, ...) do { } while(0)
26
+ #endif
27
+
28
+ #endif
@@ -0,0 +1,181 @@
1
+ #include "whistlepig.h"
2
+ #include "tokenizer.lex.h"
3
+
4
+ static posarray* posarray_new(pos_t i) {
5
+ posarray* ret = malloc(sizeof(posarray));
6
+ ret->data = malloc(sizeof(pos_t));
7
+ ret->data[0] = i;
8
+ ret->size = ret->next = 1;
9
+ return ret;
10
+ }
11
+
12
+ static void posarray_free(posarray* p) {
13
+ free(p->data);
14
+ free(p);
15
+ }
16
+
17
+ static void posarray_add(posarray* p, pos_t a) {
18
+ while(p->next >= p->size) {
19
+ p->size *= 2;
20
+ p->data = realloc(p->data, p->size * sizeof(pos_t));
21
+ }
22
+ p->data[p->next++] = a;
23
+ }
24
+
25
+ static inline pos_t posarray_get(posarray* p, int i) { return p->data[i]; }
26
+
27
+ static inline khint_t khash_hash_string(const char *s) {
28
+ khint_t h = *s;
29
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
30
+ return h;
31
+ }
32
+
33
+ inline khint_t fielded_term_hash(fielded_term ft) {
34
+ return khash_hash_string(ft.field) ^ khash_hash_string(ft.term);
35
+ }
36
+
37
+ inline khint_t fielded_term_equals(fielded_term a, fielded_term b) {
38
+ return (strcmp(a.field, b.field) == 0) && (strcmp(a.term, b.term) == 0);
39
+ }
40
+
41
+ wp_entry* wp_entry_new() {
42
+ wp_entry* ret = malloc(sizeof(wp_entry));
43
+ ret->entries = kh_init(entries);
44
+ ret->next_offset = 0;
45
+
46
+ return ret;
47
+ }
48
+
49
+ RAISING_STATIC(add_token(wp_entry* entry, const char* field, const char* term, int field_len, int term_len)) {
50
+ fielded_term ft;
51
+ int status;
52
+
53
+ // copy field and term
54
+ ft.field = calloc(field_len + 1, sizeof(char));
55
+ strncpy(ft.field, field, field_len);
56
+
57
+ ft.term = calloc(term_len + 1, sizeof(char));
58
+ strncpy(ft.term, term, term_len);
59
+
60
+ khiter_t k = kh_put(entries, entry->entries, ft, &status);
61
+ if(status == 1) { // not found
62
+ kh_value(entry->entries, k) = posarray_new(entry->next_offset);
63
+ }
64
+ else { // just add the next offset to the array
65
+ posarray_add(kh_value(entry->entries, k), entry->next_offset);
66
+
67
+ // don't need these guys any more
68
+ free(ft.field);
69
+ free(ft.term);
70
+ }
71
+
72
+ entry->next_offset++;
73
+
74
+ return NO_ERROR;
75
+ }
76
+
77
+ uint32_t wp_entry_size(wp_entry* entry) {
78
+ uint32_t ret = 0;
79
+
80
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
81
+ if(kh_exist(entry->entries, i)) {
82
+ posarray* positions = kh_val(entry->entries, i);
83
+ ret += positions->next;
84
+ }
85
+ }
86
+
87
+ return ret;
88
+ }
89
+
90
+ RAISING_STATIC(add_from_lexer(wp_entry* entry, yyscan_t* scanner, const char* field)) {
91
+ int token_type;
92
+ int field_len = strlen(field);
93
+
94
+ do {
95
+ token_type = yylex(*scanner);
96
+ if(token_type == TOK_WORD) {
97
+ RELAY_ERROR(add_token(entry, field, yyget_text(*scanner), field_len, yyget_leng(*scanner)));
98
+ }
99
+ } while(token_type != TOK_DONE);
100
+
101
+ return NO_ERROR;
102
+ }
103
+
104
+ wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) {
105
+ RELAY_ERROR(add_token(entry, field, term, strlen(field), strlen(term)));
106
+
107
+ return NO_ERROR;
108
+ }
109
+
110
+ // tokenizes and adds everything under a single field
111
+ wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) {
112
+ yyscan_t scanner;
113
+ lexinfo charpos = {0, 0};
114
+
115
+ yylex_init_extra(&charpos, &scanner);
116
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
117
+ RELAY_ERROR(add_from_lexer(entry, &scanner, field));
118
+ yy_delete_buffer(state, scanner);
119
+ yylex_destroy(scanner);
120
+
121
+ return NO_ERROR;
122
+ }
123
+
124
+ // tokenizes and adds everything from a file under a single field
125
+ wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) {
126
+ yyscan_t scanner;
127
+ lexinfo charpos = {0, 0};
128
+
129
+ yylex_init_extra(&charpos, &scanner);
130
+ yyset_in(f, scanner);
131
+ RELAY_ERROR(add_from_lexer(entry, &scanner, field));
132
+ yylex_destroy(scanner);
133
+
134
+ return NO_ERROR;
135
+ }
136
+
137
+ wp_error* wp_entry_write_to_segment(wp_entry* entry, wp_segment* seg, docid_t doc_id) {
138
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
139
+ if(kh_exist(entry->entries, i)) {
140
+ fielded_term ft = kh_key(entry->entries, i);
141
+ posarray* positions = kh_val(entry->entries, i);
142
+ RELAY_ERROR(wp_segment_add_posting(seg, ft.field, ft.term, doc_id, positions->next, positions->data));
143
+ }
144
+ }
145
+
146
+ return NO_ERROR;
147
+ }
148
+
149
+ // currently this is a crazy overestimate (it's calculating the size without
150
+ // VBE) but that's fine. as long as we're not an underestimate, we should be ok.
151
+ wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, wp_segment* seg, uint32_t* size) {
152
+ *size = 0;
153
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
154
+ if(kh_exist(entry->entries, i)) {
155
+ posarray* positions = kh_val(entry->entries, i);
156
+
157
+ uint32_t this_size;
158
+ RELAY_ERROR(wp_segment_sizeof_posarray(seg, positions->next, positions->data, &this_size));
159
+ *size += this_size;
160
+ }
161
+ }
162
+
163
+ return NO_ERROR;
164
+ }
165
+
166
+ wp_error* wp_entry_free(wp_entry* entry) {
167
+ for(khiter_t k = kh_begin(entry->entries); k < kh_end(entry->entries); k++) {
168
+ if(kh_exist(entry->entries, k)) {
169
+ fielded_term ft = kh_key(entry->entries, k);
170
+ posarray* positions = kh_val(entry->entries, k);
171
+ free(ft.term);
172
+ free(ft.field);
173
+ posarray_free(positions);
174
+ }
175
+ }
176
+
177
+ kh_destroy(entries, entry->entries);
178
+ free(entry);
179
+
180
+ return NO_ERROR;
181
+ }
@@ -0,0 +1,66 @@
1
+ #ifndef WP_ENTRY_H_
2
+ #define WP_ENTRY_H_
3
+
4
+ // whistlepig entry
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // an entry is a document before being added to the index. it's nothig more
8
+ // than a map of (field,term) pairs to a (sorted) list of positions. you can
9
+ // use this to incrementally build up a document in memory before adding it to
10
+ // the index.
11
+
12
+ #include "defaults.h"
13
+ #include "error.h"
14
+ #include "segment.h"
15
+ #include "khash.h"
16
+
17
+ typedef struct posarray {
18
+ uint16_t size;
19
+ uint16_t next;
20
+ pos_t* data;
21
+ } posarray;
22
+
23
+ typedef struct fielded_term {
24
+ char* field;
25
+ char* term;
26
+ } fielded_term;
27
+
28
+ khint_t fielded_term_hash(fielded_term ft);
29
+ khint_t fielded_term_equals(fielded_term a, fielded_term b);
30
+
31
+ KHASH_INIT(entries, fielded_term, posarray*, 1, fielded_term_hash, fielded_term_equals);
32
+
33
+ typedef struct wp_entry {
34
+ khash_t(entries)* entries;
35
+ pos_t next_offset;
36
+ } wp_entry;
37
+
38
+ struct wp_segment;
39
+
40
+ // API methods
41
+
42
+ // public: make a new entry
43
+ wp_entry* wp_entry_new();
44
+
45
+ // public: return the number of tokens occurrences in the entry
46
+ uint32_t wp_entry_size(wp_entry* entry);
47
+
48
+ // public: add an individual token
49
+ wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) RAISES_ERROR;
50
+
51
+ // public: add a string, which will be tokenized at spaces only
52
+ wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) RAISES_ERROR;
53
+
54
+ // public: add a file, which will be tokenized at spaces only
55
+ wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) RAISES_ERROR;
56
+
57
+ // public: free an entry.
58
+ wp_error* wp_entry_free(wp_entry* entry) RAISES_ERROR;
59
+
60
+ // private: write to a segment
61
+ wp_error* wp_entry_write_to_segment(wp_entry* entry, struct wp_segment* seg, docid_t doc_id) RAISES_ERROR;
62
+
63
+ // private: calculate the size needed for a postings region
64
+ wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, struct wp_segment* seg, uint32_t* size) RAISES_ERROR;
65
+
66
+ #endif
@@ -0,0 +1,24 @@
1
+ #include <stdlib.h>
2
+ #include "error.h"
3
+
4
+ wp_error* wp_error_new(const char* msg, const char* src) {
5
+ wp_error* ret = malloc(sizeof(wp_error));
6
+ ret->msg = msg;
7
+ ret->size = 1;
8
+ ret->srcs = malloc(sizeof(const char*));
9
+ ret->srcs[0] = src;
10
+
11
+ return ret;
12
+ }
13
+
14
+ wp_error* wp_error_chain(wp_error* e, const char* src) {
15
+ e->size++;
16
+ e->srcs = realloc(e->srcs, sizeof(const char*) * e->size);
17
+ e->srcs[e->size - 1] = src;
18
+ return e;
19
+ }
20
+
21
+ void wp_error_free(wp_error* e) {
22
+ free(e->srcs);
23
+ free(e);
24
+ }
@@ -0,0 +1,94 @@
1
+ #ifndef WP_ERROR_H_
2
+ #define WP_ERROR_H_
3
+
4
+ // whistlepig errors
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // a pseudo-backtrace calling convention that whistlepig uses extensively to
8
+ // systematically detect, relay and report errors. no fancy longjmp magic; just
9
+ // macros around return statements, basically.
10
+ //
11
+ // to write a new function that fits in this system:
12
+ //
13
+ // 1. have your function return a wp_error*.
14
+ // 2. mark your function as RAISES_ERROR in the declaration (or use
15
+ // RAISING_STATIC for static functions that don't need a separate
16
+ // declaration).
17
+ // 3. within the function, use RAISE_ERROR or RAISE_SYSERROR to raise a new
18
+ // error and return.
19
+ // 4. within the function, use RELAY_ERROR to wrap all calls to functions that
20
+ // return wp_error*.
21
+ // 5. return NO_ERROR if nothing happened.
22
+
23
+ #include <errno.h>
24
+ #include <stdlib.h>
25
+ #include <stdio.h>
26
+ #include <string.h>
27
+
28
+ // pseudo-backtrace
29
+ typedef struct wp_error {
30
+ unsigned int size;
31
+ const char* msg;
32
+ const char** srcs;
33
+ } wp_error;
34
+
35
+ // for functions
36
+ #define RAISES_ERROR __attribute__ ((warn_unused_result))
37
+ #define RAISING_STATIC(f) static wp_error* f RAISES_ERROR; static wp_error* f
38
+
39
+ // API methods
40
+
41
+ // private: make a new error object with a message and source line
42
+ wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
43
+ // private: add a source line to a pre-existing error
44
+ wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
45
+
46
+ // public: free an error, once handled
47
+ void wp_error_free(wp_error* e);
48
+
49
+ // public: raise an error with a printf-style message
50
+ #define RAISE_ERROR(fmt, ...) do { \
51
+ char* msg = malloc(1024); \
52
+ char* src = malloc(1024); \
53
+ snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
54
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
55
+ return wp_error_new(msg, src); \
56
+ } while(0)
57
+
58
+ // public: raise an error with a printf-style message and have strerror() autoamtically
59
+ // appended
60
+ #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR(fmt ": %s", ## __VA_ARGS__, strerror(errno))
61
+
62
+ // public: relay an error up the stack if the called function returns one.
63
+ #define RELAY_ERROR(e) do { \
64
+ wp_error* __e = e; \
65
+ if(__e != NULL) { \
66
+ char* src = malloc(1024); \
67
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
68
+ return wp_error_chain(__e, src); \
69
+ } \
70
+ } while(0)
71
+
72
+ // public: print an error to stream
73
+ #define PRINT_ERROR(e, stream) do { \
74
+ wp_error* __e = e; \
75
+ fprintf(stream, "Error: %s\n", __e->msg); \
76
+ for(unsigned int i = 0; i < e->size; i++) fprintf(stream, " at %s\n", __e->srcs[i]); \
77
+ } while(0)
78
+
79
+ // public: print and exit if an error exists
80
+ #define DIE_IF_ERROR(e) do { \
81
+ wp_error* __e = e; \
82
+ if(__e != NULL) { \
83
+ char* src = malloc(1024); \
84
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
85
+ wp_error* err = wp_error_chain(__e, src); \
86
+ PRINT_ERROR(err, stderr); \
87
+ exit(-1); \
88
+ } \
89
+ } while(0)
90
+
91
+ // return me if no error happens
92
+ #define NO_ERROR NULL
93
+
94
+ #endif
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-g -O3 -std=c99 $(cflags)"
4
+
5
+ create_header
6
+ create_makefile "whistlepigc"