whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,86 @@
1
+ = Whistlepig
2
+
3
+ Whistlepig is a minimalist realtime full-text search index. Its goal is to be
4
+ as small and feature-free as possible, while still remaining useful, performant
5
+ and scalable to large corpora. If you want realtime full-text search without
6
+ the frills, Whistlepig may be for you.
7
+
8
+ Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
9
+ bindings.
10
+
11
+ Latest version: 0.1, released 2010-02-08.
12
+ Status: alpha
13
+ News: http://all-thing.net/label/whistlepig/
14
+ Homepage: http://masanjin.net/whistlepig/
15
+
16
+ = Getting it
17
+
18
+ Tarball: http://masanjin.net/whistlepig/whistlepig-0.1.tar.gz
19
+ Rubygem: gem install whistlepig
20
+ Git: git clone git://github.com/wmorgan/whistlepig.git
21
+
22
+ = Realtime search
23
+
24
+ Roughly speaking, realtime search means:
25
+ - documents are available to to queries immediately after indexing, without
26
+ any further index merging steps; and
27
+ - later documents are more important than earlier documents.
28
+
29
+ Whistlepig takes these principles to an extreme. In particular:
30
+ - It only returns documents in the reverse order to which they were added
31
+ (i.e. LIFO order), and performs no ranking, reordering, or scoring.
32
+ - It only supports incremental indexing. There is no notion of batch indexing
33
+ or index merging.
34
+ - It does not support document deletion or modification (except in the
35
+ special case of labels; see below).
36
+ - In only supports in-memory indexes.
37
+
38
+ Features that Whistlepig does provide:
39
+ - Incremental indexing. Updates to the index are immediately available to
40
+ readers.
41
+ - Fielded terms with arbitrary fields.
42
+ - A full query language and parser with conjunctions, disjunctions, phrases,
43
+ negations, grouping, and nesting.
44
+ - Labels: arbitrary tokens which can be added to and removed from documents
45
+ at any point, and incorporated into search queries. (This is the only
46
+ mutable aspect of a document once it has been indexed.)
47
+ - Early query termination.
48
+ - Resumable queries.
49
+ - A tiny, < 3 KLOC ANSI C99 implementation.
50
+
51
+ == Synopsis (using Ruby bindings)
52
+
53
+ require 'rubygems'
54
+ require 'whistlepig'
55
+
56
+ include Whistlepig
57
+
58
+ index = Index.new "index"
59
+
60
+ entry1 = Entry.new
61
+ entry1.add_string "body", "hello there bob"
62
+ docid1 = index.add_entry entry1 # => 1
63
+
64
+ entry2 = Entry.new
65
+ entry2.add_string "body", "goodbye bob"
66
+ docid2 = index.add_entry entry2 # => 2
67
+
68
+ q1 = Query.new "body", "bob"
69
+ results1 = index.search q1 # => [2, 1]
70
+
71
+ q2 = q1.and Query.new("body", "hello")
72
+ results2 = index.search q2 # => [1]
73
+
74
+ index.add_label docid2, "funny"
75
+
76
+ q3 = Query.new "body", "bob ~funny"
77
+ results3 = index.search q3 # => [2]
78
+
79
+ == A note on concurrency:
80
+
81
+ Whistlepig is currently single-process and single-thread only. However, it is
82
+ built with multi-process access in mind. Per-segment single-writer,
83
+ multi-reader support is planned in the near future. Multi-writer support can be
84
+ accomplished via index striping and is planned for the distant future.
85
+
86
+ Please send bug reports and comments to: wmorgan-whistlepig-design@masanjin.net.
@@ -0,0 +1,28 @@
1
+ #ifndef WP_DEFAULTS_H_
2
+ #define WP_DEFAULTS_H_
3
+
4
+ // whistlepig defaults
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // just some generic definitions that we use in many places.
8
+
9
+ #include <stdint.h>
10
+
11
+ // these two types are segment-specific. an index as a whole uses a larger
12
+ // datatype for docids, and doesn't do anything with positions. but we
13
+ // refer to them all over the place, so it's convenient to break them out
14
+ // here rather than in segment.h.
15
+ typedef uint32_t docid_t;
16
+ typedef uint32_t pos_t; // position of a term within a document
17
+
18
+ // if you define DEBUGOUTPUT, all the DEBUG statements will magically start
19
+ // printing stuff out...
20
+ #ifdef DEBUGOUTPUT
21
+ #define DEBUG(fmt, ...) do { \
22
+ fprintf(stdout, "DEBUG %s:%d (%s): " fmt "\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, ## __VA_ARGS__); \
23
+ } while(0)
24
+ #else
25
+ #define DEBUG(fmt, ...) do { } while(0)
26
+ #endif
27
+
28
+ #endif
@@ -0,0 +1,181 @@
1
+ #include "whistlepig.h"
2
+ #include "tokenizer.lex.h"
3
+
4
+ static posarray* posarray_new(pos_t i) {
5
+ posarray* ret = malloc(sizeof(posarray));
6
+ ret->data = malloc(sizeof(pos_t));
7
+ ret->data[0] = i;
8
+ ret->size = ret->next = 1;
9
+ return ret;
10
+ }
11
+
12
+ static void posarray_free(posarray* p) {
13
+ free(p->data);
14
+ free(p);
15
+ }
16
+
17
+ static void posarray_add(posarray* p, pos_t a) {
18
+ while(p->next >= p->size) {
19
+ p->size *= 2;
20
+ p->data = realloc(p->data, p->size * sizeof(pos_t));
21
+ }
22
+ p->data[p->next++] = a;
23
+ }
24
+
25
+ static inline pos_t posarray_get(posarray* p, int i) { return p->data[i]; }
26
+
27
+ static inline khint_t khash_hash_string(const char *s) {
28
+ khint_t h = *s;
29
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
30
+ return h;
31
+ }
32
+
33
+ inline khint_t fielded_term_hash(fielded_term ft) {
34
+ return khash_hash_string(ft.field) ^ khash_hash_string(ft.term);
35
+ }
36
+
37
+ inline khint_t fielded_term_equals(fielded_term a, fielded_term b) {
38
+ return (strcmp(a.field, b.field) == 0) && (strcmp(a.term, b.term) == 0);
39
+ }
40
+
41
+ wp_entry* wp_entry_new() {
42
+ wp_entry* ret = malloc(sizeof(wp_entry));
43
+ ret->entries = kh_init(entries);
44
+ ret->next_offset = 0;
45
+
46
+ return ret;
47
+ }
48
+
49
+ RAISING_STATIC(add_token(wp_entry* entry, const char* field, const char* term, int field_len, int term_len)) {
50
+ fielded_term ft;
51
+ int status;
52
+
53
+ // copy field and term
54
+ ft.field = calloc(field_len + 1, sizeof(char));
55
+ strncpy(ft.field, field, field_len);
56
+
57
+ ft.term = calloc(term_len + 1, sizeof(char));
58
+ strncpy(ft.term, term, term_len);
59
+
60
+ khiter_t k = kh_put(entries, entry->entries, ft, &status);
61
+ if(status == 1) { // not found
62
+ kh_value(entry->entries, k) = posarray_new(entry->next_offset);
63
+ }
64
+ else { // just add the next offset to the array
65
+ posarray_add(kh_value(entry->entries, k), entry->next_offset);
66
+
67
+ // don't need these guys any more
68
+ free(ft.field);
69
+ free(ft.term);
70
+ }
71
+
72
+ entry->next_offset++;
73
+
74
+ return NO_ERROR;
75
+ }
76
+
77
+ uint32_t wp_entry_size(wp_entry* entry) {
78
+ uint32_t ret = 0;
79
+
80
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
81
+ if(kh_exist(entry->entries, i)) {
82
+ posarray* positions = kh_val(entry->entries, i);
83
+ ret += positions->next;
84
+ }
85
+ }
86
+
87
+ return ret;
88
+ }
89
+
90
+ RAISING_STATIC(add_from_lexer(wp_entry* entry, yyscan_t* scanner, const char* field)) {
91
+ int token_type;
92
+ int field_len = strlen(field);
93
+
94
+ do {
95
+ token_type = yylex(*scanner);
96
+ if(token_type == TOK_WORD) {
97
+ RELAY_ERROR(add_token(entry, field, yyget_text(*scanner), field_len, yyget_leng(*scanner)));
98
+ }
99
+ } while(token_type != TOK_DONE);
100
+
101
+ return NO_ERROR;
102
+ }
103
+
104
+ wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) {
105
+ RELAY_ERROR(add_token(entry, field, term, strlen(field), strlen(term)));
106
+
107
+ return NO_ERROR;
108
+ }
109
+
110
+ // tokenizes and adds everything under a single field
111
+ wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) {
112
+ yyscan_t scanner;
113
+ lexinfo charpos = {0, 0};
114
+
115
+ yylex_init_extra(&charpos, &scanner);
116
+ YY_BUFFER_STATE state = yy_scan_string(string, scanner);
117
+ RELAY_ERROR(add_from_lexer(entry, &scanner, field));
118
+ yy_delete_buffer(state, scanner);
119
+ yylex_destroy(scanner);
120
+
121
+ return NO_ERROR;
122
+ }
123
+
124
+ // tokenizes and adds everything from a file under a single field
125
+ wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) {
126
+ yyscan_t scanner;
127
+ lexinfo charpos = {0, 0};
128
+
129
+ yylex_init_extra(&charpos, &scanner);
130
+ yyset_in(f, scanner);
131
+ RELAY_ERROR(add_from_lexer(entry, &scanner, field));
132
+ yylex_destroy(scanner);
133
+
134
+ return NO_ERROR;
135
+ }
136
+
137
+ wp_error* wp_entry_write_to_segment(wp_entry* entry, wp_segment* seg, docid_t doc_id) {
138
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
139
+ if(kh_exist(entry->entries, i)) {
140
+ fielded_term ft = kh_key(entry->entries, i);
141
+ posarray* positions = kh_val(entry->entries, i);
142
+ RELAY_ERROR(wp_segment_add_posting(seg, ft.field, ft.term, doc_id, positions->next, positions->data));
143
+ }
144
+ }
145
+
146
+ return NO_ERROR;
147
+ }
148
+
149
+ // currently this is a crazy overestimate (it's calculating the size without
150
+ // VBE) but that's fine. as long as we're not an underestimate, we should be ok.
151
+ wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, wp_segment* seg, uint32_t* size) {
152
+ *size = 0;
153
+ for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
154
+ if(kh_exist(entry->entries, i)) {
155
+ posarray* positions = kh_val(entry->entries, i);
156
+
157
+ uint32_t this_size;
158
+ RELAY_ERROR(wp_segment_sizeof_posarray(seg, positions->next, positions->data, &this_size));
159
+ *size += this_size;
160
+ }
161
+ }
162
+
163
+ return NO_ERROR;
164
+ }
165
+
166
+ wp_error* wp_entry_free(wp_entry* entry) {
167
+ for(khiter_t k = kh_begin(entry->entries); k < kh_end(entry->entries); k++) {
168
+ if(kh_exist(entry->entries, k)) {
169
+ fielded_term ft = kh_key(entry->entries, k);
170
+ posarray* positions = kh_val(entry->entries, k);
171
+ free(ft.term);
172
+ free(ft.field);
173
+ posarray_free(positions);
174
+ }
175
+ }
176
+
177
+ kh_destroy(entries, entry->entries);
178
+ free(entry);
179
+
180
+ return NO_ERROR;
181
+ }
@@ -0,0 +1,66 @@
1
+ #ifndef WP_ENTRY_H_
2
+ #define WP_ENTRY_H_
3
+
4
+ // whistlepig entry
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // an entry is a document before being added to the index. it's nothig more
8
+ // than a map of (field,term) pairs to a (sorted) list of positions. you can
9
+ // use this to incrementally build up a document in memory before adding it to
10
+ // the index.
11
+
12
+ #include "defaults.h"
13
+ #include "error.h"
14
+ #include "segment.h"
15
+ #include "khash.h"
16
+
17
+ typedef struct posarray {
18
+ uint16_t size;
19
+ uint16_t next;
20
+ pos_t* data;
21
+ } posarray;
22
+
23
+ typedef struct fielded_term {
24
+ char* field;
25
+ char* term;
26
+ } fielded_term;
27
+
28
+ khint_t fielded_term_hash(fielded_term ft);
29
+ khint_t fielded_term_equals(fielded_term a, fielded_term b);
30
+
31
+ KHASH_INIT(entries, fielded_term, posarray*, 1, fielded_term_hash, fielded_term_equals);
32
+
33
+ typedef struct wp_entry {
34
+ khash_t(entries)* entries;
35
+ pos_t next_offset;
36
+ } wp_entry;
37
+
38
+ struct wp_segment;
39
+
40
+ // API methods
41
+
42
+ // public: make a new entry
43
+ wp_entry* wp_entry_new();
44
+
45
+ // public: return the number of tokens occurrences in the entry
46
+ uint32_t wp_entry_size(wp_entry* entry);
47
+
48
+ // public: add an individual token
49
+ wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) RAISES_ERROR;
50
+
51
+ // public: add a string, which will be tokenized at spaces only
52
+ wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) RAISES_ERROR;
53
+
54
+ // public: add a file, which will be tokenized at spaces only
55
+ wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) RAISES_ERROR;
56
+
57
+ // public: free an entry.
58
+ wp_error* wp_entry_free(wp_entry* entry) RAISES_ERROR;
59
+
60
+ // private: write to a segment
61
+ wp_error* wp_entry_write_to_segment(wp_entry* entry, struct wp_segment* seg, docid_t doc_id) RAISES_ERROR;
62
+
63
+ // private: calculate the size needed for a postings region
64
+ wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, struct wp_segment* seg, uint32_t* size) RAISES_ERROR;
65
+
66
+ #endif
@@ -0,0 +1,24 @@
1
+ #include <stdlib.h>
2
+ #include "error.h"
3
+
4
+ wp_error* wp_error_new(const char* msg, const char* src) {
5
+ wp_error* ret = malloc(sizeof(wp_error));
6
+ ret->msg = msg;
7
+ ret->size = 1;
8
+ ret->srcs = malloc(sizeof(const char*));
9
+ ret->srcs[0] = src;
10
+
11
+ return ret;
12
+ }
13
+
14
+ wp_error* wp_error_chain(wp_error* e, const char* src) {
15
+ e->size++;
16
+ e->srcs = realloc(e->srcs, sizeof(const char*) * e->size);
17
+ e->srcs[e->size - 1] = src;
18
+ return e;
19
+ }
20
+
21
+ void wp_error_free(wp_error* e) {
22
+ free(e->srcs);
23
+ free(e);
24
+ }
@@ -0,0 +1,94 @@
1
+ #ifndef WP_ERROR_H_
2
+ #define WP_ERROR_H_
3
+
4
+ // whistlepig errors
5
+ // (c) 2011 William Morgan. See COPYING for license terms.
6
+ //
7
+ // a pseudo-backtrace calling convention that whistlepig uses extensively to
8
+ // systematically detect, relay and report errors. no fancy longjmp magic; just
9
+ // macros around return statements, basically.
10
+ //
11
+ // to write a new function that fits in this system:
12
+ //
13
+ // 1. have your function return a wp_error*.
14
+ // 2. mark your function as RAISES_ERROR in the declaration (or use
15
+ // RAISING_STATIC for static functions that don't need a separate
16
+ // declaration).
17
+ // 3. within the function, use RAISE_ERROR or RAISE_SYSERROR to raise a new
18
+ // error and return.
19
+ // 4. within the function, use RELAY_ERROR to wrap all calls to functions that
20
+ // return wp_error*.
21
+ // 5. return NO_ERROR if nothing happened.
22
+
23
+ #include <errno.h>
24
+ #include <stdlib.h>
25
+ #include <stdio.h>
26
+ #include <string.h>
27
+
28
+ // pseudo-backtrace
29
+ typedef struct wp_error {
30
+ unsigned int size;
31
+ const char* msg;
32
+ const char** srcs;
33
+ } wp_error;
34
+
35
+ // for functions
36
+ #define RAISES_ERROR __attribute__ ((warn_unused_result))
37
+ #define RAISING_STATIC(f) static wp_error* f RAISES_ERROR; static wp_error* f
38
+
39
+ // API methods
40
+
41
+ // private: make a new error object with a message and source line
42
+ wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
43
+ // private: add a source line to a pre-existing error
44
+ wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
45
+
46
+ // public: free an error, once handled
47
+ void wp_error_free(wp_error* e);
48
+
49
+ // public: raise an error with a printf-style message
50
+ #define RAISE_ERROR(fmt, ...) do { \
51
+ char* msg = malloc(1024); \
52
+ char* src = malloc(1024); \
53
+ snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
54
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
55
+ return wp_error_new(msg, src); \
56
+ } while(0)
57
+
58
+ // public: raise an error with a printf-style message and have strerror() autoamtically
59
+ // appended
60
+ #define RAISE_SYSERROR(fmt, ...) RAISE_ERROR(fmt ": %s", ## __VA_ARGS__, strerror(errno))
61
+
62
+ // public: relay an error up the stack if the called function returns one.
63
+ #define RELAY_ERROR(e) do { \
64
+ wp_error* __e = e; \
65
+ if(__e != NULL) { \
66
+ char* src = malloc(1024); \
67
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
68
+ return wp_error_chain(__e, src); \
69
+ } \
70
+ } while(0)
71
+
72
+ // public: print an error to stream
73
+ #define PRINT_ERROR(e, stream) do { \
74
+ wp_error* __e = e; \
75
+ fprintf(stream, "Error: %s\n", __e->msg); \
76
+ for(unsigned int i = 0; i < e->size; i++) fprintf(stream, " at %s\n", __e->srcs[i]); \
77
+ } while(0)
78
+
79
+ // public: print and exit if an error exists
80
+ #define DIE_IF_ERROR(e) do { \
81
+ wp_error* __e = e; \
82
+ if(__e != NULL) { \
83
+ char* src = malloc(1024); \
84
+ snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
85
+ wp_error* err = wp_error_chain(__e, src); \
86
+ PRINT_ERROR(err, stderr); \
87
+ exit(-1); \
88
+ } \
89
+ } while(0)
90
+
91
+ // return me if no error happens
92
+ #define NO_ERROR NULL
93
+
94
+ #endif
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-g -O3 -std=c99 $(cflags)"
4
+
5
+ create_header
6
+ create_makefile "whistlepigc"