RubyGems - whistlepig - Versions diffs - 0.1 - Mend

whistlepig 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/README +86 -0
data/ext/whistlepig/defaults.h +28 -0
data/ext/whistlepig/entry.c +181 -0
data/ext/whistlepig/entry.h +66 -0
data/ext/whistlepig/error.c +24 -0
data/ext/whistlepig/error.h +94 -0
data/ext/whistlepig/extconf.rb +6 -0
data/ext/whistlepig/index.c +294 -0
data/ext/whistlepig/index.h +88 -0
data/ext/whistlepig/khash.h +316 -0
data/ext/whistlepig/mmap-obj.c +76 -0
data/ext/whistlepig/mmap-obj.h +52 -0
data/ext/whistlepig/query-parser.c +37 -0
data/ext/whistlepig/query-parser.h +25 -0
data/ext/whistlepig/query-parser.lex.c +2249 -0
data/ext/whistlepig/query-parser.lex.h +359 -0
data/ext/whistlepig/query-parser.tab.c +1757 -0
data/ext/whistlepig/query-parser.tab.h +85 -0
data/ext/whistlepig/query.c +194 -0
data/ext/whistlepig/query.h +78 -0
data/ext/whistlepig/search.c +746 -0
data/ext/whistlepig/search.h +76 -0
data/ext/whistlepig/segment.c +615 -0
data/ext/whistlepig/segment.h +137 -0
data/ext/whistlepig/stringmap.c +278 -0
data/ext/whistlepig/stringmap.h +82 -0
data/ext/whistlepig/stringpool.c +44 -0
data/ext/whistlepig/stringpool.h +58 -0
data/ext/whistlepig/termhash.c +294 -0
data/ext/whistlepig/termhash.h +79 -0
data/ext/whistlepig/tokenizer.lex.c +2263 -0
data/ext/whistlepig/tokenizer.lex.h +360 -0
data/ext/whistlepig/whistlepig.h +15 -0
data/ext/whistlepig/whistlepigc.c +537 -0
data/lib/whistlepig.rb +119 -0
metadata +103 -0

data/README ADDED Viewed

@@ -0,0 +1,86 @@
+= Whistlepig
+Whistlepig is a minimalist realtime full-text search index. Its goal is to be
+as small and feature-free as possible, while still remaining useful, performant
+and scalable to large corpora. If you want realtime full-text search without
+the frills, Whistlepig may be for you.
+Whistlepig is written in ANSI C99. It currently provides a C API and Ruby
+bindings.
+Latest version: 0.1, released 2010-02-08.
+        Status: alpha
+          News: http://all-thing.net/label/whistlepig/
+      Homepage: http://masanjin.net/whistlepig/
+= Getting it
+       Tarball:  http://masanjin.net/whistlepig/whistlepig-0.1.tar.gz
+       Rubygem:  gem install whistlepig
+           Git:  git clone git://github.com/wmorgan/whistlepig.git
+= Realtime search
+Roughly speaking, realtime search means:
+- documents are available to to queries immediately after indexing, without
+  any further index merging steps; and
+- later documents are more important than earlier documents.
+Whistlepig takes these principles to an extreme. In particular:
+- It only returns documents in the reverse order to which they were added
+  (i.e. LIFO order), and performs no ranking, reordering, or scoring.
+- It only supports incremental indexing. There is no notion of batch indexing
+  or index merging.
+- It does not support document deletion or modification (except in the
+  special case of labels; see below).
+- In only supports in-memory indexes.
+Features that Whistlepig does provide:
+- Incremental indexing. Updates to the index are immediately available to
+  readers.
+- Fielded terms with arbitrary fields.
+- A full query language and parser with conjunctions, disjunctions, phrases,
+  negations, grouping, and nesting.
+- Labels: arbitrary tokens which can be added to and removed from documents
+  at any point, and incorporated into search queries. (This is the only
+  mutable aspect of a document once it has been indexed.)
+- Early query termination.
+- Resumable queries.
+- A tiny, < 3 KLOC ANSI C99 implementation.
+== Synopsis (using Ruby bindings)
+  require 'rubygems'
+  require 'whistlepig'
+  include Whistlepig
+  index = Index.new "index"
+  entry1 = Entry.new
+  entry1.add_string "body", "hello there bob"
+  docid1 = index.add_entry entry1              # => 1
+  entry2 = Entry.new
+  entry2.add_string "body", "goodbye bob"
+  docid2 = index.add_entry entry2              # => 2
+  q1 = Query.new "body", "bob"
+  results1 = index.search q1                   # => [2, 1]
+  q2 = q1.and Query.new("body", "hello")
+  results2 = index.search q2                   # => [1]
+  index.add_label docid2, "funny"
+  q3 = Query.new "body", "bob ~funny"
+  results3 = index.search q3                   # => [2]
+== A note on concurrency:
+Whistlepig is currently single-process and single-thread only. However, it is
+built with multi-process access in mind. Per-segment single-writer,
+multi-reader support is planned in the near future. Multi-writer support can be
+accomplished via index striping and is planned for the distant future.
+Please send bug reports and comments to: wmorgan-whistlepig-design@masanjin.net.

data/ext/whistlepig/defaults.h ADDED Viewed

@@ -0,0 +1,28 @@
+#ifndef WP_DEFAULTS_H_
+#define WP_DEFAULTS_H_
+// whistlepig defaults
+// (c) 2011 William Morgan. See COPYING for license terms.
+//
+// just some generic definitions that we use in many places.
+#include <stdint.h>
+// these two types are segment-specific. an index as a whole uses a larger
+// datatype for docids, and doesn't do anything with positions. but we
+// refer to them all over the place, so it's convenient to break them out
+// here rather than in segment.h.
+typedef uint32_t docid_t;
+typedef uint32_t pos_t; // position of a term within a document
+// if you define DEBUGOUTPUT, all the DEBUG statements will magically start
+// printing stuff out...
+#ifdef DEBUGOUTPUT
+#define DEBUG(fmt, ...) do { \
+  fprintf(stdout, "DEBUG %s:%d (%s): " fmt "\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, ## __VA_ARGS__); \
+} while(0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+#endif

data/ext/whistlepig/entry.c ADDED Viewed

@@ -0,0 +1,181 @@
+#include "whistlepig.h"
+#include "tokenizer.lex.h"
+static posarray* posarray_new(pos_t i) {
+  posarray* ret = malloc(sizeof(posarray));
+  ret->data = malloc(sizeof(pos_t));
+  ret->data[0] = i;
+  ret->size = ret->next = 1;
+  return ret;
+}
+static void posarray_free(posarray* p) {
+  free(p->data);
+  free(p);
+}
+static void posarray_add(posarray* p, pos_t a) {
+  while(p->next >= p->size) {
+    p->size *= 2;
+    p->data = realloc(p->data, p->size * sizeof(pos_t));
+  }
+  p->data[p->next++] = a;
+}
+static inline pos_t posarray_get(posarray* p, int i) { return p->data[i]; }
+static inline khint_t khash_hash_string(const char *s) {
+  khint_t h = *s;
+  if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+  return h;
+}
+inline khint_t fielded_term_hash(fielded_term ft) {
+  return khash_hash_string(ft.field) ^ khash_hash_string(ft.term);
+}
+inline khint_t fielded_term_equals(fielded_term a, fielded_term b) {
+  return (strcmp(a.field, b.field) == 0) && (strcmp(a.term, b.term) == 0);
+}
+wp_entry* wp_entry_new() {
+  wp_entry* ret = malloc(sizeof(wp_entry));
+  ret->entries = kh_init(entries);
+  ret->next_offset = 0;
+  return ret;
+}
+RAISING_STATIC(add_token(wp_entry* entry, const char* field, const char* term, int field_len, int term_len)) {
+  fielded_term ft;
+  int status;
+  // copy field and term
+  ft.field = calloc(field_len + 1, sizeof(char));
+  strncpy(ft.field, field, field_len);
+  ft.term = calloc(term_len + 1, sizeof(char));
+  strncpy(ft.term, term, term_len);
+  khiter_t k = kh_put(entries, entry->entries, ft, &status);
+  if(status == 1) { // not found
+    kh_value(entry->entries, k) = posarray_new(entry->next_offset);
+  }
+  else { // just add the next offset to the array
+    posarray_add(kh_value(entry->entries, k), entry->next_offset);
+    // don't need these guys any more
+    free(ft.field);
+    free(ft.term);
+  }
+  entry->next_offset++;
+  return NO_ERROR;
+}
+uint32_t wp_entry_size(wp_entry* entry) {
+  uint32_t ret = 0;
+  for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
+    if(kh_exist(entry->entries, i)) {
+      posarray* positions = kh_val(entry->entries, i);
+      ret += positions->next;
+    }
+  }
+  return ret;
+}
+RAISING_STATIC(add_from_lexer(wp_entry* entry, yyscan_t* scanner, const char* field)) {
+  int token_type;
+  int field_len = strlen(field);
+  do {
+    token_type = yylex(*scanner);
+    if(token_type == TOK_WORD) {
+      RELAY_ERROR(add_token(entry, field, yyget_text(*scanner), field_len, yyget_leng(*scanner)));
+    }
+  } while(token_type != TOK_DONE);
+  return NO_ERROR;
+}
+wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) {
+  RELAY_ERROR(add_token(entry, field, term, strlen(field), strlen(term)));
+  return NO_ERROR;
+}
+// tokenizes and adds everything under a single field
+wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) {
+  yyscan_t scanner;
+  lexinfo charpos = {0, 0};
+  yylex_init_extra(&charpos, &scanner);
+  YY_BUFFER_STATE state = yy_scan_string(string, scanner);
+  RELAY_ERROR(add_from_lexer(entry, &scanner, field));
+  yy_delete_buffer(state, scanner);
+  yylex_destroy(scanner);
+  return NO_ERROR;
+}
+// tokenizes and adds everything from a file under a single field
+wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) {
+  yyscan_t scanner;
+  lexinfo charpos = {0, 0};
+  yylex_init_extra(&charpos, &scanner);
+  yyset_in(f, scanner);
+  RELAY_ERROR(add_from_lexer(entry, &scanner, field));
+  yylex_destroy(scanner);
+  return NO_ERROR;
+}
+wp_error* wp_entry_write_to_segment(wp_entry* entry, wp_segment* seg, docid_t doc_id) {
+  for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
+    if(kh_exist(entry->entries, i)) {
+      fielded_term ft = kh_key(entry->entries, i);
+      posarray* positions = kh_val(entry->entries, i);
+      RELAY_ERROR(wp_segment_add_posting(seg, ft.field, ft.term, doc_id, positions->next, positions->data));
+    }
+  }
+  return NO_ERROR;
+}
+// currently this is a crazy overestimate (it's calculating the size without
+// VBE) but that's fine. as long as we're not an underestimate, we should be ok.
+wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, wp_segment* seg, uint32_t* size) {
+  *size = 0;
+  for(khiter_t i = kh_begin(entry->entries); i < kh_end(entry->entries); i++) {
+    if(kh_exist(entry->entries, i)) {
+      posarray* positions = kh_val(entry->entries, i);
+      uint32_t this_size;
+      RELAY_ERROR(wp_segment_sizeof_posarray(seg, positions->next, positions->data, &this_size));
+      *size += this_size;
+    }
+  }
+  return NO_ERROR;
+}
+wp_error* wp_entry_free(wp_entry* entry) {
+  for(khiter_t k = kh_begin(entry->entries); k < kh_end(entry->entries); k++) {
+    if(kh_exist(entry->entries, k)) {
+      fielded_term ft = kh_key(entry->entries, k);
+      posarray* positions = kh_val(entry->entries, k);
+      free(ft.term);
+      free(ft.field);
+      posarray_free(positions);
+    }
+  }
+  kh_destroy(entries, entry->entries);
+  free(entry);
+  return NO_ERROR;
+}

data/ext/whistlepig/entry.h ADDED Viewed

@@ -0,0 +1,66 @@
+#ifndef WP_ENTRY_H_
+#define WP_ENTRY_H_
+// whistlepig entry
+// (c) 2011 William Morgan. See COPYING for license terms.
+//
+// an entry is a document before being added to the index. it's nothig more
+// than a map of (field,term) pairs to a (sorted) list of positions.  you can
+// use this to incrementally build up a document in memory before adding it to
+// the index.
+#include "defaults.h"
+#include "error.h"
+#include "segment.h"
+#include "khash.h"
+typedef struct posarray {
+  uint16_t size;
+  uint16_t next;
+  pos_t* data;
+} posarray;
+typedef struct fielded_term {
+  char* field;
+  char* term;
+} fielded_term;
+khint_t fielded_term_hash(fielded_term ft);
+khint_t fielded_term_equals(fielded_term a, fielded_term b);
+KHASH_INIT(entries, fielded_term, posarray*, 1, fielded_term_hash, fielded_term_equals);
+typedef struct wp_entry {
+  khash_t(entries)* entries;
+  pos_t next_offset;
+} wp_entry;
+struct wp_segment;
+// API methods
+// public: make a new entry
+wp_entry* wp_entry_new();
+// public: return the number of tokens occurrences in the entry
+uint32_t wp_entry_size(wp_entry* entry);
+// public: add an individual token
+wp_error* wp_entry_add_token(wp_entry* entry, const char* field, const char* term) RAISES_ERROR;
+// public: add a string, which will be tokenized at spaces only
+wp_error* wp_entry_add_string(wp_entry* entry, const char* field, const char* string) RAISES_ERROR;
+// public: add a file, which will be tokenized at spaces only
+wp_error* wp_entry_add_file(wp_entry* entry, const char* field, FILE* f) RAISES_ERROR;
+// public: free an entry.
+wp_error* wp_entry_free(wp_entry* entry) RAISES_ERROR;
+// private: write to a segment
+wp_error* wp_entry_write_to_segment(wp_entry* entry, struct wp_segment* seg, docid_t doc_id) RAISES_ERROR;
+// private: calculate the size needed for a postings region
+wp_error* wp_entry_sizeof_postings_region(wp_entry* entry, struct wp_segment* seg, uint32_t* size) RAISES_ERROR;
+#endif

data/ext/whistlepig/error.c ADDED Viewed

@@ -0,0 +1,24 @@
+#include <stdlib.h>
+#include "error.h"
+wp_error* wp_error_new(const char* msg, const char* src) {
+  wp_error* ret = malloc(sizeof(wp_error));
+  ret->msg = msg;
+  ret->size = 1;
+  ret->srcs = malloc(sizeof(const char*));
+  ret->srcs[0] = src;
+  return ret;
+}
+wp_error* wp_error_chain(wp_error* e, const char* src) {
+  e->size++;
+  e->srcs = realloc(e->srcs, sizeof(const char*) * e->size);
+  e->srcs[e->size - 1] = src;
+  return e;
+}
+void wp_error_free(wp_error* e) {
+  free(e->srcs);
+  free(e);
+}

data/ext/whistlepig/error.h ADDED Viewed

@@ -0,0 +1,94 @@
+#ifndef WP_ERROR_H_
+#define WP_ERROR_H_
+// whistlepig errors
+// (c) 2011 William Morgan. See COPYING for license terms.
+//
+// a pseudo-backtrace calling convention that whistlepig uses extensively to
+// systematically detect, relay and report errors. no fancy longjmp magic; just
+// macros around return statements, basically.
+//
+// to write a new function that fits in this system:
+//
+// 1. have your function return a wp_error*.
+// 2. mark your function as RAISES_ERROR in the declaration (or use
+//    RAISING_STATIC for static functions that don't need a separate
+//    declaration).
+// 3. within the function, use RAISE_ERROR or RAISE_SYSERROR to raise a new
+//    error and return.
+// 4. within the function, use RELAY_ERROR to wrap all calls to functions that
+//    return wp_error*.
+// 5. return NO_ERROR if nothing happened.
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+// pseudo-backtrace
+typedef struct wp_error {
+  unsigned int size;
+  const char* msg;
+  const char** srcs;
+} wp_error;
+// for functions
+#define RAISES_ERROR __attribute__ ((warn_unused_result))
+#define RAISING_STATIC(f) static wp_error* f RAISES_ERROR; static wp_error* f
+// API methods
+// private: make a new error object with a message and source line
+wp_error* wp_error_new(const char* msg, const char* src) RAISES_ERROR;
+// private: add a source line to a pre-existing error
+wp_error* wp_error_chain(wp_error* e, const char* src) RAISES_ERROR;
+// public: free an error, once handled
+void wp_error_free(wp_error* e);
+// public: raise an error with a printf-style message
+#define RAISE_ERROR(fmt, ...) do { \
+  char* msg = malloc(1024); \
+  char* src = malloc(1024); \
+  snprintf(msg, 1024, fmt, ## __VA_ARGS__); \
+  snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+  return wp_error_new(msg, src); \
+} while(0)
+// public: raise an error with a printf-style message and have strerror() autoamtically
+// appended
+#define RAISE_SYSERROR(fmt, ...) RAISE_ERROR(fmt ": %s", ## __VA_ARGS__, strerror(errno))
+// public: relay an error up the stack if the called function returns one.
+#define RELAY_ERROR(e) do { \
+  wp_error* __e = e; \
+  if(__e != NULL) { \
+    char* src = malloc(1024); \
+    snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+    return wp_error_chain(__e, src); \
+  } \
+} while(0)
+// public: print an error to stream
+#define PRINT_ERROR(e, stream) do { \
+  wp_error* __e = e; \
+  fprintf(stream, "Error: %s\n", __e->msg); \
+  for(unsigned int i = 0; i < e->size; i++) fprintf(stream, "  at %s\n", __e->srcs[i]); \
+} while(0)
+// public: print and exit if an error exists
+#define DIE_IF_ERROR(e) do { \
+  wp_error* __e = e; \
+  if(__e != NULL) { \
+    char* src = malloc(1024); \
+    snprintf(src, 1024, "%s (%s:%d)", __PRETTY_FUNCTION__, __FILE__, __LINE__); \
+    wp_error* err = wp_error_chain(__e, src); \
+    PRINT_ERROR(err, stderr); \
+    exit(-1); \
+  } \
+} while(0)
+// return me if no error happens
+#define NO_ERROR NULL
+#endif

data/ext/whistlepig/extconf.rb ADDED Viewed

@@ -0,0 +1,6 @@
+require 'mkmf'
+$CFLAGS = "-g -O3 -std=c99 $(cflags)"
+create_header
+create_makefile "whistlepigc"