RubyGems - fuzz_ball - Versions diffs - 0.9.0 - Mend

fuzz_ball 0.9.0

Files changed (13) hide show

data/ext/fuzz_ball/duple_index/DupleIndex.c +276 -0
data/ext/fuzz_ball/duple_index/DupleIndex.h +60 -0
data/ext/fuzz_ball/duple_index/extconf.rb +5 -0
data/ext/fuzz_ball/duple_index/utarray.h +226 -0
data/ext/fuzz_ball/duple_index/uthash.h +904 -0
data/ext/fuzz_ball/duple_index/utlist.h +522 -0
data/ext/fuzz_ball/duple_index/utstring.h +148 -0
data/ext/fuzz_ball/smith_waterman/SmithWaterman.c +211 -0
data/ext/fuzz_ball/smith_waterman/SmithWaterman.h +19 -0
data/ext/fuzz_ball/smith_waterman/extconf.rb +5 -0
data/lib/fuzz_ball.rb +6 -0
data/lib/fuzz_ball/searcher.rb +92 -0
metadata +66 -0

data/ext/fuzz_ball/duple_index/DupleIndex.c ADDED Viewed

@@ -0,0 +1,276 @@
+// Include the Ruby headers and goodies
+#include <ruby.h>
+#include "uthash.h"
+#include <DupleIndex.h>
+// The initialization method for this module
+void Init_duple_index() {
+  FuzzBall   = rb_define_module("FuzzBall");
+  DupleIndex = rb_define_class_under(FuzzBall, "DupleIndex", rb_cObject);
+  rb_define_alloc_func(DupleIndex, method_alloc_index);
+  rb_define_method(DupleIndex, "add", method_add, 2);
+  rb_define_method(DupleIndex, "match", method_match, 1);
+}
+/* method_alloc_index
+ *
+ * This method is a custom allocation method where we initialize
+ * the hash that will serve as the duple_index and store it in the
+ * duples pointer. We then store it within the instantiated ruby
+ * object for later use.
+ */
+VALUE method_alloc_index(VALUE self) {
+  struct duples_hash *duples, *ptr;
+  // Initialize the hash with a single, hidden entry so that
+  // the *duples pointer doesn't point to NULL
+  ptr          = malloc(sizeof(struct duples_hash));
+  ptr->id      = duple_id(-1, -1);
+  ptr->strings = create_duple_pos(-1, -1, NULL, NULL);
+  HASH_ADD_INT(duples, id, ptr);
+  return Data_Wrap_Struct(self, NULL, method_free_index, duples);
+}
+/* method_free_index
+ *
+ * called by ruby when it's trying to free the instantiated
+ * DupleIndex class. In this method, we start the deallocation
+ * of memory by iterating over each hash member and calling
+ * free successively.
+ */
+static void method_free_index(void *duples) {
+  destroy_index( duples );
+}
+/* destroy_index
+ *
+ * loops over each key in the duples hash, frees the memory
+ * associated with each hash member by deallocating the linked
+ * list that stores duple positions.
+ */
+void destroy_index(struct duples_hash *duples) {
+  struct duples_hash *d, *d_tmp;
+  HASH_ITER(hh, duples, d, d_tmp) {
+    destroy_duple_pos(d->strings);
+    HASH_DEL(duples, d);
+    free(d);
+  }
+}
+/* destroy_duple_pos
+ *
+ * frees the memory used to store duple_pos structs
+ */
+void destroy_duple_pos(struct duple_pos *head) {
+  struct duple_pos *c_pos, *n_pos;
+  c_pos = head;
+  while (1) {
+    n_pos = c_pos->next;
+    free(c_pos);
+    if (n_pos == NULL) {
+      break;
+    } else {
+      c_pos = n_pos;
+    }
+  }
+}
+/* method_add
+ *
+ * ruby public method that allows one to add strings to the duple index.
+ * strings are represented by a unique index (r_str_id) and by an array
+ * of ints, that represent each character. The add method will then
+ * index where each duple appears, allowing quick recall of what strings
+ * and where in those strings a given duple appears.
+ */
+VALUE method_add(VALUE self, VALUE r_str_id, VALUE r_str) {
+  int i, str_id, str_len, c_a, c_b;
+  struct duples_hash *duples;
+  Data_Get_Struct(self, struct duples_hash, duples);
+  str_len = (int) RARRAY_LEN(r_str);
+  str_id  = NUM2INT( r_str_id );
+  for (i=0; i<(str_len-1); i++) {
+    c_a = NUM2INT( RARRAY_PTR(r_str)[i] );
+    c_b = NUM2INT( RARRAY_PTR(r_str)[i+1] );
+    add_duple(duples, c_a, c_b, str_id, i);
+  }
+  return Qtrue;
+}
+/* duple_id
+ *
+ * method that hashes the two intergers that represent a duple into a
+ * unique integer value. Pretty simple algo.
+ */
+int duple_id(int c_a, int c_b) {
+ return c_b + (c_a % MAX_CHARS) * MAX_CHARS;
+}
+/* duple_at
+ *
+ * method that allows one to find which strings (and where in those strings)
+ * a given duple appears (represented by two ints, c_a and c_b)
+ */
+struct duples_hash *duple_at(struct duples_hash *duples, int c_a, int c_b) {
+  int d_id;
+  struct duples_hash *d;
+  d_id = duple_id(c_a, c_b);
+  HASH_FIND_INT(duples, &d_id, d);
+  return d;
+}
+/* add_duple
+ *
+ * Add a duple (represented by c_a and c_b) to the index, given the string (index)
+ * and position(pos) it appears.
+ */
+void add_duple(struct duples_hash *duples, int c_a, int c_b, int index, int pos) {
+  struct duples_hash *ptr;
+  struct duple_pos *d_pos;
+  ptr = duple_at(duples, c_a, c_b);
+  if (ptr == NULL) {
+    ptr          = malloc(sizeof(struct duples_hash));
+    ptr->id      = duple_id(c_a, c_b);
+    ptr->strings = create_duple_pos(index, pos, NULL, NULL);
+    HASH_ADD_INT(duples, id, ptr);
+  } else {
+    d_pos              = create_duple_pos(index, pos, ptr->strings, NULL);
+    ptr->strings->prev = d_pos;
+    ptr->strings       = d_pos;
+  }
+}
+struct duple_pos *create_duple_pos(int index, int pos, struct duple_pos *next, struct duple_pos *prev) {
+  struct duple_pos *ptr;
+  ptr = malloc( sizeof(struct duple_pos) );
+  ptr->index = index;
+  ptr->pos   = pos;
+  ptr->next  = next;
+  ptr->prev  = prev;
+  return ptr;
+}
+/* method_match
+ *
+ * this is a ruby public method that allows us to query the duple index. For a given
+ * needle string (represented as an array of numbers), we want to return a hash that
+ * lists which strings matched the needle, by how many times a duple in the needle
+ * matched a duple in the hits. To do that, we loop over the duples in the needle
+ * string, find which strings match that duple, then keep track of how many times a
+ * string matched a duple in the needle.
+ * */
+VALUE method_match(VALUE self, VALUE needle) {
+  int i, n_needle, c_a, c_b, match_id;
+  struct match *matches, *match, *match_tmp;
+  struct duples_hash *duples, *duple;
+  struct duple_pos *pos;
+  VALUE matches_by_score = rb_hash_new();
+  VALUE arr;
+  Data_Get_Struct(self, struct duples_hash, duples);
+  matches = NULL;
+  n_needle = (int) RARRAY_LEN(needle);
+  if (n_needle < 2)
+    return matches_by_score; // If the needle has fewer than two chars,
+                             // it's not a duple so return immediately
+  // Loop over each duple in the needle string
+  for (i=0; i<(n_needle-1); i++) {
+    c_a = NUM2INT( RARRAY_PTR(needle)[i] );
+    c_b = NUM2INT( RARRAY_PTR(needle)[i+1] );
+    duple = duple_at(duples, c_a, c_b); // Find the strings and positions
+                                        // where this duple is found
+    if (duple != NULL ) {
+      pos = duple->strings;
+      while (1) { // Loop over the strings where duple is found
+        // if String not found in matches hash, create new pointer
+        match_id = pos->index;
+        HASH_FIND_INT(matches, &match_id, match);
+        if (match == NULL) {
+          match = create_match(match_id, pos->pos, c_a, c_b);
+          HASH_ADD_INT(matches, id, match);
+        } else {
+          /* Only update the match count if the next matching duple appears
+           * *AFTER* the last matched duple. For instance, with a needle string
+           * of 'abc' and a indexed string of 'bcab', we don't want the 'bc'
+           * duple to match since it appears before 'ab' in the indexed string,
+           * whereas it appears after in the needle string.
+           */
+          if ((match->last_matched_position < pos->pos) && (match->last_matched_ca != c_a) && (match->last_matched_cb != c_b)) {
+            update_match( match, pos->pos, c_a, c_b );
+          }
+        }
+        if (pos->next == NULL)
+          break;
+        pos = pos->next;
+      }
+    }
+  }
+  // Loop over matches and construct the Ruby hash that stores the matching
+  // strings by the number of times they matched. Also free up memory as we go.
+  HASH_ITER(hh, matches, match, match_tmp) {
+    arr = rb_hash_aref(matches_by_score, INT2NUM(match->n_matches));
+    if (arr == Qnil)
+      arr = rb_ary_new();
+    rb_ary_push(arr, INT2NUM(match->id));
+    rb_hash_aset(matches_by_score, INT2NUM(match->n_matches), arr);
+    HASH_DEL(matches, match);
+    free(match);
+  }
+  return matches_by_score;
+}
+struct match *create_match(int id, int pos, int c_a, int c_b) {
+  struct match *new_match;
+  new_match = malloc( sizeof(struct match) );
+  new_match->id                    = id;
+  new_match->n_matches             = 1;
+  new_match->last_matched_position = pos;
+  new_match->last_matched_ca       = c_a;
+  new_match->last_matched_cb       = c_b;
+  return new_match;
+}
+void update_match(struct match* match, int pos, int c_a, int c_b) {
+  match->n_matches++;
+  match->last_matched_position = pos;
+  match->last_matched_ca       = c_a;
+  match->last_matched_cb       = c_b;
+}

data/ext/fuzz_ball/duple_index/DupleIndex.h ADDED Viewed

@@ -0,0 +1,60 @@
+#define MAX_CHARS 1000
+/* duples_hash is the struct that is the "core" of the duple index.
+ * The id field is a unique integer that represents a unique duple.
+ * the duple_pos pointer is a pointer to a linked list that stores
+ * all of the strings that a given duple appears in.
+ */
+struct duples_hash {
+  int id;
+  struct duple_pos *strings;
+  UT_hash_handle hh;
+};
+/* duple_pos is a node on a doubly-linked list that stores the index
+ * of the string and where inside the string a particular duple appears.
+ * The linked list will store all of the locations where a duple appears.
+ */
+struct duple_pos {
+  int index;
+  int pos;
+  struct duple_pos *next;
+  struct duple_pos *prev;
+};
+/* match is a struct that keeps track of how many duples match in a given
+ * string. The id field is the id of the string, n_matches is the number
+ * of duples that have matched, the last_matched_position is the position
+ * of the last matching duple, and last_matched_ca and last_matched_cb
+ * records the first and second character of the last matched duple.
+ */
+struct match {
+  int id;
+  int n_matches;
+  int last_matched_position;
+  int last_matched_ca;
+  int last_matched_cb;
+  UT_hash_handle hh;
+};
+// Ruby-related declarations
+VALUE FuzzBall   = Qnil;
+VALUE DupleIndex = Qnil;
+void        Init_duple_index();
+VALUE       method_alloc_index(VALUE self);
+static void method_free_index(void *duples);
+VALUE       method_add(VALUE self, VALUE r_str_id, VALUE r_str);
+VALUE       method_match(VALUE self, VALUE needle);
+// Internally-used C-declarations (i.e., private methods)
+void               add_duple(struct duples_hash *duples, int c_a, int c_b, int index, int pos);
+struct duples_hash *duple_at(struct duples_hash *duples, int c_a, int c_b);
+int                duple_id(int c_a, int c_b);
+struct duple_pos   *create_duple_pos(int index, int pos, struct duple_pos *next, struct duple_pos *prev);
+void               destroy_index(struct duples_hash *duples);
+void               destroy_duple_pos(struct duple_pos *head);
+struct match       *create_match(int id, int pos, int c_a, int c_b);
+void               update_match(struct match* match, int pos, int c_a, int c_b);

data/ext/fuzz_ball/duple_index/extconf.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'mkmf'
+dir_config("fuzz_ball/duple_index")
+create_makefile("fuzz_ball/duple_index")

data/ext/fuzz_ball/duple_index/utarray.h ADDED Viewed

@@ -0,0 +1,226 @@
+/*
+Copyright (c) 2008-2011, Troy D. Hanson   http://uthash.sourceforge.net
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* a dynamic array implementation using macros
+ * see http://uthash.sourceforge.net/utarray
+ */
+#ifndef UTARRAY_H
+#define UTARRAY_H
+#define UTARRAY_VERSION 1.9.4
+#ifdef __GNUC__
+#define _UNUSED_ __attribute__ ((__unused__))
+#else
+#define _UNUSED_
+#endif
+#include <stddef.h>  /* size_t */
+#include <string.h>  /* memset, etc */
+#include <stdlib.h>  /* exit */
+#define oom() exit(-1)
+typedef void (ctor_f)(void *dst, const void *src);
+typedef void (dtor_f)(void *elt);
+typedef void (init_f)(void *elt);
+typedef struct {
+    size_t sz;
+    init_f *init;
+    ctor_f *copy;
+    dtor_f *dtor;
+} UT_icd;
+typedef struct {
+    unsigned i,n;/* i: index of next available slot, n: num slots */
+    const UT_icd *icd; /* initializer, copy and destructor functions */
+    char *d;     /* n slots of size icd->sz*/
+} UT_array;
+#define utarray_init(a,_icd) do {                                             \
+  memset(a,0,sizeof(UT_array));                                               \
+  (a)->icd=_icd;                                                              \
+} while(0)
+#define utarray_done(a) do {                                                  \
+  if ((a)->n) {                                                               \
+    if ((a)->icd->dtor) {                                                     \
+      size_t _ut_i;                                                           \
+      for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
+        (a)->icd->dtor(utarray_eltptr(a,_ut_i));                              \
+      }                                                                       \
+    }                                                                         \
+    free((a)->d);                                                             \
+  }                                                                           \
+  (a)->n=0;                                                                   \
+} while(0)
+#define utarray_new(a,_icd) do {                                              \
+  a=(UT_array*)malloc(sizeof(UT_array));                                      \
+  utarray_init(a,_icd);                                                       \
+} while(0)
+#define utarray_free(a) do {                                                  \
+  utarray_done(a);                                                            \
+  free(a);                                                                    \
+} while(0)
+#define utarray_reserve(a,by) do {                                            \
+  if (((a)->i+by) > ((a)->n)) {                                               \
+    while(((a)->i+by) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); }     \
+    if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd->sz)) == NULL) oom(); \
+  }                                                                           \
+} while(0)
+#define utarray_push_back(a,p) do {                                           \
+  utarray_reserve(a,1);                                                       \
+  if ((a)->icd->copy) { (a)->icd->copy( _utarray_eltptr(a,(a)->i++), p); }    \
+  else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd->sz); };             \
+} while(0)
+#define utarray_pop_back(a) do {                                              \
+  if ((a)->icd->dtor) { (a)->icd->dtor( _utarray_eltptr(a,--((a)->i))); }     \
+  else { (a)->i--; }                                                          \
+} while(0)
+#define utarray_extend_back(a) do {                                           \
+  utarray_reserve(a,1);                                                       \
+  if ((a)->icd->init) { (a)->icd->init(_utarray_eltptr(a,(a)->i)); }          \
+  else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd->sz); }                  \
+  (a)->i++;                                                                   \
+} while(0)
+#define utarray_len(a) ((a)->i)
+#define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL)
+#define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd->sz*(j) )))
+#define utarray_insert(a,p,j) do {                                            \
+  utarray_reserve(a,1);                                                       \
+  if (j > (a)->i) break;                                                      \
+  if ((j) < (a)->i) {                                                         \
+    memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j),                  \
+             ((a)->i - (j))*((a)->icd->sz));                                  \
+  }                                                                           \
+  if ((a)->icd->copy) { (a)->icd->copy( _utarray_eltptr(a,j), p); }           \
+  else { memcpy(_utarray_eltptr(a,j), p, (a)->icd->sz); };                    \
+  (a)->i++;                                                                   \
+} while(0)
+#define utarray_inserta(a,w,j) do {                                           \
+  if (utarray_len(w) == 0) break;                                             \
+  if (j > (a)->i) break;                                                      \
+  utarray_reserve(a,utarray_len(w));                                          \
+  if ((j) < (a)->i) {                                                         \
+    memmove(_utarray_eltptr(a,(j)+utarray_len(w)),                            \
+            _utarray_eltptr(a,j),                                             \
+            ((a)->i - (j))*((a)->icd->sz));                                   \
+  }                                                                           \
+  if ((a)->icd->copy) {                                                         \
+    size_t _ut_i;                                                             \
+    for(_ut_i=0;_ut_i<(w)->i;_ut_i++) {                                       \
+      (a)->icd->copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i));   \
+    }                                                                         \
+  } else {                                                                    \
+    memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0),                        \
+           utarray_len(w)*((a)->icd->sz));                                    \
+  }                                                                           \
+  (a)->i += utarray_len(w);                                                   \
+} while(0)
+#define utarray_resize(dst,num) do {                                          \
+  size_t _ut_i;                                                               \
+  if (dst->i > (size_t)(num)) {                                               \
+    if ((dst)->icd->dtor) {                                                   \
+      for(_ut_i=num; _ut_i < dst->i; _ut_i++) {                               \
+        (dst)->icd->dtor(utarray_eltptr(dst,_ut_i));                          \
+      }                                                                       \
+    }                                                                         \
+  } else if (dst->i < (size_t)(num)) {                                        \
+    utarray_reserve(dst,num-dst->i);                                          \
+    if ((dst)->icd->init) {                                                   \
+      for(_ut_i=dst->i; _ut_i < num; _ut_i++) {                               \
+        (dst)->icd->init(utarray_eltptr(dst,_ut_i));                          \
+      }                                                                       \
+    } else {                                                                  \
+      memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd->sz*(num-dst->i));      \
+    }                                                                         \
+  }                                                                           \
+  dst->i = num;                                                               \
+} while(0)
+#define utarray_concat(dst,src) do {                                          \
+  utarray_inserta((dst),(src),utarray_len(dst));                                  \
+} while(0)
+#define utarray_erase(a,pos,len) do {                                         \
+  if ((a)->icd->dtor) {                                                       \
+    size_t _ut_i;                                                             \
+    for(_ut_i=0; _ut_i < len; _ut_i++) {                                      \
+      (a)->icd->dtor(utarray_eltptr(a,pos+_ut_i));                            \
+    }                                                                         \
+  }                                                                           \
+  if ((a)->i > (pos+len)) {                                                   \
+    memmove( _utarray_eltptr(a,pos), _utarray_eltptr(a,pos+len),              \
+            ((a->i)-(pos+len))*((a)->icd->sz));                               \
+  }                                                                           \
+  (a)->i -= (len);                                                            \
+} while(0)
+#define utarray_clear(a) do {                                                 \
+  if ((a)->i > 0) {                                                           \
+    if ((a)->icd->dtor) {                                                     \
+      size_t _ut_i;                                                           \
+      for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
+        (a)->icd->dtor(utarray_eltptr(a,_ut_i));                              \
+      }                                                                       \
+    }                                                                         \
+    (a)->i = 0;                                                               \
+  }                                                                           \
+} while(0)
+#define utarray_sort(a,cmp) do {                                              \
+  qsort((a)->d, (a)->i, (a)->icd->sz, cmp);                                   \
+} while(0)
+#define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd->sz,cmp)
+#define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL)
+#define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL))
+#define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL)
+#define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(a)->icd->sz) : -1)
+/* last we pre-define a few icd for common utarrays of ints and strings */
+static void utarray_str_cpy(void *dst, const void *src) {
+  char **_src = (char**)src, **_dst = (char**)dst;
+  *_dst = (*_src == NULL) ? NULL : strdup(*_src);
+}
+static void utarray_str_dtor(void *elt) {
+  char **eltc = (char**)elt;
+  if (*eltc) free(*eltc);
+}
+static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor};
+static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL};
+#endif /* UTARRAY_H */