RubyGems - ferret - Versions diffs - 0.10.6 → 0.10.7 - Mend

ferret 0.10.6 → 0.10.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/ext/analysis.c +136 -107
data/ext/analysis.h +4 -0
data/ext/bitvector.c +2 -2
data/ext/bitvector.h +1 -1
data/ext/compound_io.c +4 -4
data/ext/defines.h +0 -2
data/ext/filter.c +3 -3
data/ext/fs_store.c +4 -4
data/ext/hash.c +29 -18
data/ext/hash.h +34 -16
data/ext/hashset.c +6 -3
data/ext/hashset.h +1 -1
data/ext/index.c +22 -20
data/ext/q_boolean.c +3 -3
data/ext/q_const_score.c +1 -1
data/ext/q_fuzzy.c +1 -1
data/ext/q_match_all.c +1 -1
data/ext/q_multi_term.c +2 -2
data/ext/q_parser.c +21 -6
data/ext/q_phrase.c +2 -2
data/ext/q_prefix.c +1 -1
data/ext/q_range.c +3 -3
data/ext/q_span.c +8 -8
data/ext/q_term.c +1 -1
data/ext/q_wildcard.c +1 -1
data/ext/r_analysis.c +10 -4
data/ext/r_index.c +89 -12
data/ext/r_qparser.c +67 -4
data/ext/r_search.c +11 -1
data/ext/r_store.c +51 -35
data/ext/ram_store.c +18 -18
data/ext/search.c +1 -1
data/ext/search.h +25 -23
data/ext/similarity.c +1 -1
data/ext/sort.c +1 -1
data/ext/store.c +22 -3
data/ext/store.h +8 -2
data/lib/ferret/index.rb +14 -4
data/lib/ferret_version.rb +1 -1
data/test/test_helper.rb +3 -0
data/test/unit/analysis/tc_analyzer.rb +5 -5
data/test/unit/analysis/tc_token_stream.rb +3 -3
data/test/unit/index/tc_index_writer.rb +1 -1
data/test/unit/query_parser/tc_query_parser.rb +7 -5
data/test/unit/search/tc_filter.rb +1 -1
data/test/unit/search/tc_fuzzy_query.rb +1 -1
data/test/unit/search/tc_index_searcher.rb +1 -1
data/test/unit/search/tc_multi_searcher.rb +1 -1
data/test/unit/search/tc_search_and_sort.rb +1 -1
data/test/unit/search/tc_spans.rb +1 -1
metadata +4 -3

data/ext/analysis.c CHANGED Viewed

@@ -230,6 +230,43 @@ Analyzer *analyzer_new(TokenStream *ts,
     return a;
 }
+/****************************************************************************
+ *
+ * Non
+ *
+ ****************************************************************************/
+/*
+ * NonTokenizer
+ */
+static Token *nt_next(TokenStream *ts)
+{
+    if (ts->t) {
+        size_t len = strlen(ts->t);
+        ts->t = NULL;
+        return tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
+    }
+    else {
+        return NULL;
+    }
+}
+TokenStream *non_tokenizer_new()
+{
+    TokenStream *ts = cts_new();
+    ts->next = &nt_next;
+    return ts;
+}
+/*
+ * NonAnalyzer
+ */
+Analyzer *non_analyzer_new()
+{
+    return analyzer_new(non_tokenizer_new(), NULL, NULL);
+}
 /****************************************************************************
  *
  * Whitespace
@@ -520,7 +557,7 @@ static int std_get_alpha(TokenStream *ts, char *token)
 {
     int i = 0;
     char *t = ts->t;
-    while (t[i] != '\0' && isalpha(t[i])) {
+    while (t[i] != '\0' && isalnum(t[i])) {
         if (i < MAX_WORD_SIZE) {
             token[i] = t[i];
         }
@@ -538,7 +575,7 @@ static int mb_std_get_alpha(TokenStream *ts, char *token)
     i = mb_next_char(&wchr, t, &state);
-    while (wchr != 0 && iswalpha(wchr)) {
+    while (wchr != 0 && iswalnum(wchr)) {
         t += i;
         i = mb_next_char(&wchr, t, &state);
     }
@@ -820,133 +857,125 @@ static Token *std_next(TokenStream *ts)
     }
     start = t = ts->t;
-    if ((isdigit(*t) || isnumpunc(*t))
-        && ((len = std_get_number(t)) > 0)) {
-        t += len;
+    token_i = std_tz->get_alpha(ts, token);
+    t += token_i;
+    if (!std_tz->is_tok_char(t)) {
+        /* very common case, ie a plain word, so check and return */
         ts->t = t;
-        tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+        return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
     }
-    else {
-        token_i = std_tz->get_alpha(ts, token);
-        t += token_i;
-        if (!std_tz->is_tok_char(t)) {
-            /* very common case, ie a plain word, so check and return */
-            ts->t = t;
-            return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+    if (*t == '\'') {       /* apostrophe case. */
+        t += std_tz->get_apostrophe(t);
+        ts->t = t;
+        len = (int)(t - start);
+        /* strip possesive */
+        if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
+            t -= 2;
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+            CTS(ts)->token.end += 2;
+        }
+        else {
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
         }
-        if (*t == '\'') {       /* apostrophe case. */
-            t += std_tz->get_apostrophe(t);
-            ts->t = t;
-            len = (int)(t - start);
-            /* strip possesive */
-            if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
-                t -= 2;
-                tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
-                CTS(ts)->token.end += 2;
-            }
-            else {
-                tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
-            }
+        return &(CTS(ts)->token);
+    }
-            return &(CTS(ts)->token);
-        }
+    if (*t == '&') {        /* apostrophe case. */
+        t += std_get_company_name(t);
+        ts->t = t;
+        return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+    }
-        if (*t == '&') {        /* apostrophe case. */
-            t += std_get_company_name(t);
-            ts->t = t;
-            return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
+    if ((isdigit(*t) || isnumpunc(*t))       /* possibly a number */
+        && (len = std_get_number(t) > 0)) {
+        num_end = start + len;
+        if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
+            ts->t = num_end;
+            return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
         }
+        /* else there may be a longer token so check */
+    }
-        if ((isdigit(*t) || isnumpunc(*t))       /* possibly a number */
-            && (len = std_get_number(t) > 0)) {
-            num_end = start + len;
-            if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
-                ts->t = num_end;
-                return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
-            }
-            /* else there may be a longer token so check */
+    if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
+        /* check for a known url start */
+        token[token_i] = '\0';
+        t += 3;
+        while (*t == '/') {
+            t++;
         }
-        if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
-            /* check for a known url start */
-            token[token_i] = '\0';
-            t += 3;
-            while (*t == '/') {
-                t++;
-            }
-            if (isalpha(*t) &&
-                (memcmp(token, "ftp", 3) == 0 ||
-                 memcmp(token, "http", 4) == 0 ||
-                 memcmp(token, "https", 5) == 0 ||
-                 memcmp(token, "file", 4) == 0)) {
-                len = std_get_url(t, token, 0); /* dispose of first part of the URL */
-            }
-            else {              /* still treat as url but keep the first part */
-                token_i = (int)(t - start);
-                memcpy(token, start, token_i * sizeof(char));
-                len = token_i + std_get_url(t, token, token_i); /* keep start */
-            }
-            ts->t = t + len;
-            token[len] = 0;
-            return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
-                   (int)(ts->t - ts->text), 1);
+        if (isalpha(*t) &&
+            (memcmp(token, "ftp", 3) == 0 ||
+             memcmp(token, "http", 4) == 0 ||
+             memcmp(token, "https", 5) == 0 ||
+             memcmp(token, "file", 4) == 0)) {
+            len = std_get_url(t, token, 0); /* dispose of first part of the URL */
         }
-        /* now see how long a url we can find. */
-        is_acronym = true;
-        seen_at_symbol = false;
-        while (isurlxatc(*t)) {
-            if (is_acronym && !isalpha(*t) && (*t != '.')) {
-                is_acronym = false;
-            }
-            if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
-                break; /* can't have two punctuation characters in a row */
+        else {              /* still treat as url but keep the first part */
+            token_i = (int)(t - start);
+            memcpy(token, start, token_i * sizeof(char));
+            len = token_i + std_get_url(t, token, token_i); /* keep start */
+        }
+        ts->t = t + len;
+        token[len] = 0;
+        return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
+               (int)(ts->t - ts->text), 1);
+    }
+    /* now see how long a url we can find. */
+    is_acronym = true;
+    seen_at_symbol = false;
+    while (isurlxatc(*t)) {
+        if (is_acronym && !isalpha(*t) && (*t != '.')) {
+            is_acronym = false;
+        }
+        if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
+            break; /* can't have two punctuation characters in a row */
+        }
+        if (*t == '@') {
+            if (seen_at_symbol) {
+                break; /* we can only have one @ symbol */
             }
-            if (*t == '@') {
-                if (seen_at_symbol) {
-                    break; /* we can only have one @ symbol */
-                }
-                else {
-                    seen_at_symbol = true;
-                }
+            else {
+                seen_at_symbol = true;
             }
-            t++;
-        }
-        while (isurlxatpunc(t[-1])) {
-            t--;                /* strip trailing punctuation */
         }
+        t++;
+    }
+    while (isurlxatpunc(t[-1])) {
+        t--;                /* strip trailing punctuation */
+    }
-        if (num_end == NULL || t > num_end) {
-            ts->t = t;
+    if (num_end == NULL || t > num_end) {
+        ts->t = t;
-            if (is_acronym) {   /* check it is one letter followed by one '.' */
-                for (s = start; s < t - 1; s++) {
-                    if (isalpha(*s) && (s[1] != '.'))
-                        is_acronym = false;
-                }
+        if (is_acronym) {   /* check it is one letter followed by one '.' */
+            for (s = start; s < t - 1; s++) {
+                if (isalpha(*s) && (s[1] != '.'))
+                    is_acronym = false;
             }
-            if (is_acronym) {   /* strip '.'s */
-                for (s = start + token_i; s < t; s++) {
-                    if (*s != '.') {
-                        token[token_i] = *s;
-                        token_i++;
-                    }
+        }
+        if (is_acronym) {   /* strip '.'s */
+            for (s = start + token_i; s < t; s++) {
+                if (*s != '.') {
+                    token[token_i] = *s;
+                    token_i++;
                 }
-                tk_set(&(CTS(ts)->token), token, token_i,
-                       (int)(start - ts->text),
-                       (int)(t - ts->text), 1);
-            }
-            else { /* just return the url as is */
-                tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
             }
+            tk_set(&(CTS(ts)->token), token, token_i,
+                   (int)(start - ts->text),
+                   (int)(t - ts->text), 1);
         }
-        else {                  /* return the number */
-            ts->t = num_end;
-            tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
+        else { /* just return the url as is */
+            tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
         }
     }
+    else {                  /* return the number */
+        ts->t = num_end;
+        tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
+    }
     return &(CTS(ts)->token);
 }

data/ext/analysis.h CHANGED Viewed

@@ -112,6 +112,8 @@ typedef struct StemFilter
 extern void ts_deref(TokenStream *ts);
+extern TokenStream *non_tokenizer_new();
 extern TokenStream *whitespace_tokenizer_new();
 extern TokenStream *mb_whitespace_tokenizer_new(bool lowercase);
@@ -172,6 +174,8 @@ extern Analyzer *analyzer_new(TokenStream *ts,
                                                      char *field,
                                                      char *text));
 extern void a_standard_destroy(Analyzer *a);
+extern Analyzer *non_analyzer_new();
 extern Analyzer *whitespace_analyzer_new(bool lowercase);
 extern Analyzer *mb_whitespace_analyzer_new(bool lowercase);

data/ext/bitvector.c CHANGED Viewed

@@ -345,9 +345,9 @@ int bv_eq(BitVector *bv1, BitVector *bv2)
     return true;
 }
-ulong bv_hash(BitVector *bv)
+unsigned long bv_hash(BitVector *bv)
 {
-    ulong hash = 0;
+    unsigned long hash = 0;
     const f_u32 empty_word = bv->extends_as_ones ? 0xFFFFFFFF : 0;
     int i;
     for (i = (bv->size >> 5); i >= 0; i--) {

data/ext/bitvector.h CHANGED Viewed

@@ -193,7 +193,7 @@ extern int bv_eq(BitVector *bv1, BitVector *bv2);
  * @param bv the BitVector to hash
  * @return A hash value for the BitVector
  */
-extern ulong bv_hash(BitVector *bv);
+extern unsigned long bv_hash(BitVector *bv);
 /**
  * ANDs two BitVectors (+bv1+ and +bv2+) together and return the resultant

data/ext/compound_io.c CHANGED Viewed

@@ -181,7 +181,7 @@ static OutStream *cmpd_new_output(Store *store, const char *file_name)
     return NULL;
 }
-static Lock *cmpd_open_lock(Store *store, char *lock_name)
+static Lock *cmpd_open_lock_i(Store *store, char *lock_name)
 {
     (void)store;
     (void)lock_name;
@@ -189,7 +189,7 @@ static Lock *cmpd_open_lock(Store *store, char *lock_name)
     return NULL;
 }
-static void cmpd_close_lock(Lock *lock)
+static void cmpd_close_lock_i(Lock *lock)
 {
     (void)lock;
     RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
@@ -246,8 +246,8 @@ Store *open_cmpd_store(Store *store, const char *name)
     new_store->close_i      = &cmpd_close_i;
     new_store->new_output   = &cmpd_new_output;
     new_store->open_input   = &cmpd_open_input;
-    new_store->open_lock    = &cmpd_open_lock;
-    new_store->close_lock   = &cmpd_close_lock;
+    new_store->open_lock_i  = &cmpd_open_lock_i;
+    new_store->close_lock_i = &cmpd_close_lock_i;
     return new_store;
 }

data/ext/defines.h CHANGED Viewed

@@ -13,8 +13,6 @@
 typedef unsigned int        bool;
 typedef unsigned char       uchar;
-typedef unsigned int        uint;
-typedef unsigned long int   ulong;
 typedef posh_u16_t f_u16;
 typedef posh_i16_t f_i16;

data/ext/filter.c CHANGED Viewed

@@ -41,7 +41,7 @@ static char *filt_to_s_i(Filter *filt)
     return estrdup(filt->name);
 }
-ulong filt_hash_default(Filter *filt)
+unsigned long filt_hash_default(Filter *filt)
 {
     (void)filt;
     return 0;
@@ -66,7 +66,7 @@ Filter *filt_create(size_t size, const char *name)
     return filt;
 }
-ulong filt_hash(Filter *filt)
+unsigned long filt_hash(Filter *filt)
 {
     return str_hash(filt->name) ^ filt->hash(filt);
 }
@@ -118,7 +118,7 @@ static BitVector *qfilt_get_bv_i(Filter *filt, IndexReader *ir)
     return bv;
 }
-static ulong qfilt_hash(Filter *filt)
+static unsigned long qfilt_hash(Filter *filt)
 {
     return q_hash(QF(filt)->query);
 }

data/ext/fs_store.c CHANGED Viewed

@@ -384,7 +384,7 @@ void fs_lock_release(Lock *lock)
     remove(lock->name);
 }
-static Lock *fs_open_lock(Store *store, char *lockname)
+static Lock *fs_open_lock_i(Store *store, char *lockname)
 {
     Lock *lock = ALLOC(Lock);
     char lname[100];
@@ -398,7 +398,7 @@ static Lock *fs_open_lock(Store *store, char *lockname)
     return lock;
 }
-static void fs_close_lock(Lock *lock)
+static void fs_close_lock_i(Lock *lock)
 {
     remove(lock->name);
     free(lock->name);
@@ -447,8 +447,8 @@ static Store *fs_store_new(const char *pathname)
     new_store->each          = &fs_each;
     new_store->new_output    = &fs_new_output;
     new_store->open_input    = &fs_open_input;
-    new_store->open_lock     = &fs_open_lock;
-    new_store->close_lock    = &fs_close_lock;
+    new_store->open_lock_i   = &fs_open_lock_i;
+    new_store->close_lock_i  = &fs_close_lock_i;
     return new_store;
 }

data/ext/hash.c CHANGED Viewed

@@ -18,16 +18,16 @@ static char *dummy_key = "";
 static HashTable *free_hts[MAX_FREE_HASH_TABLES];
 static int num_free_hts = 0;
-ulong *imalloc(ulong value)
+unsigned long *imalloc(unsigned long value)
 {
-  ulong *p = ALLOC(ulong);
+  unsigned long *p = ALLOC(unsigned long);
   *p = value;
   return p;
 }
-ulong str_hash(const char *const str)
+unsigned long str_hash(const char *const str)
 {
-    register ulong h = 0;
+    register unsigned long h = 0;
     register unsigned char *p = (unsigned char *) str;
     for (; *p; p++) {
@@ -37,6 +37,16 @@ ulong str_hash(const char *const str)
     return h;
 }
+unsigned long ptr_hash(const void *const ptr)
+{
+    return (unsigned long)ptr;
+}
+int ptr_eq(const void *q1, const void *q2)
+{
+    return q1 == q2;
+}
 static int int_eq(const void *q1, const void *q2)
 {
     (void)q1;
@@ -44,9 +54,9 @@ static int int_eq(const void *q1, const void *q2)
     return true;
 }
-static ulong int_hash(const void *i)
+static unsigned long int_hash(const void *i)
 {
-    return *((ulong *)i);
+    return *((unsigned long *)i);
 }
 typedef HashEntry *(*lookup_ft)(struct HashTable *ht, register const void *key);
@@ -58,9 +68,10 @@ typedef HashEntry *(*lookup_ft)(struct HashTable *ht, register const void *key);
  * @param ht the HashTable to do the fast lookup in
  * @param the hashkey we are looking for
  */
-static __inline HashEntry *h_resize_lookup(HashTable *ht, register const ulong hash)
+static __inline HashEntry *h_resize_lookup(HashTable *ht,
+                                           register const unsigned long hash)
 {
-    register ulong perturb;
+    register unsigned long perturb;
     register int mask = ht->mask;
     register HashEntry *he0 = ht->table;
     register int i = hash & mask;
@@ -83,8 +94,8 @@ static __inline HashEntry *h_resize_lookup(HashTable *ht, register const ulong h
 HashEntry *h_lookup_int(HashTable *ht, const void *key)
 {
-    register ulong hash = *((int *)key);
-    register ulong perturb;
+    register unsigned long hash = *((int *)key);
+    register unsigned long perturb;
     register int mask = ht->mask;
     register HashEntry *he0 = ht->table;
     register int i = hash & mask;
@@ -120,8 +131,8 @@ HashEntry *h_lookup_int(HashTable *ht, const void *key)
 HashEntry *h_lookup_str(HashTable *ht, register const char *key)
 {
-    register ulong hash = str_hash(key);
-    register ulong perturb;
+    register unsigned long hash = str_hash(key);
+    register unsigned long perturb;
     register int mask = ht->mask;
     register HashEntry *he0 = ht->table;
     register int i = hash & mask;
@@ -477,32 +488,32 @@ int h_has_key(HashTable *ht, const void *key)
     }
 }
-void *h_get_int(HashTable *self, const ulong key)
+void *h_get_int(HashTable *self, const unsigned long key)
 {
   return h_get(self, &key);
 }
-int h_del_int(HashTable *self, const ulong key)
+int h_del_int(HashTable *self, const unsigned long key)
 {
   return h_del(self, &key);
 }
-void *h_rem_int(HashTable *self, const ulong key)
+void *h_rem_int(HashTable *self, const unsigned long key)
 {
   return h_rem(self, &key, false);
 }
-int h_set_int(HashTable *self, const ulong key, void *value)
+int h_set_int(HashTable *self, const unsigned long key, void *value)
 {
   return h_set(self, &key, value);
 }
-int h_set_safe_int(HashTable *self, const ulong key, void *value)
+int h_set_safe_int(HashTable *self, const unsigned long key, void *value)
 {
   return h_set_safe(self, &key, value);
 }
-int h_has_key_int(HashTable *self, const ulong key)
+int h_has_key_int(HashTable *self, const unsigned long key)
 {
   return h_has_key(self, &key);
 }