RubyGems - ferret - Versions diffs - 0.10.2 → 0.10.3 - Mend

ferret 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/Rakefile +31 -36
data/ext/analysis.c +97 -37
data/ext/analysis.h +11 -0
data/ext/ferret.c +10 -0
data/ext/ferret.h +2 -0
data/ext/inc/lang.h +1 -0
data/ext/index.c +2 -2
data/ext/lang.h +1 -0
data/ext/q_parser.c +25 -5
data/ext/r_analysis.c +97 -53
data/ext/r_index.c +0 -1
data/ext/r_search.c +1 -1
data/ext/search.c +7 -3
data/ext/term_vectors.c +1 -1
data/lib/ferret/index.rb +94 -48
data/lib/ferret_version.rb +1 -1
data/test/unit/analysis/tc_analyzer.rb +24 -8
data/test/unit/analysis/tc_token_stream.rb +7 -0
data/test/unit/index/tc_index.rb +2 -2
data/test/unit/query_parser/tc_query_parser.rb +3 -3
metadata +12 -7
data/ext/tags +0 -7841

data/Rakefile CHANGED

@@ -121,7 +121,17 @@ file "ext/#{EXT}" => ["ext/Makefile"] do
   cp "ext/inc/threading.h", "ext/threading.h"
   cd "ext"
   if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
-    sh "nmake"
+    begin
+      sh "nmake"
+    rescue Exception => e
+      puts
+      puts "**********************************************************************"
+      puts "You may need to call VCVARS32.BAT to set the environment variables."
+      puts '  c:\Program Files\Microsoft Visual Studio\VC98\Bin\VCVARS32.BAT'
+      puts "**********************************************************************"
+      puts
+      raise e
+    end
   else
     sh "make"
   end
@@ -132,6 +142,7 @@ file "ext/lang.h" => ["ext/inc/lang.h"] do
   rm_f "ext/lang.h"
   cp "ext/inc/lang.h", "ext/lang.h"
 end
 file "ext/threading.h" => ["ext/inc/threading.h"] do
   rm_f "ext/threading.h"
   cp "ext/inc/threading.h", "ext/threading.h"
@@ -158,7 +169,7 @@ end
 PKG_FILES = FileList[
   'setup.rb',
   '[-A-Z]*',
-  'ext/**/*',
+  'ext/**/*.[ch]',
   'lib/**/*.rb',
   'test/**/*.rb',
   'test/**/wordfile',
@@ -176,7 +187,6 @@ else
   spec = Gem::Specification.new do |s|
     #### Basic information.
     s.name = 'ferret'
     s.version = PKG_VERSION
     s.summary = "Ruby indexing library."
@@ -186,29 +196,17 @@ else
     EOF
     #### Dependencies and requirements.
-    #s.add_dependency('log4r', '> 1.0.4')
-    #s.requirements << ""
-    #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
+    s.add_dependency('rake')
     s.files = PKG_FILES.to_a
-    #### C code extensions.
     s.extensions << "ext/extconf.rb"
-    #### Load-time details: library and application (you will need one or both).
-    s.require_path = 'lib'                         # Use these for libraries.
+    s.require_path = 'lib'
     s.autorequire = 'ferret'
-    #s.bindir = "bin"                               # Use these for applications.
-    #s.executables = ["rake"]
-    #s.default_executable = "rake"
-    #### Documentation and testing.
+    #### Author and project details.
+    s.author = "David Balmain"
+    s.email = "dbalmain@gmail.com"
+    s.homepage = "http://ferret.davebalmain.com/trac"
+    s.rubyforge_project = "ferret"
     s.has_rdoc = true
     s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
@@ -217,21 +215,18 @@ else
       '--main' << 'README' << '--line-numbers' <<
       'TUTORIAL' << 'TODO'
-    #### Author and project details.
-    s.author = "David Balmain"
-    s.email = "dbalmain@gmail.com"
-    s.homepage = "http://ferret.davebalmain.com/trac"
-    s.rubyforge_project = "ferret"
-#     if ENV['CERT_DIR']
-#       s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
-#       s.cert_chain  = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
-#     end
+    if RUBY_PLATFORM =~ /mswin/
+      s.files = PKG_FILES.to_a + ["ext/#{EXT}"]
+      s.extensions.clear
+      s.platform = Gem::Platform::WIN32
+    end
   end
   package_task = Rake::GemPackageTask.new(spec) do |pkg|
-    pkg.need_zip = true
-    pkg.need_tar = true
+    unless RUBY_PLATFORM =~ /mswin/
+      pkg.need_zip = true
+      pkg.need_tar = true
+    end
   end
 end
@@ -309,11 +304,11 @@ task :update_version => [:prerelease] do
     announce "No version change ... skipping version update"
   else
     announce "Updating Ferret version to #{PKG_VERSION}"
-    reversion("lib/ferret.rb")
+    reversion("lib/ferret_version.rb")
     if ENV['RELTEST']
       announce "Release Task Testing, skipping commiting of new version"
     else
-      sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
+      sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret_version.rb}
     end
   end
 end

data/ext/analysis.c CHANGED

@@ -55,7 +55,8 @@ __inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
 int tk_eq(Token *tk1, Token *tk2)
 {
     return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
-            tk1->start == tk2->start && tk1->end == tk2->end);
+            tk1->start == tk2->start && tk1->end == tk2->end &&
+            tk1->pos_inc == tk2->pos_inc);
 }
 int tk_cmp(Token *tk1, Token *tk2)
@@ -724,7 +725,7 @@ static int std_get_url(char *input, char *token, int i)
 {
     while (isurlc(input[i])) {
         if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
-            break;              /* can't have to puncs in a row */
+            break; /* can't have two puncs in a row */
         }
         if (i < MAX_WORD_SIZE) {
             token[i] = input[i];
@@ -1061,18 +1062,18 @@ static TokenStream *sf_clone_i(TokenStream *orig_ts)
 static Token *sf_next(TokenStream *ts)
 {
-    int pos_inc = 1;
+    int pos_inc = 0;
     HashTable *words = StopFilt(ts)->words;
     TokenFilter *tf = TkFilt(ts);
     Token *tk = tf->sub_ts->next(tf->sub_ts);
     while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
+        pos_inc += tk->pos_inc;
         tk = tf->sub_ts->next(tf->sub_ts);
-        pos_inc++;
     }
     if (tk != NULL) {
-        tk->pos_inc = pos_inc;
+        tk->pos_inc += pos_inc;
     }
     return tk;
@@ -1122,6 +1123,85 @@ TokenStream *stop_filter_new(TokenStream *ts)
     return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
 }
+/****************************************************************************
+ * HyphenFilter
+ ****************************************************************************/
+#define HyphenFilt(filter) ((HyphenFilter *)(filter))
+static TokenStream *hf_clone_i(TokenStream *orig_ts)
+{
+    TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
+    return new_ts;
+}
+static Token *hf_next(TokenStream *ts)
+{
+    HyphenFilter *hf = HyphenFilt(ts);
+    TokenFilter *tf = TkFilt(ts);
+    Token *tk = hf->tk;
+    if (hf->pos < hf->len) {
+        const int pos = hf->pos;
+        const int text_len = strlen(hf->text + pos);
+        strcpy(tk->text, hf->text + pos);
+        tk->pos_inc = ((pos != 0) ? 1 : 0);
+        tk->start = hf->start + pos;
+        tk->end = tk->start + text_len;
+        hf->pos += text_len + 1;
+        tk->len = text_len;
+        return tk;
+    }
+    else {
+        char *p;
+        bool seen_hyphen = false;
+        bool seen_other_punc = false;
+        hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
+        if (NULL == tk) return NULL;
+        p = tk->text + 1;
+        while (*p) {
+            if (*p == '-') {
+                seen_hyphen = true;
+            }
+            else if (!isalpha(*p)) {
+                seen_other_punc = true;
+                break;
+            }
+            p++;
+        }
+        if (seen_hyphen && !seen_other_punc) {
+            char *q = hf->text;
+            char *r = tk->text;
+            p = tk->text;
+            while (*p) {
+                if (*p == '-') {
+                    *q = '\0';
+                }
+                else {
+                    *r = *q = *p;
+                    r++;
+                }
+                q++;
+                p++;
+            }
+            *r = *q = '\0';
+            hf->start = tk->start;
+            hf->pos = 0;
+            hf->len = q - hf->text;
+            tk->len = r - tk->text;
+        }
+    }
+    return tk;
+}
+TokenStream *hyphen_filter_new(TokenStream *sub_ts)
+{
+    TokenStream *ts = tf_new(HyphenFilter, sub_ts);
+    ts->next        = &hf_next;
+    ts->clone_i     = &hf_clone_i;
+    return ts;
+}
 /****************************************************************************
  * LowerCaseFilter
  ****************************************************************************/
@@ -1257,64 +1337,44 @@ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
 Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
                                                bool lowercase)
 {
-    TokenStream *ts;
+    TokenStream *ts = standard_tokenizer_new();
     if (lowercase) {
-        ts = stop_filter_new_with_words_len(lowercase_filter_new
-                                            (standard_tokenizer_new()),
-                                            words, len);
-    }
-    else {
-        ts = stop_filter_new_with_words_len(standard_tokenizer_new(),
-                                            words, len);
+        ts = lowercase_filter_new(ts);
     }
+    ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
     return analyzer_new(ts, NULL, NULL);
 }
 Analyzer *standard_analyzer_new_with_words(const char **words,
                                            bool lowercase)
 {
-    TokenStream *ts;
+    TokenStream *ts = standard_tokenizer_new();
     if (lowercase) {
-        ts = stop_filter_new_with_words(lowercase_filter_new
-                                        (standard_tokenizer_new()),
-                                        words);
-    }
-    else {
-        ts = stop_filter_new_with_words(standard_tokenizer_new(),
-                                        words);
+        ts = lowercase_filter_new(ts);
     }
+    ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
     return analyzer_new(ts, NULL, NULL);
 }
 Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
                                                   int len, bool lowercase)
 {
-    TokenStream *ts;
+    TokenStream *ts = mb_standard_tokenizer_new();
     if (lowercase) {
-        ts = stop_filter_new_with_words_len(mb_lowercase_filter_new
-                                            (mb_standard_tokenizer_new
-                                             ()), words, len);
-    }
-    else {
-        ts = stop_filter_new_with_words_len(mb_standard_tokenizer_new(),
-                                            words, len);
+        ts = mb_lowercase_filter_new(ts);
     }
+    ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
     return analyzer_new(ts, NULL, NULL);
 }
 Analyzer *mb_standard_analyzer_new_with_words(const char **words,
                                               bool lowercase)
 {
-    TokenStream *ts;
+    TokenStream *ts = mb_standard_tokenizer_new();
     if (lowercase) {
-        ts = stop_filter_new_with_words(mb_lowercase_filter_new
-                                        (mb_standard_tokenizer_new()),
-                                        words);
-    }
-    else {
-        ts = stop_filter_new_with_words(mb_standard_tokenizer_new(),
-                                        words);
+        ts = mb_lowercase_filter_new(ts);
     }
+    ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
     return analyzer_new(ts, NULL, NULL);
 }

data/ext/analysis.h CHANGED

@@ -89,6 +89,16 @@ typedef struct StopFilter
     HashTable  *words;
 } StopFilter;
+typedef struct HyphenFilter
+{
+    TokenFilter super;
+    char text[MAX_WORD_SIZE];
+    int start;
+    int pos;
+    int len;
+    Token *tk;
+} HyphenFilter;
 typedef struct StemFilter
 {
     TokenFilter        super;
@@ -111,6 +121,7 @@ extern TokenStream *mb_letter_tokenizer_new(bool lowercase);
 extern TokenStream *standard_tokenizer_new();
 extern TokenStream *mb_standard_tokenizer_new();
+extern TokenStream *hyphen_filter_new(TokenStream *ts);
 extern TokenStream *lowercase_filter_new(TokenStream *ts);
 extern TokenStream *mb_lowercase_filter_new(TokenStream *ts);

data/ext/ferret.c CHANGED

@@ -16,6 +16,7 @@ ID id_lt;
 ID id_call;
 ID id_is_directory;
 ID id_close;
+ID id_cclass;
 ID id_data;
 static ID id_mkdir_p;
@@ -97,6 +98,13 @@ VALUE frt_data_alloc(VALUE klass)
     return Frt_Make_Struct(klass);
 }
+VALUE frt_define_class_under(VALUE module, char *name, VALUE super)
+{
+    VALUE klass = rb_define_class_under(module, name, super);
+    rb_ivar_set(klass, id_cclass, Qtrue);
+    return klass;
+}
 void frt_deref_free(void *p)
 {
     object_del(p);
@@ -255,6 +263,8 @@ void Init_ferret_ext(void)
     id_is_directory = rb_intern("directory?");
     id_close = rb_intern("close");
+    id_cclass = rb_intern("cclass");
     id_data = rb_intern("@data");
     /* Symbols */

data/ext/ferret.h CHANGED

@@ -13,6 +13,7 @@ extern ID id_lt;
 extern ID id_call;
 extern ID id_is_directory;
 extern ID id_close;
+extern ID id_cclass;
 extern ID id_data;
 /* Symbols */
@@ -60,6 +61,7 @@ extern void frt_create_dir(VALUE rpath);
 extern VALUE frt_hs_to_rb_ary(HashSet *hs);
 extern void *frt_rb_data_ptr(VALUE val);
 extern char * frt_field(VALUE rfield);
+extern VALUE frt_define_class_under(VALUE module, char *name, VALUE super);
 #define Frt_Make_Struct(klass)\
   rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)

data/ext/inc/lang.h CHANGED

@@ -8,6 +8,7 @@
 #undef close
 #undef rename
+#undef read
 #define frt_malloc xmalloc
 #define frt_calloc(n) xcalloc(n, 1)

data/ext/index.c CHANGED

@@ -722,8 +722,8 @@ void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
         RAISE(IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
               "is not between 0 and %d", start, self->len);
     }
-    if (len < 0) {
-        RAISE(IO_ERROR, "len %d should be greater than 0", len);
+    if (len <= 0) {
+        RAISE(IO_ERROR, "len = %d, but should be greater than 0", len);
     }
     if (start + len > self->len) {
         RAISE(IO_ERROR, "Tried to read past end of field. Field is only %d "

data/ext/lang.h CHANGED

@@ -8,6 +8,7 @@
 #undef close
 #undef rename
+#undef read
 #define frt_malloc xmalloc
 #define frt_calloc(n) xcalloc(n, 1)

data/ext/q_parser.c CHANGED

@@ -1984,7 +1984,14 @@ static Query *get_term_q(QParser *qp, char *field, char *word)
             q->destroy_i(q);
             q = phq;
             do {
-                phq_add_term(q, token->text, token->pos_inc);
+                if (token->pos_inc) {
+                    phq_add_term(q, token->text, token->pos_inc);
+                    /* add some slop since single term  was expected */
+                    ((PhraseQuery *)q)->slop++;
+                }
+                else {
+                    phq_append_multi_term(q, token->text);
+                }
             } while ((token = ts_next(stream)) != NULL);
         }
     }
@@ -2157,7 +2164,7 @@ static Phrase *ph_add_multi_word(Phrase *self, char *word)
 }
 static Query *get_phrase_query(QParser *qp, char *field,
-                           Phrase *phrase, char *slop_str)
+                               Phrase *phrase, char *slop_str)
 {
     const int pos_cnt = phrase->size;
     Query *q = NULL;
@@ -2180,6 +2187,7 @@ static Query *get_phrase_query(QParser *qp, char *field,
         Token *token;
         TokenStream *stream;
         int i, j;
+        int pos_inc = 0;
         q = phq_new(field);
         if (slop_str) {
             int slop;
@@ -2188,14 +2196,24 @@ static Query *get_phrase_query(QParser *qp, char *field,
         }
         for (i = 0; i < pos_cnt; i++) {
-            int pos_inc = phrase->positions[i].pos; /* Actually holds pos_inc */
             char **words = phrase->positions[i].terms;
             const int word_count = ary_size(words);
+            if (pos_inc) {
+                ((PhraseQuery *)q)->slop++;
+            }
+            pos_inc += phrase->positions[i].pos + 1; /* Actually holds pos_inc*/
             if (word_count == 1) {
                 stream = get_cached_ts(qp, field, words[0]);
                 while ((token = ts_next(stream))) {
-                    phq_add_term(q, token->text, token->pos_inc + pos_inc);
+                    if (token->pos_inc) {
+                        phq_add_term(q, token->text,
+                                     pos_inc ? pos_inc : token->pos_inc);
+                    }
+                    else {
+                        phq_append_multi_term(q, token->text);
+                        ((PhraseQuery *)q)->slop++;
+                    }
                     pos_inc = 0;
                 }
             }
@@ -2206,8 +2224,10 @@ static Query *get_phrase_query(QParser *qp, char *field,
                     stream = get_cached_ts(qp, field, words[j]);
                     if ((token = ts_next(stream))) {
                         if (!added_position) {
-                            phq_add_term(q, token->text, token->pos_inc + pos_inc);
+                            phq_add_term(q, token->text,
+                                         pos_inc ? pos_inc : token->pos_inc);
                             added_position = true;
+                            pos_inc = 0;
                         }
                         else {
                             phq_append_multi_term(q, token->text);