ferret 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -0,0 +1,54 @@
1
+ #include "ferret.h"
2
+
3
+ static ID frt_id_index_terms;
4
+ /****************************************************************************
5
+ *
6
+ * TermInfosReader Methods
7
+ *
8
+ ****************************************************************************/
9
+
10
+ static VALUE
11
+ frt_tir_get_index_offset(VALUE self, VALUE rterm)
12
+ {
13
+ VALUE index_terms = rb_ivar_get(self, frt_id_index_terms);
14
+
15
+ register int lo = 0; // binary search @index_terms[]
16
+ register int hi = RARRAY(index_terms)->len - 1;
17
+ register int mid, delta;
18
+
19
+ Term *term, *tmp_term;
20
+ Data_Get_Struct(rterm, Term, term);
21
+
22
+ while (hi >= lo) {
23
+ mid = (lo + hi) >> 1;
24
+
25
+ Data_Get_Struct(RARRAY(index_terms)->ptr[mid], Term, tmp_term);
26
+ delta = frt_term_cmp(term, tmp_term);
27
+ if (delta < 0) {
28
+ hi = mid - 1;
29
+ } else if (delta > 0) {
30
+ lo = mid + 1;
31
+ } else {
32
+ return INT2FIX(mid);
33
+ }
34
+ }
35
+ return INT2FIX(hi);
36
+ }
37
+
38
+ /****************************************************************************
39
+ *
40
+ * Init Function
41
+ *
42
+ ****************************************************************************/
43
+
44
+ void
45
+ Init_term_infos_reader(void)
46
+ {
47
+ /* IDs */
48
+ frt_id_index_terms = rb_intern("@index_terms");
49
+
50
+ /* TermInfosReader */
51
+ cTermInfosReader = rb_define_class_under(mIndex, "TermInfosReader", rb_cObject);
52
+
53
+ rb_define_method(cTermInfosReader, "get_index_offset", frt_tir_get_index_offset, 1);
54
+ }
data/ext/terminfo.c ADDED
@@ -0,0 +1,160 @@
1
+ #include "ferret.h"
2
+
3
+
4
+ /****************************************************************************
5
+ *
6
+ * TermInfo Methods
7
+ *
8
+ ****************************************************************************/
9
+
10
+ void
11
+ frt_ti_free(void *p)
12
+ {
13
+ free(p);
14
+ }
15
+
16
+ static VALUE
17
+ frt_ti_alloc(VALUE klass)
18
+ {
19
+ TermInfo *ti = (TermInfo *)ALLOC(TermInfo);
20
+ VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_ti_free, ti);
21
+ return rbuffer;
22
+ }
23
+
24
+ #define GET_TI TermInfo *ti; Data_Get_Struct(self, TermInfo, ti)
25
+ inline VALUE
26
+ frt_ti_set(int argc, VALUE *argv, VALUE self)
27
+ {
28
+ VALUE df, fp, pp, so;
29
+ GET_TI;
30
+ MEMZERO(ti, TermInfo, 1);
31
+ rb_scan_args(argc, argv, "04", &df, &fp, &pp, &so);
32
+ switch (argc) {
33
+ case 4:
34
+ ti->skip_offset = FIX2INT(so);
35
+ case 3:
36
+ ti->prox_pointer = FIX2INT(pp);
37
+ case 2:
38
+ ti->freq_pointer = FIX2INT(fp);
39
+ case 1:
40
+ ti->doc_freq = FIX2INT(df);
41
+ case 0:
42
+ break;
43
+ }
44
+ return Qnil;
45
+ }
46
+
47
+ static VALUE
48
+ frt_ti_init(int argc, VALUE *argv, VALUE self)
49
+ {
50
+ frt_ti_set(argc, argv, self);
51
+ return self;
52
+ }
53
+
54
+ static VALUE
55
+ frt_ti_init_copy(VALUE self, VALUE rother)
56
+ {
57
+ TermInfo *other_ti;
58
+ GET_TI;
59
+ Data_Get_Struct(rother, TermInfo, other_ti);
60
+ MEMCPY(ti, other_ti, TermInfo, 1);
61
+ return self;
62
+ }
63
+
64
+ static VALUE
65
+ frt_ti_eql(VALUE self, VALUE rother)
66
+ {
67
+ TermInfo *other_ti;
68
+ GET_TI;
69
+ if (NIL_P(rother)) return Qfalse;
70
+ Data_Get_Struct(rother, TermInfo, other_ti);
71
+ return (MEMCMP(ti, other_ti, TermInfo, 1) == 0) ? Qtrue : Qfalse;
72
+ }
73
+
74
+ static VALUE
75
+ frt_ti_get_df(VALUE self)
76
+ {
77
+ GET_TI;
78
+ return INT2FIX(ti->doc_freq);
79
+ }
80
+
81
+ static VALUE
82
+ frt_ti_get_fp(VALUE self)
83
+ {
84
+ GET_TI;
85
+ return INT2FIX(ti->freq_pointer);
86
+ }
87
+
88
+ static VALUE
89
+ frt_ti_get_pp(VALUE self)
90
+ {
91
+ GET_TI;
92
+ return INT2FIX(ti->prox_pointer);
93
+ }
94
+
95
+ static VALUE
96
+ frt_ti_get_so(VALUE self)
97
+ {
98
+ GET_TI;
99
+ return INT2FIX(ti->skip_offset);
100
+ }
101
+
102
+ static VALUE
103
+ frt_ti_set_df(VALUE self, VALUE val)
104
+ {
105
+ GET_TI;
106
+ ti->doc_freq = FIX2INT(val);
107
+ return Qnil;
108
+ }
109
+
110
+ static VALUE
111
+ frt_ti_set_fp(VALUE self, VALUE val)
112
+ {
113
+ GET_TI;
114
+ ti->freq_pointer = FIX2INT(val);
115
+ return Qnil;
116
+ }
117
+
118
+ static VALUE
119
+ frt_ti_set_pp(VALUE self, VALUE val)
120
+ {
121
+ GET_TI;
122
+ ti->prox_pointer = FIX2INT(val);
123
+ return Qnil;
124
+ }
125
+
126
+ static VALUE
127
+ frt_ti_set_so(VALUE self, VALUE val)
128
+ {
129
+ GET_TI;
130
+ ti->skip_offset = FIX2INT(val);
131
+ return Qnil;
132
+ }
133
+
134
+ /****************************************************************************
135
+ *
136
+ * Init Function
137
+ *
138
+ ****************************************************************************/
139
+
140
+ void
141
+ Init_term_info(void)
142
+ {
143
+ /* TermInfo */
144
+ cTermInfo = rb_define_class_under(mIndex, "TermInfo", rb_cObject);
145
+ rb_define_alloc_func(cTermInfo, frt_ti_alloc);
146
+
147
+ rb_define_method(cTermInfo, "initialize", frt_ti_init, -1);
148
+ rb_define_method(cTermInfo, "set_values!", frt_ti_set, -1);
149
+ rb_define_method(cTermInfo, "initialize_copy", frt_ti_init_copy, 1);
150
+ rb_define_method(cTermInfo, "set!", frt_ti_init_copy, 1);
151
+ rb_define_method(cTermInfo, "==", frt_ti_eql, 1);
152
+ rb_define_method(cTermInfo, "doc_freq", frt_ti_get_df, 0);
153
+ rb_define_method(cTermInfo, "doc_freq=", frt_ti_set_df, 1);
154
+ rb_define_method(cTermInfo, "freq_pointer", frt_ti_get_fp, 0);
155
+ rb_define_method(cTermInfo, "freq_pointer=", frt_ti_set_fp, 1);
156
+ rb_define_method(cTermInfo, "prox_pointer", frt_ti_get_pp, 0);
157
+ rb_define_method(cTermInfo, "prox_pointer=", frt_ti_set_pp, 1);
158
+ rb_define_method(cTermInfo, "skip_offset", frt_ti_get_so, 0);
159
+ rb_define_method(cTermInfo, "skip_offset=", frt_ti_set_so, 1);
160
+ }
data/ext/token.c ADDED
@@ -0,0 +1,93 @@
1
+ #include "ferret.h"
2
+
3
+ /****************************************************************************
4
+ *
5
+ * Token Methods
6
+ *
7
+ ****************************************************************************/
8
+
9
+ ID id_tk_text, id_tk_pos_inc, id_tk_start_offset, id_tk_end_offset, id_tk_type;
10
+ ID id_tk_pos_inc_set;
11
+
12
+ static VALUE
13
+ frt_token_pos_inc (VALUE self, VALUE pI)
14
+ {
15
+ if(FIX2INT(pI) < 0)
16
+ rb_raise(rb_eArgError, "position_increment < 0");
17
+ rb_ivar_set(self, id_tk_pos_inc, pI);
18
+ return self;
19
+ }
20
+
21
+ static VALUE
22
+ frt_token_init(int argc, VALUE *argv, VALUE self)
23
+ {
24
+ VALUE text, start_offset, end_offset, type, pos_inc;
25
+ rb_scan_args(argc, argv, "32", &text,
26
+ &start_offset, &end_offset, &type, &pos_inc);
27
+ rb_ivar_set(self, id_tk_text, text);
28
+ rb_ivar_set(self, id_tk_start_offset, start_offset);
29
+ rb_ivar_set(self, id_tk_end_offset, end_offset);
30
+ if (argc < 4) {
31
+ rb_ivar_set(self, id_tk_type, rb_str_new("word", 4));
32
+ } else {
33
+ rb_ivar_set(self, id_tk_type, type);
34
+ }
35
+ if (argc < 5) {
36
+ rb_ivar_set(self, id_tk_pos_inc, INT2FIX(1));
37
+ } else {
38
+ rb_ivar_set(self, id_tk_pos_inc, pos_inc);
39
+ }
40
+ return self;
41
+ }
42
+
43
+ static VALUE
44
+ frt_token_eql(VALUE self, VALUE other)
45
+ {
46
+ VALUE rself_text, rother_text;
47
+ char *self_text, *other_text;
48
+ if (!rb_respond_to(other, id_tk_pos_inc_set))
49
+ return Qfalse;
50
+ rself_text = rb_ivar_get(self, id_tk_text);
51
+ rother_text = rb_ivar_get(other, id_tk_text);
52
+ self_text = StringValuePtr(rself_text);
53
+ other_text = StringValuePtr(rother_text);
54
+ if (rb_ivar_get(self, id_tk_start_offset) == rb_ivar_get(other, id_tk_start_offset) &&
55
+ rb_ivar_get(self, id_tk_end_offset) == rb_ivar_get(other, id_tk_end_offset) &&
56
+ (strcmp(self_text, other_text) == 0))
57
+ return Qtrue;
58
+ else
59
+ return Qfalse;
60
+ }
61
+
62
+ /****************************************************************************
63
+ *
64
+ * Init Function
65
+ *
66
+ ****************************************************************************/
67
+
68
+ void
69
+ Init_token(void)
70
+ {
71
+ /* IDs */
72
+ id_tk_text = rb_intern("@term_text");
73
+ id_tk_start_offset = rb_intern("@start_offset");
74
+ id_tk_end_offset = rb_intern("@end_offset");
75
+ id_tk_type = rb_intern("@type");
76
+ id_tk_pos_inc = rb_intern("@position_increment");
77
+ id_tk_pos_inc_set = rb_intern("position_increment=");
78
+
79
+
80
+ /* IndexWriter */
81
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
82
+
83
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
84
+ rb_define_method(cToken, "position_increment=", frt_token_pos_inc, 1);
85
+ rb_define_method(cToken, "==", frt_token_eql, 1);
86
+ rb_define_method(cToken, "eql", frt_token_eql, 1);
87
+
88
+ rb_define_attr(cToken, "term_text", 1, 1);
89
+ rb_define_attr(cToken, "position_increment", 1, 0);
90
+ rb_define_attr(cToken, "start_offset", 1, 0);
91
+ rb_define_attr(cToken, "end_offset", 1, 0);
92
+ rb_define_attr(cToken, "type", 1, 1);
93
+ }
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.2.2'
25
+ VERSION = '0.3.0'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -17,6 +17,24 @@ module Ferret::Analysis
17
17
  def token_stream(field, string)
18
18
  return LowerCaseTokenizer.new(string)
19
19
  end
20
+
21
+ # Invoked before indexing a Field instance if
22
+ # terms have already been added to that field. This allows custom
23
+ # analyzers to place an automatic position increment gap between
24
+ # Field instances using the same field name. The default value
25
+ # position increment gap is 0. With a 0 position increment gap and
26
+ # the typical default token position increment of 1, all terms in a field,
27
+ # including across Field instances, are in successive positions, allowing
28
+ # exact PhraseQuery matches, for instance, across Field instance boundaries.
29
+ #
30
+ # field_name:: Field name being indexed.
31
+ # position_increment_gap:: added to the next token emitted from
32
+ # #token_stream(String,Reader)
33
+ #
34
+ def position_increment_gap(field_name)
35
+ return 0
36
+ end
37
+
20
38
  end
21
39
 
22
40
  # An Analyzer that uses WhiteSpaceTokenizer.
@@ -18,7 +18,21 @@ module Ferret::Analysis
18
18
  ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
19
  P = /[_\/.,-]/
20
20
  HASDIGIT = /\w*\d\w*/
21
+ TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
22
+ |\.([[:alpha:]]\.)+
23
+ |(@|\&)\w+([-.]\w+)*
24
+ )
25
+ |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
26
+ |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
27
+ |(\.\w+)+
28
+ |
29
+ )
30
+ /x
21
31
 
32
+ ACRONYM_WORD = /^#{ACRONYM}$/
33
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
34
+ DOT = /\./
35
+ APOSTROPHE_S = /'[sS]$/
22
36
  protected
23
37
 
24
38
  # Collects only characters which are not spaces tabs or carraige returns
@@ -27,24 +41,15 @@ module Ferret::Analysis
27
41
  # This is a simplified version of the original Lucene standard
28
42
  # tokenizer. I think it works better. I hope so anyway. Any way to
29
43
  # do this more neatly?
30
- /[[:alpha:]]+(('[[:alpha:]]+)+
31
- |\.([[:alpha:]]\.)+
32
- |(@|\&)\w+([-.]\w+)*
33
- )
34
- |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
35
- |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
36
- |(\.\w+)+
37
- |
38
- )
39
- /x
44
+ TOKEN_RE
40
45
  end
41
46
 
42
47
  # stem the 's and remove the '.'s from acronyms
43
48
  def normalize(str)
44
- if str =~ /^#{ACRONYM}$/
45
- str.gsub!(/\./, '')
46
- elsif str =~ /^#{APOSTROPHE}$/
47
- str.gsub!(/'[sS]$/, '')
49
+ if str =~ ACRONYM_WORD
50
+ str.gsub!(DOT, '')
51
+ elsif str =~ APOSTROPHE_WORD
52
+ str.gsub!(APOSTROPHE_S, '')
48
53
  end
49
54
  str
50
55
  end
@@ -35,9 +35,16 @@ module Ferret::Analysis
35
35
  @position_increment = pos_inc
36
36
  end
37
37
 
38
+ def set!(txt, so, eo)
39
+ @term_text = txt
40
+ @start_offset = so
41
+ @end_offset = eo
42
+ self
43
+ end
44
+
38
45
  def eql?(o)
39
46
  return (o.instance_of?(Token) and @start_offset == o.start_offset and
40
- @end_offset == o.end_offset and @term_text = o.term_text)
47
+ @end_offset == o.end_offset and @term_text == o.term_text)
41
48
  end
42
49
  alias :== :eql?
43
50
 
@@ -36,6 +36,7 @@ module Ferret::Analysis
36
36
  # input:: must have a read(count) method which returns an array or string
37
37
  # of _count_ chars.
38
38
  def initialize(input)
39
+ #@token_buffer = Token.new("", 0, 0)
39
40
  if input.is_a? String
40
41
  @ss = StringScanner.new(input)
41
42
  else
@@ -53,6 +54,7 @@ module Ferret::Analysis
53
54
  return nil
54
55
  end
55
56
 
57
+ #return @token_buffer.set!(normalize(term), term_start, term_end)
56
58
  return Token.new(normalize(term), term_start, term_end)
57
59
  end
58
60
 
@@ -62,8 +64,9 @@ module Ferret::Analysis
62
64
 
63
65
  protected
64
66
  # returns the regular expression used to find the next token
67
+ TOKEN_RE = /[[:alpha:]]+/
65
68
  def token_re
66
- /[[:alpha:]]+/
69
+ TOKEN_RE
67
70
  end
68
71
 
69
72
  # Called on each token to normalize it before it is added to the
@@ -80,8 +83,9 @@ module Ferret::Analysis
80
83
  protected
81
84
  # Collects only characters which satisfy the regular expression
82
85
  # _/[[:alpha:]]+/_.
83
- def token_re()
84
- /[[:alpha:]]+/
86
+ TOKEN_RE = /[[:alpha:]]+/
87
+ def token_re
88
+ TOKEN_RE
85
89
  end
86
90
  end
87
91
 
@@ -100,8 +104,9 @@ module Ferret::Analysis
100
104
  class WhiteSpaceTokenizer < RegExpTokenizer
101
105
  protected
102
106
  # Collects only characters which are not spaces tabs or carraige returns
103
- def token_re()
104
- /\S+/
107
+ TOKEN_RE = /\S+/
108
+ def token_re
109
+ TOKEN_RE
105
110
  end
106
111
  end
107
112
  end