ferret 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/ext/Makefile +2 -2
  2. data/ext/ferret.c +27 -2
  3. data/ext/ferret.h +59 -16
  4. data/ext/ferret_ext.so +0 -0
  5. data/ext/index_io.c +72 -77
  6. data/ext/priority_queue.c +150 -145
  7. data/ext/ram_directory.c +47 -42
  8. data/ext/segment_merge_queue.c +4 -8
  9. data/ext/segment_term_enum.c +324 -0
  10. data/ext/similarity.c +59 -0
  11. data/ext/string_helper.c +2 -2
  12. data/ext/tags +150 -46
  13. data/ext/term.c +107 -152
  14. data/ext/term_buffer.c +105 -174
  15. data/ext/term_infos_reader.c +54 -0
  16. data/ext/terminfo.c +160 -0
  17. data/ext/token.c +93 -0
  18. data/lib/ferret.rb +1 -1
  19. data/lib/ferret/analysis/analyzers.rb +18 -0
  20. data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
  21. data/lib/ferret/analysis/token.rb +8 -1
  22. data/lib/ferret/analysis/tokenizers.rb +10 -5
  23. data/lib/ferret/document/field.rb +33 -11
  24. data/lib/ferret/index/document_writer.rb +3 -2
  25. data/lib/ferret/index/field_infos.rb +38 -12
  26. data/lib/ferret/index/fields_io.rb +10 -4
  27. data/lib/ferret/index/index.rb +20 -4
  28. data/lib/ferret/index/index_reader.rb +19 -4
  29. data/lib/ferret/index/index_writer.rb +1 -1
  30. data/lib/ferret/index/multi_reader.rb +21 -7
  31. data/lib/ferret/index/segment_merge_info.rb +24 -22
  32. data/lib/ferret/index/segment_merge_queue.rb +2 -2
  33. data/lib/ferret/index/segment_merger.rb +28 -11
  34. data/lib/ferret/index/segment_reader.rb +19 -4
  35. data/lib/ferret/index/segment_term_enum.rb +3 -11
  36. data/lib/ferret/index/term_buffer.rb +13 -16
  37. data/lib/ferret/index/term_doc_enum.rb +8 -5
  38. data/lib/ferret/index/term_enum.rb +2 -2
  39. data/lib/ferret/index/term_info.rb +1 -5
  40. data/lib/ferret/index/term_infos_io.rb +2 -0
  41. data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
  42. data/lib/ferret/search/phrase_scorer.rb +0 -1
  43. data/lib/ferret/search/similarity.rb +2 -2
  44. data/lib/ferret/search/term_scorer.rb +2 -2
  45. data/lib/ferret/store/directory.rb +2 -0
  46. data/lib/ferret/store/fs_store.rb +16 -3
  47. data/lib/ferret/store/ram_store.rb +2 -2
  48. data/test/unit/document/tc_field.rb +9 -0
  49. data/test/unit/index/tc_field_infos.rb +29 -21
  50. data/test/unit/index/tc_index.rb +44 -7
  51. data/test/unit/index/tc_term_buffer.rb +3 -3
  52. data/test/unit/index/tc_term_info.rb +1 -1
  53. data/test/unit/query_parser/tc_query_parser.rb +1 -1
  54. data/test/unit/search/tc_index_searcher.rb +3 -0
  55. data/test/unit/store/tc_fs_store.rb +47 -16
  56. data/test/unit/store/tc_ram_store.rb +1 -1
  57. metadata +8 -3
@@ -0,0 +1,54 @@
1
+ #include "ferret.h"
2
+
3
+ static ID frt_id_index_terms;
4
+ /****************************************************************************
5
+ *
6
+ * TermInfosReader Methods
7
+ *
8
+ ****************************************************************************/
9
+
10
+ static VALUE
11
+ frt_tir_get_index_offset(VALUE self, VALUE rterm)
12
+ {
13
+ VALUE index_terms = rb_ivar_get(self, frt_id_index_terms);
14
+
15
+ register int lo = 0; // binary search @index_terms[]
16
+ register int hi = RARRAY(index_terms)->len - 1;
17
+ register int mid, delta;
18
+
19
+ Term *term, *tmp_term;
20
+ Data_Get_Struct(rterm, Term, term);
21
+
22
+ while (hi >= lo) {
23
+ mid = (lo + hi) >> 1;
24
+
25
+ Data_Get_Struct(RARRAY(index_terms)->ptr[mid], Term, tmp_term);
26
+ delta = frt_term_cmp(term, tmp_term);
27
+ if (delta < 0) {
28
+ hi = mid - 1;
29
+ } else if (delta > 0) {
30
+ lo = mid + 1;
31
+ } else {
32
+ return INT2FIX(mid);
33
+ }
34
+ }
35
+ return INT2FIX(hi);
36
+ }
37
+
38
+ /****************************************************************************
39
+ *
40
+ * Init Function
41
+ *
42
+ ****************************************************************************/
43
+
44
+ void
45
+ Init_term_infos_reader(void)
46
+ {
47
+ /* IDs */
48
+ frt_id_index_terms = rb_intern("@index_terms");
49
+
50
+ /* TermInfosReader */
51
+ cTermInfosReader = rb_define_class_under(mIndex, "TermInfosReader", rb_cObject);
52
+
53
+ rb_define_method(cTermInfosReader, "get_index_offset", frt_tir_get_index_offset, 1);
54
+ }
data/ext/terminfo.c ADDED
@@ -0,0 +1,160 @@
1
+ #include "ferret.h"
2
+
3
+
4
+ /****************************************************************************
5
+ *
6
+ * TermInfo Methods
7
+ *
8
+ ****************************************************************************/
9
+
10
+ void
11
+ frt_ti_free(void *p)
12
+ {
13
+ free(p);
14
+ }
15
+
16
+ static VALUE
17
+ frt_ti_alloc(VALUE klass)
18
+ {
19
+ TermInfo *ti = (TermInfo *)ALLOC(TermInfo);
20
+ VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_ti_free, ti);
21
+ return rbuffer;
22
+ }
23
+
24
+ #define GET_TI TermInfo *ti; Data_Get_Struct(self, TermInfo, ti)
25
+ inline VALUE
26
+ frt_ti_set(int argc, VALUE *argv, VALUE self)
27
+ {
28
+ VALUE df, fp, pp, so;
29
+ GET_TI;
30
+ MEMZERO(ti, TermInfo, 1);
31
+ rb_scan_args(argc, argv, "04", &df, &fp, &pp, &so);
32
+ switch (argc) {
33
+ case 4:
34
+ ti->skip_offset = FIX2INT(so);
35
+ case 3:
36
+ ti->prox_pointer = FIX2INT(pp);
37
+ case 2:
38
+ ti->freq_pointer = FIX2INT(fp);
39
+ case 1:
40
+ ti->doc_freq = FIX2INT(df);
41
+ case 0:
42
+ break;
43
+ }
44
+ return Qnil;
45
+ }
46
+
47
+ static VALUE
48
+ frt_ti_init(int argc, VALUE *argv, VALUE self)
49
+ {
50
+ frt_ti_set(argc, argv, self);
51
+ return self;
52
+ }
53
+
54
+ static VALUE
55
+ frt_ti_init_copy(VALUE self, VALUE rother)
56
+ {
57
+ TermInfo *other_ti;
58
+ GET_TI;
59
+ Data_Get_Struct(rother, TermInfo, other_ti);
60
+ MEMCPY(ti, other_ti, TermInfo, 1);
61
+ return self;
62
+ }
63
+
64
+ static VALUE
65
+ frt_ti_eql(VALUE self, VALUE rother)
66
+ {
67
+ TermInfo *other_ti;
68
+ GET_TI;
69
+ if (NIL_P(rother)) return Qfalse;
70
+ Data_Get_Struct(rother, TermInfo, other_ti);
71
+ return (MEMCMP(ti, other_ti, TermInfo, 1) == 0) ? Qtrue : Qfalse;
72
+ }
73
+
74
+ static VALUE
75
+ frt_ti_get_df(VALUE self)
76
+ {
77
+ GET_TI;
78
+ return INT2FIX(ti->doc_freq);
79
+ }
80
+
81
+ static VALUE
82
+ frt_ti_get_fp(VALUE self)
83
+ {
84
+ GET_TI;
85
+ return INT2FIX(ti->freq_pointer);
86
+ }
87
+
88
+ static VALUE
89
+ frt_ti_get_pp(VALUE self)
90
+ {
91
+ GET_TI;
92
+ return INT2FIX(ti->prox_pointer);
93
+ }
94
+
95
+ static VALUE
96
+ frt_ti_get_so(VALUE self)
97
+ {
98
+ GET_TI;
99
+ return INT2FIX(ti->skip_offset);
100
+ }
101
+
102
+ static VALUE
103
+ frt_ti_set_df(VALUE self, VALUE val)
104
+ {
105
+ GET_TI;
106
+ ti->doc_freq = FIX2INT(val);
107
+ return Qnil;
108
+ }
109
+
110
+ static VALUE
111
+ frt_ti_set_fp(VALUE self, VALUE val)
112
+ {
113
+ GET_TI;
114
+ ti->freq_pointer = FIX2INT(val);
115
+ return Qnil;
116
+ }
117
+
118
+ static VALUE
119
+ frt_ti_set_pp(VALUE self, VALUE val)
120
+ {
121
+ GET_TI;
122
+ ti->prox_pointer = FIX2INT(val);
123
+ return Qnil;
124
+ }
125
+
126
+ static VALUE
127
+ frt_ti_set_so(VALUE self, VALUE val)
128
+ {
129
+ GET_TI;
130
+ ti->skip_offset = FIX2INT(val);
131
+ return Qnil;
132
+ }
133
+
134
+ /****************************************************************************
135
+ *
136
+ * Init Function
137
+ *
138
+ ****************************************************************************/
139
+
140
+ void
141
+ Init_term_info(void)
142
+ {
143
+ /* TermInfo */
144
+ cTermInfo = rb_define_class_under(mIndex, "TermInfo", rb_cObject);
145
+ rb_define_alloc_func(cTermInfo, frt_ti_alloc);
146
+
147
+ rb_define_method(cTermInfo, "initialize", frt_ti_init, -1);
148
+ rb_define_method(cTermInfo, "set_values!", frt_ti_set, -1);
149
+ rb_define_method(cTermInfo, "initialize_copy", frt_ti_init_copy, 1);
150
+ rb_define_method(cTermInfo, "set!", frt_ti_init_copy, 1);
151
+ rb_define_method(cTermInfo, "==", frt_ti_eql, 1);
152
+ rb_define_method(cTermInfo, "doc_freq", frt_ti_get_df, 0);
153
+ rb_define_method(cTermInfo, "doc_freq=", frt_ti_set_df, 1);
154
+ rb_define_method(cTermInfo, "freq_pointer", frt_ti_get_fp, 0);
155
+ rb_define_method(cTermInfo, "freq_pointer=", frt_ti_set_fp, 1);
156
+ rb_define_method(cTermInfo, "prox_pointer", frt_ti_get_pp, 0);
157
+ rb_define_method(cTermInfo, "prox_pointer=", frt_ti_set_pp, 1);
158
+ rb_define_method(cTermInfo, "skip_offset", frt_ti_get_so, 0);
159
+ rb_define_method(cTermInfo, "skip_offset=", frt_ti_set_so, 1);
160
+ }
data/ext/token.c ADDED
@@ -0,0 +1,93 @@
1
+ #include "ferret.h"
2
+
3
+ /****************************************************************************
4
+ *
5
+ * Token Methods
6
+ *
7
+ ****************************************************************************/
8
+
9
+ ID id_tk_text, id_tk_pos_inc, id_tk_start_offset, id_tk_end_offset, id_tk_type;
10
+ ID id_tk_pos_inc_set;
11
+
12
+ static VALUE
13
+ frt_token_pos_inc (VALUE self, VALUE pI)
14
+ {
15
+ if(FIX2INT(pI) < 0)
16
+ rb_raise(rb_eArgError, "position_increment < 0");
17
+ rb_ivar_set(self, id_tk_pos_inc, pI);
18
+ return self;
19
+ }
20
+
21
+ static VALUE
22
+ frt_token_init(int argc, VALUE *argv, VALUE self)
23
+ {
24
+ VALUE text, start_offset, end_offset, type, pos_inc;
25
+ rb_scan_args(argc, argv, "32", &text,
26
+ &start_offset, &end_offset, &type, &pos_inc);
27
+ rb_ivar_set(self, id_tk_text, text);
28
+ rb_ivar_set(self, id_tk_start_offset, start_offset);
29
+ rb_ivar_set(self, id_tk_end_offset, end_offset);
30
+ if (argc < 4) {
31
+ rb_ivar_set(self, id_tk_type, rb_str_new("word", 4));
32
+ } else {
33
+ rb_ivar_set(self, id_tk_type, type);
34
+ }
35
+ if (argc < 5) {
36
+ rb_ivar_set(self, id_tk_pos_inc, INT2FIX(1));
37
+ } else {
38
+ rb_ivar_set(self, id_tk_pos_inc, pos_inc);
39
+ }
40
+ return self;
41
+ }
42
+
43
+ static VALUE
44
+ frt_token_eql(VALUE self, VALUE other)
45
+ {
46
+ VALUE rself_text, rother_text;
47
+ char *self_text, *other_text;
48
+ if (!rb_respond_to(other, id_tk_pos_inc_set))
49
+ return Qfalse;
50
+ rself_text = rb_ivar_get(self, id_tk_text);
51
+ rother_text = rb_ivar_get(other, id_tk_text);
52
+ self_text = StringValuePtr(rself_text);
53
+ other_text = StringValuePtr(rother_text);
54
+ if (rb_ivar_get(self, id_tk_start_offset) == rb_ivar_get(other, id_tk_start_offset) &&
55
+ rb_ivar_get(self, id_tk_end_offset) == rb_ivar_get(other, id_tk_end_offset) &&
56
+ (strcmp(self_text, other_text) == 0))
57
+ return Qtrue;
58
+ else
59
+ return Qfalse;
60
+ }
61
+
62
+ /****************************************************************************
63
+ *
64
+ * Init Function
65
+ *
66
+ ****************************************************************************/
67
+
68
+ void
69
+ Init_token(void)
70
+ {
71
+ /* IDs */
72
+ id_tk_text = rb_intern("@term_text");
73
+ id_tk_start_offset = rb_intern("@start_offset");
74
+ id_tk_end_offset = rb_intern("@end_offset");
75
+ id_tk_type = rb_intern("@type");
76
+ id_tk_pos_inc = rb_intern("@position_increment");
77
+ id_tk_pos_inc_set = rb_intern("position_increment=");
78
+
79
+
80
+ /* IndexWriter */
81
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
82
+
83
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
84
+ rb_define_method(cToken, "position_increment=", frt_token_pos_inc, 1);
85
+ rb_define_method(cToken, "==", frt_token_eql, 1);
86
+ rb_define_method(cToken, "eql", frt_token_eql, 1);
87
+
88
+ rb_define_attr(cToken, "term_text", 1, 1);
89
+ rb_define_attr(cToken, "position_increment", 1, 0);
90
+ rb_define_attr(cToken, "start_offset", 1, 0);
91
+ rb_define_attr(cToken, "end_offset", 1, 0);
92
+ rb_define_attr(cToken, "type", 1, 1);
93
+ }
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.2.2'
25
+ VERSION = '0.3.0'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -17,6 +17,24 @@ module Ferret::Analysis
17
17
  def token_stream(field, string)
18
18
  return LowerCaseTokenizer.new(string)
19
19
  end
20
+
21
+ # Invoked before indexing a Field instance if
22
+ # terms have already been added to that field. This allows custom
23
+ # analyzers to place an automatic position increment gap between
24
+ # Field instances using the same field name. The default value
25
+ # position increment gap is 0. With a 0 position increment gap and
26
+ # the typical default token position increment of 1, all terms in a field,
27
+ # including across Field instances, are in successive positions, allowing
28
+ # exact PhraseQuery matches, for instance, across Field instance boundaries.
29
+ #
30
+ # field_name:: Field name being indexed.
31
+ # position_increment_gap:: added to the next token emitted from
32
+ # #token_stream(String,Reader)
33
+ #
34
+ def position_increment_gap(field_name)
35
+ return 0
36
+ end
37
+
20
38
  end
21
39
 
22
40
  # An Analyzer that uses WhiteSpaceTokenizer.
@@ -18,7 +18,21 @@ module Ferret::Analysis
18
18
  ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
19
  P = /[_\/.,-]/
20
20
  HASDIGIT = /\w*\d\w*/
21
+ TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
22
+ |\.([[:alpha:]]\.)+
23
+ |(@|\&)\w+([-.]\w+)*
24
+ )
25
+ |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
26
+ |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
27
+ |(\.\w+)+
28
+ |
29
+ )
30
+ /x
21
31
 
32
+ ACRONYM_WORD = /^#{ACRONYM}$/
33
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
34
+ DOT = /\./
35
+ APOSTROPHE_S = /'[sS]$/
22
36
  protected
23
37
 
24
38
  # Collects only characters which are not spaces tabs or carraige returns
@@ -27,24 +41,15 @@ module Ferret::Analysis
27
41
  # This is a simplified version of the original Lucene standard
28
42
  # tokenizer. I think it works better. I hope so anyway. Any way to
29
43
  # do this more neatly?
30
- /[[:alpha:]]+(('[[:alpha:]]+)+
31
- |\.([[:alpha:]]\.)+
32
- |(@|\&)\w+([-.]\w+)*
33
- )
34
- |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
35
- |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
36
- |(\.\w+)+
37
- |
38
- )
39
- /x
44
+ TOKEN_RE
40
45
  end
41
46
 
42
47
  # stem the 's and remove the '.'s from acronyms
43
48
  def normalize(str)
44
- if str =~ /^#{ACRONYM}$/
45
- str.gsub!(/\./, '')
46
- elsif str =~ /^#{APOSTROPHE}$/
47
- str.gsub!(/'[sS]$/, '')
49
+ if str =~ ACRONYM_WORD
50
+ str.gsub!(DOT, '')
51
+ elsif str =~ APOSTROPHE_WORD
52
+ str.gsub!(APOSTROPHE_S, '')
48
53
  end
49
54
  str
50
55
  end
@@ -35,9 +35,16 @@ module Ferret::Analysis
35
35
  @position_increment = pos_inc
36
36
  end
37
37
 
38
+ def set!(txt, so, eo)
39
+ @term_text = txt
40
+ @start_offset = so
41
+ @end_offset = eo
42
+ self
43
+ end
44
+
38
45
  def eql?(o)
39
46
  return (o.instance_of?(Token) and @start_offset == o.start_offset and
40
- @end_offset == o.end_offset and @term_text = o.term_text)
47
+ @end_offset == o.end_offset and @term_text == o.term_text)
41
48
  end
42
49
  alias :== :eql?
43
50
 
@@ -36,6 +36,7 @@ module Ferret::Analysis
36
36
  # input:: must have a read(count) method which returns an array or string
37
37
  # of _count_ chars.
38
38
  def initialize(input)
39
+ #@token_buffer = Token.new("", 0, 0)
39
40
  if input.is_a? String
40
41
  @ss = StringScanner.new(input)
41
42
  else
@@ -53,6 +54,7 @@ module Ferret::Analysis
53
54
  return nil
54
55
  end
55
56
 
57
+ #return @token_buffer.set!(normalize(term), term_start, term_end)
56
58
  return Token.new(normalize(term), term_start, term_end)
57
59
  end
58
60
 
@@ -62,8 +64,9 @@ module Ferret::Analysis
62
64
 
63
65
  protected
64
66
  # returns the regular expression used to find the next token
67
+ TOKEN_RE = /[[:alpha:]]+/
65
68
  def token_re
66
- /[[:alpha:]]+/
69
+ TOKEN_RE
67
70
  end
68
71
 
69
72
  # Called on each token to normalize it before it is added to the
@@ -80,8 +83,9 @@ module Ferret::Analysis
80
83
  protected
81
84
  # Collects only characters which satisfy the regular expression
82
85
  # _/[[:alpha:]]+/_.
83
- def token_re()
84
- /[[:alpha:]]+/
86
+ TOKEN_RE = /[[:alpha:]]+/
87
+ def token_re
88
+ TOKEN_RE
85
89
  end
86
90
  end
87
91
 
@@ -100,8 +104,9 @@ module Ferret::Analysis
100
104
  class WhiteSpaceTokenizer < RegExpTokenizer
101
105
  protected
102
106
  # Collects only characters which are not spaces tabs or carraige returns
103
- def token_re()
104
- /\S+/
107
+ TOKEN_RE = /\S+/
108
+ def token_re
109
+ TOKEN_RE
105
110
  end
106
111
  end
107
112
  end