ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,299 @@
1
+ #include "ferret.h"
2
+
3
+ ID field_name;
4
+
5
+ /****************************************************************************
6
+ *
7
+ * TermBuffer Methods
8
+ *
9
+ ****************************************************************************/
10
+
11
+ void
12
+ frt_termbuffer_free(void *p)
13
+ {
14
+ TermBuffer *tb;
15
+ tb = (TermBuffer *)p;
16
+ free((void *)(tb->text));
17
+ free((void *)(tb->field));
18
+ free(p);
19
+ }
20
+
21
+ static VALUE
22
+ frt_termbuffer_alloc(VALUE klass)
23
+ {
24
+ TermBuffer *tb;
25
+ tb = (TermBuffer *)ALLOC(TermBuffer);
26
+ tb->text = NULL;
27
+ tb->field = NULL;
28
+ tb->tlen = 0;
29
+ tb->flen = 0;
30
+
31
+ VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_termbuffer_free, tb);
32
+ return rbuffer;
33
+ }
34
+
35
+ static VALUE
36
+ frt_termbuffer_init(VALUE self)
37
+ {
38
+ rb_iv_set(self, "@term", Qnil);
39
+ return Qnil;
40
+ }
41
+
42
+ static VALUE
43
+ frt_termbuffer_get_text_length(VALUE self)
44
+ {
45
+
46
+ TermBuffer *tb;
47
+ Data_Get_Struct(self, TermBuffer, tb);
48
+ return INT2FIX(tb->tlen);
49
+ }
50
+
51
+ static VALUE
52
+ frt_termbuffer_get_text(VALUE self)
53
+ {
54
+
55
+ TermBuffer *tb;
56
+ Data_Get_Struct(self, TermBuffer, tb);
57
+ return rb_str_new(tb->text, tb->tlen);
58
+ }
59
+
60
+ static VALUE
61
+ frt_termbuffer_get_field_name(VALUE self)
62
+ {
63
+
64
+ TermBuffer *tb;
65
+ Data_Get_Struct(self, TermBuffer, tb);
66
+ return rb_str_new(tb->field, tb->flen);
67
+ }
68
+
69
+ static VALUE
70
+ frt_termbuffer_reset(VALUE self)
71
+ {
72
+ TermBuffer *tb;
73
+ Data_Get_Struct(self, TermBuffer, tb);
74
+
75
+ tb->field = NULL;
76
+ tb->text = NULL;
77
+ tb->tlen = 0;
78
+ tb->flen = 0;
79
+
80
+ return Qnil;
81
+ }
82
+
83
+ static VALUE
84
+ frt_termbuffer_to_term(VALUE self)
85
+ {
86
+ TermBuffer *tb;
87
+ Data_Get_Struct(self, TermBuffer, tb);
88
+
89
+ if(tb->field == NULL) {
90
+ return Qnil;
91
+ } else {
92
+ VALUE field = rb_str_new(tb->field, tb->flen);
93
+ VALUE text = rb_str_new(tb->text, tb->tlen);
94
+ return rb_funcall(cTerm, frt_newobj, 2, field, text);
95
+ }
96
+ }
97
+
98
+ int
99
+ frt_termbuffer_compare_to_int(VALUE self, VALUE rother)
100
+ {
101
+ int comp, size, my_len, o_len;
102
+ TermBuffer *tb, *other;
103
+ Data_Get_Struct(self, TermBuffer, tb);
104
+ Data_Get_Struct(rother, TermBuffer, other);
105
+
106
+ my_len = tb->flen;
107
+ o_len = other->flen;
108
+ size = my_len >= o_len ? o_len : my_len;
109
+ comp = memcmp(tb->field, other->field, size);
110
+ if(comp == 0){
111
+ if(my_len == o_len) {
112
+ my_len = tb->tlen;
113
+ o_len = other->tlen;
114
+ size = my_len >= o_len ? o_len : my_len;
115
+ comp = memcmp(tb->text, other->text, size);
116
+ if(comp == 0 && my_len != o_len)
117
+ comp = my_len > o_len ? 1 : -1;
118
+ } else {
119
+ comp = my_len > o_len ? 1 : -1;
120
+ }
121
+ }
122
+ return comp;
123
+ }
124
+
125
+ VALUE
126
+ frt_termbuffer_lt(VALUE self, VALUE rother)
127
+ {
128
+ return frt_termbuffer_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
129
+ }
130
+
131
+ VALUE
132
+ frt_termbuffer_gt(VALUE self, VALUE rother)
133
+ {
134
+ return frt_termbuffer_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
135
+ }
136
+
137
+ VALUE
138
+ frt_termbuffer_le(VALUE self, VALUE rother)
139
+ {
140
+ return frt_termbuffer_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
141
+ }
142
+
143
+ VALUE
144
+ frt_termbuffer_ge(VALUE self, VALUE rother)
145
+ {
146
+ return frt_termbuffer_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
147
+ }
148
+
149
+ VALUE
150
+ frt_termbuffer_eq(VALUE self, VALUE rother)
151
+ {
152
+ if (rother == Qnil)
153
+ return Qfalse;
154
+ return frt_termbuffer_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
155
+ }
156
+
157
+ static VALUE
158
+ frt_termbuffer_compare_to(VALUE self, VALUE rother)
159
+ {
160
+ return INT2FIX(frt_termbuffer_compare_to_int(self, rother));
161
+ }
162
+
163
+ static VALUE
164
+ frt_termbuffer_set_term(VALUE self, VALUE rterm)
165
+ {
166
+ TermBuffer *tb;
167
+ Term *term;
168
+ int tlen, flen;
169
+
170
+ Data_Get_Struct(self, TermBuffer, tb);
171
+ Data_Get_Struct(rterm, Term, term);
172
+
173
+ tlen = term->tlen;
174
+ flen = term->flen;
175
+
176
+ if(tb->field == NULL){
177
+ tb->field = (char *)ALLOC_N(char, flen+1);
178
+ tb->text = (char *)ALLOC_N(char, tlen+1);
179
+ } else {
180
+ REALLOC_N(tb->text, char, tlen+1);
181
+ REALLOC_N(tb->field, char, flen+1);
182
+ }
183
+
184
+ tb->flen = flen;
185
+ tb->tlen = tlen;
186
+ MEMCPY(tb->text, term->text, char, tlen);
187
+ MEMCPY(tb->field, term->field, char, flen);
188
+
189
+ return Qnil;
190
+ }
191
+
192
+ static VALUE
193
+ frt_termbuffer_init_copy(VALUE self, VALUE rother)
194
+ {
195
+ TermBuffer *tb, *other;
196
+ int tlen, flen;
197
+
198
+ Data_Get_Struct(self, TermBuffer, tb);
199
+ Data_Get_Struct(rother, TermBuffer, other);
200
+
201
+ tlen = other->tlen;
202
+ flen = other->flen;
203
+
204
+ if(tb->field == NULL){
205
+ tb->field = (char *)ALLOC_N(char, flen+1);
206
+ tb->text = (char *)ALLOC_N(char, tlen+1);
207
+ } else {
208
+ REALLOC_N(tb->text, char, tlen+1);
209
+ REALLOC_N(tb->field, char, flen+1);
210
+ }
211
+
212
+ tb->flen = flen;
213
+ tb->tlen = tlen;
214
+ MEMCPY(tb->text, other->text, char, tlen);
215
+ MEMCPY(tb->field, other->field, char, flen);
216
+
217
+ return Qnil;
218
+ }
219
+
220
+ static VALUE
221
+ frt_termbuffer_read(VALUE self, VALUE input, VALUE info)
222
+ {
223
+ TermBuffer *tb;
224
+ int tlen, flen, start, length;
225
+ VALUE field, fnum;
226
+ Data_Get_Struct(self, TermBuffer, tb);
227
+
228
+ start = frt_read_vint(input);
229
+ length = frt_read_vint(input);
230
+ tlen = start + length;
231
+
232
+ if(tb->field == NULL){
233
+ tb->text = (char *)ALLOC_N(char, tlen+1);
234
+ } else {
235
+ REALLOC_N(tb->text, char, tlen+1);
236
+ }
237
+
238
+ frt_read_chars(input, tb->text, start, length);
239
+ fnum = INT2FIX(frt_read_vint(input));
240
+ field = rb_funcall(info, field_name, 1, fnum);
241
+ flen = RSTRING(field)->len;
242
+
243
+ REALLOC_N(tb->field, char, flen+1);
244
+
245
+ MEMCPY(tb->field, RSTRING(field)->ptr, char, flen);
246
+
247
+ tb->flen = flen;
248
+ tb->tlen = tlen;
249
+ return Qnil;
250
+ }
251
+
252
+ static VALUE
253
+ frt_termbuffer_hash(VALUE self)
254
+ {
255
+ TermBuffer *tb;
256
+ Data_Get_Struct(self, TermBuffer, tb);
257
+ return INT2FIX(frt_hash(tb->text, tb->tlen) +
258
+ frt_hash(tb->field, tb->flen));
259
+ }
260
+
261
+ /****************************************************************************
262
+ *
263
+ * Init Function
264
+ *
265
+ ****************************************************************************/
266
+
267
+
268
+ void
269
+ Init_term_buffer(void) {
270
+ // IDs
271
+ field_name = rb_intern("name");
272
+
273
+ // TermBuffer
274
+ cTermBuffer = rb_define_class_under(mIndex, "TermBuffer", rb_cObject);
275
+ rb_define_alloc_func(cTermBuffer, frt_termbuffer_alloc);
276
+ rb_include_module(cTermBuffer, rb_mComparable);
277
+
278
+ // Methods
279
+ rb_define_method(cTermBuffer, "initialize", frt_termbuffer_init, 0);
280
+ rb_define_method(cTermBuffer, "initialize_copy", frt_termbuffer_init_copy, 1);
281
+ rb_define_method(cTermBuffer, "text", frt_termbuffer_get_text, 0);
282
+ rb_define_method(cTermBuffer, "field", frt_termbuffer_get_field_name, 0);
283
+ rb_define_method(cTermBuffer, "text_length", frt_termbuffer_get_text_length, 0);
284
+ rb_define_method(cTermBuffer, "<=>", frt_termbuffer_compare_to, 1);
285
+ rb_define_method(cTermBuffer, "<", frt_termbuffer_lt, 1);
286
+ rb_define_method(cTermBuffer, ">", frt_termbuffer_gt, 1);
287
+ rb_define_method(cTermBuffer, "<=", frt_termbuffer_le, 1);
288
+ rb_define_method(cTermBuffer, ">=", frt_termbuffer_ge, 1);
289
+ rb_define_method(cTermBuffer, "eql?", frt_termbuffer_eq, 1);
290
+ rb_define_method(cTermBuffer, "==", frt_termbuffer_eq, 1);
291
+ rb_define_method(cTermBuffer, "hash", frt_termbuffer_hash, 0);
292
+ rb_define_method(cTermBuffer, "read", frt_termbuffer_read, 2);
293
+ rb_define_method(cTermBuffer, "reset", frt_termbuffer_reset, 0);
294
+ rb_define_method(cTermBuffer, "to_term", frt_termbuffer_to_term, 0);
295
+ rb_define_method(cTermBuffer, "term", frt_termbuffer_to_term, 0);
296
+ rb_define_method(cTermBuffer, "term=", frt_termbuffer_set_term, 1);
297
+ rb_define_method(cTermBuffer, "set!", frt_termbuffer_init_copy, 1);
298
+ rb_define_method(cTermBuffer, "text_str", frt_termbuffer_get_text, 0);
299
+ }
@@ -0,0 +1,12 @@
1
+ int
2
+ frt_hash(register char *p, register int len)
3
+ {
4
+ register int key = 0;
5
+
6
+ while (len--) {
7
+ key = key*65599 + *p;
8
+ p++;
9
+ }
10
+ key = key + (key>>5);
11
+ return key;
12
+ }
@@ -0,0 +1,41 @@
1
+ #--
2
+ # Copyright (c) 2005 David Balmain
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # :include: ../TUTORIAL
24
+ module Ferret
25
+ VERSION = '0.1.0'
26
+ end
27
+
28
+ require 'ferret/utils'
29
+ require 'ferret/document'
30
+ require 'ferret/stemmers'
31
+ require 'ferret/analysis'
32
+ require 'ferret/store'
33
+ require 'ferret/index'
34
+ require 'ferret/search'
35
+ require 'ferret/query_parser'
36
+
37
+ # try and load the C extension but it isn't necessary.
38
+ begin
39
+ require 'ferret_ext'
40
+ rescue Exception => e
41
+ end
@@ -0,0 +1,11 @@
1
+ # Documentation for Analysis
2
+ module Ferret::Analysis
3
+ end
4
+
5
+ require 'ferret/analysis/token'
6
+ require 'ferret/analysis/token_stream'
7
+ require 'ferret/analysis/tokenizers'
8
+ require 'ferret/analysis/standard_tokenizer'
9
+ require 'ferret/analysis/token_filters'
10
+ require 'ferret/analysis/word_list_loader'
11
+ require 'ferret/analysis/analyzers'
@@ -0,0 +1,93 @@
1
+ module Ferret::Analysis
2
+ # An Analyzer builds TokenStreams, which analyze text. It thus represents
3
+ # a policy for extracting index terms from text.
4
+ #
5
+ # Typical implementations first build a Tokenizer, which breaks the stream
6
+ # of characters from the Reader into raw Tokens. One or more TokenFilter s
7
+ # may then be applied to the output of the Tokenizer.
8
+ #
9
+ # The default Analyzer just creates a LowerCaseTokenizer which converts
10
+ # all text to lowercase tokens. See LowerCaseTokenizer for more details.
11
+ class Analyzer
12
+ # Creates a TokenStream which tokenizes all the text in the provided
13
+ # Reader. Override to allow Analyzer to choose strategy based on
14
+ # document and/or field.
15
+ # string:: the string representing the text in the field
16
+ # field:: name of the field. Not required.
17
+ def token_stream(field, string)
18
+ return LowerCaseTokenizer.new(string)
19
+ end
20
+ end
21
+
22
+ # An Analyzer that uses WhiteSpaceTokenizer.
23
+ class WhiteSpaceAnalyzer < Analyzer
24
+ def token_stream(field, string)
25
+ return WhiteSpaceTokenizer.new(string)
26
+ end
27
+ end
28
+
29
+ # Filters LetterTokenizer with LowerCaseFilter and StopFilter.
30
+ class StopAnalyzer < Analyzer
31
+
32
+ # An array containing some common English words that are not usually useful
33
+ # for searching.
34
+ ENGLISH_STOP_WORDS = [
35
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
36
+ "for", "if", "in", "into", "is", "it",
37
+ "no", "not", "of", "on", "or", "s", "such",
38
+ "t", "that", "the", "their", "then", "there", "these",
39
+ "they", "this", "to", "was", "will", "with"
40
+ ]
41
+
42
+ # Builds an analyzer which removes words in the provided array.
43
+ def initialize(stop_words = ENGLISH_STOP_WORDS)
44
+ @stop_words = stop_words
45
+ end
46
+
47
+ # Filters LowerCaseTokenizer with StopFilter.
48
+ def token_stream(field, string)
49
+ return StopFilter.new(LowerCaseTokenizer.new(string), @stop_words)
50
+ end
51
+ end
52
+
53
+ # An Analyzer that filters LetterTokenizer with LowerCaseFilter.
54
+ class StandardAnalyzer < StopAnalyzer
55
+ def token_stream(field, string)
56
+ return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
57
+ end
58
+ end
59
+
60
+
61
+ # This analyzer is used to facilitate scenarios where different
62
+ # fields require different analysis techniques. Use #add_analyzer
63
+ # to add a non-default analyzer on a field name basis.
64
+ # See tc_per_field_analyzer_wrapper for example usage.
65
+ class PerFieldAnalyzerWrapper < Analyzer
66
+
67
+ # Constructs with default analyzer.
68
+ #
69
+ # default_analyzer:: Any fields not specifically defined to use a
70
+ # different analyzer will use the one provided here.
71
+ def initialize(default_analyzer)
72
+ @default_analyzer = default_analyzer
73
+ @analyzers = {}
74
+ end
75
+
76
+ # Defines an analyzer to use for the specified field.
77
+ #
78
+ # field:: field name requiring a non-default analyzer.
79
+ # analyzer:: non-default analyzer to use for field
80
+ def add_analyzer(field, analyzer)
81
+ @analyzers[field] = analyzer
82
+ end
83
+
84
+ def token_stream(field, string)
85
+ analyzer = @analyzers[field]
86
+ if (analyzer == nil)
87
+ analyzer = @default_analyzer;
88
+ end
89
+
90
+ return analyzer.token_stream(field, string)
91
+ end
92
+ end
93
+ end