ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,299 @@
1
+ #include "ferret.h"
2
+
3
+ ID field_name;
4
+
5
+ /****************************************************************************
6
+ *
7
+ * TermBuffer Methods
8
+ *
9
+ ****************************************************************************/
10
+
11
+ void
12
+ frt_termbuffer_free(void *p)
13
+ {
14
+ TermBuffer *tb;
15
+ tb = (TermBuffer *)p;
16
+ free((void *)(tb->text));
17
+ free((void *)(tb->field));
18
+ free(p);
19
+ }
20
+
21
+ static VALUE
22
+ frt_termbuffer_alloc(VALUE klass)
23
+ {
24
+ TermBuffer *tb;
25
+ tb = (TermBuffer *)ALLOC(TermBuffer);
26
+ tb->text = NULL;
27
+ tb->field = NULL;
28
+ tb->tlen = 0;
29
+ tb->flen = 0;
30
+
31
+ VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_termbuffer_free, tb);
32
+ return rbuffer;
33
+ }
34
+
35
+ static VALUE
36
+ frt_termbuffer_init(VALUE self)
37
+ {
38
+ rb_iv_set(self, "@term", Qnil);
39
+ return Qnil;
40
+ }
41
+
42
+ static VALUE
43
+ frt_termbuffer_get_text_length(VALUE self)
44
+ {
45
+
46
+ TermBuffer *tb;
47
+ Data_Get_Struct(self, TermBuffer, tb);
48
+ return INT2FIX(tb->tlen);
49
+ }
50
+
51
+ static VALUE
52
+ frt_termbuffer_get_text(VALUE self)
53
+ {
54
+
55
+ TermBuffer *tb;
56
+ Data_Get_Struct(self, TermBuffer, tb);
57
+ return rb_str_new(tb->text, tb->tlen);
58
+ }
59
+
60
+ static VALUE
61
+ frt_termbuffer_get_field_name(VALUE self)
62
+ {
63
+
64
+ TermBuffer *tb;
65
+ Data_Get_Struct(self, TermBuffer, tb);
66
+ return rb_str_new(tb->field, tb->flen);
67
+ }
68
+
69
+ static VALUE
70
+ frt_termbuffer_reset(VALUE self)
71
+ {
72
+ TermBuffer *tb;
73
+ Data_Get_Struct(self, TermBuffer, tb);
74
+
75
+ tb->field = NULL;
76
+ tb->text = NULL;
77
+ tb->tlen = 0;
78
+ tb->flen = 0;
79
+
80
+ return Qnil;
81
+ }
82
+
83
+ static VALUE
84
+ frt_termbuffer_to_term(VALUE self)
85
+ {
86
+ TermBuffer *tb;
87
+ Data_Get_Struct(self, TermBuffer, tb);
88
+
89
+ if(tb->field == NULL) {
90
+ return Qnil;
91
+ } else {
92
+ VALUE field = rb_str_new(tb->field, tb->flen);
93
+ VALUE text = rb_str_new(tb->text, tb->tlen);
94
+ return rb_funcall(cTerm, frt_newobj, 2, field, text);
95
+ }
96
+ }
97
+
98
+ int
99
+ frt_termbuffer_compare_to_int(VALUE self, VALUE rother)
100
+ {
101
+ int comp, size, my_len, o_len;
102
+ TermBuffer *tb, *other;
103
+ Data_Get_Struct(self, TermBuffer, tb);
104
+ Data_Get_Struct(rother, TermBuffer, other);
105
+
106
+ my_len = tb->flen;
107
+ o_len = other->flen;
108
+ size = my_len >= o_len ? o_len : my_len;
109
+ comp = memcmp(tb->field, other->field, size);
110
+ if(comp == 0){
111
+ if(my_len == o_len) {
112
+ my_len = tb->tlen;
113
+ o_len = other->tlen;
114
+ size = my_len >= o_len ? o_len : my_len;
115
+ comp = memcmp(tb->text, other->text, size);
116
+ if(comp == 0 && my_len != o_len)
117
+ comp = my_len > o_len ? 1 : -1;
118
+ } else {
119
+ comp = my_len > o_len ? 1 : -1;
120
+ }
121
+ }
122
+ return comp;
123
+ }
124
+
125
+ VALUE
126
+ frt_termbuffer_lt(VALUE self, VALUE rother)
127
+ {
128
+ return frt_termbuffer_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
129
+ }
130
+
131
+ VALUE
132
+ frt_termbuffer_gt(VALUE self, VALUE rother)
133
+ {
134
+ return frt_termbuffer_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
135
+ }
136
+
137
+ VALUE
138
+ frt_termbuffer_le(VALUE self, VALUE rother)
139
+ {
140
+ return frt_termbuffer_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
141
+ }
142
+
143
+ VALUE
144
+ frt_termbuffer_ge(VALUE self, VALUE rother)
145
+ {
146
+ return frt_termbuffer_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
147
+ }
148
+
149
+ VALUE
150
+ frt_termbuffer_eq(VALUE self, VALUE rother)
151
+ {
152
+ if (rother == Qnil)
153
+ return Qfalse;
154
+ return frt_termbuffer_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
155
+ }
156
+
157
+ static VALUE
158
+ frt_termbuffer_compare_to(VALUE self, VALUE rother)
159
+ {
160
+ return INT2FIX(frt_termbuffer_compare_to_int(self, rother));
161
+ }
162
+
163
+ static VALUE
164
+ frt_termbuffer_set_term(VALUE self, VALUE rterm)
165
+ {
166
+ TermBuffer *tb;
167
+ Term *term;
168
+ int tlen, flen;
169
+
170
+ Data_Get_Struct(self, TermBuffer, tb);
171
+ Data_Get_Struct(rterm, Term, term);
172
+
173
+ tlen = term->tlen;
174
+ flen = term->flen;
175
+
176
+ if(tb->field == NULL){
177
+ tb->field = (char *)ALLOC_N(char, flen+1);
178
+ tb->text = (char *)ALLOC_N(char, tlen+1);
179
+ } else {
180
+ REALLOC_N(tb->text, char, tlen+1);
181
+ REALLOC_N(tb->field, char, flen+1);
182
+ }
183
+
184
+ tb->flen = flen;
185
+ tb->tlen = tlen;
186
+ MEMCPY(tb->text, term->text, char, tlen);
187
+ MEMCPY(tb->field, term->field, char, flen);
188
+
189
+ return Qnil;
190
+ }
191
+
192
+ static VALUE
193
+ frt_termbuffer_init_copy(VALUE self, VALUE rother)
194
+ {
195
+ TermBuffer *tb, *other;
196
+ int tlen, flen;
197
+
198
+ Data_Get_Struct(self, TermBuffer, tb);
199
+ Data_Get_Struct(rother, TermBuffer, other);
200
+
201
+ tlen = other->tlen;
202
+ flen = other->flen;
203
+
204
+ if(tb->field == NULL){
205
+ tb->field = (char *)ALLOC_N(char, flen+1);
206
+ tb->text = (char *)ALLOC_N(char, tlen+1);
207
+ } else {
208
+ REALLOC_N(tb->text, char, tlen+1);
209
+ REALLOC_N(tb->field, char, flen+1);
210
+ }
211
+
212
+ tb->flen = flen;
213
+ tb->tlen = tlen;
214
+ MEMCPY(tb->text, other->text, char, tlen);
215
+ MEMCPY(tb->field, other->field, char, flen);
216
+
217
+ return Qnil;
218
+ }
219
+
220
+ static VALUE
221
+ frt_termbuffer_read(VALUE self, VALUE input, VALUE info)
222
+ {
223
+ TermBuffer *tb;
224
+ int tlen, flen, start, length;
225
+ VALUE field, fnum;
226
+ Data_Get_Struct(self, TermBuffer, tb);
227
+
228
+ start = frt_read_vint(input);
229
+ length = frt_read_vint(input);
230
+ tlen = start + length;
231
+
232
+ if(tb->field == NULL){
233
+ tb->text = (char *)ALLOC_N(char, tlen+1);
234
+ } else {
235
+ REALLOC_N(tb->text, char, tlen+1);
236
+ }
237
+
238
+ frt_read_chars(input, tb->text, start, length);
239
+ fnum = INT2FIX(frt_read_vint(input));
240
+ field = rb_funcall(info, field_name, 1, fnum);
241
+ flen = RSTRING(field)->len;
242
+
243
+ REALLOC_N(tb->field, char, flen+1);
244
+
245
+ MEMCPY(tb->field, RSTRING(field)->ptr, char, flen);
246
+
247
+ tb->flen = flen;
248
+ tb->tlen = tlen;
249
+ return Qnil;
250
+ }
251
+
252
+ static VALUE
253
+ frt_termbuffer_hash(VALUE self)
254
+ {
255
+ TermBuffer *tb;
256
+ Data_Get_Struct(self, TermBuffer, tb);
257
+ return INT2FIX(frt_hash(tb->text, tb->tlen) +
258
+ frt_hash(tb->field, tb->flen));
259
+ }
260
+
261
+ /****************************************************************************
262
+ *
263
+ * Init Function
264
+ *
265
+ ****************************************************************************/
266
+
267
+
268
+ void
269
+ Init_term_buffer(void) {
270
+ // IDs
271
+ field_name = rb_intern("name");
272
+
273
+ // TermBuffer
274
+ cTermBuffer = rb_define_class_under(mIndex, "TermBuffer", rb_cObject);
275
+ rb_define_alloc_func(cTermBuffer, frt_termbuffer_alloc);
276
+ rb_include_module(cTermBuffer, rb_mComparable);
277
+
278
+ // Methods
279
+ rb_define_method(cTermBuffer, "initialize", frt_termbuffer_init, 0);
280
+ rb_define_method(cTermBuffer, "initialize_copy", frt_termbuffer_init_copy, 1);
281
+ rb_define_method(cTermBuffer, "text", frt_termbuffer_get_text, 0);
282
+ rb_define_method(cTermBuffer, "field", frt_termbuffer_get_field_name, 0);
283
+ rb_define_method(cTermBuffer, "text_length", frt_termbuffer_get_text_length, 0);
284
+ rb_define_method(cTermBuffer, "<=>", frt_termbuffer_compare_to, 1);
285
+ rb_define_method(cTermBuffer, "<", frt_termbuffer_lt, 1);
286
+ rb_define_method(cTermBuffer, ">", frt_termbuffer_gt, 1);
287
+ rb_define_method(cTermBuffer, "<=", frt_termbuffer_le, 1);
288
+ rb_define_method(cTermBuffer, ">=", frt_termbuffer_ge, 1);
289
+ rb_define_method(cTermBuffer, "eql?", frt_termbuffer_eq, 1);
290
+ rb_define_method(cTermBuffer, "==", frt_termbuffer_eq, 1);
291
+ rb_define_method(cTermBuffer, "hash", frt_termbuffer_hash, 0);
292
+ rb_define_method(cTermBuffer, "read", frt_termbuffer_read, 2);
293
+ rb_define_method(cTermBuffer, "reset", frt_termbuffer_reset, 0);
294
+ rb_define_method(cTermBuffer, "to_term", frt_termbuffer_to_term, 0);
295
+ rb_define_method(cTermBuffer, "term", frt_termbuffer_to_term, 0);
296
+ rb_define_method(cTermBuffer, "term=", frt_termbuffer_set_term, 1);
297
+ rb_define_method(cTermBuffer, "set!", frt_termbuffer_init_copy, 1);
298
+ rb_define_method(cTermBuffer, "text_str", frt_termbuffer_get_text, 0);
299
+ }
@@ -0,0 +1,12 @@
1
+ int
2
+ frt_hash(register char *p, register int len)
3
+ {
4
+ register int key = 0;
5
+
6
+ while (len--) {
7
+ key = key*65599 + *p;
8
+ p++;
9
+ }
10
+ key = key + (key>>5);
11
+ return key;
12
+ }
@@ -0,0 +1,41 @@
1
+ #--
2
+ # Copyright (c) 2005 David Balmain
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # :include: ../TUTORIAL
24
+ module Ferret
25
+ VERSION = '0.1.0'
26
+ end
27
+
28
+ require 'ferret/utils'
29
+ require 'ferret/document'
30
+ require 'ferret/stemmers'
31
+ require 'ferret/analysis'
32
+ require 'ferret/store'
33
+ require 'ferret/index'
34
+ require 'ferret/search'
35
+ require 'ferret/query_parser'
36
+
37
+ # try and load the C extension but it isn't necessary.
38
+ begin
39
+ require 'ferret_ext'
40
+ rescue Exception => e
41
+ end
@@ -0,0 +1,11 @@
1
+ # Documentation for Analysis
2
+ module Ferret::Analysis
3
+ end
4
+
5
+ require 'ferret/analysis/token'
6
+ require 'ferret/analysis/token_stream'
7
+ require 'ferret/analysis/tokenizers'
8
+ require 'ferret/analysis/standard_tokenizer'
9
+ require 'ferret/analysis/token_filters'
10
+ require 'ferret/analysis/word_list_loader'
11
+ require 'ferret/analysis/analyzers'
@@ -0,0 +1,93 @@
1
+ module Ferret::Analysis
2
+ # An Analyzer builds TokenStreams, which analyze text. It thus represents
3
+ # a policy for extracting index terms from text.
4
+ #
5
+ # Typical implementations first build a Tokenizer, which breaks the stream
6
+ # of characters from the Reader into raw Tokens. One or more TokenFilter s
7
+ # may then be applied to the output of the Tokenizer.
8
+ #
9
+ # The default Analyzer just creates a LowerCaseTokenizer which converts
10
+ # all text to lowercase tokens. See LowerCaseTokenizer for more details.
11
+ class Analyzer
12
+ # Creates a TokenStream which tokenizes all the text in the provided
13
+ # Reader. Override to allow Analyzer to choose strategy based on
14
+ # document and/or field.
15
+ # string:: the string representing the text in the field
16
+ # field:: name of the field. Not required.
17
+ def token_stream(field, string)
18
+ return LowerCaseTokenizer.new(string)
19
+ end
20
+ end
21
+
22
+ # An Analyzer that uses WhiteSpaceTokenizer.
23
+ class WhiteSpaceAnalyzer < Analyzer
24
+ def token_stream(field, string)
25
+ return WhiteSpaceTokenizer.new(string)
26
+ end
27
+ end
28
+
29
+ # Filters LetterTokenizer with LowerCaseFilter and StopFilter.
30
+ class StopAnalyzer < Analyzer
31
+
32
+ # An array containing some common English words that are not usually useful
33
+ # for searching.
34
+ ENGLISH_STOP_WORDS = [
35
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
36
+ "for", "if", "in", "into", "is", "it",
37
+ "no", "not", "of", "on", "or", "s", "such",
38
+ "t", "that", "the", "their", "then", "there", "these",
39
+ "they", "this", "to", "was", "will", "with"
40
+ ]
41
+
42
+ # Builds an analyzer which removes words in the provided array.
43
+ def initialize(stop_words = ENGLISH_STOP_WORDS)
44
+ @stop_words = stop_words
45
+ end
46
+
47
+ # Filters LowerCaseTokenizer with StopFilter.
48
+ def token_stream(field, string)
49
+ return StopFilter.new(LowerCaseTokenizer.new(string), @stop_words)
50
+ end
51
+ end
52
+
53
+ # An Analyzer that filters LetterTokenizer with LowerCaseFilter.
54
+ class StandardAnalyzer < StopAnalyzer
55
+ def token_stream(field, string)
56
+ return StopFilter.new(LowerCaseFilter.new(StandardTokenizer.new(string)), @stop_words)
57
+ end
58
+ end
59
+
60
+
61
+ # This analyzer is used to facilitate scenarios where different
62
+ # fields require different analysis techniques. Use #add_analyzer
63
+ # to add a non-default analyzer on a field name basis.
64
+ # See tc_per_field_analyzer_wrapper for example usage.
65
+ class PerFieldAnalyzerWrapper < Analyzer
66
+
67
+ # Constructs with default analyzer.
68
+ #
69
+ # default_analyzer:: Any fields not specifically defined to use a
70
+ # different analyzer will use the one provided here.
71
+ def initialize(default_analyzer)
72
+ @default_analyzer = default_analyzer
73
+ @analyzers = {}
74
+ end
75
+
76
+ # Defines an analyzer to use for the specified field.
77
+ #
78
+ # field:: field name requiring a non-default analyzer.
79
+ # analyzer:: non-default analyzer to use for field
80
+ def add_analyzer(field, analyzer)
81
+ @analyzers[field] = analyzer
82
+ end
83
+
84
+ def token_stream(field, string)
85
+ analyzer = @analyzers[field]
86
+ if (analyzer == nil)
87
+ analyzer = @default_analyzer;
88
+ end
89
+
90
+ return analyzer.token_stream(field, string)
91
+ end
92
+ end
93
+ end