ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_wildcard.c CHANGED
@@ -7,152 +7,165 @@
7
7
  *
8
8
  ****************************************************************************/
9
9
 
10
- char *wcq_to_s(Query *self, char *field)
11
- {
12
- char *buffer, *bptr;
13
- Term *term = (Term *)self->data;
14
- size_t tlen = strlen(term->text);
15
- size_t flen = strlen(term->field);
16
- bptr = buffer = ALLOC_N(char, tlen + flen + 35);
17
-
18
- if (strcmp(term->field, field) != 0) {
19
- sprintf(bptr, "%s:", term->field);
20
- bptr += strlen(term->field) + 1;
21
- }
22
- sprintf(bptr, "%s", term->text);
23
- bptr = buffer + strlen(buffer);
24
- if (self->boost != 1.0) {
25
- *bptr = '^';
26
- dbl_to_s(++bptr, self->boost);
27
- }
28
-
29
- return buffer;
30
- }
10
+ #define WCQ(query) ((WildCardQuery *)(query))
31
11
 
32
- bool wc_match(char *pattern, char *text)
12
+ static char *wcq_to_s(Query *self, const char *current_field)
33
13
  {
34
- char *p = pattern, *t = text, *xt;
14
+ char *buffer, *bptr;
15
+ const char *field = WCQ(self)->field;
16
+ const char *pattern = WCQ(self)->pattern;
17
+ size_t flen = strlen(field);
18
+ size_t plen = strlen(pattern);
19
+ bptr = buffer = ALLOC_N(char, plen + flen + 35);
20
+
21
+ if (strcmp(field, current_field) != 0) {
22
+ sprintf(bptr, "%s:", field);
23
+ bptr += flen + 1;
24
+ }
25
+ sprintf(bptr, "%s", pattern);
26
+ bptr += plen;
35
27
 
36
- /* include '\0' as we need to match empty string */
37
- char *text_last = t + strlen(t);
28
+ if (self->boost != 1.0) {
29
+ *bptr = '^';
30
+ dbl_to_s(++bptr, self->boost);
31
+ }
38
32
 
39
- for (;; p++, t++) {
33
+ return buffer;
34
+ }
40
35
 
41
- /* end of text so make sure end of pattern doesn't matter */
42
- if (*t == '\0') {
43
- while (*p) {
44
- if (*p != WILD_STRING) return false;
45
- p++;
46
- }
47
- return true;
48
- }
36
+ bool wc_match(const char *pattern, const char *text)
37
+ {
38
+ const char *p = pattern, *t = text, *xt;
39
+
40
+ /* include '\0' as we need to match empty string */
41
+ const char *text_last = t + strlen(t);
42
+
43
+ for (;; p++, t++) {
44
+
45
+ /* end of text so make sure end of pattern doesn't matter */
46
+ if (*t == '\0') {
47
+ while (*p) {
48
+ if (*p != WILD_STRING) {
49
+ return false;
50
+ }
51
+ p++;
52
+ }
53
+ return true;
54
+ }
49
55
 
50
- /* If we've gone past the end of the pattern, return false. */
51
- if (*p == '\0') return false;
56
+ /* If we've gone past the end of the pattern, return false. */
57
+ if (*p == '\0') {
58
+ return false;
59
+ }
52
60
 
53
- /* Match a single character, so continue. */
54
- if (*p == WILD_CHAR) continue;
61
+ /* Match a single character, so continue. */
62
+ if (*p == WILD_CHAR) {
63
+ continue;
64
+ }
55
65
 
56
- if (*p == WILD_STRING) {
57
- // Look at the character beyond the '*'.
58
- p++;
59
- // Examine the string, starting at the last character.
60
- for (xt = text_last; xt >= t; xt--) {
61
- if (wc_match(p, xt)) return true;
62
- }
63
- return false;
66
+ if (*p == WILD_STRING) {
67
+ /* Look at the character beyond the '*'. */
68
+ p++;
69
+ /* Examine the string, starting at the last character. */
70
+ for (xt = text_last; xt >= t; xt--) {
71
+ if (wc_match(p, xt)) return true;
72
+ }
73
+ return false;
74
+ }
75
+ if (*p != *t) {
76
+ return false;
77
+ }
64
78
  }
65
- if (*p != *t)
66
- return false;
67
- }
68
79
 
69
- return false;
80
+ return false;
70
81
  }
71
82
 
72
- Query *wcq_rewrite(Query *self, IndexReader *ir)
83
+ static Query *wcq_rewrite(Query *self, IndexReader *ir)
73
84
  {
74
- Query *q;
75
- Query *tq;
76
-
77
- Term *term = (Term *)self->data;
78
- char *text = term->text;
79
- char *field = term->field;
80
- char *first_star = strchr(text, WILD_STRING);
81
- char *first_ques = strchr(text, WILD_CHAR);
82
-
83
- if (first_star == NULL && first_ques == NULL) {
84
- q = tq_create(term_clone(term));
85
- } else {
86
- TermEnum *te;
87
- Term prefix_term;
88
- char *prefix = NULL;
89
-
90
- char *pattern = (first_ques && (!first_star || (first_star > first_ques)))
91
- ? first_ques : first_star;
92
-
93
- int prefix_len = (int)(pattern - text);
94
-
95
- prefix_term.field = field;
96
- prefix_term.text = (char *)EMPTY_STRING;
97
- if (prefix_len > 0) {
98
- prefix = ALLOC_N(char, prefix_len + 1);
99
- strncpy(prefix, text, prefix_len);
100
- prefix_term.text = prefix;
101
- prefix_term.text[prefix_len] = '\0';
85
+ Query *q;
86
+ const char *field = WCQ(self)->field;
87
+ const char *pattern = WCQ(self)->pattern;
88
+ const char *first_star = strchr(pattern, WILD_STRING);
89
+ const char *first_ques = strchr(pattern, WILD_CHAR);
90
+
91
+ if (NULL == first_star && NULL == first_ques) {
92
+ q = tq_new(field, pattern);
93
+ q->boost = self->boost;
102
94
  }
103
- te = ir->terms_from(ir, &prefix_term);
104
-
105
- q = bq_create(true);
106
- if (te) {
107
- TermBuffer *tb = te->tb_curr;
108
- do {
109
- if (strcmp(tb->field, field) != 0 ||
110
- (prefix && strncmp(tb->text, prefix, prefix_len) != 0))
111
- break;
112
-
113
- if (wc_match(pattern, tb->text + prefix_len)) {
114
- tq = tq_create(term_create(tb->field, tb->text)); /* found match */
115
- tq->boost = self->boost; /* set boost */
116
- bq_add_query(q, tq, BC_SHOULD); /* add query */
95
+ else {
96
+ const int field_num = fis_get_field_num(ir->fis, field);
97
+ q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
98
+
99
+ if (field_num >= 0) {
100
+ TermEnum *te;
101
+ char prefix[MAX_WORD_SIZE] = "";
102
+ int prefix_len;
103
+
104
+ pattern = (first_ques && (!first_star || first_star > first_ques))
105
+ ? first_ques : first_star;
106
+
107
+ prefix_len = (int)(pattern - WCQ(self)->pattern);
108
+
109
+ if (prefix_len > 0) {
110
+ memcpy(prefix, WCQ(self)->pattern, prefix_len);
111
+ prefix[prefix_len] = '\0';
112
+ }
113
+
114
+ te = ir->terms_from(ir, field_num, prefix);
115
+
116
+ if (te != NULL) {
117
+ const char *term = te->curr_term;
118
+ const char *pat_term = term + prefix_len;
119
+ do {
120
+ if (prefix && strncmp(term, prefix, prefix_len) != 0) {
121
+ break;
122
+ }
123
+
124
+ if (wc_match(pattern, pat_term)) {
125
+ multi_tq_add_term(q, term);
126
+ }
127
+ } while (te->next(te) != NULL);
128
+ te->close(te);
129
+ }
117
130
  }
118
- } while ((tb = te->next(te)) != NULL);
119
- te->close(te);
120
131
  }
121
- free(prefix);
122
- }
123
132
 
124
- return q;
133
+ return q;
125
134
  }
126
135
 
127
136
  static void wcq_destroy(Query *self)
128
137
  {
129
- if (self->destroy_all) term_destroy((Term *)self->data);
130
- q_destroy_i(self);
138
+ free(WCQ(self)->field);
139
+ free(WCQ(self)->pattern);
140
+ q_destroy_i(self);
131
141
  }
132
142
 
133
- static uint wcq_hash(Query *self)
143
+ static ulong wcq_hash(Query *self)
134
144
  {
135
- return term_hash((Term *)self->data);
145
+ return str_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
136
146
  }
137
147
 
138
148
  static int wcq_eq(Query *self, Query *o)
139
149
  {
140
- return term_eq((Term *)self->data, (Term *)o->data);
150
+ return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
151
+ && (strcmp(WCQ(self)->field, WCQ(o)->field) == 0);
141
152
  }
142
153
 
143
- Query *wcq_create(Term *term)
154
+ Query *wcq_new(const char *field, const char *pattern)
144
155
  {
145
- Query *self = q_create();
156
+ Query *self = q_new(WildCardQuery);
146
157
 
147
- self->data = term;
158
+ WCQ(self)->field = estrdup(field);
159
+ WCQ(self)->pattern = estrdup(pattern);
160
+ MTQMaxTerms(self) = WILD_CARD_QUERY_MAX_TERMS;
148
161
 
149
- self->type = WILD_CARD_QUERY;
150
- self->rewrite = &wcq_rewrite;
151
- self->to_s = &wcq_to_s;
152
- self->hash = &wcq_hash;
153
- self->eq = &wcq_eq;
154
- self->destroy_i = &wcq_destroy;
155
- self->create_weight_i = &q_create_weight_unsup;
162
+ self->type = WILD_CARD_QUERY;
163
+ self->rewrite = &wcq_rewrite;
164
+ self->to_s = &wcq_to_s;
165
+ self->hash = &wcq_hash;
166
+ self->eq = &wcq_eq;
167
+ self->destroy_i = &wcq_destroy;
168
+ self->create_weight_i = &q_create_weight_unsup;
156
169
 
157
- return self;
170
+ return self;
158
171
  }
data/ext/r_analysis.c CHANGED
@@ -1,7 +1,10 @@
1
1
  #include <regex.h>
2
+ #include <locale.h>
3
+ #include <st.h>
2
4
  #include "ferret.h"
3
5
  #include "analysis.h"
4
- #include "locale.h"
6
+
7
+ static VALUE mAnalysis;
5
8
 
6
9
  static VALUE cToken;
7
10
  static VALUE cAsciiLetterTokenizer;
@@ -27,7 +30,6 @@ static VALUE cStandardAnalyzer;
27
30
  static VALUE cPerFieldAnalyzer;
28
31
  static VALUE cRegExpAnalyzer;
29
32
 
30
- //static VALUE cRegexAnalyzer;
31
33
  static VALUE cTokenStream;
32
34
 
33
35
  /* TokenStream Methods */
@@ -40,9 +42,16 @@ static ID id_token_stream;
40
42
 
41
43
  static VALUE object_space;
42
44
 
43
- extern TokenStream *ts_create();
44
- extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
45
- struct re_registers *);
45
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
46
+ int, struct re_registers *);
47
+
48
+ /*
49
+ static int
50
+ frt_rb_hash_size(VALUE hash)
51
+ {
52
+ return RHASH(hash)->tbl->num_entries;
53
+ }
54
+ */
46
55
 
47
56
  /****************************************************************************
48
57
  *
@@ -53,18 +62,18 @@ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, in
53
62
  static char **
54
63
  get_stopwords(VALUE rstop_words)
55
64
  {
56
- char **stop_words;
57
- int i, len;
58
- VALUE rstr;
59
- Check_Type(rstop_words, T_ARRAY);
60
- len = RARRAY(rstop_words)->len;
61
- stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
62
- stop_words[len] = NULL;
63
- for (i = 0; i < len; i++) {
64
- rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
65
- stop_words[i] = RSTRING(rstr)->ptr;
66
- }
67
- return stop_words;
65
+ char **stop_words;
66
+ int i, len;
67
+ VALUE rstr;
68
+ Check_Type(rstop_words, T_ARRAY);
69
+ len = RARRAY(rstop_words)->len;
70
+ stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
71
+ stop_words[len] = NULL;
72
+ for (i = 0; i < len; i++) {
73
+ rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
74
+ stop_words[i] = RSTRING(rstr)->ptr;
75
+ }
76
+ return stop_words;
68
77
  }
69
78
 
70
79
  /****************************************************************************
@@ -74,140 +83,295 @@ get_stopwords(VALUE rstop_words)
74
83
  ****************************************************************************/
75
84
 
76
85
  typedef struct RToken {
77
- VALUE text;
78
- int start;
79
- int end;
80
- int pos_inc;
86
+ VALUE text;
87
+ int start;
88
+ int end;
89
+ int pos_inc;
81
90
  } RToken;
82
91
 
83
92
  static void
84
93
  frt_token_free(void *p)
85
94
  {
86
- free(p);
95
+ free(p);
87
96
  }
88
-
97
+
89
98
  static void
90
99
  frt_token_mark(void *p)
91
100
  {
92
- RToken *token = (RToken *)p;
93
- rb_gc_mark(token->text);
101
+ RToken *token = (RToken *)p;
102
+ rb_gc_mark(token->text);
94
103
  }
95
104
 
96
105
  static VALUE
97
106
  frt_token_alloc(VALUE klass)
98
107
  {
99
- return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
108
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
109
+ ALLOC(RToken));
100
110
  }
101
111
 
102
112
  static VALUE
103
113
  get_token(Token *tk)
104
114
  {
105
- RToken *token = ALLOC(RToken);
115
+ RToken *token = ALLOC(RToken);
106
116
 
107
- token->text = rb_str_new2(tk->text);
108
- token->start = tk->start;
109
- token->end = tk->end;
110
- token->pos_inc = tk->pos_inc;
111
- return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
117
+ token->text = rb_str_new2(tk->text);
118
+ token->start = tk->start;
119
+ token->end = tk->end;
120
+ token->pos_inc = tk->pos_inc;
121
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
112
122
  }
113
123
 
114
124
  Token *
115
125
  frt_set_token(Token *tk, VALUE rt)
116
126
  {
117
- RToken *rtk;
127
+ RToken *rtk;
118
128
 
119
- if (rt == Qnil) return NULL;
129
+ if (rt == Qnil) return NULL;
120
130
 
121
- Data_Get_Struct(rt, RToken, rtk);
122
- tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
123
- rtk->start, rtk->end, rtk->pos_inc);
124
- return tk;
131
+ Data_Get_Struct(rt, RToken, rtk);
132
+ tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
133
+ rtk->start, rtk->end, rtk->pos_inc);
134
+ return tk;
125
135
  }
126
136
 
127
- #define GET_TK RToken *token = (RToken *)DATA_PTR(self)
137
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
138
+
139
+ /*
140
+ * call-seq:
141
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
142
+ *
143
+ * Creates a new token setting the text, start and end offsets of the token
144
+ * and the position increment for the token.
145
+ *
146
+ * The position increment is usually set to 1 but you can set it to other
147
+ * values as needed. For example, if you have a stop word filter you will be
148
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
149
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
150
+ * "Sea" will have the position incerements 2, 1 and 3 respectively.
151
+ *
152
+ * Another reason you might want to vary the position increment is if you are
153
+ * adding synonyms to the index. For example let's say you have the synonym
154
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
155
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
156
+ * and then "fast" and "quick" with position increments of 0 since they are
157
+ * represented in the same position.
158
+ *
159
+ * The offset set values +start+ and +end+ should be byte offsets, not
160
+ * character offsets. This makes it easy to use those offsets to quickly
161
+ * access the token in the input string and also to insert highlighting tags
162
+ * when necessary.
163
+ *
164
+ * text:: the main text for the token.
165
+ * start:: the start offset of the token in bytes.
166
+ * end:: the end offset of the token in bytes.
167
+ * pos_inc:: the position increment of a token. See above.
168
+ * return:: a newly created and assigned Token object
169
+ */
128
170
  static VALUE
129
171
  frt_token_init(int argc, VALUE *argv, VALUE self)
130
172
  {
131
- GET_TK;
132
- VALUE rtext, rstart, rend, rpos_inc, rtype;
133
- token->pos_inc = 1;
134
- switch (rb_scan_args(argc, argv, "32", &rtext, &rstart, &rend, &rpos_inc, &rtype)) {
135
- case 5: /* type gets ignored at this stage */
136
- case 4: token->pos_inc = FIX2INT(rpos_inc);
137
- }
138
- token->text = rb_obj_as_string(rtext);
139
- token->start = FIX2INT(rstart);
140
- token->end = FIX2INT(rend);
141
- return self;
173
+ RToken *token;
174
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
175
+ GET_TK(token, self);
176
+ token->pos_inc = 1;
177
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
178
+ &rend, &rpos_inc, &rtype)) {
179
+ case 5: /* type gets ignored at this stage */
180
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
181
+ }
182
+ token->text = rb_obj_as_string(rtext);
183
+ token->start = FIX2INT(rstart);
184
+ token->end = FIX2INT(rend);
185
+ return self;
142
186
  }
143
187
 
188
+ /*
189
+ * call-seq:
190
+ * token.cmp(other_token) -> bool
191
+ *
192
+ * Used to compare two tokens. Token is extended by Comparable so you can
193
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
194
+ *
195
+ * Tokens are sorted by the position in the text at which they occur, ie
196
+ * the start offset. If two tokens have the same start offset, (see
197
+ * pos_inc=) then, they are sorted by the end offset and then
198
+ * lexically by the token text.
199
+ */
144
200
  static VALUE
145
201
  frt_token_cmp(VALUE self, VALUE rother)
146
202
  {
147
- RToken *other;
148
- int cmp;
149
- GET_TK;
150
- Data_Get_Struct(rother, RToken, other);
151
- if (token->start > other->start) {
152
- cmp = 1;
153
- } else if (token->start < other->start) {
154
- cmp = -1;
155
- } else {
156
- if (token->end > other->end) {
157
- cmp = 1;
158
- } else if (token->end < other->end) {
159
- cmp = -1;
203
+ RToken *token, *other;
204
+ int cmp;
205
+ GET_TK(token, self);
206
+ GET_TK(other, rother);
207
+ if (token->start > other->start) {
208
+ cmp = 1;
209
+ } else if (token->start < other->start) {
210
+ cmp = -1;
160
211
  } else {
161
- cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
212
+ if (token->end > other->end) {
213
+ cmp = 1;
214
+ } else if (token->end < other->end) {
215
+ cmp = -1;
216
+ } else {
217
+ cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
218
+ }
162
219
  }
163
- }
164
- return INT2FIX(cmp);
220
+ return INT2FIX(cmp);
165
221
  }
166
222
 
223
+ /*
224
+ * call-seq:
225
+ * token.text -> text
226
+ *
227
+ * Returns the text that this token represents
228
+ */
167
229
  static VALUE
168
230
  frt_token_get_text(VALUE self)
169
231
  {
170
- GET_TK;
171
- return token->text;
232
+ RToken *token;
233
+ GET_TK(token, self);
234
+ return token->text;
172
235
  }
173
236
 
237
+ /*
238
+ * call-seq:
239
+ * token.text = text -> text
240
+ *
241
+ * Set the text for this token.
242
+ */
174
243
  static VALUE
175
244
  frt_token_set_text(VALUE self, VALUE rtext)
176
245
  {
177
- GET_TK;
178
- token->text = rtext;
179
- return rtext;
246
+ RToken *token;
247
+ GET_TK(token, self);
248
+ token->text = rtext;
249
+ return rtext;
180
250
  }
181
251
 
252
+ /*
253
+ * call-seq:
254
+ * token.start -> integer
255
+ *
256
+ * Start byte-position of this token
257
+ */
182
258
  static VALUE
183
259
  frt_token_get_start_offset(VALUE self)
184
260
  {
185
- GET_TK;
186
- return INT2FIX(token->start);
261
+ RToken *token;
262
+ GET_TK(token, self);
263
+ return INT2FIX(token->start);
187
264
  }
188
265
 
266
+ /*
267
+ * call-seq:
268
+ * token.end -> integer
269
+ *
270
+ * End byte-position of this token
271
+ */
189
272
  static VALUE
190
273
  frt_token_get_end_offset(VALUE self)
191
274
  {
192
- GET_TK;
193
- return INT2FIX(token->end);
275
+ RToken *token;
276
+ GET_TK(token, self);
277
+ return INT2FIX(token->end);
194
278
  }
195
279
 
280
+ /*
281
+ * call-seq:
282
+ * token.pos_inc -> integer
283
+ *
284
+ * Position Increment for this token
285
+ */
196
286
  static VALUE
197
287
  frt_token_get_pos_inc(VALUE self)
198
288
  {
199
- GET_TK;
200
- return INT2FIX(token->pos_inc);
289
+ RToken *token;
290
+ GET_TK(token, self);
291
+ return INT2FIX(token->pos_inc);
201
292
  }
202
293
 
294
+ /*
295
+ * call-seq:
296
+ * token.start = start -> integer
297
+ *
298
+ * Set start byte-position of this token
299
+ */
300
+ static VALUE
301
+ frt_token_set_start_offset(VALUE self, VALUE rstart)
302
+ {
303
+ RToken *token;
304
+ GET_TK(token, self);
305
+ token->start = FIX2INT(rstart);
306
+ return rstart;
307
+ }
308
+
309
+ /*
310
+ * call-seq:
311
+ * token.end = end -> integer
312
+ *
313
+ * Set end byte-position of this token
314
+ */
315
+ static VALUE
316
+ frt_token_set_end_offset(VALUE self, VALUE rend)
317
+ {
318
+ RToken *token;
319
+ GET_TK(token, self);
320
+ token->end = FIX2INT(rend);
321
+ return rend;
322
+ }
323
+
324
+ /*
325
+ * call-seq:
326
+ * token.pos_inc = pos_inc -> integer
327
+ *
328
+ * Set the position increment. This determines the position of this token
329
+ * relative to the previous Token in a TokenStream, used in phrase
330
+ * searching.
331
+ *
332
+ * The default value is 1.
333
+ *
334
+ * Some common uses for this are:
335
+ *
336
+ * * Set it to zero to put multiple terms in the same position. This is
337
+ * useful if, e.g., a word has multiple stems. Searches for phrases
338
+ * including either stem will match. In this case, all but the first
339
+ * stem's increment should be set to zero: the increment of the first
340
+ * instance should be one. Repeating a token with an increment of zero
341
+ * can also be used to boost the scores of matches on that token.
342
+ *
343
+ * * Set it to values greater than one to inhibit exact phrase matches.
344
+ * If, for example, one does not want phrases to match across removed
345
+ * stop words, then one could build a stop word filter that removes stop
346
+ * words and also sets the increment to the number of stop words removed
347
+ * before each non-stop word. Then exact phrase queries will only match
348
+ * when the terms occur with no intervening stop words.
349
+ *
350
+ */
351
+ static VALUE
352
+ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
353
+ {
354
+ RToken *token;
355
+ GET_TK(token, self);
356
+ token->pos_inc = FIX2INT(rpos_inc);
357
+ return rpos_inc;
358
+ }
359
+
360
+ /*
361
+ * call-seq:
362
+ * token.to_s -> token_str
363
+ *
364
+ * Return a string representation of the token
365
+ */
203
366
  static VALUE
204
367
  frt_token_to_s(VALUE self)
205
368
  {
206
- GET_TK;
207
- char *buf = alloca(RSTRING(token->text)->len + 80);
208
- sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
209
- token->end, token->pos_inc);
210
- return rb_str_new2(buf);
369
+ RToken *token;
370
+ GET_TK(token, self);
371
+ char *buf = alloca(RSTRING(token->text)->len + 80);
372
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
373
+ token->end, token->pos_inc);
374
+ return rb_str_new2(buf);
211
375
  }
212
376
 
213
377
  /****************************************************************************
@@ -216,143 +380,210 @@ frt_token_to_s(VALUE self)
216
380
  *
217
381
  ****************************************************************************/
218
382
 
383
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
384
+
219
385
  static void
220
386
  frt_ts_mark(void *p)
221
387
  {
222
- TokenStream *ts = (TokenStream *)p;
223
- if (ts->text) frt_gc_mark(&ts->text);
224
- if (ts->sub_ts) frt_gc_mark(&ts->sub_ts);
388
+ TokenStream *ts = (TokenStream *)p;
389
+ if (ts->text) frt_gc_mark(&ts->text);
225
390
  }
226
391
 
227
392
  static void
228
393
  frt_ts_free(TokenStream *ts)
229
394
  {
230
- if (object_get(&ts->text) != Qnil) object_del(&ts->text);
231
- if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
232
- object_del(ts);
233
- ts_deref(ts);
395
+ if (object_get(&ts->text) != Qnil) {
396
+ object_del(&ts->text);
397
+ }
398
+ object_del(ts);
399
+ ts_deref(ts);
234
400
  }
235
401
 
402
+ static void frt_rets_free(TokenStream *ts);
403
+ static void frt_rets_mark(TokenStream *ts);
404
+ static Token *rets_next(TokenStream *ts);
405
+
236
406
  static VALUE
237
- get_token_stream(TokenStream *ts)
238
- {
239
- VALUE rts = object_get(ts);
240
- if (rts == Qnil) {
241
- rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark, &frt_ts_free, ts);
242
- object_add(ts, rts);
243
- }
244
- return rts;
407
+ get_rb_token_stream(TokenStream *ts)
408
+ {
409
+ VALUE rts = object_get(ts);
410
+ if (rts == Qnil) {
411
+ if (ts->next == &rets_next) {
412
+ rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
413
+ &frt_rets_free, ts);
414
+ } else {
415
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
416
+ &frt_ts_free, ts);
417
+ }
418
+ object_add(ts, rts);
419
+ }
420
+ return rts;
245
421
  }
246
422
 
247
423
  static inline VALUE
248
424
  get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
249
425
  {
250
- rstr = rb_obj_as_string(rstr);
251
- ts->reset(ts, RSTRING(rstr)->ptr);
252
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
253
- object_add(&ts->text, rstr);
254
- object_add(ts, self);
255
- return self;
426
+ StringValue(rstr);
427
+ ts->reset(ts, RSTRING(rstr)->ptr);
428
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
429
+ object_add(&ts->text, rstr);
430
+ object_add(ts, self);
431
+ return self;
256
432
  }
257
433
 
434
+ /*
435
+ * call-seq:
436
+ * token_stream.text = text -> text
437
+ *
438
+ * Set the text attribute of the TokenStream to the text you wish to be
439
+ * tokenized. For example, you may do this;
440
+ *
441
+ * token_stream.text = File.read(file_name)
442
+ */
258
443
  static VALUE
259
444
  frt_ts_set_text(VALUE self, VALUE rtext)
260
445
  {
261
- TokenStream *ts;
262
- Data_Get_Struct(self, TokenStream, ts);
263
- rtext = rb_obj_as_string(rtext);
264
- ts->reset(ts, RSTRING(rtext)->ptr);
265
- object_set(&ts->text, rtext);
446
+ TokenStream *ts;
447
+ Data_Get_Struct(self, TokenStream, ts);
448
+ StringValue(rtext);
449
+ ts->reset(ts, RSTRING(rtext)->ptr);
450
+ object_set(&ts->text, rtext);
266
451
 
267
- return rtext;
452
+ return rtext;
268
453
  }
269
454
 
455
+ /*
456
+ * call-seq:
457
+ * token_stream.text = text -> text
458
+ *
459
+ * Return the text that the TokenStream is tokenizing
460
+ */
270
461
  static VALUE
271
462
  frt_ts_get_text(VALUE self)
272
463
  {
273
- VALUE rtext = Qnil;
274
- TokenStream *ts;
275
- Data_Get_Struct(self, TokenStream, ts);
276
- if (ts->text) {
277
- if ((rtext = object_get(&ts->text)) == Qnil) {
278
- rtext = rb_str_new2(ts->text);
279
- object_set(&ts->text, rtext);
280
- }
281
- }
282
- return rtext;
464
+ VALUE rtext = Qnil;
465
+ TokenStream *ts;
466
+ Data_Get_Struct(self, TokenStream, ts);
467
+ if (ts->text) {
468
+ if ((rtext = object_get(&ts->text)) == Qnil) {
469
+ rtext = rb_str_new2(ts->text);
470
+ object_set(&ts->text, rtext);
471
+ }
472
+ }
473
+ return rtext;
283
474
  }
284
475
 
476
+ /*
477
+ * call-seq:
478
+ * token_stream.next -> token
479
+ *
480
+ * Return the next token from the TokenStream or nil if there are no more
481
+ * tokens.
482
+ */
285
483
  static VALUE
286
484
  frt_ts_next(VALUE self)
287
485
  {
288
- TokenStream *ts = (TokenStream *)DATA_PTR(self);
289
- Token *next = ts->next(ts);
290
- if (next == NULL) {
291
- return Qnil;
292
- }
486
+ TokenStream *ts;
487
+ GET_TS(ts, self);
488
+ Token *next = ts->next(ts);
489
+ if (next == NULL) {
490
+ return Qnil;
491
+ }
293
492
 
294
- return get_token(next);
493
+ return get_token(next);
295
494
  }
296
495
 
496
+ /****************************************************************************
497
+ * TokenFilter
498
+ ****************************************************************************/
499
+
500
+ #define TkFilt(filter) ((TokenFilter *)(filter))
501
+
502
+ static void
503
+ frt_tf_mark(void *p)
504
+ {
505
+ TokenStream *ts = (TokenStream *)p;
506
+ if (TkFilt(ts)->sub_ts) {
507
+ frt_gc_mark(&TkFilt(ts)->sub_ts);
508
+ }
509
+ }
510
+
511
+ static void
512
+ frt_tf_free(TokenStream *ts)
513
+ {
514
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
515
+ object_del(&TkFilt(ts)->sub_ts);
516
+ }
517
+ object_del(ts);
518
+ ts_deref(ts);
519
+ }
520
+
521
+
297
522
  /****************************************************************************
298
523
  * CWrappedTokenStream
299
524
  ****************************************************************************/
300
525
 
526
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
527
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
528
+
529
+ typedef struct CWrappedTokenStream {
530
+ CachedTokenStream super;
531
+ VALUE rts;
532
+ } CWrappedTokenStream;
533
+
301
534
  static void
302
- cwrts_destroy(TokenStream *ts)
535
+ cwrts_destroy_i(TokenStream *ts)
303
536
  {
304
- rb_hash_delete(object_space, LONG2NUM((long)ts->data));
305
- free(ts->token);
306
- free(ts);
537
+ rb_hash_delete(object_space, LONG2NUM(CWTS(ts)->rts));
538
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
539
+ free(ts);
307
540
  }
308
541
 
309
542
  static Token *
310
543
  cwrts_next(TokenStream *ts)
311
544
  {
312
- VALUE rts = (VALUE)ts->data;
313
- VALUE rtoken = rb_funcall(rts, id_next, 0);
314
- return frt_set_token(ts->token, rtoken);
545
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
546
+ return frt_set_token(&(CachedTS(ts)->token), rtoken);
315
547
  }
316
548
 
317
- static void
549
+ static TokenStream *
318
550
  cwrts_reset(TokenStream *ts, char *text)
319
551
  {
320
- VALUE rts = (VALUE)ts->data;
321
- ts->t = ts->text = text;
322
- rb_funcall(rts, id_reset, 1, rb_str_new2(text));
552
+ ts->t = ts->text = text;
553
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
554
+ return ts;
323
555
  }
324
556
 
325
- static void
326
- cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
557
+ static TokenStream *
558
+ cwrts_clone_i(TokenStream *orig_ts)
327
559
  {
328
- VALUE rorig_ts = (VALUE)orig_ts->data;
329
- new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
560
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
561
+ CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
562
+ return new_ts;
330
563
  }
331
564
 
332
565
  static TokenStream *
333
566
  frt_get_cwrapped_rts(VALUE rts)
334
567
  {
335
- TokenStream *ts;
336
- switch (TYPE(rts)) {
337
- case T_DATA:
338
- Data_Get_Struct(rts, TokenStream, ts);
339
- ref(ts);
340
- break;
341
- default:
342
- ts = ALLOC(TokenStream);
343
- ts->token = ALLOC(Token);
344
- ts->data = (void *)rts;
345
- ts->next = &cwrts_next;
346
- ts->reset = &cwrts_reset;
347
- ts->clone_i = &cwrts_clone_i;
348
- ts->destroy = &cwrts_destroy;
349
- ts->sub_ts = NULL;
350
- // prevent from being garbage collected
351
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
352
- ts->ref_cnt = 1;
353
- break;
354
- }
355
- return ts;
568
+ TokenStream *ts;
569
+ switch (TYPE(rts)) {
570
+ case T_DATA:
571
+ GET_TS(ts, rts);
572
+ REF(ts);
573
+ break;
574
+ default:
575
+ ts = ts_new(CWrappedTokenStream);
576
+ CWTS(ts)->rts = rts;
577
+ ts->next = &cwrts_next;
578
+ ts->reset = &cwrts_reset;
579
+ ts->clone_i = &cwrts_clone_i;
580
+ ts->destroy_i = &cwrts_destroy_i;
581
+ /* prevent from being garbage collected */
582
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
583
+ ts->ref_cnt = 1;
584
+ break;
585
+ }
586
+ return ts;
356
587
  }
357
588
 
358
589
  /****************************************************************************
@@ -364,165 +595,181 @@ frt_get_cwrapped_rts(VALUE rts)
364
595
  #define ALPHA "[-_[:alpha:]]"
365
596
  #define ALNUM "[-_[:alnum:]]"
366
597
 
367
- static char *token_re =
368
- ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
369
- "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
370
- "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
598
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
599
+
600
+ static const char *TOKEN_RE =
601
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
602
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
603
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
371
604
  "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
372
605
  "|(\\.\\w+)+"
373
606
  "|"
374
- ")";
607
+ ")";
375
608
  static VALUE rtoken_re;
376
609
 
377
610
  typedef struct RegExpTokenStream {
378
- VALUE rtext;
379
- VALUE regex;
380
- VALUE proc;
381
- int curr_ind;
611
+ CachedTokenStream super;
612
+ VALUE rtext;
613
+ VALUE regex;
614
+ VALUE proc;
615
+ int curr_ind;
382
616
  } RegExpTokenStream;
383
617
 
384
618
  static void
385
- rets_destroy(TokenStream *ts)
619
+ rets_destroy_i(TokenStream *ts)
386
620
  {
387
- rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
388
- free(ts->data);
389
- free(ts->token);
390
- free(ts);
621
+ free(ts);
391
622
  }
392
623
 
393
624
  static void
394
625
  frt_rets_free(TokenStream *ts)
395
626
  {
396
- object_del(ts);
397
- ts_deref(ts);
627
+ if (object_get(&ts->text) != Qnil) {
628
+ object_del(&ts->text);
629
+ }
630
+ object_del(ts);
631
+ ts_deref(ts);
398
632
  }
399
633
 
400
634
  static void
401
635
  frt_rets_mark(TokenStream *ts)
402
636
  {
403
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
404
- rb_gc_mark(rets->rtext);
405
- rb_gc_mark(rets->regex);
406
- rb_gc_mark(rets->proc);
637
+ if (ts->text) frt_gc_mark(&ts->text);
638
+ rb_gc_mark(RETS(ts)->rtext);
639
+ rb_gc_mark(RETS(ts)->regex);
640
+ rb_gc_mark(RETS(ts)->proc);
407
641
  }
408
642
 
643
+ /*
644
+ * call-seq:
645
+ * tokenizer.text = text -> text
646
+ *
647
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
648
+ * tokenize the text from the beginning.
649
+ */
409
650
  static VALUE
410
651
  frt_rets_set_text(VALUE self, VALUE rtext)
411
652
  {
412
- TokenStream *ts;
413
- RegExpTokenStream *rets;
414
- Data_Get_Struct(self, TokenStream, ts);
653
+ TokenStream *ts;
654
+ GET_TS(ts, self);
655
+
656
+ StringValue(rtext);
657
+ RETS(ts)->rtext = rtext;
658
+ RETS(ts)->curr_ind = 0;
415
659
 
416
- StringValue(rtext);
417
- rets = (RegExpTokenStream *)ts->data;
418
- rets->rtext = rtext;
419
- rets->curr_ind = 0;
420
-
421
- return rtext;
660
+ return rtext;
422
661
  }
423
662
 
663
+ /*
664
+ * call-seq:
665
+ * tokenizer.text = text -> text
666
+ *
667
+ * Get the text being tokenized by the tokenizer.
668
+ */
424
669
  static VALUE
425
670
  frt_rets_get_text(VALUE self)
426
671
  {
427
- TokenStream *ts;
428
- RegExpTokenStream *rets;
429
- Data_Get_Struct(self, TokenStream, ts);
430
- rets = (RegExpTokenStream *)ts->data;
431
- return rets->rtext;
672
+ TokenStream *ts;
673
+ GET_TS(ts, self);
674
+ return RETS(ts)->rtext;
432
675
  }
433
676
 
434
677
  static Token *
435
678
  rets_next(TokenStream *ts)
436
679
  {
437
- static struct re_registers regs;
438
- int ret, beg, end;
439
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
440
- struct RString *rtext = RSTRING(rets->rtext);
441
- Check_Type(rets->regex, T_REGEXP);
442
- ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
443
- rtext->ptr, rtext->len,
444
- rets->curr_ind, rtext->len - rets->curr_ind,
445
- &regs);
446
-
447
- if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
448
- if (ret < 0) return NULL; /* not matched */
449
-
450
- beg = regs.beg[0];
451
- rets->curr_ind = end = regs.end[0];
452
- if (NIL_P(rets->proc)) {
453
- return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
454
- } else {
455
- VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
456
- rtok = rb_funcall(rets->proc, id_call, 1, rtok);
457
- return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
458
- }
680
+ static struct re_registers regs;
681
+ int ret, beg, end;
682
+ struct RString *rtext = RSTRING(RETS(ts)->rtext);
683
+ Check_Type(RETS(ts)->regex, T_REGEXP);
684
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
685
+ rtext->ptr, rtext->len,
686
+ RETS(ts)->curr_ind, rtext->len - RETS(ts)->curr_ind,
687
+ &regs);
688
+
689
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
690
+ if (ret < 0) return NULL; /* not matched */
691
+
692
+ beg = regs.beg[0];
693
+ RETS(ts)->curr_ind = end = regs.end[0];
694
+ if (NIL_P(RETS(ts)->proc)) {
695
+ return tk_set(&(CachedTS(ts)->token), rtext->ptr + beg, end - beg,
696
+ beg, end, 1);
697
+ } else {
698
+ VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
699
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
700
+ return tk_set(&(CachedTS(ts)->token), RSTRING(rtok)->ptr,
701
+ RSTRING(rtok)->len, beg, end, 1);
702
+ }
459
703
  }
460
704
 
461
- static void
705
+ static TokenStream *
462
706
  rets_reset(TokenStream *ts, char *text)
463
707
  {
464
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
465
- rets->rtext = rb_str_new2(text);
466
- rets->curr_ind = 0;
708
+ RETS(ts)->rtext = rb_str_new2(text);
709
+ RETS(ts)->curr_ind = 0;
710
+ return ts;
467
711
  }
468
712
 
469
- void
470
- rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
713
+ static TokenStream *
714
+ rets_clone_i(TokenStream *orig_ts)
471
715
  {
472
- RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
473
- RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
474
- memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
475
- new_ts->data = new_rets;
716
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
717
+ return ts;
476
718
  }
477
719
 
478
720
  static TokenStream *
479
- rets_create(VALUE rtext, VALUE regex, VALUE proc)
721
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
480
722
  {
481
- RegExpTokenStream *rets;
482
- TokenStream *ts;
723
+ TokenStream *ts;
483
724
 
484
- if (rtext != Qnil) {
485
- rtext = StringValue(rtext);
486
- }
487
- ts = ts_create();
488
- ts->reset = &rets_reset;
489
- ts->next = &rets_next;
490
- ts->clone_i = &rets_clone_i;
491
- ts->destroy = &rets_destroy;
492
- ts->ref_cnt = 1;
493
-
494
- rets = ALLOC(RegExpTokenStream);
495
- rets->curr_ind = 0;
496
- rets->rtext = rtext;
497
- rets->proc = proc;
498
- if (NIL_P(regex)) {
499
- rets->regex = rtoken_re;
500
- } else {
501
- Check_Type(regex, T_REGEXP);
502
- rets->regex = regex;
503
- }
504
-
505
- ts->data = rets;
725
+ if (rtext != Qnil) {
726
+ rtext = StringValue(rtext);
727
+ }
728
+ ts = ts_new(RegExpTokenStream);
729
+ ts->reset = &rets_reset;
730
+ ts->next = &rets_next;
731
+ ts->clone_i = &rets_clone_i;
732
+ ts->destroy_i = &rets_destroy_i;
733
+
734
+ RETS(ts)->curr_ind = 0;
735
+ RETS(ts)->rtext = rtext;
736
+ RETS(ts)->proc = proc;
737
+
738
+ if (NIL_P(regex)) {
739
+ RETS(ts)->regex = rtoken_re;
740
+ } else {
741
+ Check_Type(regex, T_REGEXP);
742
+ RETS(ts)->regex = regex;
743
+ }
506
744
 
507
- return ts;
745
+ return ts;
508
746
  }
509
747
 
748
+ /*
749
+ * call-seq:
750
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
751
+ *
752
+ * Create a new tokenizer based on a regular expression
753
+ *
754
+ * input:: text to tokenizer
755
+ * regexp:: regular expression used to recognize tokens in the input
756
+ */
510
757
  static VALUE
511
758
  frt_rets_init(int argc, VALUE *argv, VALUE self)
512
759
  {
513
- VALUE rtext, regex, proc;
514
- TokenStream *ts;
760
+ VALUE rtext, regex, proc;
761
+ TokenStream *ts;
515
762
 
516
- rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
763
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
517
764
 
518
- ts = rets_create(rtext, regex, proc);
765
+ ts = rets_new(rtext, regex, proc);
519
766
 
520
- Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
521
- object_add(ts, self);
522
- /* no need to add to object space as it is going to ruby space
523
- * rb_hash_aset(object_space, LONG2NUM((long)self), self);
524
- */
525
- return self;
767
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
768
+ object_add(ts, self);
769
+ /* no need to add to object space as it is going to ruby space
770
+ * rb_hash_aset(object_space, LONG2NUM((long)self), self);
771
+ */
772
+ return self;
526
773
  }
527
774
 
528
775
  /****************************************************************************
@@ -530,47 +777,92 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
530
777
  ****************************************************************************/
531
778
 
532
779
  #define TS_ARGS(dflt) \
533
- bool lower;\
534
- VALUE rlower, rstr;\
535
- rb_scan_args(argc, argv, "11", &rstr, &rlower);\
536
- lower = (argc ? RTEST(rlower) : dflt)
537
-
780
+ bool lower;\
781
+ VALUE rlower, rstr;\
782
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
783
+ lower = (argc ? RTEST(rlower) : dflt)
784
+
785
+ /*
786
+ * call-seq:
787
+ * AsciiLetterTokenizer.new() -> tokenizer
788
+ *
789
+ * Create a new AsciiLetterTokenizer
790
+ */
538
791
  static VALUE
539
792
  frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
540
793
  {
541
- return get_wrapped_ts(self, rstr, letter_tokenizer_create());
794
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
542
795
  }
543
796
 
797
+ /*
798
+ * call-seq:
799
+ * LetterTokenizer.new(lower = true) -> tokenizer
800
+ *
801
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
802
+ * is done according the the current locale.
803
+ *
804
+ * lower:: set to false if you don't wish to downcase tokens
805
+ */
544
806
  static VALUE
545
807
  frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
546
808
  {
547
- TS_ARGS(false);
548
- return get_wrapped_ts(self, rstr, mb_letter_tokenizer_create(lower));
809
+ TS_ARGS(false);
810
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
549
811
  }
550
812
 
813
+ /*
814
+ * call-seq:
815
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
816
+ *
817
+ * Create a new AsciiWhiteSpaceTokenizer
818
+ */
551
819
  static VALUE
552
820
  frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
553
821
  {
554
- return get_wrapped_ts(self, rstr, whitespace_tokenizer_create());
822
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
555
823
  }
556
824
 
825
+ /*
826
+ * call-seq:
827
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
828
+ *
829
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
830
+ * Downcasing is done according the the current locale.
831
+ *
832
+ * lower:: set to false if you don't wish to downcase tokens
833
+ */
557
834
  static VALUE
558
835
  frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
559
836
  {
560
- TS_ARGS(false);
561
- return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_create(lower));
837
+ TS_ARGS(false);
838
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
562
839
  }
563
840
 
841
+ /*
842
+ * call-seq:
843
+ * AsciiStandardTokenizer.new() -> tokenizer
844
+ *
845
+ * Create a new AsciiStandardTokenizer
846
+ */
564
847
  static VALUE
565
848
  frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
566
849
  {
567
- return get_wrapped_ts(self, rstr, standard_tokenizer_create());
850
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
568
851
  }
569
852
 
853
+ /*
854
+ * call-seq:
855
+ * StandardTokenizer.new(lower = true) -> tokenizer
856
+ *
857
+ * Create a new StandardTokenizer which optionally downcases tokens.
858
+ * Downcasing is done according the the current locale.
859
+ *
860
+ * lower:: set to false if you don't wish to downcase tokens
861
+ */
570
862
  static VALUE
571
863
  frt_standard_tokenizer_init(VALUE self, VALUE rstr)
572
864
  {
573
- return get_wrapped_ts(self, rstr, mb_standard_tokenizer_create());
865
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
574
866
  }
575
867
 
576
868
  /****************************************************************************
@@ -578,71 +870,114 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
578
870
  ****************************************************************************/
579
871
 
580
872
 
873
+ /*
874
+ * call-seq:
875
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
876
+ *
877
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
878
+ * lowercase but only for Ascii characters. For other characters use
879
+ * LowerCaseFilter.
880
+ */
581
881
  static VALUE
582
882
  frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
583
883
  {
584
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
585
- ts = lowercase_filter_create(ts);
586
- object_add(&ts->sub_ts, rsub_ts);
884
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
885
+ ts = lowercase_filter_new(ts);
886
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
587
887
 
588
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
589
- object_add(ts, self);
590
- return self;
888
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
889
+ object_add(ts, self);
890
+ return self;
591
891
  }
592
892
 
893
+ /*
894
+ * call-seq:
895
+ * LowerCaseFilter.new(token_stream) -> token_stream
896
+ *
897
+ * Create an LowerCaseFilter which normalizes a token's text to
898
+ * lowercase based on the current locale.
899
+ */
593
900
  static VALUE
594
901
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
595
902
  {
596
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
597
- ts = mb_lowercase_filter_create(ts);
598
- object_add(&ts->sub_ts, rsub_ts);
903
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
904
+ ts = mb_lowercase_filter_new(ts);
905
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
599
906
 
600
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
601
- object_add(ts, self);
602
- return self;
907
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
908
+ object_add(ts, self);
909
+ return self;
603
910
  }
604
911
 
912
+ /*
913
+ * call-seq:
914
+ * StopFilter.new(token_stream) -> token_stream
915
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
916
+ *
917
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
918
+ * optionally specify the stopwords you wish to have removed.
919
+ *
920
+ * token_stream:: TokenStream to be filtered
921
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
922
+ * defaults to a list of English stop-words. The
923
+ * Ferret::Analysis contains a number of stop-word lists.
924
+ */
605
925
  static VALUE
606
926
  frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
607
927
  {
608
- VALUE rsub_ts, rstop_words;
609
- TokenStream *ts;
610
- rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
611
- ts = frt_get_cwrapped_rts(rsub_ts);
612
- if (rstop_words != Qnil) {
613
- char **stop_words = get_stopwords(rstop_words);
614
- ts = stop_filter_create_with_words(ts, (const char **)stop_words);
928
+ VALUE rsub_ts, rstop_words;
929
+ TokenStream *ts;
930
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
931
+ ts = frt_get_cwrapped_rts(rsub_ts);
932
+ if (rstop_words != Qnil) {
933
+ char **stop_words = get_stopwords(rstop_words);
934
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
615
935
 
616
- free(stop_words);
617
- } else {
618
- ts = stop_filter_create(ts);
619
- }
620
- object_add(&ts->sub_ts, rsub_ts);
936
+ free(stop_words);
937
+ } else {
938
+ ts = stop_filter_new(ts);
939
+ }
940
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
621
941
 
622
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
623
- object_add(ts, self);
624
- return self;
942
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
943
+ object_add(ts, self);
944
+ return self;
625
945
  }
626
946
 
947
+ /*
948
+ * call-seq:
949
+ * StemFilter.new(token_stream) -> token_stream
950
+ * StemFilter.new(token_stream,
951
+ * algorithm="english",
952
+ * encoding=locale-specific) -> token_stream
953
+ *
954
+ * Create an StemFilter which uses a snowball stemmer (thankyou Martin
955
+ * Porter) to stem words. You can optionally specify the algorithm (default:
956
+ * "english") and encoding (default: "UTF-8").
957
+ *
958
+ * token_stream:: TokenStream to be filtered
959
+ * algorithm:: The algorithm (or language) to use
960
+ * encoding:: The encoding of the data (default: "UTF-8")
961
+ */
627
962
  static VALUE
628
963
  frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
629
964
  {
630
- VALUE rsub_ts, ralgorithm, rcharenc;
631
- char *algorithm = "english";
632
- char *charenc = NULL;
633
- TokenStream *ts;
634
- rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
635
- ts = frt_get_cwrapped_rts(rsub_ts);
636
- switch (argc) {
637
- case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
638
- case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
639
- }
640
- ts = stem_filter_create(ts, algorithm, charenc);
641
- object_add(&ts->sub_ts, rsub_ts);
965
+ VALUE rsub_ts, ralgorithm, rcharenc;
966
+ char *algorithm = "english";
967
+ char *charenc = NULL;
968
+ TokenStream *ts;
969
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
970
+ ts = frt_get_cwrapped_rts(rsub_ts);
971
+ switch (argc) {
972
+ case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
973
+ case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
974
+ }
975
+ ts = stem_filter_new(ts, algorithm, charenc);
976
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
642
977
 
643
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
644
- object_add(ts, self);
645
- return self;
978
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
979
+ object_add(ts, self);
980
+ return self;
646
981
  }
647
982
 
648
983
  /****************************************************************************
@@ -655,216 +990,327 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
655
990
  * CWrappedAnalyzer Methods
656
991
  ****************************************************************************/
657
992
 
993
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
994
+
995
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
996
+ typedef struct CWrappedAnalyzer
997
+ {
998
+ Analyzer super;
999
+ VALUE ranalyzer;
1000
+ } CWrappedAnalyzer;
1001
+
658
1002
  static void
659
- cwa_destroy(Analyzer *a)
1003
+ cwa_destroy_i(Analyzer *a)
660
1004
  {
661
- rb_hash_delete(object_space, LONG2NUM((long)a->data));
662
- a_standard_destroy(a);
1005
+ rb_hash_delete(object_space, LONG2NUM(CWA(a)->ranalyzer));
1006
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1007
+ free(a);
663
1008
  }
664
1009
 
665
1010
  static TokenStream *
666
1011
  cwa_get_ts(Analyzer *a, char *field, char *text)
667
1012
  {
668
- VALUE ranalyzer = (VALUE)a->data;
669
- VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
670
- rb_str_new2(field), rb_str_new2(text));
671
- return frt_get_cwrapped_rts(rts);
1013
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1014
+ rb_str_new2(field), rb_str_new2(text));
1015
+ return frt_get_cwrapped_rts(rts);
672
1016
  }
673
1017
 
674
1018
  Analyzer *
675
- frt_get_cwrapped_analyzer(ranalyzer)
676
- {
677
- Analyzer *a = NULL;
678
- switch (TYPE(ranalyzer)) {
679
- case T_DATA:
680
- Data_Get_Struct(ranalyzer, Analyzer, a);
681
- ref(a);
682
- break;
683
- default:
684
- a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
685
- // prevent from being garbage collected
686
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
687
- break;
688
- }
689
- return a;
1019
+ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1020
+ {
1021
+ Analyzer *a = NULL;
1022
+ switch (TYPE(ranalyzer)) {
1023
+ case T_DATA:
1024
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1025
+ REF(a);
1026
+ break;
1027
+ default:
1028
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1029
+ a->destroy_i = &cwa_destroy_i;
1030
+ a->get_ts = &cwa_get_ts;
1031
+ a->ref_cnt = 1;
1032
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1033
+ /* prevent from being garbage collected */
1034
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1035
+ break;
1036
+ }
1037
+ return a;
690
1038
  }
691
1039
 
692
1040
  static void
693
1041
  frt_analyzer_free(Analyzer *a)
694
1042
  {
695
- object_del(a);
696
- a_deref(a);
1043
+ object_del(a);
1044
+ a_deref(a);
697
1045
  }
698
1046
 
699
1047
  VALUE
700
1048
  frt_get_analyzer(Analyzer *a)
701
1049
  {
702
- VALUE self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
703
- object_add(a, self);
704
- return self;
1050
+ VALUE self = Qnil;
1051
+ if (a) {
1052
+ self = object_get(a);
1053
+ if (self == Qnil) {
1054
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1055
+ REF(a);
1056
+ object_add(a, self);
1057
+ }
1058
+ }
1059
+ return self;
705
1060
  }
706
1061
 
1062
+ /*
1063
+ * call-seq:
1064
+ * analyzer.token_stream(field_name, input) -> token_stream
1065
+ *
1066
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1067
+ * also depend on the +field_name+. Although this parameter is typically
1068
+ * ignored.
1069
+ *
1070
+ * field_name:: name of the field to be tokenized
1071
+ * input:: data from the field to be tokenized
1072
+ */
707
1073
  static VALUE
708
1074
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
709
1075
  {
710
- TokenStream *ts;
711
- Analyzer *a = (Analyzer *)DATA_PTR(self);
1076
+ TokenStream *ts;
1077
+ Analyzer *a;
1078
+ GET_A(a, self);
1079
+
1080
+ StringValue(rfield);
1081
+ StringValue(rstring);
712
1082
 
713
- rfield = rb_obj_as_string(rfield);
714
- rstring = rb_obj_as_string(rstring);
715
-
716
- ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
1083
+ ts = a_get_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
717
1084
 
718
- /* Make sure that there is no entry already */
719
- object_set(&ts->text, rstring);
720
- return get_token_stream(ts);
1085
+ /* Make sure that there is no entry already */
1086
+ object_set(&ts->text, rstring);
1087
+ return get_rb_token_stream(ts);
721
1088
  }
722
1089
 
723
1090
  #define GET_LOWER(dflt) \
724
- bool lower;\
725
- VALUE rlower;\
726
- rb_scan_args(argc, argv, "01", &rlower);\
727
- lower = (argc ? RTEST(rlower) : dflt)
728
-
729
- /*** AsciiWhiteSpaceAnalyzer ***/
1091
+ bool lower;\
1092
+ VALUE rlower;\
1093
+ rb_scan_args(argc, argv, "01", &rlower);\
1094
+ lower = (argc ? RTEST(rlower) : dflt)
1095
+
1096
+ /*
1097
+ * call-seq:
1098
+ * AsciiWhiteSpaceAnalyzer.new(lower = true) -> analyzer
1099
+ *
1100
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1101
+ * but can optionally leave case as is. Lowercasing will only be done to
1102
+ * ascii characters.
1103
+ *
1104
+ * lower:: set to false if you don't want the field's tokens to be downcased
1105
+ */
730
1106
  static VALUE
731
1107
  frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
732
1108
  {
733
- Analyzer *a;
734
- GET_LOWER(false);
735
- a = whitespace_analyzer_create(lower);
736
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
737
- object_add(a, self);
738
- return self;
1109
+ Analyzer *a;
1110
+ GET_LOWER(false);
1111
+ a = whitespace_analyzer_new(lower);
1112
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1113
+ object_add(a, self);
1114
+ return self;
739
1115
  }
740
1116
 
741
- /*** WhiteSpaceAnalyzer ***/
1117
+ /*
1118
+ * call-seq:
1119
+ * WhiteSpaceAnalyzer.new(lower = true) -> analyzer
1120
+ *
1121
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1122
+ * optionally leave case as is. Lowercasing will be done based on the current
1123
+ * locale.
1124
+ *
1125
+ * lower:: set to false if you don't want the field's tokens to be downcased
1126
+ */
742
1127
  static VALUE
743
1128
  frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
744
1129
  {
745
- Analyzer *a;
746
- GET_LOWER(false);
747
- a = mb_whitespace_analyzer_create(lower);
748
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
749
- object_add(a, self);
750
- return self;
1130
+ Analyzer *a;
1131
+ GET_LOWER(false);
1132
+ a = mb_whitespace_analyzer_new(lower);
1133
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1134
+ object_add(a, self);
1135
+ return self;
751
1136
  }
752
1137
 
753
- /*** AsciiLetterAnalyzer ***/
1138
+ /*
1139
+ * call-seq:
1140
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1141
+ *
1142
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1143
+ * but can optionally leave case as is. Lowercasing will only be done to
1144
+ * ascii characters.
1145
+ *
1146
+ * lower:: set to false if you don't want the field's tokens to be downcased
1147
+ */
754
1148
  static VALUE
755
1149
  frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
756
1150
  {
757
- Analyzer *a;
758
- GET_LOWER(true);
759
- a = letter_analyzer_create(lower);
760
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
761
- object_add(a, self);
762
- return self;
1151
+ Analyzer *a;
1152
+ GET_LOWER(true);
1153
+ a = letter_analyzer_new(lower);
1154
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1155
+ object_add(a, self);
1156
+ return self;
763
1157
  }
764
1158
 
765
- /*** LetterAnalyzer ***/
1159
+ /*
1160
+ * call-seq:
1161
+ * LetterAnalyzer.new(lower = true) -> analyzer
1162
+ *
1163
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1164
+ * optionally leave case as is. Lowercasing will be done based on the current
1165
+ * locale.
1166
+ *
1167
+ * lower:: set to false if you don't want the field's tokens to be downcased
1168
+ */
766
1169
  static VALUE
767
1170
  frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
768
1171
  {
769
- Analyzer *a;
770
- GET_LOWER(true);
771
- a = mb_letter_analyzer_create(lower);
772
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
773
- object_add(a, self);
774
- return self;
1172
+ Analyzer *a;
1173
+ GET_LOWER(true);
1174
+ a = mb_letter_analyzer_new(lower);
1175
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1176
+ object_add(a, self);
1177
+ return self;
775
1178
  }
776
1179
 
777
1180
  static VALUE
778
1181
  get_rstopwords(const char **stop_words)
779
1182
  {
780
- char **w = (char **)stop_words;
781
- VALUE rstopwords = rb_ary_new();
1183
+ char **w = (char **)stop_words;
1184
+ VALUE rstopwords = rb_ary_new();
782
1185
 
783
- while (*w) {
784
- rb_ary_push(rstopwords, rb_str_new2(*w));
785
- w++;
786
- }
787
- return rstopwords;
1186
+ while (*w) {
1187
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1188
+ w++;
1189
+ }
1190
+ return rstopwords;
788
1191
  }
789
1192
 
790
- /*** AsciiStandardAnalyzer ***/
1193
+ /*
1194
+ * call-seq:
1195
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1196
+ * -> analyzer
1197
+ *
1198
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1199
+ * can optionally leave case as is. Lowercasing will be done based on the
1200
+ * current locale. You can also set the list of stop-words to be used by the
1201
+ * StopFilter.
1202
+ *
1203
+ * lower:: set to false if you don't want the field's tokens to be downcased
1204
+ * stop_words:: list of stop-words to pass to the StopFilter
1205
+ */
791
1206
  static VALUE
792
1207
  frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
793
1208
  {
794
- bool lower;
795
- VALUE rlower, rstop_words;
796
- Analyzer *a;
797
- rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
798
- lower = ((rlower == Qnil) ? true : RTEST(rlower));
799
- if (rstop_words != Qnil) {
800
- char **stop_words = get_stopwords(rstop_words);
801
- a = standard_analyzer_create_with_words((const char **)stop_words, lower);
802
- free(stop_words);
803
- } else {
804
- a = standard_analyzer_create(lower);
805
- }
806
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
807
- object_add(a, self);
808
- return self;
809
- }
810
-
811
- /*** StandardAnalyzer ***/
1209
+ bool lower;
1210
+ VALUE rlower, rstop_words;
1211
+ Analyzer *a;
1212
+ rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
1213
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1214
+ if (rstop_words != Qnil) {
1215
+ char **stop_words = get_stopwords(rstop_words);
1216
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1217
+ free(stop_words);
1218
+ } else {
1219
+ a = standard_analyzer_new(lower);
1220
+ }
1221
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1222
+ object_add(a, self);
1223
+ return self;
1224
+ }
1225
+
1226
+ /*
1227
+ * call-seq:
1228
+ * StandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1229
+ * -> analyzer
1230
+ *
1231
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1232
+ * optionally leave case as is. Lowercasing will be done based on the current
1233
+ * locale. You can also set the list of stop-words to be used by the
1234
+ * StopFilter.
1235
+ *
1236
+ * lower:: set to false if you don't want the field's tokens to be downcased
1237
+ * stop_words:: list of stop-words to pass to the StopFilter
1238
+ */
812
1239
  static VALUE
813
1240
  frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
814
1241
  {
815
- bool lower;
816
- VALUE rlower, rstop_words;
817
- Analyzer *a;
818
- rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
819
- lower = ((rlower == Qnil) ? true : RTEST(rlower));
820
- if (rstop_words != Qnil) {
821
- char **stop_words = get_stopwords(rstop_words);
822
- a = mb_standard_analyzer_create_with_words((const char **)stop_words, lower);
823
- free(stop_words);
824
- } else {
825
- a = mb_standard_analyzer_create(lower);
826
- }
827
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
828
- object_add(a, self);
829
- return self;
1242
+ bool lower;
1243
+ VALUE rlower, rstop_words;
1244
+ Analyzer *a;
1245
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1246
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1247
+ if (rstop_words != Qnil) {
1248
+ char **stop_words = get_stopwords(rstop_words);
1249
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1250
+ free(stop_words);
1251
+ } else {
1252
+ a = mb_standard_analyzer_new(lower);
1253
+ }
1254
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1255
+ object_add(a, self);
1256
+ return self;
830
1257
  }
831
1258
 
832
- void
1259
+ static void
833
1260
  frt_h_mark_values_i(void *key, void *value, void *arg)
834
1261
  {
835
- frt_gc_mark(value);
1262
+ frt_gc_mark(value);
836
1263
  }
837
1264
 
838
- void
1265
+ static void
839
1266
  frt_pfa_mark(void *p)
840
1267
  {
841
- Analyzer *a = (Analyzer *)p;
842
- PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
843
- frt_gc_mark(pfa->def);
844
- h_each(pfa->dict, &frt_h_mark_values_i, NULL);
1268
+ frt_gc_mark(PFA(p)->default_a);
1269
+ h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
845
1270
  }
846
1271
 
847
1272
  /*** PerFieldAnalyzer ***/
848
1273
 
1274
+ /*
1275
+ * call-seq:
1276
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1277
+ *
1278
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1279
+ * all fields that are set specifically.
1280
+ *
1281
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1282
+ * specified
1283
+ */
849
1284
  static VALUE
850
1285
  frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
851
1286
  {
852
- Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
853
- Analyzer *a = per_field_analyzer_create(def);
854
- Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
855
- object_add(a, self);
856
- return self;
1287
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1288
+ Analyzer *a = per_field_analyzer_new(def);
1289
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1290
+ object_add(a, self);
1291
+ return self;
857
1292
  }
858
1293
 
1294
+ /*
1295
+ * call-seq:
1296
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1297
+ * per_field_analyzer[field_name] = default_analyzer -> self
1298
+ *
1299
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1300
+ * should be a symbol.
1301
+ *
1302
+ * field_name:: field we wish to set the analyzer for
1303
+ * analyzer:: analyzer to be used on +field_name+
1304
+ */
859
1305
  static VALUE
860
1306
  frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
861
1307
  {
862
- Analyzer *pfa, *a;
863
- Data_Get_Struct(self, Analyzer, pfa);
864
- a = frt_get_cwrapped_analyzer(ranalyzer);
1308
+ Analyzer *pfa, *a;
1309
+ Data_Get_Struct(self, Analyzer, pfa);
1310
+ a = frt_get_cwrapped_analyzer(ranalyzer);
865
1311
 
866
- pfa_add_field(pfa, StringValuePtr(rfield), a);
867
- return self;
1312
+ pfa_add_field(pfa, StringValuePtr(rfield), a);
1313
+ return self;
868
1314
  }
869
1315
 
870
1316
  /*** RegExpAnalyzer ***/
@@ -872,36 +1318,46 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
872
1318
  static void
873
1319
  frt_re_analyzer_mark(Analyzer *a)
874
1320
  {
875
- frt_gc_mark(a->current_ts);
1321
+ frt_gc_mark(a->current_ts);
876
1322
  }
877
1323
 
878
1324
  static void
879
- re_analyzer_destroy(Analyzer *a)
1325
+ re_analyzer_destroy_i(Analyzer *a)
880
1326
  {
881
- free(a->data);
882
- a_standard_destroy(a);
1327
+ ts_deref(a->current_ts);
1328
+ free(a);
883
1329
  }
884
1330
 
1331
+ /*
1332
+ * call-seq:
1333
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1334
+ *
1335
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1336
+ * regular expression and lowercasing if required.
1337
+ *
1338
+ * reg_exp:: the token matcher for the tokenizer to use
1339
+ * lower:: set to false if you don't want to downcase the tokens
1340
+ */
885
1341
  static VALUE
886
1342
  frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
887
1343
  {
888
- VALUE lower, rets, regex, proc;
889
- Analyzer *a;
890
- TokenStream *ts;
891
- rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1344
+ VALUE lower, rets, regex, proc;
1345
+ Analyzer *a;
1346
+ TokenStream *ts;
1347
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
892
1348
 
893
- ts = rets_create(Qnil, regex, proc);
894
- rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
895
- ref(ts);
896
- rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
897
- object_add(ts, rets);
1349
+ ts = rets_new(Qnil, regex, proc);
1350
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1351
+ REF(ts);
1352
+ /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1353
+ object_add(ts, rets);
898
1354
 
899
- if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
1355
+ if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
900
1356
 
901
- a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
902
- Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
903
- object_add(a, self);
904
- return self;
1357
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1358
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1359
+ object_add(a, self);
1360
+ return self;
905
1361
  }
906
1362
 
907
1363
  /****************************************************************************
@@ -912,265 +1368,818 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
912
1368
 
913
1369
  static char *frt_locale = NULL;
914
1370
 
915
- static VALUE frt_getlocale(VALUE self, VALUE locale)
1371
+ /*
1372
+ * call-seq:
1373
+ * Ferret.locale -> locale_str
1374
+ *
1375
+ * Returns a string corresponding to the locale set. For example;
1376
+ *
1377
+ * puts Ferret.locale #=> "en_US.UTF-8"
1378
+ */
1379
+ static VALUE frt_get_locale(VALUE self, VALUE locale)
916
1380
  {
917
- return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1381
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
918
1382
  }
919
1383
 
920
- static VALUE frt_setlocale(VALUE self, VALUE locale)
1384
+ /*
1385
+ * call-seq:
1386
+ * Ferret.locale = "en_US.UTF-8"
1387
+ *
1388
+ * Set the global locale. You should use this method to set different locales
1389
+ * when indexing documents with different encodings.
1390
+ */
1391
+ static VALUE frt_set_locale(VALUE self, VALUE locale)
921
1392
  {
922
- char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
923
- frt_locale = setlocale(LC_ALL, l);
924
- return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1393
+ char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
1394
+ frt_locale = setlocale(LC_ALL, l);
1395
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
925
1396
  }
926
1397
 
927
1398
  /****************************************************************************
928
1399
  *
929
- * Init Function
1400
+ * Init Functions
930
1401
  *
931
1402
  ****************************************************************************/
932
1403
 
1404
+ /*
1405
+ * Document-class: Ferret::Analysis::Token
1406
+ *
1407
+ * == Summary
1408
+ *
1409
+ * A Token is an occurence of a term from the text of a field. It consists
1410
+ * of a term's text and the start and end offset of the term in the text of
1411
+ * the field;
1412
+ *
1413
+ * The start and end offsets permit applications to re-associate a token with
1414
+ * its source text, e.g., to display highlighted query terms in a document
1415
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1416
+ * display, etc.
1417
+ *
1418
+ * === Attributes
1419
+ *
1420
+ * text:: the terms text which may have been modified by a Token Filter or
1421
+ * Tokenizer from the text originally found in the document
1422
+ * start:: is the position of the first character corresponding to
1423
+ * this token in the source text
1424
+ * end:: is equal to one greater than the position of the last
1425
+ * character corresponding of this token Note that the
1426
+ * difference between @end_offset and @start_offset may not be
1427
+ * equal to @text.length(), as the term text may have been
1428
+ * altered by a stemmer or some other filter.
1429
+ */
1430
+ static void Init_Token(void)
1431
+ {
1432
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1433
+ rb_define_alloc_func(cToken, frt_token_alloc);
1434
+ rb_include_module(cToken, rb_mComparable);
1435
+
1436
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
1437
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1438
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
1439
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
1440
+ rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1441
+ rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1442
+ rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1443
+ rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1444
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1445
+ rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1446
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1447
+ }
1448
+
1449
+ /*
1450
+ * Document-class: Ferret::Analysis::TokenStream
1451
+ *
1452
+ * A TokenStream enumerates the sequence of tokens, either from
1453
+ * fields of a document or from query text.
1454
+ *
1455
+ * This is an abstract class. Concrete subclasses are:
1456
+ *
1457
+ * Tokenizer:: a TokenStream whose input is a string
1458
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1459
+ */
1460
+ static void Init_TokenStream(void)
1461
+ {
1462
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1463
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1464
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1465
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1466
+ }
1467
+
1468
+ /*
1469
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1470
+ *
1471
+ * A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
1472
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1473
+ * as defined by the regular expression _/[A-Za-z]+/_.
1474
+ *
1475
+ * === Example
1476
+ *
1477
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1478
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1479
+ */
1480
+ static void Init_AsciiLetterTokenizer(void)
1481
+ {
1482
+ cAsciiLetterTokenizer =
1483
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1484
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1485
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1486
+ frt_a_letter_tokenizer_init, 1);
1487
+ }
1488
+
1489
+ /*
1490
+ * Document-class: Ferret::Analysis::LetterTokenizer
1491
+ *
1492
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1493
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1494
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1495
+ * all characters in your local locale.
1496
+ *
1497
+ * === Example
1498
+ *
1499
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1500
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1501
+ */
1502
+ static void Init_LetterTokenizer(void)
1503
+ {
1504
+ cLetterTokenizer =
1505
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1506
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1507
+ rb_define_method(cLetterTokenizer, "initialize",
1508
+ frt_letter_tokenizer_init, -1);
1509
+ }
1510
+
1511
+ /*
1512
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1513
+ *
1514
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1515
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1516
+ *
1517
+ * === Example
1518
+ *
1519
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1520
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1521
+ */
1522
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1523
+ {
1524
+ cAsciiWhiteSpaceTokenizer =
1525
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1526
+ cTokenStream);
1527
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1528
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1529
+ frt_a_whitespace_tokenizer_init, 1);
1530
+ }
1531
+
1532
+ /*
1533
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1534
+ *
1535
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1536
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1537
+ *
1538
+ * === Example
1539
+ *
1540
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1541
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1542
+ */
1543
+ static void Init_WhiteSpaceTokenizer(void)
1544
+ {
1545
+ cWhiteSpaceTokenizer =
1546
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1547
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1548
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1549
+ frt_whitespace_tokenizer_init, -1);
1550
+ }
1551
+
1552
+ /*
1553
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1554
+ *
1555
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1556
+ * words correctly as well as tokenizing things like email addresses, web
1557
+ * addresses, phone numbers, etc.
1558
+ *
1559
+ * === Example
1560
+ *
1561
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1562
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1563
+ */
1564
+ static void Init_AsciiStandardTokenizer(void)
1565
+ {
1566
+ cAsciiStandardTokenizer =
1567
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1568
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1569
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1570
+ frt_a_standard_tokenizer_init, 1);
1571
+ }
1572
+
1573
+ /*
1574
+ * Document-class: Ferret::Analysis::StandardTokenizer
1575
+ *
1576
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1577
+ * words correctly as well as tokenizing things like email addresses, web
1578
+ * addresses, phone numbers, etc.
1579
+ *
1580
+ * === Example
1581
+ *
1582
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1583
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1584
+ */
1585
+ static void Init_StandardTokenizer(void)
1586
+ {
1587
+ cStandardTokenizer =
1588
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1589
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1590
+ rb_define_method(cStandardTokenizer, "initialize",
1591
+ frt_standard_tokenizer_init, 1);
1592
+ }
1593
+
1594
+ /*
1595
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1596
+ *
1597
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1598
+ * the contructor. Most possible tokenizers can be created using this class.
1599
+ *
1600
+ * === Example
1601
+ *
1602
+ * Below is an example of a simple implementation of a LetterTokenizer using
1603
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1604
+ * characters separated by one or more non-alphabetic characters.
1605
+ *
1606
+ * # of course you would add more than just é
1607
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1608
+ *
1609
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1610
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1611
+ */
1612
+ static void Init_RegExpTokenizer(void)
1613
+ {
1614
+ cRegExpTokenizer =
1615
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1616
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1617
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1618
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1619
+ rb_define_method(cRegExpTokenizer, "initialize",
1620
+ frt_rets_init, -1);
1621
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1622
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1623
+ }
1624
+
1625
+ /***************/
1626
+ /*** Filters ***/
1627
+ /***************/
1628
+
1629
+ /*
1630
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1631
+ *
1632
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1633
+ * Ascii characters. For other characters use LowerCaseFilter.
1634
+ *
1635
+ * === Example
1636
+ *
1637
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1638
+ *
1639
+ */
1640
+ static void Init_AsciiLowerCaseFilter(void)
1641
+ {
1642
+ cAsciiLowerCaseFilter =
1643
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1644
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1645
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1646
+ frt_a_lowercase_filter_init, 1);
1647
+ }
1648
+
1649
+ /*
1650
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1651
+ *
1652
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1653
+ * current locale.
1654
+ *
1655
+ * === Example
1656
+ *
1657
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1658
+ *
1659
+ */
1660
+ static void Init_LowerCaseFilter(void)
1661
+ {
1662
+ cLowerCaseFilter =
1663
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1664
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1665
+ rb_define_method(cLowerCaseFilter, "initialize",
1666
+ frt_lowercase_filter_init, 1);
1667
+ }
1668
+
1669
+ /*
1670
+ * Document-class: Ferret::Analysis::StopFilter
1671
+ *
1672
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
1673
+ * that you don't wish to be index. Usually they will be common words like
1674
+ * "the" and "and" although you can specify whichever words you want.
1675
+ *
1676
+ * === Example
1677
+ *
1678
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
1679
+ */
1680
+ static void Init_StopFilter(void)
1681
+ {
1682
+ cStopFilter =
1683
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1684
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
1685
+ rb_define_method(cStopFilter, "initialize",
1686
+ frt_stop_filter_init, -1);
1687
+ }
1688
+
1689
+ /*
1690
+ * Document-class: Ferret::Analysis::StemFilter
1691
+ *
1692
+ * == Summary
1693
+ *
1694
+ * A StemFilter takes a term and transforms the term as per the SnowBall
1695
+ * stemming algorithm. Note: the input to the stemming filter must already
1696
+ * be in lower case, so you will need to use LowerCaseFilter or
1697
+ * LowerCaseTokenizer further down the Tokenizer chain in order for this to
1698
+ * work properly!
1699
+ *
1700
+ * To use this filter with other analyzers, you'll want to write an Analyzer
1701
+ * class that sets up the TokenStream chain as you want it. To use this with
1702
+ * LowerCaseTokenizer, for example, you'd write an analyzer like this:
1703
+ *
1704
+ * === Available algorithms and encodings
1705
+ *
1706
+ * Algorithm Algorithm Pseudonyms Encoding
1707
+ * ----------------------------------------------------------------
1708
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
1709
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
1710
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
1711
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
1712
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
1713
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
1714
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
1715
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
1716
+ * "porter", | | "ISO_8859_1", "UTF_8"
1717
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
1718
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
1719
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
1720
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
1721
+ *
1722
+ * === Example
1723
+ *
1724
+ * def MyAnalyzer < Analyzer
1725
+ * def token_stream(field, str)
1726
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
1727
+ * end
1728
+ * end
1729
+ *
1730
+ * "debate debates debated debating debater"
1731
+ * => ["debat", "debat", "debat", "debat", "debat"]
1732
+ *
1733
+ * === Attributes
1734
+ *
1735
+ * token_stream:: TokenStream to be filtered
1736
+ * algorithm:: The algorithm (or language) to use (default: "english")
1737
+ * encoding:: The encoding of the data (default: "UTF-8")
1738
+ */
1739
+ static void Init_StemFilter(void)
1740
+ {
1741
+ cStemFilter =
1742
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1743
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
1744
+ rb_define_method(cStemFilter, "initialize",
1745
+ frt_stem_filter_init, -1);
1746
+ }
1747
+
1748
+ /*************************/
1749
+ /*** * * Analyzers * * ***/
1750
+ /*************************/
1751
+
1752
+ /*
1753
+ * Document-class: Ferret::Analysis::Analyzer
1754
+ *
1755
+ * == Summary
1756
+ *
1757
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
1758
+ * a policy for extracting index terms from text.
1759
+ *
1760
+ * Typical implementations first build a Tokenizer, which breaks the stream
1761
+ * of characters from the Reader into raw Tokens. One or more TokenFilter s
1762
+ * may then be applied to the output of the Tokenizer.
1763
+ *
1764
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
1765
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
1766
+ *
1767
+ * === Example
1768
+ *
1769
+ * To create your own custom Analyzer you simply need to implement a
1770
+ * token_stream method which takes the field name and the data to be
1771
+ * tokenized as parameters and returns a TokenStream. Most analyzers
1772
+ * typically ignore the field name.
1773
+ *
1774
+ * Here we'll create a StemmingAnalyzer;
1775
+ *
1776
+ * def MyAnalyzer < Analyzer
1777
+ * def token_stream(field, str)
1778
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
1779
+ * end
1780
+ * end
1781
+ */
1782
+ static void Init_Analyzer(void)
1783
+ {
1784
+ cAnalyzer =
1785
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1786
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1787
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1788
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
1789
+ }
1790
+
1791
+ /*
1792
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
1793
+ *
1794
+ * == Summary
1795
+ *
1796
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
1797
+ * maximal strings of Ascii characters. If implemented in Ruby it would look
1798
+ * like;
1799
+ *
1800
+ * class AsciiLetterAnalyzer
1801
+ * def initialize(lower = true)
1802
+ * @lower = lower
1803
+ * end
1804
+ *
1805
+ * def token_stream(field, str)
1806
+ * if @lower
1807
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
1808
+ * else
1809
+ * return AsciiLetterTokenizer.new(str)
1810
+ * end
1811
+ * end
1812
+ * end
1813
+ *
1814
+ * As you can see it makes use of the AsciiLetterTokenizer and
1815
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
1816
+ * characters so you should use the LetterAnalyzer is you want to analyze
1817
+ * multi-byte data like "UTF-8".
1818
+ */
1819
+ static void Init_AsciiLetterAnalyzer(void)
1820
+ {
1821
+ cAsciiLetterAnalyzer =
1822
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1823
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1824
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
1825
+ frt_a_letter_analyzer_init, -1);
1826
+ }
1827
+
1828
+ /*
1829
+ * Document-class: Ferret::Analysis::LetterAnalyzer
1830
+ *
1831
+ * == Summary
1832
+ *
1833
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
1834
+ * maximal strings of characters as recognized by the current locale. If
1835
+ * implemented in Ruby it would look like;
1836
+ *
1837
+ * class LetterAnalyzer
1838
+ * def initialize(lower = true)
1839
+ * @lower = lower
1840
+ * end
1841
+ *
1842
+ * def token_stream(field, str)
1843
+ * return LetterTokenizer.new(str, @lower)
1844
+ * end
1845
+ * end
1846
+ *
1847
+ * As you can see it makes use of the LetterTokenizer.
1848
+ */
1849
+ static void Init_LetterAnalyzer(void)
1850
+ {
1851
+ cLetterAnalyzer =
1852
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1853
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1854
+ rb_define_method(cLetterAnalyzer, "initialize",
1855
+ frt_letter_analyzer_init, -1);
1856
+ }
1857
+
1858
+ /*
1859
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
1860
+ *
1861
+ * == Summary
1862
+ *
1863
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
1864
+ * non-whitespace characters. If implemented in Ruby the
1865
+ * AsciiWhiteSpaceAnalyzer would look like;
1866
+ *
1867
+ * class AsciiWhiteSpaceAnalyzer
1868
+ * def initialize(lower = true)
1869
+ * @lower = lower
1870
+ * end
1871
+ *
1872
+ * def token_stream(field, str)
1873
+ * if @lower
1874
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
1875
+ * else
1876
+ * return AsciiWhiteSpaceTokenizer.new(str)
1877
+ * end
1878
+ * end
1879
+ * end
1880
+ *
1881
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
1882
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
1883
+ * as "UTF-8".
1884
+ */
1885
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
1886
+ {
1887
+ cAsciiWhiteSpaceAnalyzer =
1888
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1889
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1890
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1891
+ frt_a_white_space_analyzer_init, -1);
1892
+ }
1893
+
1894
+ /*
1895
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
1896
+ *
1897
+ * == Summary
1898
+ *
1899
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
1900
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
1901
+ * would look like;
1902
+ *
1903
+ * class WhiteSpaceAnalyzer
1904
+ * def initialize(lower = true)
1905
+ * @lower = lower
1906
+ * end
1907
+ *
1908
+ * def token_stream(field, str)
1909
+ * return WhiteSpaceTokenizer.new(str, @lower)
1910
+ * end
1911
+ * end
1912
+ *
1913
+ * As you can see it makes use of the WhiteSpaceTokenizer.
1914
+ */
1915
+ static void Init_WhiteSpaceAnalyzer(void)
1916
+ {
1917
+ cWhiteSpaceAnalyzer =
1918
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1919
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1920
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1921
+ frt_white_space_analyzer_init, -1);
1922
+ }
1923
+
1924
+ /*
1925
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
1926
+ *
1927
+ * == Summary
1928
+ *
1929
+ * The AsciiStandardAnalyzer is the most advanced of the available
1930
+ * ascii-analyzers. If it were implemented in Ruby it would look like this;
1931
+ *
1932
+ * class AsciiStandardAnalyzer
1933
+ * def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
1934
+ * @lower = lower
1935
+ * @stop_words = stop_words
1936
+ * end
1937
+ *
1938
+ * def token_stream(field, str)
1939
+ * if @lower
1940
+ * return StopFilter.new(AsciiLowerCaseFilter.new(
1941
+ * AsciiStandardTokenizer.new(str)), @stop_words)
1942
+ * else
1943
+ * return StopFilter.new(AsciiStandardTokenizer.new(str), @stop_words)
1944
+ * end
1945
+ * end
1946
+ * end
1947
+ *
1948
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
1949
+ * add your own list of stop-words if you wish. Note that this tokenizer
1950
+ * won't recognize non-ascii characters so you should use the
1951
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
1952
+ */
1953
+ static void Init_AsciiStandardAnalyzer(void)
1954
+ {
1955
+ cAsciiStandardAnalyzer =
1956
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1957
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1958
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
1959
+ frt_a_standard_analyzer_init, -1);
1960
+ }
1961
+
1962
+ /*
1963
+ * Document-class: Ferret::Analysis::StandardAnalyzer
1964
+ *
1965
+ * == Summary
1966
+ *
1967
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
1968
+ * it were implemented in Ruby it would look like this;
1969
+ *
1970
+ * class StandardAnalyzer
1971
+ * def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
1972
+ * @lower = lower
1973
+ * @stop_words = stop_words
1974
+ * end
1975
+ *
1976
+ * def token_stream(field, str)
1977
+ * return StopFilter.new(StandardTokenizer.new(str, @lower), @stop_words)
1978
+ * end
1979
+ * end
1980
+ *
1981
+ * As you can see it makes use of the StandardTokenizer and you can also add
1982
+ * your own list of stopwords if you wish.
1983
+ */
1984
+ static void Init_StandardAnalyzer(void)
1985
+ {
1986
+ cStandardAnalyzer =
1987
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1988
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1989
+ rb_define_method(cStandardAnalyzer, "initialize",
1990
+ frt_standard_analyzer_init, -1);
1991
+ }
1992
+
1993
+ /*
1994
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
1995
+ *
1996
+ * == Summary
1997
+ *
1998
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
1999
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2000
+ * you want each field analyzed.
2001
+ *
2002
+ * === Example
2003
+ *
2004
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2005
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2006
+ *
2007
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2008
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2009
+ *
2010
+ * # Use a custom analyzer on the :created_at field
2011
+ * pfa[:created_at] = DateAnalyzer.new
2012
+ */
2013
+ static void Init_PerFieldAnalyzer(void)
2014
+ {
2015
+ cPerFieldAnalyzer =
2016
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2017
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2018
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2019
+ frt_per_field_analyzer_init, 1);
2020
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2021
+ frt_per_field_analyzer_add_field, 2);
2022
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2023
+ frt_per_field_analyzer_add_field, 2);
2024
+ }
2025
+
2026
+ /*
2027
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2028
+ *
2029
+ * == Summary
2030
+ *
2031
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2032
+ * implemented in Ruby it would look like this;
2033
+ *
2034
+ * class RegExpAnalyzer
2035
+ * def initialize(reg_exp, lower = true)
2036
+ * @lower = lower
2037
+ * @reg_exp = reg_exp
2038
+ * end
2039
+ *
2040
+ * def token_stream(field, str)
2041
+ * if @lower
2042
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2043
+ * else
2044
+ * return RegExpTokenizer.new(str, reg_exp)
2045
+ * end
2046
+ * end
2047
+ * end
2048
+ *
2049
+ * === Example
2050
+ *
2051
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2052
+ */
2053
+ static void Init_RegExpAnalyzer(void)
2054
+ {
2055
+ cRegExpAnalyzer =
2056
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2057
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2058
+ rb_define_method(cRegExpAnalyzer, "initialize",
2059
+ frt_re_analyzer_init, -1);
2060
+ }
2061
+
2062
+ /* rdoc hack
2063
+ extern VALUE mFerret = rb_define_module("Ferret");
2064
+ */
2065
+
2066
+ /*
2067
+ * Document-module: Ferret::Analysis
2068
+ *
2069
+ * == Summary
2070
+ *
2071
+ * The Analysis module contains all the classes used to analyze and tokenize
2072
+ * the data to be indexed. There are three main classes you need to know
2073
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2074
+ *
2075
+ * == Classes
2076
+ *
2077
+ * === Analyzer
2078
+ *
2079
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2080
+ * indexing class when you create it and it will create the TokenStreams
2081
+ * necessary to tokenize the fields in the documents. Most of the time you
2082
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2083
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2084
+ * need to implement a custom analyzer.
2085
+ *
2086
+ * === TokenStream
2087
+ *
2088
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2089
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2090
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2091
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2092
+ * as you like but they always need to finish with a Tokenizer.
2093
+ *
2094
+ * === Token
2095
+ *
2096
+ * A Token is a single term from a document field. A token contains the text
2097
+ * representing the term as well as the start and end offset of the token.
2098
+ * The start and end offset will represent the token as it appears in the
2099
+ * source field. Some TokenFilters may change the text in the Token but the
2100
+ * start and end offsets should stay the same so (end - start) won't
2101
+ * necessarily be equal to the length of text in the token. For example using
2102
+ * a stemming TokenFilter the term "Beginning" might have start and end
2103
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2104
+ * might be "begin" (after stemming).
2105
+ */
933
2106
  void
934
- Init_analysis(void)
935
- {
936
- /* TokenStream Methods */
937
- id_next = rb_intern("next");
938
- id_reset = rb_intern("text=");
939
- id_clone = rb_intern("clone");
940
-
941
- /* Analyzer Methods */
942
- id_token_stream = rb_intern("token_stream");
943
-
944
- object_space = rb_hash_new();
945
- rb_define_const(mFerret, "OBJECT_SPACE", object_space);
946
-
947
- /*** * * Locale stuff * * ***/
948
- frt_locale = setlocale(LC_ALL, "");
949
- rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
950
- rb_define_singleton_method(mFerret, "locale", frt_getlocale, 0);
951
-
952
- /*********************/
953
- /*** * * Token * * ***/
954
- /*********************/
955
- cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
956
- rb_define_alloc_func(cToken, frt_token_alloc);
957
- rb_include_module(cToken, rb_mComparable);
958
-
959
- rb_define_method(cToken, "initialize", frt_token_init, -1);
960
- rb_define_method(cToken, "<=>", frt_token_cmp, 1);
961
- rb_define_method(cToken, "text", frt_token_get_text, 0);
962
- rb_define_method(cToken, "text=", frt_token_set_text, 1);
963
- rb_define_method(cToken, "start_offset", frt_token_get_start_offset, 0);
964
- rb_define_method(cToken, "end_offset", frt_token_get_end_offset, 0);
965
- rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
966
- rb_define_method(cToken, "to_s", frt_token_to_s, 0);
967
-
968
- /****************************/
969
- /*** * * TokenStreams * * ***/
970
- /****************************/
971
-
972
- cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
973
- rb_define_method(cTokenStream, "next", frt_ts_next, 0);
974
- rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
975
- rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
976
-
977
- /******************/
978
- /*** Tokenizers ***/
979
- /******************/
980
-
981
- /*** * * AsciiLetterTokenizer * * ***/
982
- cAsciiLetterTokenizer =
983
- rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
984
- rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
985
- rb_define_method(cAsciiLetterTokenizer, "initialize",
986
- frt_a_letter_tokenizer_init, 1);
987
-
988
- /*** * * LetterTokenizer * * ***/
989
- cLetterTokenizer =
990
- rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
991
- rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
992
- rb_define_method(cLetterTokenizer, "initialize",
993
- frt_letter_tokenizer_init, -1);
994
-
995
- /*** * * AsciiWhiteSpaceTokenizer * * ***/
996
- cAsciiWhiteSpaceTokenizer =
997
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer", cTokenStream);
998
- rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
999
- rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1000
- frt_a_whitespace_tokenizer_init, 1);
1001
-
1002
- /*** * * WhiteSpaceTokenizer * * ***/
1003
- cWhiteSpaceTokenizer =
1004
- rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1005
- rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1006
- rb_define_method(cWhiteSpaceTokenizer, "initialize",
1007
- frt_whitespace_tokenizer_init, -1);
1008
-
1009
- /*** * * AsciiStandardTokenizer * * ***/
1010
- cAsciiStandardTokenizer =
1011
- rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1012
- rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1013
- rb_define_method(cAsciiStandardTokenizer, "initialize",
1014
- frt_a_standard_tokenizer_init, 1);
1015
-
1016
- /*** * * StandardTokenizer * * ***/
1017
- cStandardTokenizer =
1018
- rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1019
- rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1020
- rb_define_method(cStandardTokenizer, "initialize",
1021
- frt_standard_tokenizer_init, 1);
1022
-
1023
- /*** * * RegExpTokenizer * * ***/
1024
- cRegExpTokenizer =
1025
- rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1026
- rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
1027
- rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1028
- rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1029
- rb_define_method(cRegExpTokenizer, "initialize",
1030
- frt_rets_init, -1);
1031
- rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
1032
- rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1033
- rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1034
-
1035
- /***************/
1036
- /*** Filters ***/
1037
- /***************/
1038
- rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
1039
- get_rstopwords(ENGLISH_STOP_WORDS));
1040
- rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
1041
- get_rstopwords(FULL_ENGLISH_STOP_WORDS));
1042
- rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
1043
- get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
1044
- rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
1045
- get_rstopwords(FULL_FRENCH_STOP_WORDS));
1046
- rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
1047
- get_rstopwords(FULL_SPANISH_STOP_WORDS));
1048
- rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
1049
- get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
1050
- rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
1051
- get_rstopwords(FULL_ITALIAN_STOP_WORDS));
1052
- rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
1053
- get_rstopwords(FULL_GERMAN_STOP_WORDS));
1054
- rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
1055
- get_rstopwords(FULL_DUTCH_STOP_WORDS));
1056
- rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
1057
- get_rstopwords(FULL_SWEDISH_STOP_WORDS));
1058
- rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
1059
- get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
1060
- rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
1061
- get_rstopwords(FULL_DANISH_STOP_WORDS));
1062
- rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
1063
- get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
1064
- rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
1065
- get_rstopwords(FULL_FINNISH_STOP_WORDS));
1066
-
1067
- cAsciiLowerCaseFilter =
1068
- rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1069
- rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1070
- rb_define_method(cAsciiLowerCaseFilter, "initialize",
1071
- frt_a_lowercase_filter_init, 1);
1072
-
1073
- cLowerCaseFilter =
1074
- rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1075
- rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1076
- rb_define_method(cLowerCaseFilter, "initialize",
1077
- frt_lowercase_filter_init, 1);
1078
-
1079
- cStopFilter =
1080
- rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1081
- rb_define_alloc_func(cStopFilter, frt_data_alloc);
1082
- rb_define_method(cStopFilter, "initialize",
1083
- frt_stop_filter_init, -1);
1084
-
1085
- cStemFilter =
1086
- rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1087
- rb_define_alloc_func(cStemFilter, frt_data_alloc);
1088
- rb_define_method(cStemFilter, "initialize",
1089
- frt_stem_filter_init, -1);
1090
-
1091
-
1092
- /*************************/
1093
- /*** * * Analyzers * * ***/
1094
- /*************************/
1095
-
1096
- /*** * * Analyzer * * ***/
1097
- cAnalyzer =
1098
- rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1099
- rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1100
- rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1101
- rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
1102
-
1103
- /*** * * AsciiLetterAnalyzer * * ***/
1104
- cAsciiLetterAnalyzer =
1105
- rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1106
- rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1107
- rb_define_method(cAsciiLetterAnalyzer, "initialize",
1108
- frt_a_letter_analyzer_init, -1);
1109
-
1110
- /*** * * LetterAnalyzer * * ***/
1111
- cLetterAnalyzer =
1112
- rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1113
- rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1114
- rb_define_method(cLetterAnalyzer, "initialize",
1115
- frt_letter_analyzer_init, -1);
1116
-
1117
- /*** * * AsciiWhiteSpaceAnalyzer * * ***/
1118
- cAsciiWhiteSpaceAnalyzer =
1119
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1120
- rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1121
- rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1122
- frt_a_white_space_analyzer_init, -1);
1123
-
1124
- /*** * * WhiteSpaceAnalyzer * * ***/
1125
- cWhiteSpaceAnalyzer =
1126
- rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1127
- rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1128
- rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1129
- frt_white_space_analyzer_init, -1);
1130
-
1131
- /*** * * AsciiStandardAnalyzer * * ***/
1132
- cAsciiStandardAnalyzer =
1133
- rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1134
- rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1135
- rb_define_method(cAsciiStandardAnalyzer, "initialize",
1136
- frt_a_standard_analyzer_init, -1);
1137
-
1138
- /*** * * StandardAnalyzer * * ***/
1139
- cStandardAnalyzer =
1140
- rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1141
- rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1142
- rb_define_method(cStandardAnalyzer, "initialize",
1143
- frt_standard_analyzer_init, -1);
1144
-
1145
- /*** * * PerFieldAnalyzer * * ***/
1146
- cPerFieldAnalyzer =
1147
- rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
1148
- rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
1149
- rb_define_method(cPerFieldAnalyzer, "initialize",
1150
- frt_per_field_analyzer_init, 1);
1151
- rb_define_method(cPerFieldAnalyzer, "add_field",
1152
- frt_per_field_analyzer_add_field, 2);
1153
- rb_define_method(cPerFieldAnalyzer, "[]=",
1154
- frt_per_field_analyzer_add_field, 2);
1155
- rb_define_class_under(mAnalysis, "PerFieldAnalyzerWrapper", cPerFieldAnalyzer);
1156
-
1157
- /*** * * RegexAnalyzer * * ***/
1158
- cRegExpAnalyzer =
1159
- rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
1160
- rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
1161
- rb_define_method(cRegExpAnalyzer, "initialize",
1162
- frt_re_analyzer_init, -1);
1163
-
1164
- /*
1165
- cRegexAnalyzer =
1166
- rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
1167
- rb_define_alloc_func(cRegexAnalyzer, frt_data_alloc);
1168
- rb_define_method(cRegexAnalyzer, "initialize",
1169
- frt_regex_analyzer_init, 0);
1170
- rb_define_method(cRegexAnalyzer, "token_stream",
1171
- frt_regex_analyzer_token_stream, 2);
1172
- rb_define_method(cRegexAnalyzer, "setlocale",
1173
- frt_regex_analyzer_setlocale, 1);
1174
- */
2107
+ Init_Analysis(void)
2108
+ {
2109
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2110
+
2111
+ /* TokenStream Methods */
2112
+ id_next = rb_intern("next");
2113
+ id_reset = rb_intern("text=");
2114
+ id_clone = rb_intern("clone");
2115
+
2116
+ /* Analyzer Methods */
2117
+ id_token_stream = rb_intern("token_stream");
2118
+
2119
+ object_space = rb_hash_new();
2120
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2121
+
2122
+ /*** * * Locale stuff * * ***/
2123
+ frt_locale = setlocale(LC_ALL, "");
2124
+ rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2125
+ rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2126
+
2127
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2128
+ get_rstopwords(ENGLISH_STOP_WORDS));
2129
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2130
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2131
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2132
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2133
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2134
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2135
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2136
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2137
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2138
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2139
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2140
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2141
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2142
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2143
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2144
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2145
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2146
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2147
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2148
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2149
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2150
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2151
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2152
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2153
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2154
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2155
+
2156
+ Init_Token();
2157
+ Init_TokenStream();
2158
+
2159
+ Init_AsciiLetterTokenizer();
2160
+ Init_LetterTokenizer();
2161
+
2162
+ Init_AsciiWhiteSpaceTokenizer();
2163
+ Init_WhiteSpaceTokenizer();
2164
+
2165
+ Init_AsciiStandardTokenizer();
2166
+ Init_StandardTokenizer();
2167
+
2168
+ Init_RegExpTokenizer();
2169
+
2170
+ Init_AsciiLowerCaseFilter();
2171
+ Init_LowerCaseFilter();
2172
+ Init_StopFilter();
2173
+ Init_StemFilter();
2174
+
2175
+ Init_Analyzer();
2176
+ Init_AsciiLetterAnalyzer();
2177
+ Init_LetterAnalyzer();
2178
+ Init_AsciiWhiteSpaceAnalyzer();
2179
+ Init_WhiteSpaceAnalyzer();
2180
+ Init_AsciiStandardAnalyzer();
2181
+ Init_StandardAnalyzer();
2182
+ Init_PerFieldAnalyzer();
2183
+ Init_RegExpAnalyzer();
1175
2184
 
1176
2185
  }