ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_wildcard.c CHANGED
@@ -7,152 +7,165 @@
7
7
  *
8
8
  ****************************************************************************/
9
9
 
10
- char *wcq_to_s(Query *self, char *field)
11
- {
12
- char *buffer, *bptr;
13
- Term *term = (Term *)self->data;
14
- size_t tlen = strlen(term->text);
15
- size_t flen = strlen(term->field);
16
- bptr = buffer = ALLOC_N(char, tlen + flen + 35);
17
-
18
- if (strcmp(term->field, field) != 0) {
19
- sprintf(bptr, "%s:", term->field);
20
- bptr += strlen(term->field) + 1;
21
- }
22
- sprintf(bptr, "%s", term->text);
23
- bptr = buffer + strlen(buffer);
24
- if (self->boost != 1.0) {
25
- *bptr = '^';
26
- dbl_to_s(++bptr, self->boost);
27
- }
28
-
29
- return buffer;
30
- }
10
+ #define WCQ(query) ((WildCardQuery *)(query))
31
11
 
32
- bool wc_match(char *pattern, char *text)
12
+ static char *wcq_to_s(Query *self, const char *current_field)
33
13
  {
34
- char *p = pattern, *t = text, *xt;
14
+ char *buffer, *bptr;
15
+ const char *field = WCQ(self)->field;
16
+ const char *pattern = WCQ(self)->pattern;
17
+ size_t flen = strlen(field);
18
+ size_t plen = strlen(pattern);
19
+ bptr = buffer = ALLOC_N(char, plen + flen + 35);
20
+
21
+ if (strcmp(field, current_field) != 0) {
22
+ sprintf(bptr, "%s:", field);
23
+ bptr += flen + 1;
24
+ }
25
+ sprintf(bptr, "%s", pattern);
26
+ bptr += plen;
35
27
 
36
- /* include '\0' as we need to match empty string */
37
- char *text_last = t + strlen(t);
28
+ if (self->boost != 1.0) {
29
+ *bptr = '^';
30
+ dbl_to_s(++bptr, self->boost);
31
+ }
38
32
 
39
- for (;; p++, t++) {
33
+ return buffer;
34
+ }
40
35
 
41
- /* end of text so make sure end of pattern doesn't matter */
42
- if (*t == '\0') {
43
- while (*p) {
44
- if (*p != WILD_STRING) return false;
45
- p++;
46
- }
47
- return true;
48
- }
36
+ bool wc_match(const char *pattern, const char *text)
37
+ {
38
+ const char *p = pattern, *t = text, *xt;
39
+
40
+ /* include '\0' as we need to match empty string */
41
+ const char *text_last = t + strlen(t);
42
+
43
+ for (;; p++, t++) {
44
+
45
+ /* end of text so make sure end of pattern doesn't matter */
46
+ if (*t == '\0') {
47
+ while (*p) {
48
+ if (*p != WILD_STRING) {
49
+ return false;
50
+ }
51
+ p++;
52
+ }
53
+ return true;
54
+ }
49
55
 
50
- /* If we've gone past the end of the pattern, return false. */
51
- if (*p == '\0') return false;
56
+ /* If we've gone past the end of the pattern, return false. */
57
+ if (*p == '\0') {
58
+ return false;
59
+ }
52
60
 
53
- /* Match a single character, so continue. */
54
- if (*p == WILD_CHAR) continue;
61
+ /* Match a single character, so continue. */
62
+ if (*p == WILD_CHAR) {
63
+ continue;
64
+ }
55
65
 
56
- if (*p == WILD_STRING) {
57
- // Look at the character beyond the '*'.
58
- p++;
59
- // Examine the string, starting at the last character.
60
- for (xt = text_last; xt >= t; xt--) {
61
- if (wc_match(p, xt)) return true;
62
- }
63
- return false;
66
+ if (*p == WILD_STRING) {
67
+ /* Look at the character beyond the '*'. */
68
+ p++;
69
+ /* Examine the string, starting at the last character. */
70
+ for (xt = text_last; xt >= t; xt--) {
71
+ if (wc_match(p, xt)) return true;
72
+ }
73
+ return false;
74
+ }
75
+ if (*p != *t) {
76
+ return false;
77
+ }
64
78
  }
65
- if (*p != *t)
66
- return false;
67
- }
68
79
 
69
- return false;
80
+ return false;
70
81
  }
71
82
 
72
- Query *wcq_rewrite(Query *self, IndexReader *ir)
83
+ static Query *wcq_rewrite(Query *self, IndexReader *ir)
73
84
  {
74
- Query *q;
75
- Query *tq;
76
-
77
- Term *term = (Term *)self->data;
78
- char *text = term->text;
79
- char *field = term->field;
80
- char *first_star = strchr(text, WILD_STRING);
81
- char *first_ques = strchr(text, WILD_CHAR);
82
-
83
- if (first_star == NULL && first_ques == NULL) {
84
- q = tq_create(term_clone(term));
85
- } else {
86
- TermEnum *te;
87
- Term prefix_term;
88
- char *prefix = NULL;
89
-
90
- char *pattern = (first_ques && (!first_star || (first_star > first_ques)))
91
- ? first_ques : first_star;
92
-
93
- int prefix_len = (int)(pattern - text);
94
-
95
- prefix_term.field = field;
96
- prefix_term.text = (char *)EMPTY_STRING;
97
- if (prefix_len > 0) {
98
- prefix = ALLOC_N(char, prefix_len + 1);
99
- strncpy(prefix, text, prefix_len);
100
- prefix_term.text = prefix;
101
- prefix_term.text[prefix_len] = '\0';
85
+ Query *q;
86
+ const char *field = WCQ(self)->field;
87
+ const char *pattern = WCQ(self)->pattern;
88
+ const char *first_star = strchr(pattern, WILD_STRING);
89
+ const char *first_ques = strchr(pattern, WILD_CHAR);
90
+
91
+ if (NULL == first_star && NULL == first_ques) {
92
+ q = tq_new(field, pattern);
93
+ q->boost = self->boost;
102
94
  }
103
- te = ir->terms_from(ir, &prefix_term);
104
-
105
- q = bq_create(true);
106
- if (te) {
107
- TermBuffer *tb = te->tb_curr;
108
- do {
109
- if (strcmp(tb->field, field) != 0 ||
110
- (prefix && strncmp(tb->text, prefix, prefix_len) != 0))
111
- break;
112
-
113
- if (wc_match(pattern, tb->text + prefix_len)) {
114
- tq = tq_create(term_create(tb->field, tb->text)); /* found match */
115
- tq->boost = self->boost; /* set boost */
116
- bq_add_query(q, tq, BC_SHOULD); /* add query */
95
+ else {
96
+ const int field_num = fis_get_field_num(ir->fis, field);
97
+ q = multi_tq_new_conf(field, MTQMaxTerms(self), 0.0);
98
+
99
+ if (field_num >= 0) {
100
+ TermEnum *te;
101
+ char prefix[MAX_WORD_SIZE] = "";
102
+ int prefix_len;
103
+
104
+ pattern = (first_ques && (!first_star || first_star > first_ques))
105
+ ? first_ques : first_star;
106
+
107
+ prefix_len = (int)(pattern - WCQ(self)->pattern);
108
+
109
+ if (prefix_len > 0) {
110
+ memcpy(prefix, WCQ(self)->pattern, prefix_len);
111
+ prefix[prefix_len] = '\0';
112
+ }
113
+
114
+ te = ir->terms_from(ir, field_num, prefix);
115
+
116
+ if (te != NULL) {
117
+ const char *term = te->curr_term;
118
+ const char *pat_term = term + prefix_len;
119
+ do {
120
+ if (prefix && strncmp(term, prefix, prefix_len) != 0) {
121
+ break;
122
+ }
123
+
124
+ if (wc_match(pattern, pat_term)) {
125
+ multi_tq_add_term(q, term);
126
+ }
127
+ } while (te->next(te) != NULL);
128
+ te->close(te);
129
+ }
117
130
  }
118
- } while ((tb = te->next(te)) != NULL);
119
- te->close(te);
120
131
  }
121
- free(prefix);
122
- }
123
132
 
124
- return q;
133
+ return q;
125
134
  }
126
135
 
127
136
  static void wcq_destroy(Query *self)
128
137
  {
129
- if (self->destroy_all) term_destroy((Term *)self->data);
130
- q_destroy_i(self);
138
+ free(WCQ(self)->field);
139
+ free(WCQ(self)->pattern);
140
+ q_destroy_i(self);
131
141
  }
132
142
 
133
- static uint wcq_hash(Query *self)
143
+ static ulong wcq_hash(Query *self)
134
144
  {
135
- return term_hash((Term *)self->data);
145
+ return str_hash(WCQ(self)->field) ^ str_hash(WCQ(self)->pattern);
136
146
  }
137
147
 
138
148
  static int wcq_eq(Query *self, Query *o)
139
149
  {
140
- return term_eq((Term *)self->data, (Term *)o->data);
150
+ return (strcmp(WCQ(self)->pattern, WCQ(o)->pattern) == 0)
151
+ && (strcmp(WCQ(self)->field, WCQ(o)->field) == 0);
141
152
  }
142
153
 
143
- Query *wcq_create(Term *term)
154
+ Query *wcq_new(const char *field, const char *pattern)
144
155
  {
145
- Query *self = q_create();
156
+ Query *self = q_new(WildCardQuery);
146
157
 
147
- self->data = term;
158
+ WCQ(self)->field = estrdup(field);
159
+ WCQ(self)->pattern = estrdup(pattern);
160
+ MTQMaxTerms(self) = WILD_CARD_QUERY_MAX_TERMS;
148
161
 
149
- self->type = WILD_CARD_QUERY;
150
- self->rewrite = &wcq_rewrite;
151
- self->to_s = &wcq_to_s;
152
- self->hash = &wcq_hash;
153
- self->eq = &wcq_eq;
154
- self->destroy_i = &wcq_destroy;
155
- self->create_weight_i = &q_create_weight_unsup;
162
+ self->type = WILD_CARD_QUERY;
163
+ self->rewrite = &wcq_rewrite;
164
+ self->to_s = &wcq_to_s;
165
+ self->hash = &wcq_hash;
166
+ self->eq = &wcq_eq;
167
+ self->destroy_i = &wcq_destroy;
168
+ self->create_weight_i = &q_create_weight_unsup;
156
169
 
157
- return self;
170
+ return self;
158
171
  }
data/ext/r_analysis.c CHANGED
@@ -1,7 +1,10 @@
1
1
  #include <regex.h>
2
+ #include <locale.h>
3
+ #include <st.h>
2
4
  #include "ferret.h"
3
5
  #include "analysis.h"
4
- #include "locale.h"
6
+
7
+ static VALUE mAnalysis;
5
8
 
6
9
  static VALUE cToken;
7
10
  static VALUE cAsciiLetterTokenizer;
@@ -27,7 +30,6 @@ static VALUE cStandardAnalyzer;
27
30
  static VALUE cPerFieldAnalyzer;
28
31
  static VALUE cRegExpAnalyzer;
29
32
 
30
- //static VALUE cRegexAnalyzer;
31
33
  static VALUE cTokenStream;
32
34
 
33
35
  /* TokenStream Methods */
@@ -40,9 +42,16 @@ static ID id_token_stream;
40
42
 
41
43
  static VALUE object_space;
42
44
 
43
- extern TokenStream *ts_create();
44
- extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, int,
45
- struct re_registers *);
45
+ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int,
46
+ int, struct re_registers *);
47
+
48
+ /*
49
+ static int
50
+ frt_rb_hash_size(VALUE hash)
51
+ {
52
+ return RHASH(hash)->tbl->num_entries;
53
+ }
54
+ */
46
55
 
47
56
  /****************************************************************************
48
57
  *
@@ -53,18 +62,18 @@ extern int ruby_re_search(struct re_pattern_buffer *, const char *, int, int, in
53
62
  static char **
54
63
  get_stopwords(VALUE rstop_words)
55
64
  {
56
- char **stop_words;
57
- int i, len;
58
- VALUE rstr;
59
- Check_Type(rstop_words, T_ARRAY);
60
- len = RARRAY(rstop_words)->len;
61
- stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
62
- stop_words[len] = NULL;
63
- for (i = 0; i < len; i++) {
64
- rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
65
- stop_words[i] = RSTRING(rstr)->ptr;
66
- }
67
- return stop_words;
65
+ char **stop_words;
66
+ int i, len;
67
+ VALUE rstr;
68
+ Check_Type(rstop_words, T_ARRAY);
69
+ len = RARRAY(rstop_words)->len;
70
+ stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
71
+ stop_words[len] = NULL;
72
+ for (i = 0; i < len; i++) {
73
+ rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
74
+ stop_words[i] = RSTRING(rstr)->ptr;
75
+ }
76
+ return stop_words;
68
77
  }
69
78
 
70
79
  /****************************************************************************
@@ -74,140 +83,295 @@ get_stopwords(VALUE rstop_words)
74
83
  ****************************************************************************/
75
84
 
76
85
  typedef struct RToken {
77
- VALUE text;
78
- int start;
79
- int end;
80
- int pos_inc;
86
+ VALUE text;
87
+ int start;
88
+ int end;
89
+ int pos_inc;
81
90
  } RToken;
82
91
 
83
92
  static void
84
93
  frt_token_free(void *p)
85
94
  {
86
- free(p);
95
+ free(p);
87
96
  }
88
-
97
+
89
98
  static void
90
99
  frt_token_mark(void *p)
91
100
  {
92
- RToken *token = (RToken *)p;
93
- rb_gc_mark(token->text);
101
+ RToken *token = (RToken *)p;
102
+ rb_gc_mark(token->text);
94
103
  }
95
104
 
96
105
  static VALUE
97
106
  frt_token_alloc(VALUE klass)
98
107
  {
99
- return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
108
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free,
109
+ ALLOC(RToken));
100
110
  }
101
111
 
102
112
  static VALUE
103
113
  get_token(Token *tk)
104
114
  {
105
- RToken *token = ALLOC(RToken);
115
+ RToken *token = ALLOC(RToken);
106
116
 
107
- token->text = rb_str_new2(tk->text);
108
- token->start = tk->start;
109
- token->end = tk->end;
110
- token->pos_inc = tk->pos_inc;
111
- return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
117
+ token->text = rb_str_new2(tk->text);
118
+ token->start = tk->start;
119
+ token->end = tk->end;
120
+ token->pos_inc = tk->pos_inc;
121
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
112
122
  }
113
123
 
114
124
  Token *
115
125
  frt_set_token(Token *tk, VALUE rt)
116
126
  {
117
- RToken *rtk;
127
+ RToken *rtk;
118
128
 
119
- if (rt == Qnil) return NULL;
129
+ if (rt == Qnil) return NULL;
120
130
 
121
- Data_Get_Struct(rt, RToken, rtk);
122
- tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
123
- rtk->start, rtk->end, rtk->pos_inc);
124
- return tk;
131
+ Data_Get_Struct(rt, RToken, rtk);
132
+ tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
133
+ rtk->start, rtk->end, rtk->pos_inc);
134
+ return tk;
125
135
  }
126
136
 
127
- #define GET_TK RToken *token = (RToken *)DATA_PTR(self)
137
+ #define GET_TK(tk, self) Data_Get_Struct(self, RToken, tk)
138
+
139
+ /*
140
+ * call-seq:
141
+ * Token.new(text, start, end, pos_inc = 1) -> new Token
142
+ *
143
+ * Creates a new token setting the text, start and end offsets of the token
144
+ * and the position increment for the token.
145
+ *
146
+ * The position increment is usually set to 1 but you can set it to other
147
+ * values as needed. For example, if you have a stop word filter you will be
148
+ * skipping tokens. Let's say you have the stop words "the" and "and" and you
149
+ * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
150
+ * "Sea" will have the position incerements 2, 1 and 3 respectively.
151
+ *
152
+ * Another reason you might want to vary the position increment is if you are
153
+ * adding synonyms to the index. For example let's say you have the synonym
154
+ * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
155
+ * speedy delivery", you'll add "speedy" first with a position increment of 1
156
+ * and then "fast" and "quick" with position increments of 0 since they are
157
+ * represented in the same position.
158
+ *
159
+ * The offset set values +start+ and +end+ should be byte offsets, not
160
+ * character offsets. This makes it easy to use those offsets to quickly
161
+ * access the token in the input string and also to insert highlighting tags
162
+ * when necessary.
163
+ *
164
+ * text:: the main text for the token.
165
+ * start:: the start offset of the token in bytes.
166
+ * end:: the end offset of the token in bytes.
167
+ * pos_inc:: the position increment of a token. See above.
168
+ * return:: a newly created and assigned Token object
169
+ */
128
170
  static VALUE
129
171
  frt_token_init(int argc, VALUE *argv, VALUE self)
130
172
  {
131
- GET_TK;
132
- VALUE rtext, rstart, rend, rpos_inc, rtype;
133
- token->pos_inc = 1;
134
- switch (rb_scan_args(argc, argv, "32", &rtext, &rstart, &rend, &rpos_inc, &rtype)) {
135
- case 5: /* type gets ignored at this stage */
136
- case 4: token->pos_inc = FIX2INT(rpos_inc);
137
- }
138
- token->text = rb_obj_as_string(rtext);
139
- token->start = FIX2INT(rstart);
140
- token->end = FIX2INT(rend);
141
- return self;
173
+ RToken *token;
174
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
175
+ GET_TK(token, self);
176
+ token->pos_inc = 1;
177
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
178
+ &rend, &rpos_inc, &rtype)) {
179
+ case 5: /* type gets ignored at this stage */
180
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
181
+ }
182
+ token->text = rb_obj_as_string(rtext);
183
+ token->start = FIX2INT(rstart);
184
+ token->end = FIX2INT(rend);
185
+ return self;
142
186
  }
143
187
 
188
+ /*
189
+ * call-seq:
190
+ * token.cmp(other_token) -> bool
191
+ *
192
+ * Used to compare two tokens. Token is extended by Comparable so you can
193
+ * also use +<+, +>+, +<=+, +>=+ etc. to compare tokens.
194
+ *
195
+ * Tokens are sorted by the position in the text at which they occur, ie
196
+ * the start offset. If two tokens have the same start offset, (see
197
+ * pos_inc=) then, they are sorted by the end offset and then
198
+ * lexically by the token text.
199
+ */
144
200
  static VALUE
145
201
  frt_token_cmp(VALUE self, VALUE rother)
146
202
  {
147
- RToken *other;
148
- int cmp;
149
- GET_TK;
150
- Data_Get_Struct(rother, RToken, other);
151
- if (token->start > other->start) {
152
- cmp = 1;
153
- } else if (token->start < other->start) {
154
- cmp = -1;
155
- } else {
156
- if (token->end > other->end) {
157
- cmp = 1;
158
- } else if (token->end < other->end) {
159
- cmp = -1;
203
+ RToken *token, *other;
204
+ int cmp;
205
+ GET_TK(token, self);
206
+ GET_TK(other, rother);
207
+ if (token->start > other->start) {
208
+ cmp = 1;
209
+ } else if (token->start < other->start) {
210
+ cmp = -1;
160
211
  } else {
161
- cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
212
+ if (token->end > other->end) {
213
+ cmp = 1;
214
+ } else if (token->end < other->end) {
215
+ cmp = -1;
216
+ } else {
217
+ cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
218
+ }
162
219
  }
163
- }
164
- return INT2FIX(cmp);
220
+ return INT2FIX(cmp);
165
221
  }
166
222
 
223
+ /*
224
+ * call-seq:
225
+ * token.text -> text
226
+ *
227
+ * Returns the text that this token represents
228
+ */
167
229
  static VALUE
168
230
  frt_token_get_text(VALUE self)
169
231
  {
170
- GET_TK;
171
- return token->text;
232
+ RToken *token;
233
+ GET_TK(token, self);
234
+ return token->text;
172
235
  }
173
236
 
237
+ /*
238
+ * call-seq:
239
+ * token.text = text -> text
240
+ *
241
+ * Set the text for this token.
242
+ */
174
243
  static VALUE
175
244
  frt_token_set_text(VALUE self, VALUE rtext)
176
245
  {
177
- GET_TK;
178
- token->text = rtext;
179
- return rtext;
246
+ RToken *token;
247
+ GET_TK(token, self);
248
+ token->text = rtext;
249
+ return rtext;
180
250
  }
181
251
 
252
+ /*
253
+ * call-seq:
254
+ * token.start -> integer
255
+ *
256
+ * Start byte-position of this token
257
+ */
182
258
  static VALUE
183
259
  frt_token_get_start_offset(VALUE self)
184
260
  {
185
- GET_TK;
186
- return INT2FIX(token->start);
261
+ RToken *token;
262
+ GET_TK(token, self);
263
+ return INT2FIX(token->start);
187
264
  }
188
265
 
266
+ /*
267
+ * call-seq:
268
+ * token.end -> integer
269
+ *
270
+ * End byte-position of this token
271
+ */
189
272
  static VALUE
190
273
  frt_token_get_end_offset(VALUE self)
191
274
  {
192
- GET_TK;
193
- return INT2FIX(token->end);
275
+ RToken *token;
276
+ GET_TK(token, self);
277
+ return INT2FIX(token->end);
194
278
  }
195
279
 
280
+ /*
281
+ * call-seq:
282
+ * token.pos_inc -> integer
283
+ *
284
+ * Position Increment for this token
285
+ */
196
286
  static VALUE
197
287
  frt_token_get_pos_inc(VALUE self)
198
288
  {
199
- GET_TK;
200
- return INT2FIX(token->pos_inc);
289
+ RToken *token;
290
+ GET_TK(token, self);
291
+ return INT2FIX(token->pos_inc);
201
292
  }
202
293
 
294
+ /*
295
+ * call-seq:
296
+ * token.start = start -> integer
297
+ *
298
+ * Set start byte-position of this token
299
+ */
300
+ static VALUE
301
+ frt_token_set_start_offset(VALUE self, VALUE rstart)
302
+ {
303
+ RToken *token;
304
+ GET_TK(token, self);
305
+ token->start = FIX2INT(rstart);
306
+ return rstart;
307
+ }
308
+
309
+ /*
310
+ * call-seq:
311
+ * token.end = end -> integer
312
+ *
313
+ * Set end byte-position of this token
314
+ */
315
+ static VALUE
316
+ frt_token_set_end_offset(VALUE self, VALUE rend)
317
+ {
318
+ RToken *token;
319
+ GET_TK(token, self);
320
+ token->end = FIX2INT(rend);
321
+ return rend;
322
+ }
323
+
324
+ /*
325
+ * call-seq:
326
+ * token.pos_inc = pos_inc -> integer
327
+ *
328
+ * Set the position increment. This determines the position of this token
329
+ * relative to the previous Token in a TokenStream, used in phrase
330
+ * searching.
331
+ *
332
+ * The default value is 1.
333
+ *
334
+ * Some common uses for this are:
335
+ *
336
+ * * Set it to zero to put multiple terms in the same position. This is
337
+ * useful if, e.g., a word has multiple stems. Searches for phrases
338
+ * including either stem will match. In this case, all but the first
339
+ * stem's increment should be set to zero: the increment of the first
340
+ * instance should be one. Repeating a token with an increment of zero
341
+ * can also be used to boost the scores of matches on that token.
342
+ *
343
+ * * Set it to values greater than one to inhibit exact phrase matches.
344
+ * If, for example, one does not want phrases to match across removed
345
+ * stop words, then one could build a stop word filter that removes stop
346
+ * words and also sets the increment to the number of stop words removed
347
+ * before each non-stop word. Then exact phrase queries will only match
348
+ * when the terms occur with no intervening stop words.
349
+ *
350
+ */
351
+ static VALUE
352
+ frt_token_set_pos_inc(VALUE self, VALUE rpos_inc)
353
+ {
354
+ RToken *token;
355
+ GET_TK(token, self);
356
+ token->pos_inc = FIX2INT(rpos_inc);
357
+ return rpos_inc;
358
+ }
359
+
360
+ /*
361
+ * call-seq:
362
+ * token.to_s -> token_str
363
+ *
364
+ * Return a string representation of the token
365
+ */
203
366
  static VALUE
204
367
  frt_token_to_s(VALUE self)
205
368
  {
206
- GET_TK;
207
- char *buf = alloca(RSTRING(token->text)->len + 80);
208
- sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
209
- token->end, token->pos_inc);
210
- return rb_str_new2(buf);
369
+ RToken *token;
370
+ GET_TK(token, self);
371
+ char *buf = alloca(RSTRING(token->text)->len + 80);
372
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
373
+ token->end, token->pos_inc);
374
+ return rb_str_new2(buf);
211
375
  }
212
376
 
213
377
  /****************************************************************************
@@ -216,143 +380,210 @@ frt_token_to_s(VALUE self)
216
380
  *
217
381
  ****************************************************************************/
218
382
 
383
+ #define GET_TS(ts, self) Data_Get_Struct(self, TokenStream, ts)
384
+
219
385
  static void
220
386
  frt_ts_mark(void *p)
221
387
  {
222
- TokenStream *ts = (TokenStream *)p;
223
- if (ts->text) frt_gc_mark(&ts->text);
224
- if (ts->sub_ts) frt_gc_mark(&ts->sub_ts);
388
+ TokenStream *ts = (TokenStream *)p;
389
+ if (ts->text) frt_gc_mark(&ts->text);
225
390
  }
226
391
 
227
392
  static void
228
393
  frt_ts_free(TokenStream *ts)
229
394
  {
230
- if (object_get(&ts->text) != Qnil) object_del(&ts->text);
231
- if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
232
- object_del(ts);
233
- ts_deref(ts);
395
+ if (object_get(&ts->text) != Qnil) {
396
+ object_del(&ts->text);
397
+ }
398
+ object_del(ts);
399
+ ts_deref(ts);
234
400
  }
235
401
 
402
+ static void frt_rets_free(TokenStream *ts);
403
+ static void frt_rets_mark(TokenStream *ts);
404
+ static Token *rets_next(TokenStream *ts);
405
+
236
406
  static VALUE
237
- get_token_stream(TokenStream *ts)
238
- {
239
- VALUE rts = object_get(ts);
240
- if (rts == Qnil) {
241
- rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark, &frt_ts_free, ts);
242
- object_add(ts, rts);
243
- }
244
- return rts;
407
+ get_rb_token_stream(TokenStream *ts)
408
+ {
409
+ VALUE rts = object_get(ts);
410
+ if (rts == Qnil) {
411
+ if (ts->next == &rets_next) {
412
+ rts = Data_Wrap_Struct(cTokenStream, &frt_rets_mark,
413
+ &frt_rets_free, ts);
414
+ } else {
415
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark,
416
+ &frt_ts_free, ts);
417
+ }
418
+ object_add(ts, rts);
419
+ }
420
+ return rts;
245
421
  }
246
422
 
247
423
  static inline VALUE
248
424
  get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
249
425
  {
250
- rstr = rb_obj_as_string(rstr);
251
- ts->reset(ts, RSTRING(rstr)->ptr);
252
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
253
- object_add(&ts->text, rstr);
254
- object_add(ts, self);
255
- return self;
426
+ StringValue(rstr);
427
+ ts->reset(ts, RSTRING(rstr)->ptr);
428
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
429
+ object_add(&ts->text, rstr);
430
+ object_add(ts, self);
431
+ return self;
256
432
  }
257
433
 
434
+ /*
435
+ * call-seq:
436
+ * token_stream.text = text -> text
437
+ *
438
+ * Set the text attribute of the TokenStream to the text you wish to be
439
+ * tokenized. For example, you may do this;
440
+ *
441
+ * token_stream.text = File.read(file_name)
442
+ */
258
443
  static VALUE
259
444
  frt_ts_set_text(VALUE self, VALUE rtext)
260
445
  {
261
- TokenStream *ts;
262
- Data_Get_Struct(self, TokenStream, ts);
263
- rtext = rb_obj_as_string(rtext);
264
- ts->reset(ts, RSTRING(rtext)->ptr);
265
- object_set(&ts->text, rtext);
446
+ TokenStream *ts;
447
+ Data_Get_Struct(self, TokenStream, ts);
448
+ StringValue(rtext);
449
+ ts->reset(ts, RSTRING(rtext)->ptr);
450
+ object_set(&ts->text, rtext);
266
451
 
267
- return rtext;
452
+ return rtext;
268
453
  }
269
454
 
455
+ /*
456
+ * call-seq:
457
+ * token_stream.text = text -> text
458
+ *
459
+ * Return the text that the TokenStream is tokenizing
460
+ */
270
461
  static VALUE
271
462
  frt_ts_get_text(VALUE self)
272
463
  {
273
- VALUE rtext = Qnil;
274
- TokenStream *ts;
275
- Data_Get_Struct(self, TokenStream, ts);
276
- if (ts->text) {
277
- if ((rtext = object_get(&ts->text)) == Qnil) {
278
- rtext = rb_str_new2(ts->text);
279
- object_set(&ts->text, rtext);
280
- }
281
- }
282
- return rtext;
464
+ VALUE rtext = Qnil;
465
+ TokenStream *ts;
466
+ Data_Get_Struct(self, TokenStream, ts);
467
+ if (ts->text) {
468
+ if ((rtext = object_get(&ts->text)) == Qnil) {
469
+ rtext = rb_str_new2(ts->text);
470
+ object_set(&ts->text, rtext);
471
+ }
472
+ }
473
+ return rtext;
283
474
  }
284
475
 
476
+ /*
477
+ * call-seq:
478
+ * token_stream.next -> token
479
+ *
480
+ * Return the next token from the TokenStream or nil if there are no more
481
+ * tokens.
482
+ */
285
483
  static VALUE
286
484
  frt_ts_next(VALUE self)
287
485
  {
288
- TokenStream *ts = (TokenStream *)DATA_PTR(self);
289
- Token *next = ts->next(ts);
290
- if (next == NULL) {
291
- return Qnil;
292
- }
486
+ TokenStream *ts;
487
+ GET_TS(ts, self);
488
+ Token *next = ts->next(ts);
489
+ if (next == NULL) {
490
+ return Qnil;
491
+ }
293
492
 
294
- return get_token(next);
493
+ return get_token(next);
295
494
  }
296
495
 
496
+ /****************************************************************************
497
+ * TokenFilter
498
+ ****************************************************************************/
499
+
500
+ #define TkFilt(filter) ((TokenFilter *)(filter))
501
+
502
+ static void
503
+ frt_tf_mark(void *p)
504
+ {
505
+ TokenStream *ts = (TokenStream *)p;
506
+ if (TkFilt(ts)->sub_ts) {
507
+ frt_gc_mark(&TkFilt(ts)->sub_ts);
508
+ }
509
+ }
510
+
511
+ static void
512
+ frt_tf_free(TokenStream *ts)
513
+ {
514
+ if (TkFilt(ts)->sub_ts && (object_get(&TkFilt(ts)->sub_ts) != Qnil)) {
515
+ object_del(&TkFilt(ts)->sub_ts);
516
+ }
517
+ object_del(ts);
518
+ ts_deref(ts);
519
+ }
520
+
521
+
297
522
  /****************************************************************************
298
523
  * CWrappedTokenStream
299
524
  ****************************************************************************/
300
525
 
526
+ #define CachedTS(token_stream) ((CachedTokenStream *)(token_stream))
527
+ #define CWTS(token_stream) ((CWrappedTokenStream *)(token_stream))
528
+
529
+ typedef struct CWrappedTokenStream {
530
+ CachedTokenStream super;
531
+ VALUE rts;
532
+ } CWrappedTokenStream;
533
+
301
534
  static void
302
- cwrts_destroy(TokenStream *ts)
535
+ cwrts_destroy_i(TokenStream *ts)
303
536
  {
304
- rb_hash_delete(object_space, LONG2NUM((long)ts->data));
305
- free(ts->token);
306
- free(ts);
537
+ rb_hash_delete(object_space, LONG2NUM(CWTS(ts)->rts));
538
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
539
+ free(ts);
307
540
  }
308
541
 
309
542
  static Token *
310
543
  cwrts_next(TokenStream *ts)
311
544
  {
312
- VALUE rts = (VALUE)ts->data;
313
- VALUE rtoken = rb_funcall(rts, id_next, 0);
314
- return frt_set_token(ts->token, rtoken);
545
+ VALUE rtoken = rb_funcall(CWTS(ts)->rts, id_next, 0);
546
+ return frt_set_token(&(CachedTS(ts)->token), rtoken);
315
547
  }
316
548
 
317
- static void
549
+ static TokenStream *
318
550
  cwrts_reset(TokenStream *ts, char *text)
319
551
  {
320
- VALUE rts = (VALUE)ts->data;
321
- ts->t = ts->text = text;
322
- rb_funcall(rts, id_reset, 1, rb_str_new2(text));
552
+ ts->t = ts->text = text;
553
+ rb_funcall(CWTS(ts)->rts, id_reset, 1, rb_str_new2(text));
554
+ return ts;
323
555
  }
324
556
 
325
- static void
326
- cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
557
+ static TokenStream *
558
+ cwrts_clone_i(TokenStream *orig_ts)
327
559
  {
328
- VALUE rorig_ts = (VALUE)orig_ts->data;
329
- new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
560
+ TokenStream *new_ts = ts_clone_size(orig_ts, sizeof(CWrappedTokenStream));
561
+ CWTS(new_ts)->rts = rb_funcall(CWTS(orig_ts)->rts, id_clone, 0);
562
+ return new_ts;
330
563
  }
331
564
 
332
565
  static TokenStream *
333
566
  frt_get_cwrapped_rts(VALUE rts)
334
567
  {
335
- TokenStream *ts;
336
- switch (TYPE(rts)) {
337
- case T_DATA:
338
- Data_Get_Struct(rts, TokenStream, ts);
339
- ref(ts);
340
- break;
341
- default:
342
- ts = ALLOC(TokenStream);
343
- ts->token = ALLOC(Token);
344
- ts->data = (void *)rts;
345
- ts->next = &cwrts_next;
346
- ts->reset = &cwrts_reset;
347
- ts->clone_i = &cwrts_clone_i;
348
- ts->destroy = &cwrts_destroy;
349
- ts->sub_ts = NULL;
350
- // prevent from being garbage collected
351
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
352
- ts->ref_cnt = 1;
353
- break;
354
- }
355
- return ts;
568
+ TokenStream *ts;
569
+ switch (TYPE(rts)) {
570
+ case T_DATA:
571
+ GET_TS(ts, rts);
572
+ REF(ts);
573
+ break;
574
+ default:
575
+ ts = ts_new(CWrappedTokenStream);
576
+ CWTS(ts)->rts = rts;
577
+ ts->next = &cwrts_next;
578
+ ts->reset = &cwrts_reset;
579
+ ts->clone_i = &cwrts_clone_i;
580
+ ts->destroy_i = &cwrts_destroy_i;
581
+ /* prevent from being garbage collected */
582
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
583
+ ts->ref_cnt = 1;
584
+ break;
585
+ }
586
+ return ts;
356
587
  }
357
588
 
358
589
  /****************************************************************************
@@ -364,165 +595,181 @@ frt_get_cwrapped_rts(VALUE rts)
364
595
  #define ALPHA "[-_[:alpha:]]"
365
596
  #define ALNUM "[-_[:alnum:]]"
366
597
 
367
- static char *token_re =
368
- ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
369
- "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
370
- "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
598
+ #define RETS(token_stream) ((RegExpTokenStream *)(token_stream))
599
+
600
+ static const char *TOKEN_RE =
601
+ ALPHA "+(('" ALPHA "+)+|\\.(" ALPHA "\\.)+|"
602
+ "(@|\\&)\\w+([-.]\\w+)*|:\\/\\/" ALNUM "+([-.\\/]" ALNUM "+)*)?"
603
+ "|\\w+(([-._]\\w+)*\\@\\w+([-.]\\w+)+"
371
604
  "|" P HASDIGIT "(" P "\\w+" P HASDIGIT ")*(" P "\\w+)?"
372
605
  "|(\\.\\w+)+"
373
606
  "|"
374
- ")";
607
+ ")";
375
608
  static VALUE rtoken_re;
376
609
 
377
610
  typedef struct RegExpTokenStream {
378
- VALUE rtext;
379
- VALUE regex;
380
- VALUE proc;
381
- int curr_ind;
611
+ CachedTokenStream super;
612
+ VALUE rtext;
613
+ VALUE regex;
614
+ VALUE proc;
615
+ int curr_ind;
382
616
  } RegExpTokenStream;
383
617
 
384
618
  static void
385
- rets_destroy(TokenStream *ts)
619
+ rets_destroy_i(TokenStream *ts)
386
620
  {
387
- rb_hash_delete(object_space, LONG2NUM((long)object_get(ts)));
388
- free(ts->data);
389
- free(ts->token);
390
- free(ts);
621
+ free(ts);
391
622
  }
392
623
 
393
624
  static void
394
625
  frt_rets_free(TokenStream *ts)
395
626
  {
396
- object_del(ts);
397
- ts_deref(ts);
627
+ if (object_get(&ts->text) != Qnil) {
628
+ object_del(&ts->text);
629
+ }
630
+ object_del(ts);
631
+ ts_deref(ts);
398
632
  }
399
633
 
400
634
  static void
401
635
  frt_rets_mark(TokenStream *ts)
402
636
  {
403
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
404
- rb_gc_mark(rets->rtext);
405
- rb_gc_mark(rets->regex);
406
- rb_gc_mark(rets->proc);
637
+ if (ts->text) frt_gc_mark(&ts->text);
638
+ rb_gc_mark(RETS(ts)->rtext);
639
+ rb_gc_mark(RETS(ts)->regex);
640
+ rb_gc_mark(RETS(ts)->proc);
407
641
  }
408
642
 
643
+ /*
644
+ * call-seq:
645
+ * tokenizer.text = text -> text
646
+ *
647
+ * Set the text to be tokenized by the tokenizer. The tokenizer gets reset to
648
+ * tokenize the text from the beginning.
649
+ */
409
650
  static VALUE
410
651
  frt_rets_set_text(VALUE self, VALUE rtext)
411
652
  {
412
- TokenStream *ts;
413
- RegExpTokenStream *rets;
414
- Data_Get_Struct(self, TokenStream, ts);
653
+ TokenStream *ts;
654
+ GET_TS(ts, self);
655
+
656
+ StringValue(rtext);
657
+ RETS(ts)->rtext = rtext;
658
+ RETS(ts)->curr_ind = 0;
415
659
 
416
- StringValue(rtext);
417
- rets = (RegExpTokenStream *)ts->data;
418
- rets->rtext = rtext;
419
- rets->curr_ind = 0;
420
-
421
- return rtext;
660
+ return rtext;
422
661
  }
423
662
 
663
+ /*
664
+ * call-seq:
665
+ * tokenizer.text = text -> text
666
+ *
667
+ * Get the text being tokenized by the tokenizer.
668
+ */
424
669
  static VALUE
425
670
  frt_rets_get_text(VALUE self)
426
671
  {
427
- TokenStream *ts;
428
- RegExpTokenStream *rets;
429
- Data_Get_Struct(self, TokenStream, ts);
430
- rets = (RegExpTokenStream *)ts->data;
431
- return rets->rtext;
672
+ TokenStream *ts;
673
+ GET_TS(ts, self);
674
+ return RETS(ts)->rtext;
432
675
  }
433
676
 
434
677
  static Token *
435
678
  rets_next(TokenStream *ts)
436
679
  {
437
- static struct re_registers regs;
438
- int ret, beg, end;
439
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
440
- struct RString *rtext = RSTRING(rets->rtext);
441
- Check_Type(rets->regex, T_REGEXP);
442
- ret = ruby_re_search(RREGEXP(rets->regex)->ptr,
443
- rtext->ptr, rtext->len,
444
- rets->curr_ind, rtext->len - rets->curr_ind,
445
- &regs);
446
-
447
- if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
448
- if (ret < 0) return NULL; /* not matched */
449
-
450
- beg = regs.beg[0];
451
- rets->curr_ind = end = regs.end[0];
452
- if (NIL_P(rets->proc)) {
453
- return tk_set(ts->token, rtext->ptr + beg, end - beg, beg, end, 1);
454
- } else {
455
- VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
456
- rtok = rb_funcall(rets->proc, id_call, 1, rtok);
457
- return tk_set(ts->token, RSTRING(rtok)->ptr, RSTRING(rtok)->len, beg, end, 1);
458
- }
680
+ static struct re_registers regs;
681
+ int ret, beg, end;
682
+ struct RString *rtext = RSTRING(RETS(ts)->rtext);
683
+ Check_Type(RETS(ts)->regex, T_REGEXP);
684
+ ret = ruby_re_search(RREGEXP(RETS(ts)->regex)->ptr,
685
+ rtext->ptr, rtext->len,
686
+ RETS(ts)->curr_ind, rtext->len - RETS(ts)->curr_ind,
687
+ &regs);
688
+
689
+ if (ret == -2) rb_raise(rb_eStandardError, "regexp buffer overflow");
690
+ if (ret < 0) return NULL; /* not matched */
691
+
692
+ beg = regs.beg[0];
693
+ RETS(ts)->curr_ind = end = regs.end[0];
694
+ if (NIL_P(RETS(ts)->proc)) {
695
+ return tk_set(&(CachedTS(ts)->token), rtext->ptr + beg, end - beg,
696
+ beg, end, 1);
697
+ } else {
698
+ VALUE rtok = rb_str_new(rtext->ptr + beg, end - beg);
699
+ rtok = rb_funcall(RETS(ts)->proc, id_call, 1, rtok);
700
+ return tk_set(&(CachedTS(ts)->token), RSTRING(rtok)->ptr,
701
+ RSTRING(rtok)->len, beg, end, 1);
702
+ }
459
703
  }
460
704
 
461
- static void
705
+ static TokenStream *
462
706
  rets_reset(TokenStream *ts, char *text)
463
707
  {
464
- RegExpTokenStream *rets = (RegExpTokenStream *)ts->data;
465
- rets->rtext = rb_str_new2(text);
466
- rets->curr_ind = 0;
708
+ RETS(ts)->rtext = rb_str_new2(text);
709
+ RETS(ts)->curr_ind = 0;
710
+ return ts;
467
711
  }
468
712
 
469
- void
470
- rets_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
713
+ static TokenStream *
714
+ rets_clone_i(TokenStream *orig_ts)
471
715
  {
472
- RegExpTokenStream *new_rets = ALLOC(RegExpTokenStream);
473
- RegExpTokenStream *orig_rets = (RegExpTokenStream *)orig_ts->data;
474
- memcpy(new_rets, orig_rets, sizeof(RegExpTokenStream));
475
- new_ts->data = new_rets;
716
+ TokenStream *ts = ts_clone_size(orig_ts, sizeof(RegExpTokenStream));
717
+ return ts;
476
718
  }
477
719
 
478
720
  static TokenStream *
479
- rets_create(VALUE rtext, VALUE regex, VALUE proc)
721
+ rets_new(VALUE rtext, VALUE regex, VALUE proc)
480
722
  {
481
- RegExpTokenStream *rets;
482
- TokenStream *ts;
723
+ TokenStream *ts;
483
724
 
484
- if (rtext != Qnil) {
485
- rtext = StringValue(rtext);
486
- }
487
- ts = ts_create();
488
- ts->reset = &rets_reset;
489
- ts->next = &rets_next;
490
- ts->clone_i = &rets_clone_i;
491
- ts->destroy = &rets_destroy;
492
- ts->ref_cnt = 1;
493
-
494
- rets = ALLOC(RegExpTokenStream);
495
- rets->curr_ind = 0;
496
- rets->rtext = rtext;
497
- rets->proc = proc;
498
- if (NIL_P(regex)) {
499
- rets->regex = rtoken_re;
500
- } else {
501
- Check_Type(regex, T_REGEXP);
502
- rets->regex = regex;
503
- }
504
-
505
- ts->data = rets;
725
+ if (rtext != Qnil) {
726
+ rtext = StringValue(rtext);
727
+ }
728
+ ts = ts_new(RegExpTokenStream);
729
+ ts->reset = &rets_reset;
730
+ ts->next = &rets_next;
731
+ ts->clone_i = &rets_clone_i;
732
+ ts->destroy_i = &rets_destroy_i;
733
+
734
+ RETS(ts)->curr_ind = 0;
735
+ RETS(ts)->rtext = rtext;
736
+ RETS(ts)->proc = proc;
737
+
738
+ if (NIL_P(regex)) {
739
+ RETS(ts)->regex = rtoken_re;
740
+ } else {
741
+ Check_Type(regex, T_REGEXP);
742
+ RETS(ts)->regex = regex;
743
+ }
506
744
 
507
- return ts;
745
+ return ts;
508
746
  }
509
747
 
748
+ /*
749
+ * call-seq:
750
+ * RegExpTokenizer.new(input, /[[:alpha:]]+/)
751
+ *
752
+ * Create a new tokenizer based on a regular expression
753
+ *
754
+ * input:: text to tokenizer
755
+ * regexp:: regular expression used to recognize tokens in the input
756
+ */
510
757
  static VALUE
511
758
  frt_rets_init(int argc, VALUE *argv, VALUE self)
512
759
  {
513
- VALUE rtext, regex, proc;
514
- TokenStream *ts;
760
+ VALUE rtext, regex, proc;
761
+ TokenStream *ts;
515
762
 
516
- rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
763
+ rb_scan_args(argc, argv, "11&", &rtext, &regex, &proc);
517
764
 
518
- ts = rets_create(rtext, regex, proc);
765
+ ts = rets_new(rtext, regex, proc);
519
766
 
520
- Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
521
- object_add(ts, self);
522
- /* no need to add to object space as it is going to ruby space
523
- * rb_hash_aset(object_space, LONG2NUM((long)self), self);
524
- */
525
- return self;
767
+ Frt_Wrap_Struct(self, &frt_rets_mark, &frt_rets_free, ts);
768
+ object_add(ts, self);
769
+ /* no need to add to object space as it is going to ruby space
770
+ * rb_hash_aset(object_space, LONG2NUM((long)self), self);
771
+ */
772
+ return self;
526
773
  }
527
774
 
528
775
  /****************************************************************************
@@ -530,47 +777,92 @@ frt_rets_init(int argc, VALUE *argv, VALUE self)
530
777
  ****************************************************************************/
531
778
 
532
779
  #define TS_ARGS(dflt) \
533
- bool lower;\
534
- VALUE rlower, rstr;\
535
- rb_scan_args(argc, argv, "11", &rstr, &rlower);\
536
- lower = (argc ? RTEST(rlower) : dflt)
537
-
780
+ bool lower;\
781
+ VALUE rlower, rstr;\
782
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
783
+ lower = (argc ? RTEST(rlower) : dflt)
784
+
785
+ /*
786
+ * call-seq:
787
+ * AsciiLetterTokenizer.new() -> tokenizer
788
+ *
789
+ * Create a new AsciiLetterTokenizer
790
+ */
538
791
  static VALUE
539
792
  frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
540
793
  {
541
- return get_wrapped_ts(self, rstr, letter_tokenizer_create());
794
+ return get_wrapped_ts(self, rstr, letter_tokenizer_new());
542
795
  }
543
796
 
797
+ /*
798
+ * call-seq:
799
+ * LetterTokenizer.new(lower = true) -> tokenizer
800
+ *
801
+ * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
802
+ * is done according the the current locale.
803
+ *
804
+ * lower:: set to false if you don't wish to downcase tokens
805
+ */
544
806
  static VALUE
545
807
  frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
546
808
  {
547
- TS_ARGS(false);
548
- return get_wrapped_ts(self, rstr, mb_letter_tokenizer_create(lower));
809
+ TS_ARGS(false);
810
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_new(lower));
549
811
  }
550
812
 
813
+ /*
814
+ * call-seq:
815
+ * AsciiWhiteSpaceTokenizer.new() -> tokenizer
816
+ *
817
+ * Create a new AsciiWhiteSpaceTokenizer
818
+ */
551
819
  static VALUE
552
820
  frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
553
821
  {
554
- return get_wrapped_ts(self, rstr, whitespace_tokenizer_create());
822
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_new());
555
823
  }
556
824
 
825
+ /*
826
+ * call-seq:
827
+ * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
828
+ *
829
+ * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
830
+ * Downcasing is done according the the current locale.
831
+ *
832
+ * lower:: set to false if you don't wish to downcase tokens
833
+ */
557
834
  static VALUE
558
835
  frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
559
836
  {
560
- TS_ARGS(false);
561
- return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_create(lower));
837
+ TS_ARGS(false);
838
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_new(lower));
562
839
  }
563
840
 
841
+ /*
842
+ * call-seq:
843
+ * AsciiStandardTokenizer.new() -> tokenizer
844
+ *
845
+ * Create a new AsciiStandardTokenizer
846
+ */
564
847
  static VALUE
565
848
  frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
566
849
  {
567
- return get_wrapped_ts(self, rstr, standard_tokenizer_create());
850
+ return get_wrapped_ts(self, rstr, standard_tokenizer_new());
568
851
  }
569
852
 
853
+ /*
854
+ * call-seq:
855
+ * StandardTokenizer.new(lower = true) -> tokenizer
856
+ *
857
+ * Create a new StandardTokenizer which optionally downcases tokens.
858
+ * Downcasing is done according the the current locale.
859
+ *
860
+ * lower:: set to false if you don't wish to downcase tokens
861
+ */
570
862
  static VALUE
571
863
  frt_standard_tokenizer_init(VALUE self, VALUE rstr)
572
864
  {
573
- return get_wrapped_ts(self, rstr, mb_standard_tokenizer_create());
865
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_new());
574
866
  }
575
867
 
576
868
  /****************************************************************************
@@ -578,71 +870,114 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
578
870
  ****************************************************************************/
579
871
 
580
872
 
873
+ /*
874
+ * call-seq:
875
+ * AsciiLowerCaseFilter.new(token_stream) -> token_stream
876
+ *
877
+ * Create an AsciiLowerCaseFilter which normalizes a token's text to
878
+ * lowercase but only for Ascii characters. For other characters use
879
+ * LowerCaseFilter.
880
+ */
581
881
  static VALUE
582
882
  frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
583
883
  {
584
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
585
- ts = lowercase_filter_create(ts);
586
- object_add(&ts->sub_ts, rsub_ts);
884
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
885
+ ts = lowercase_filter_new(ts);
886
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
587
887
 
588
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
589
- object_add(ts, self);
590
- return self;
888
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
889
+ object_add(ts, self);
890
+ return self;
591
891
  }
592
892
 
893
+ /*
894
+ * call-seq:
895
+ * LowerCaseFilter.new(token_stream) -> token_stream
896
+ *
897
+ * Create an LowerCaseFilter which normalizes a token's text to
898
+ * lowercase based on the current locale.
899
+ */
593
900
  static VALUE
594
901
  frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
595
902
  {
596
- TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
597
- ts = mb_lowercase_filter_create(ts);
598
- object_add(&ts->sub_ts, rsub_ts);
903
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
904
+ ts = mb_lowercase_filter_new(ts);
905
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
599
906
 
600
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
601
- object_add(ts, self);
602
- return self;
907
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
908
+ object_add(ts, self);
909
+ return self;
603
910
  }
604
911
 
912
+ /*
913
+ * call-seq:
914
+ * StopFilter.new(token_stream) -> token_stream
915
+ * StopFilter.new(token_stream, ["the", "and", "it"]) -> token_stream
916
+ *
917
+ * Create an StopFilter which removes *stop-words* from a TokenStream. You can
918
+ * optionally specify the stopwords you wish to have removed.
919
+ *
920
+ * token_stream:: TokenStream to be filtered
921
+ * stop_words:: Array of *stop-words* you wish to be filtered out. This
922
+ * defaults to a list of English stop-words. The
923
+ * Ferret::Analysis contains a number of stop-word lists.
924
+ */
605
925
  static VALUE
606
926
  frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
607
927
  {
608
- VALUE rsub_ts, rstop_words;
609
- TokenStream *ts;
610
- rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
611
- ts = frt_get_cwrapped_rts(rsub_ts);
612
- if (rstop_words != Qnil) {
613
- char **stop_words = get_stopwords(rstop_words);
614
- ts = stop_filter_create_with_words(ts, (const char **)stop_words);
928
+ VALUE rsub_ts, rstop_words;
929
+ TokenStream *ts;
930
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
931
+ ts = frt_get_cwrapped_rts(rsub_ts);
932
+ if (rstop_words != Qnil) {
933
+ char **stop_words = get_stopwords(rstop_words);
934
+ ts = stop_filter_new_with_words(ts, (const char **)stop_words);
615
935
 
616
- free(stop_words);
617
- } else {
618
- ts = stop_filter_create(ts);
619
- }
620
- object_add(&ts->sub_ts, rsub_ts);
936
+ free(stop_words);
937
+ } else {
938
+ ts = stop_filter_new(ts);
939
+ }
940
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
621
941
 
622
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
623
- object_add(ts, self);
624
- return self;
942
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
943
+ object_add(ts, self);
944
+ return self;
625
945
  }
626
946
 
947
+ /*
948
+ * call-seq:
949
+ * StemFilter.new(token_stream) -> token_stream
950
+ * StemFilter.new(token_stream,
951
+ * algorithm="english",
952
+ * encoding=locale-specific) -> token_stream
953
+ *
954
+ * Create an StemFilter which uses a snowball stemmer (thankyou Martin
955
+ * Porter) to stem words. You can optionally specify the algorithm (default:
956
+ * "english") and encoding (default: "UTF-8").
957
+ *
958
+ * token_stream:: TokenStream to be filtered
959
+ * algorithm:: The algorithm (or language) to use
960
+ * encoding:: The encoding of the data (default: "UTF-8")
961
+ */
627
962
  static VALUE
628
963
  frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
629
964
  {
630
- VALUE rsub_ts, ralgorithm, rcharenc;
631
- char *algorithm = "english";
632
- char *charenc = NULL;
633
- TokenStream *ts;
634
- rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
635
- ts = frt_get_cwrapped_rts(rsub_ts);
636
- switch (argc) {
637
- case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
638
- case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
639
- }
640
- ts = stem_filter_create(ts, algorithm, charenc);
641
- object_add(&ts->sub_ts, rsub_ts);
965
+ VALUE rsub_ts, ralgorithm, rcharenc;
966
+ char *algorithm = "english";
967
+ char *charenc = NULL;
968
+ TokenStream *ts;
969
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
970
+ ts = frt_get_cwrapped_rts(rsub_ts);
971
+ switch (argc) {
972
+ case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
973
+ case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
974
+ }
975
+ ts = stem_filter_new(ts, algorithm, charenc);
976
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
642
977
 
643
- Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
644
- object_add(ts, self);
645
- return self;
978
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
979
+ object_add(ts, self);
980
+ return self;
646
981
  }
647
982
 
648
983
  /****************************************************************************
@@ -655,216 +990,327 @@ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
655
990
  * CWrappedAnalyzer Methods
656
991
  ****************************************************************************/
657
992
 
993
+ #define GET_A(a, self) Data_Get_Struct(self, Analyzer, a)
994
+
995
+ #define CWA(analyzer) ((CWrappedAnalyzer *)(analyzer))
996
+ typedef struct CWrappedAnalyzer
997
+ {
998
+ Analyzer super;
999
+ VALUE ranalyzer;
1000
+ } CWrappedAnalyzer;
1001
+
658
1002
  static void
659
- cwa_destroy(Analyzer *a)
1003
+ cwa_destroy_i(Analyzer *a)
660
1004
  {
661
- rb_hash_delete(object_space, LONG2NUM((long)a->data));
662
- a_standard_destroy(a);
1005
+ rb_hash_delete(object_space, LONG2NUM(CWA(a)->ranalyzer));
1006
+ /*printf("rb_hash_size = %d\n", frt_rb_hash_size(object_space)); */
1007
+ free(a);
663
1008
  }
664
1009
 
665
1010
  static TokenStream *
666
1011
  cwa_get_ts(Analyzer *a, char *field, char *text)
667
1012
  {
668
- VALUE ranalyzer = (VALUE)a->data;
669
- VALUE rts = rb_funcall(ranalyzer, id_token_stream, 2,
670
- rb_str_new2(field), rb_str_new2(text));
671
- return frt_get_cwrapped_rts(rts);
1013
+ VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1014
+ rb_str_new2(field), rb_str_new2(text));
1015
+ return frt_get_cwrapped_rts(rts);
672
1016
  }
673
1017
 
674
1018
  Analyzer *
675
- frt_get_cwrapped_analyzer(ranalyzer)
676
- {
677
- Analyzer *a = NULL;
678
- switch (TYPE(ranalyzer)) {
679
- case T_DATA:
680
- Data_Get_Struct(ranalyzer, Analyzer, a);
681
- ref(a);
682
- break;
683
- default:
684
- a = analyzer_create((void *)ranalyzer, NULL, &cwa_destroy, &cwa_get_ts);
685
- // prevent from being garbage collected
686
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
687
- break;
688
- }
689
- return a;
1019
+ frt_get_cwrapped_analyzer(VALUE ranalyzer)
1020
+ {
1021
+ Analyzer *a = NULL;
1022
+ switch (TYPE(ranalyzer)) {
1023
+ case T_DATA:
1024
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1025
+ REF(a);
1026
+ break;
1027
+ default:
1028
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1029
+ a->destroy_i = &cwa_destroy_i;
1030
+ a->get_ts = &cwa_get_ts;
1031
+ a->ref_cnt = 1;
1032
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1033
+ /* prevent from being garbage collected */
1034
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1035
+ break;
1036
+ }
1037
+ return a;
690
1038
  }
691
1039
 
692
1040
  static void
693
1041
  frt_analyzer_free(Analyzer *a)
694
1042
  {
695
- object_del(a);
696
- a_deref(a);
1043
+ object_del(a);
1044
+ a_deref(a);
697
1045
  }
698
1046
 
699
1047
  VALUE
700
1048
  frt_get_analyzer(Analyzer *a)
701
1049
  {
702
- VALUE self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
703
- object_add(a, self);
704
- return self;
1050
+ VALUE self = Qnil;
1051
+ if (a) {
1052
+ self = object_get(a);
1053
+ if (self == Qnil) {
1054
+ self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
1055
+ REF(a);
1056
+ object_add(a, self);
1057
+ }
1058
+ }
1059
+ return self;
705
1060
  }
706
1061
 
1062
+ /*
1063
+ * call-seq:
1064
+ * analyzer.token_stream(field_name, input) -> token_stream
1065
+ *
1066
+ * Create a new TokenStream to tokenize +input+. The TokenStream created may
1067
+ * also depend on the +field_name+. Although this parameter is typically
1068
+ * ignored.
1069
+ *
1070
+ * field_name:: name of the field to be tokenized
1071
+ * input:: data from the field to be tokenized
1072
+ */
707
1073
  static VALUE
708
1074
  frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
709
1075
  {
710
- TokenStream *ts;
711
- Analyzer *a = (Analyzer *)DATA_PTR(self);
1076
+ TokenStream *ts;
1077
+ Analyzer *a;
1078
+ GET_A(a, self);
1079
+
1080
+ StringValue(rfield);
1081
+ StringValue(rstring);
712
1082
 
713
- rfield = rb_obj_as_string(rfield);
714
- rstring = rb_obj_as_string(rstring);
715
-
716
- ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
1083
+ ts = a_get_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
717
1084
 
718
- /* Make sure that there is no entry already */
719
- object_set(&ts->text, rstring);
720
- return get_token_stream(ts);
1085
+ /* Make sure that there is no entry already */
1086
+ object_set(&ts->text, rstring);
1087
+ return get_rb_token_stream(ts);
721
1088
  }
722
1089
 
723
1090
  #define GET_LOWER(dflt) \
724
- bool lower;\
725
- VALUE rlower;\
726
- rb_scan_args(argc, argv, "01", &rlower);\
727
- lower = (argc ? RTEST(rlower) : dflt)
728
-
729
- /*** AsciiWhiteSpaceAnalyzer ***/
1091
+ bool lower;\
1092
+ VALUE rlower;\
1093
+ rb_scan_args(argc, argv, "01", &rlower);\
1094
+ lower = (argc ? RTEST(rlower) : dflt)
1095
+
1096
+ /*
1097
+ * call-seq:
1098
+ * AsciiWhiteSpaceAnalyzer.new(lower = true) -> analyzer
1099
+ *
1100
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1101
+ * but can optionally leave case as is. Lowercasing will only be done to
1102
+ * ascii characters.
1103
+ *
1104
+ * lower:: set to false if you don't want the field's tokens to be downcased
1105
+ */
730
1106
  static VALUE
731
1107
  frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
732
1108
  {
733
- Analyzer *a;
734
- GET_LOWER(false);
735
- a = whitespace_analyzer_create(lower);
736
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
737
- object_add(a, self);
738
- return self;
1109
+ Analyzer *a;
1110
+ GET_LOWER(false);
1111
+ a = whitespace_analyzer_new(lower);
1112
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1113
+ object_add(a, self);
1114
+ return self;
739
1115
  }
740
1116
 
741
- /*** WhiteSpaceAnalyzer ***/
1117
+ /*
1118
+ * call-seq:
1119
+ * WhiteSpaceAnalyzer.new(lower = true) -> analyzer
1120
+ *
1121
+ * Create a new WhiteSpaceAnalyzer which downcases tokens by default but can
1122
+ * optionally leave case as is. Lowercasing will be done based on the current
1123
+ * locale.
1124
+ *
1125
+ * lower:: set to false if you don't want the field's tokens to be downcased
1126
+ */
742
1127
  static VALUE
743
1128
  frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
744
1129
  {
745
- Analyzer *a;
746
- GET_LOWER(false);
747
- a = mb_whitespace_analyzer_create(lower);
748
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
749
- object_add(a, self);
750
- return self;
1130
+ Analyzer *a;
1131
+ GET_LOWER(false);
1132
+ a = mb_whitespace_analyzer_new(lower);
1133
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1134
+ object_add(a, self);
1135
+ return self;
751
1136
  }
752
1137
 
753
- /*** AsciiLetterAnalyzer ***/
1138
+ /*
1139
+ * call-seq:
1140
+ * AsciiLetterAnalyzer.new(lower = true) -> analyzer
1141
+ *
1142
+ * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1143
+ * but can optionally leave case as is. Lowercasing will only be done to
1144
+ * ascii characters.
1145
+ *
1146
+ * lower:: set to false if you don't want the field's tokens to be downcased
1147
+ */
754
1148
  static VALUE
755
1149
  frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
756
1150
  {
757
- Analyzer *a;
758
- GET_LOWER(true);
759
- a = letter_analyzer_create(lower);
760
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
761
- object_add(a, self);
762
- return self;
1151
+ Analyzer *a;
1152
+ GET_LOWER(true);
1153
+ a = letter_analyzer_new(lower);
1154
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1155
+ object_add(a, self);
1156
+ return self;
763
1157
  }
764
1158
 
765
- /*** LetterAnalyzer ***/
1159
+ /*
1160
+ * call-seq:
1161
+ * LetterAnalyzer.new(lower = true) -> analyzer
1162
+ *
1163
+ * Create a new LetterAnalyzer which downcases tokens by default but can
1164
+ * optionally leave case as is. Lowercasing will be done based on the current
1165
+ * locale.
1166
+ *
1167
+ * lower:: set to false if you don't want the field's tokens to be downcased
1168
+ */
766
1169
  static VALUE
767
1170
  frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
768
1171
  {
769
- Analyzer *a;
770
- GET_LOWER(true);
771
- a = mb_letter_analyzer_create(lower);
772
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
773
- object_add(a, self);
774
- return self;
1172
+ Analyzer *a;
1173
+ GET_LOWER(true);
1174
+ a = mb_letter_analyzer_new(lower);
1175
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1176
+ object_add(a, self);
1177
+ return self;
775
1178
  }
776
1179
 
777
1180
  static VALUE
778
1181
  get_rstopwords(const char **stop_words)
779
1182
  {
780
- char **w = (char **)stop_words;
781
- VALUE rstopwords = rb_ary_new();
1183
+ char **w = (char **)stop_words;
1184
+ VALUE rstopwords = rb_ary_new();
782
1185
 
783
- while (*w) {
784
- rb_ary_push(rstopwords, rb_str_new2(*w));
785
- w++;
786
- }
787
- return rstopwords;
1186
+ while (*w) {
1187
+ rb_ary_push(rstopwords, rb_str_new2(*w));
1188
+ w++;
1189
+ }
1190
+ return rstopwords;
788
1191
  }
789
1192
 
790
- /*** AsciiStandardAnalyzer ***/
1193
+ /*
1194
+ * call-seq:
1195
+ * AsciiStandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1196
+ * -> analyzer
1197
+ *
1198
+ * Create a new AsciiStandardAnalyzer which downcases tokens by default but
1199
+ * can optionally leave case as is. Lowercasing will be done based on the
1200
+ * current locale. You can also set the list of stop-words to be used by the
1201
+ * StopFilter.
1202
+ *
1203
+ * lower:: set to false if you don't want the field's tokens to be downcased
1204
+ * stop_words:: list of stop-words to pass to the StopFilter
1205
+ */
791
1206
  static VALUE
792
1207
  frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
793
1208
  {
794
- bool lower;
795
- VALUE rlower, rstop_words;
796
- Analyzer *a;
797
- rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
798
- lower = ((rlower == Qnil) ? true : RTEST(rlower));
799
- if (rstop_words != Qnil) {
800
- char **stop_words = get_stopwords(rstop_words);
801
- a = standard_analyzer_create_with_words((const char **)stop_words, lower);
802
- free(stop_words);
803
- } else {
804
- a = standard_analyzer_create(lower);
805
- }
806
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
807
- object_add(a, self);
808
- return self;
809
- }
810
-
811
- /*** StandardAnalyzer ***/
1209
+ bool lower;
1210
+ VALUE rlower, rstop_words;
1211
+ Analyzer *a;
1212
+ rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
1213
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1214
+ if (rstop_words != Qnil) {
1215
+ char **stop_words = get_stopwords(rstop_words);
1216
+ a = standard_analyzer_new_with_words((const char **)stop_words, lower);
1217
+ free(stop_words);
1218
+ } else {
1219
+ a = standard_analyzer_new(lower);
1220
+ }
1221
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1222
+ object_add(a, self);
1223
+ return self;
1224
+ }
1225
+
1226
+ /*
1227
+ * call-seq:
1228
+ * StandardAnalyzer.new(lower = true, stop_words = ENGLISH_STOP_WORDS)
1229
+ * -> analyzer
1230
+ *
1231
+ * Create a new StandardAnalyzer which downcases tokens by default but can
1232
+ * optionally leave case as is. Lowercasing will be done based on the current
1233
+ * locale. You can also set the list of stop-words to be used by the
1234
+ * StopFilter.
1235
+ *
1236
+ * lower:: set to false if you don't want the field's tokens to be downcased
1237
+ * stop_words:: list of stop-words to pass to the StopFilter
1238
+ */
812
1239
  static VALUE
813
1240
  frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
814
1241
  {
815
- bool lower;
816
- VALUE rlower, rstop_words;
817
- Analyzer *a;
818
- rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
819
- lower = ((rlower == Qnil) ? true : RTEST(rlower));
820
- if (rstop_words != Qnil) {
821
- char **stop_words = get_stopwords(rstop_words);
822
- a = mb_standard_analyzer_create_with_words((const char **)stop_words, lower);
823
- free(stop_words);
824
- } else {
825
- a = mb_standard_analyzer_create(lower);
826
- }
827
- Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
828
- object_add(a, self);
829
- return self;
1242
+ bool lower;
1243
+ VALUE rlower, rstop_words;
1244
+ Analyzer *a;
1245
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
1246
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
1247
+ if (rstop_words != Qnil) {
1248
+ char **stop_words = get_stopwords(rstop_words);
1249
+ a = mb_standard_analyzer_new_with_words((const char **)stop_words, lower);
1250
+ free(stop_words);
1251
+ } else {
1252
+ a = mb_standard_analyzer_new(lower);
1253
+ }
1254
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
1255
+ object_add(a, self);
1256
+ return self;
830
1257
  }
831
1258
 
832
- void
1259
+ static void
833
1260
  frt_h_mark_values_i(void *key, void *value, void *arg)
834
1261
  {
835
- frt_gc_mark(value);
1262
+ frt_gc_mark(value);
836
1263
  }
837
1264
 
838
- void
1265
+ static void
839
1266
  frt_pfa_mark(void *p)
840
1267
  {
841
- Analyzer *a = (Analyzer *)p;
842
- PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)a->data;
843
- frt_gc_mark(pfa->def);
844
- h_each(pfa->dict, &frt_h_mark_values_i, NULL);
1268
+ frt_gc_mark(PFA(p)->default_a);
1269
+ h_each(PFA(p)->dict, &frt_h_mark_values_i, NULL);
845
1270
  }
846
1271
 
847
1272
  /*** PerFieldAnalyzer ***/
848
1273
 
1274
+ /*
1275
+ * call-seq:
1276
+ * PerFieldAnalyzer.new(default_analyzer) -> analyzer
1277
+ *
1278
+ * Create a new PerFieldAnalyzer specifying the default analyzer to use on
1279
+ * all fields that are set specifically.
1280
+ *
1281
+ * default_analyzer:: analyzer to be used on fields that aren't otherwise
1282
+ * specified
1283
+ */
849
1284
  static VALUE
850
1285
  frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
851
1286
  {
852
- Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
853
- Analyzer *a = per_field_analyzer_create(def);
854
- Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
855
- object_add(a, self);
856
- return self;
1287
+ Analyzer *def = frt_get_cwrapped_analyzer(ranalyzer);
1288
+ Analyzer *a = per_field_analyzer_new(def);
1289
+ Frt_Wrap_Struct(self, &frt_pfa_mark, &frt_analyzer_free, a);
1290
+ object_add(a, self);
1291
+ return self;
857
1292
  }
858
1293
 
1294
+ /*
1295
+ * call-seq:
1296
+ * per_field_analyzer.add_field(field_name, default_analyzer) -> self
1297
+ * per_field_analyzer[field_name] = default_analyzer -> self
1298
+ *
1299
+ * Set the analyzer to be used on field +field_name+. Note that field_name
1300
+ * should be a symbol.
1301
+ *
1302
+ * field_name:: field we wish to set the analyzer for
1303
+ * analyzer:: analyzer to be used on +field_name+
1304
+ */
859
1305
  static VALUE
860
1306
  frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
861
1307
  {
862
- Analyzer *pfa, *a;
863
- Data_Get_Struct(self, Analyzer, pfa);
864
- a = frt_get_cwrapped_analyzer(ranalyzer);
1308
+ Analyzer *pfa, *a;
1309
+ Data_Get_Struct(self, Analyzer, pfa);
1310
+ a = frt_get_cwrapped_analyzer(ranalyzer);
865
1311
 
866
- pfa_add_field(pfa, StringValuePtr(rfield), a);
867
- return self;
1312
+ pfa_add_field(pfa, StringValuePtr(rfield), a);
1313
+ return self;
868
1314
  }
869
1315
 
870
1316
  /*** RegExpAnalyzer ***/
@@ -872,36 +1318,46 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
872
1318
  static void
873
1319
  frt_re_analyzer_mark(Analyzer *a)
874
1320
  {
875
- frt_gc_mark(a->current_ts);
1321
+ frt_gc_mark(a->current_ts);
876
1322
  }
877
1323
 
878
1324
  static void
879
- re_analyzer_destroy(Analyzer *a)
1325
+ re_analyzer_destroy_i(Analyzer *a)
880
1326
  {
881
- free(a->data);
882
- a_standard_destroy(a);
1327
+ ts_deref(a->current_ts);
1328
+ free(a);
883
1329
  }
884
1330
 
1331
+ /*
1332
+ * call-seq:
1333
+ * RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
1334
+ *
1335
+ * Create a new RegExpAnalyzer which will create tokenizers based on the
1336
+ * regular expression and lowercasing if required.
1337
+ *
1338
+ * reg_exp:: the token matcher for the tokenizer to use
1339
+ * lower:: set to false if you don't want to downcase the tokens
1340
+ */
885
1341
  static VALUE
886
1342
  frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
887
1343
  {
888
- VALUE lower, rets, regex, proc;
889
- Analyzer *a;
890
- TokenStream *ts;
891
- rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
1344
+ VALUE lower, rets, regex, proc;
1345
+ Analyzer *a;
1346
+ TokenStream *ts;
1347
+ rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);
892
1348
 
893
- ts = rets_create(Qnil, regex, proc);
894
- rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
895
- ref(ts);
896
- rb_hash_aset(object_space, LONG2NUM((long)rets), rets);
897
- object_add(ts, rets);
1349
+ ts = rets_new(Qnil, regex, proc);
1350
+ rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1351
+ REF(ts);
1352
+ /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1353
+ object_add(ts, rets);
898
1354
 
899
- if (lower != Qfalse) ts = mb_lowercase_filter_create(ts);
1355
+ if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
900
1356
 
901
- a = analyzer_create(NULL, ts, &re_analyzer_destroy, NULL);
902
- Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
903
- object_add(a, self);
904
- return self;
1357
+ a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1358
+ Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
1359
+ object_add(a, self);
1360
+ return self;
905
1361
  }
906
1362
 
907
1363
  /****************************************************************************
@@ -912,265 +1368,818 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
912
1368
 
913
1369
  static char *frt_locale = NULL;
914
1370
 
915
- static VALUE frt_getlocale(VALUE self, VALUE locale)
1371
+ /*
1372
+ * call-seq:
1373
+ * Ferret.locale -> locale_str
1374
+ *
1375
+ * Returns a string corresponding to the locale set. For example;
1376
+ *
1377
+ * puts Ferret.locale #=> "en_US.UTF-8"
1378
+ */
1379
+ static VALUE frt_get_locale(VALUE self, VALUE locale)
916
1380
  {
917
- return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
1381
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
918
1382
  }
919
1383
 
920
- static VALUE frt_setlocale(VALUE self, VALUE locale)
1384
+ /*
1385
+ * call-seq:
1386
+ * Ferret.locale = "en_US.UTF-8"
1387
+ *
1388
+ * Set the global locale. You should use this method to set different locales
1389
+ * when indexing documents with different encodings.
1390
+ */
1391
+ static VALUE frt_set_locale(VALUE self, VALUE locale)
921
1392
  {
922
- char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
923
- frt_locale = setlocale(LC_ALL, l);
924
- return frt_locale ? rb_str_new2(frt_locale) : Qnil;
1393
+ char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
1394
+ frt_locale = setlocale(LC_ALL, l);
1395
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
925
1396
  }
926
1397
 
927
1398
  /****************************************************************************
928
1399
  *
929
- * Init Function
1400
+ * Init Functions
930
1401
  *
931
1402
  ****************************************************************************/
932
1403
 
1404
+ /*
1405
+ * Document-class: Ferret::Analysis::Token
1406
+ *
1407
+ * == Summary
1408
+ *
1409
+ * A Token is an occurence of a term from the text of a field. It consists
1410
+ * of a term's text and the start and end offset of the term in the text of
1411
+ * the field;
1412
+ *
1413
+ * The start and end offsets permit applications to re-associate a token with
1414
+ * its source text, e.g., to display highlighted query terms in a document
1415
+ * browser, or to show matching text fragments in a KWIC (KeyWord In Context)
1416
+ * display, etc.
1417
+ *
1418
+ * === Attributes
1419
+ *
1420
+ * text:: the terms text which may have been modified by a Token Filter or
1421
+ * Tokenizer from the text originally found in the document
1422
+ * start:: is the position of the first character corresponding to
1423
+ * this token in the source text
1424
+ * end:: is equal to one greater than the position of the last
1425
+ * character corresponding of this token Note that the
1426
+ * difference between @end_offset and @start_offset may not be
1427
+ * equal to @text.length(), as the term text may have been
1428
+ * altered by a stemmer or some other filter.
1429
+ */
1430
+ static void Init_Token(void)
1431
+ {
1432
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
1433
+ rb_define_alloc_func(cToken, frt_token_alloc);
1434
+ rb_include_module(cToken, rb_mComparable);
1435
+
1436
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
1437
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
1438
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
1439
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
1440
+ rb_define_method(cToken, "start", frt_token_get_start_offset, 0);
1441
+ rb_define_method(cToken, "start=", frt_token_set_start_offset, 1);
1442
+ rb_define_method(cToken, "end", frt_token_get_end_offset, 0);
1443
+ rb_define_method(cToken, "end=", frt_token_set_end_offset, 1);
1444
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
1445
+ rb_define_method(cToken, "pos_inc=", frt_token_set_pos_inc, 1);
1446
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
1447
+ }
1448
+
1449
+ /*
1450
+ * Document-class: Ferret::Analysis::TokenStream
1451
+ *
1452
+ * A TokenStream enumerates the sequence of tokens, either from
1453
+ * fields of a document or from query text.
1454
+ *
1455
+ * This is an abstract class. Concrete subclasses are:
1456
+ *
1457
+ * Tokenizer:: a TokenStream whose input is a string
1458
+ * TokenFilter:: a TokenStream whose input is another TokenStream
1459
+ */
1460
+ static void Init_TokenStream(void)
1461
+ {
1462
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1463
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1464
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1465
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
1466
+ }
1467
+
1468
+ /*
1469
+ * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1470
+ *
1471
+ * A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
1472
+ * That is to say, it defines tokens as maximal strings of adjacent letters,
1473
+ * as defined by the regular expression _/[A-Za-z]+/_.
1474
+ *
1475
+ * === Example
1476
+ *
1477
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1478
+ * => ["Dave", "s", "r", "sum", "at", "http", "www", "davebalmain", "com"]
1479
+ */
1480
+ static void Init_AsciiLetterTokenizer(void)
1481
+ {
1482
+ cAsciiLetterTokenizer =
1483
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1484
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1485
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
1486
+ frt_a_letter_tokenizer_init, 1);
1487
+ }
1488
+
1489
+ /*
1490
+ * Document-class: Ferret::Analysis::LetterTokenizer
1491
+ *
1492
+ * A LetterTokenizer is a tokenizer that divides text at non-letters. That is
1493
+ * to say, it defines tokens as maximal strings of adjacent letters, as
1494
+ * defined by the regular expression _/[[:alpha:]]+/_ where [:alpha] matches
1495
+ * all characters in your local locale.
1496
+ *
1497
+ * === Example
1498
+ *
1499
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1500
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1501
+ */
1502
+ static void Init_LetterTokenizer(void)
1503
+ {
1504
+ cLetterTokenizer =
1505
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1506
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1507
+ rb_define_method(cLetterTokenizer, "initialize",
1508
+ frt_letter_tokenizer_init, -1);
1509
+ }
1510
+
1511
+ /*
1512
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceTokenizer
1513
+ *
1514
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1515
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1516
+ *
1517
+ * === Example
1518
+ *
1519
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1520
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1521
+ */
1522
+ static void Init_AsciiWhiteSpaceTokenizer(void)
1523
+ {
1524
+ cAsciiWhiteSpaceTokenizer =
1525
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1526
+ cTokenStream);
1527
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1528
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1529
+ frt_a_whitespace_tokenizer_init, 1);
1530
+ }
1531
+
1532
+ /*
1533
+ * Document-class: Ferret::Analysis::WhiteSpaceTokenizer
1534
+ *
1535
+ * A WhiteSpaceTokenizer is a tokenizer that divides text at white-space.
1536
+ * Adjacent sequences of non-WhiteSpace characters form tokens.
1537
+ *
1538
+ * === Example
1539
+ *
1540
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1541
+ * => ["Dave's", "résumé,", "at", "http://www.davebalmain.com", "1234"]
1542
+ */
1543
+ static void Init_WhiteSpaceTokenizer(void)
1544
+ {
1545
+ cWhiteSpaceTokenizer =
1546
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1547
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1548
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
1549
+ frt_whitespace_tokenizer_init, -1);
1550
+ }
1551
+
1552
+ /*
1553
+ * Document-class: Ferret::Analysis::AsciiStandardTokenizer
1554
+ *
1555
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1556
+ * words correctly as well as tokenizing things like email addresses, web
1557
+ * addresses, phone numbers, etc.
1558
+ *
1559
+ * === Example
1560
+ *
1561
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1562
+ * => ["Dave's", "r", "sum", "at", "http://www.davebalmain.com", "1234"]
1563
+ */
1564
+ static void Init_AsciiStandardTokenizer(void)
1565
+ {
1566
+ cAsciiStandardTokenizer =
1567
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1568
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1569
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
1570
+ frt_a_standard_tokenizer_init, 1);
1571
+ }
1572
+
1573
+ /*
1574
+ * Document-class: Ferret::Analysis::StandardTokenizer
1575
+ *
1576
+ * The standard tokenizer is an advanced tokenizer which tokenizes most
1577
+ * words correctly as well as tokenizing things like email addresses, web
1578
+ * addresses, phone numbers, etc.
1579
+ *
1580
+ * === Example
1581
+ *
1582
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1583
+ * => ["Dave's", "résumé", "at", "http://www.davebalmain.com", "1234"]
1584
+ */
1585
+ static void Init_StandardTokenizer(void)
1586
+ {
1587
+ cStandardTokenizer =
1588
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1589
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1590
+ rb_define_method(cStandardTokenizer, "initialize",
1591
+ frt_standard_tokenizer_init, 1);
1592
+ }
1593
+
1594
+ /*
1595
+ * Document-class: Ferret::Analysis::RegExpTokenizer
1596
+ *
1597
+ * A tokenizer that recognizes tokens based on a regular expression passed to
1598
+ * the contructor. Most possible tokenizers can be created using this class.
1599
+ *
1600
+ * === Example
1601
+ *
1602
+ * Below is an example of a simple implementation of a LetterTokenizer using
1603
+ * an RegExpTokenizer. Basically, a token is a sequence of alphabetic
1604
+ * characters separated by one or more non-alphabetic characters.
1605
+ *
1606
+ * # of course you would add more than just é
1607
+ * RegExpTokenizer.new(input, /[[:alpha:]é]+/)
1608
+ *
1609
+ * "Dave's résumé, at http://www.davebalmain.com/ 1234"
1610
+ * => ["Dave", "s", "résumé", "at", "http", "www", "davebalmain", "com"]
1611
+ */
1612
+ static void Init_RegExpTokenizer(void)
1613
+ {
1614
+ cRegExpTokenizer =
1615
+ rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1616
+ rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1617
+ rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1618
+ rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1619
+ rb_define_method(cRegExpTokenizer, "initialize",
1620
+ frt_rets_init, -1);
1621
+ rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1622
+ rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1623
+ }
1624
+
1625
+ /***************/
1626
+ /*** Filters ***/
1627
+ /***************/
1628
+
1629
+ /*
1630
+ * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1631
+ *
1632
+ * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1633
+ * Ascii characters. For other characters use LowerCaseFilter.
1634
+ *
1635
+ * === Example
1636
+ *
1637
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "rÉsumÉ"]
1638
+ *
1639
+ */
1640
+ static void Init_AsciiLowerCaseFilter(void)
1641
+ {
1642
+ cAsciiLowerCaseFilter =
1643
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1644
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1645
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
1646
+ frt_a_lowercase_filter_init, 1);
1647
+ }
1648
+
1649
+ /*
1650
+ * Document-class: Ferret::Analysis::LowerCaseFilter
1651
+ *
1652
+ * LowerCaseFilter normalizes a token's text to lowercase based on the
1653
+ * current locale.
1654
+ *
1655
+ * === Example
1656
+ *
1657
+ * ["One", "TWO", "three", "RÉSUMÉ"] => ["one", "two", "three", "résumé"]
1658
+ *
1659
+ */
1660
+ static void Init_LowerCaseFilter(void)
1661
+ {
1662
+ cLowerCaseFilter =
1663
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1664
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1665
+ rb_define_method(cLowerCaseFilter, "initialize",
1666
+ frt_lowercase_filter_init, 1);
1667
+ }
1668
+
1669
+ /*
1670
+ * Document-class: Ferret::Analysis::StopFilter
1671
+ *
1672
+ * A StopFilter filters *stop-words* from a TokenStream. Stop-words are words
1673
+ * that you don't wish to be index. Usually they will be common words like
1674
+ * "the" and "and" although you can specify whichever words you want.
1675
+ *
1676
+ * === Example
1677
+ *
1678
+ * ["the", "pig", "and", "whistle"] => ["pig", "whistle"]
1679
+ */
1680
+ static void Init_StopFilter(void)
1681
+ {
1682
+ cStopFilter =
1683
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1684
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
1685
+ rb_define_method(cStopFilter, "initialize",
1686
+ frt_stop_filter_init, -1);
1687
+ }
1688
+
1689
+ /*
1690
+ * Document-class: Ferret::Analysis::StemFilter
1691
+ *
1692
+ * == Summary
1693
+ *
1694
+ * A StemFilter takes a term and transforms the term as per the SnowBall
1695
+ * stemming algorithm. Note: the input to the stemming filter must already
1696
+ * be in lower case, so you will need to use LowerCaseFilter or
1697
+ * LowerCaseTokenizer further down the Tokenizer chain in order for this to
1698
+ * work properly!
1699
+ *
1700
+ * To use this filter with other analyzers, you'll want to write an Analyzer
1701
+ * class that sets up the TokenStream chain as you want it. To use this with
1702
+ * LowerCaseTokenizer, for example, you'd write an analyzer like this:
1703
+ *
1704
+ * === Available algorithms and encodings
1705
+ *
1706
+ * Algorithm Algorithm Pseudonyms Encoding
1707
+ * ----------------------------------------------------------------
1708
+ * "danish", | "da", "dan" | "ISO_8859_1", "UTF_8"
1709
+ * "dutch", | "dut", "nld" | "ISO_8859_1", "UTF_8"
1710
+ * "english", | "en", "eng" | "ISO_8859_1", "UTF_8"
1711
+ * "finnish", | "fi", "fin" | "ISO_8859_1", "UTF_8"
1712
+ * "french", | "fr", "fra", "fre" | "ISO_8859_1", "UTF_8"
1713
+ * "german", | "de", "deu", "ge", "ger" | "ISO_8859_1", "UTF_8"
1714
+ * "italian", | "it", "ita" | "ISO_8859_1", "UTF_8"
1715
+ * "norwegian", | "nl", "no" | "ISO_8859_1", "UTF_8"
1716
+ * "porter", | | "ISO_8859_1", "UTF_8"
1717
+ * "portuguese", | "por", "pt" | "ISO_8859_1", "UTF_8"
1718
+ * "russian", | "ru", "rus" | "KOI8_R", "UTF_8"
1719
+ * "spanish", | "es", "esl" | "ISO_8859_1", "UTF_8"
1720
+ * "swedish", | "sv", "swe" | "ISO_8859_1", "UTF_8"
1721
+ *
1722
+ * === Example
1723
+ *
1724
+ * def MyAnalyzer < Analyzer
1725
+ * def token_stream(field, str)
1726
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
1727
+ * end
1728
+ * end
1729
+ *
1730
+ * "debate debates debated debating debater"
1731
+ * => ["debat", "debat", "debat", "debat", "debat"]
1732
+ *
1733
+ * === Attributes
1734
+ *
1735
+ * token_stream:: TokenStream to be filtered
1736
+ * algorithm:: The algorithm (or language) to use (default: "english")
1737
+ * encoding:: The encoding of the data (default: "UTF-8")
1738
+ */
1739
+ static void Init_StemFilter(void)
1740
+ {
1741
+ cStemFilter =
1742
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1743
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
1744
+ rb_define_method(cStemFilter, "initialize",
1745
+ frt_stem_filter_init, -1);
1746
+ }
1747
+
1748
+ /*************************/
1749
+ /*** * * Analyzers * * ***/
1750
+ /*************************/
1751
+
1752
+ /*
1753
+ * Document-class: Ferret::Analysis::Analyzer
1754
+ *
1755
+ * == Summary
1756
+ *
1757
+ * An Analyzer builds TokenStreams, which analyze text. It thus represents
1758
+ * a policy for extracting index terms from text.
1759
+ *
1760
+ * Typical implementations first build a Tokenizer, which breaks the stream
1761
+ * of characters from the Reader into raw Tokens. One or more TokenFilter s
1762
+ * may then be applied to the output of the Tokenizer.
1763
+ *
1764
+ * The default Analyzer just creates a LowerCaseTokenizer which converts
1765
+ * all text to lowercase tokens. See LowerCaseTokenizer for more details.
1766
+ *
1767
+ * === Example
1768
+ *
1769
+ * To create your own custom Analyzer you simply need to implement a
1770
+ * token_stream method which takes the field name and the data to be
1771
+ * tokenized as parameters and returns a TokenStream. Most analyzers
1772
+ * typically ignore the field name.
1773
+ *
1774
+ * Here we'll create a StemmingAnalyzer;
1775
+ *
1776
+ * def MyAnalyzer < Analyzer
1777
+ * def token_stream(field, str)
1778
+ * return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
1779
+ * end
1780
+ * end
1781
+ */
1782
+ static void Init_Analyzer(void)
1783
+ {
1784
+ cAnalyzer =
1785
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1786
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1787
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1788
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
1789
+ }
1790
+
1791
+ /*
1792
+ * Document-class: Ferret::Analysis::AsciiLetterAnalyzer
1793
+ *
1794
+ * == Summary
1795
+ *
1796
+ * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
1797
+ * maximal strings of Ascii characters. If implemented in Ruby it would look
1798
+ * like;
1799
+ *
1800
+ * class AsciiLetterAnalyzer
1801
+ * def initialize(lower = true)
1802
+ * @lower = lower
1803
+ * end
1804
+ *
1805
+ * def token_stream(field, str)
1806
+ * if @lower
1807
+ * return AsciiLowerCaseFilter.new(AsciiLetterTokenizer.new(str))
1808
+ * else
1809
+ * return AsciiLetterTokenizer.new(str)
1810
+ * end
1811
+ * end
1812
+ * end
1813
+ *
1814
+ * As you can see it makes use of the AsciiLetterTokenizer and
1815
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
1816
+ * characters so you should use the LetterAnalyzer is you want to analyze
1817
+ * multi-byte data like "UTF-8".
1818
+ */
1819
+ static void Init_AsciiLetterAnalyzer(void)
1820
+ {
1821
+ cAsciiLetterAnalyzer =
1822
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1823
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1824
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
1825
+ frt_a_letter_analyzer_init, -1);
1826
+ }
1827
+
1828
+ /*
1829
+ * Document-class: Ferret::Analysis::LetterAnalyzer
1830
+ *
1831
+ * == Summary
1832
+ *
1833
+ * A LetterAnalyzer creates a TokenStream that splits the input up into
1834
+ * maximal strings of characters as recognized by the current locale. If
1835
+ * implemented in Ruby it would look like;
1836
+ *
1837
+ * class LetterAnalyzer
1838
+ * def initialize(lower = true)
1839
+ * @lower = lower
1840
+ * end
1841
+ *
1842
+ * def token_stream(field, str)
1843
+ * return LetterTokenizer.new(str, @lower)
1844
+ * end
1845
+ * end
1846
+ *
1847
+ * As you can see it makes use of the LetterTokenizer.
1848
+ */
1849
+ static void Init_LetterAnalyzer(void)
1850
+ {
1851
+ cLetterAnalyzer =
1852
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1853
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1854
+ rb_define_method(cLetterAnalyzer, "initialize",
1855
+ frt_letter_analyzer_init, -1);
1856
+ }
1857
+
1858
+ /*
1859
+ * Document-class: Ferret::Analysis::AsciiWhiteSpaceAnalyzer
1860
+ *
1861
+ * == Summary
1862
+ *
1863
+ * The AsciiWhiteSpaceAnalyzer recognizes tokens as maximal strings of
1864
+ * non-whitespace characters. If implemented in Ruby the
1865
+ * AsciiWhiteSpaceAnalyzer would look like;
1866
+ *
1867
+ * class AsciiWhiteSpaceAnalyzer
1868
+ * def initialize(lower = true)
1869
+ * @lower = lower
1870
+ * end
1871
+ *
1872
+ * def token_stream(field, str)
1873
+ * if @lower
1874
+ * return AsciiLowerCaseFilter.new(AsciiWhiteSpaceTokenizer.new(str))
1875
+ * else
1876
+ * return AsciiWhiteSpaceTokenizer.new(str)
1877
+ * end
1878
+ * end
1879
+ * end
1880
+ *
1881
+ * As you can see it makes use of the AsciiWhiteSpaceTokenizer. You should
1882
+ * use WhiteSpaceAnalyzer if you want to recognize multibyte encodings such
1883
+ * as "UTF-8".
1884
+ */
1885
+ static void Init_AsciiWhiteSpaceAnalyzer(void)
1886
+ {
1887
+ cAsciiWhiteSpaceAnalyzer =
1888
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1889
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1890
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1891
+ frt_a_white_space_analyzer_init, -1);
1892
+ }
1893
+
1894
+ /*
1895
+ * Document-class: Ferret::Analysis::WhiteSpaceAnalyzer
1896
+ *
1897
+ * == Summary
1898
+ *
1899
+ * The WhiteSpaceAnalyzer recognizes tokens as maximal strings of
1900
+ * non-whitespace characters. If implemented in Ruby the WhiteSpaceAnalyzer
1901
+ * would look like;
1902
+ *
1903
+ * class WhiteSpaceAnalyzer
1904
+ * def initialize(lower = true)
1905
+ * @lower = lower
1906
+ * end
1907
+ *
1908
+ * def token_stream(field, str)
1909
+ * return WhiteSpaceTokenizer.new(str, @lower)
1910
+ * end
1911
+ * end
1912
+ *
1913
+ * As you can see it makes use of the WhiteSpaceTokenizer.
1914
+ */
1915
+ static void Init_WhiteSpaceAnalyzer(void)
1916
+ {
1917
+ cWhiteSpaceAnalyzer =
1918
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1919
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1920
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1921
+ frt_white_space_analyzer_init, -1);
1922
+ }
1923
+
1924
+ /*
1925
+ * Document-class: Ferret::Analysis::AsciiStandardAnalyzer
1926
+ *
1927
+ * == Summary
1928
+ *
1929
+ * The AsciiStandardAnalyzer is the most advanced of the available
1930
+ * ascii-analyzers. If it were implemented in Ruby it would look like this;
1931
+ *
1932
+ * class AsciiStandardAnalyzer
1933
+ * def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
1934
+ * @lower = lower
1935
+ * @stop_words = stop_words
1936
+ * end
1937
+ *
1938
+ * def token_stream(field, str)
1939
+ * if @lower
1940
+ * return StopFilter.new(AsciiLowerCaseFilter.new(
1941
+ * AsciiStandardTokenizer.new(str)), @stop_words)
1942
+ * else
1943
+ * return StopFilter.new(AsciiStandardTokenizer.new(str), @stop_words)
1944
+ * end
1945
+ * end
1946
+ * end
1947
+ *
1948
+ * As you can see it makes use of the AsciiStandardTokenizer and you can also
1949
+ * add your own list of stop-words if you wish. Note that this tokenizer
1950
+ * won't recognize non-ascii characters so you should use the
1951
+ * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
1952
+ */
1953
+ static void Init_AsciiStandardAnalyzer(void)
1954
+ {
1955
+ cAsciiStandardAnalyzer =
1956
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1957
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1958
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
1959
+ frt_a_standard_analyzer_init, -1);
1960
+ }
1961
+
1962
+ /*
1963
+ * Document-class: Ferret::Analysis::StandardAnalyzer
1964
+ *
1965
+ * == Summary
1966
+ *
1967
+ * The StandardAnalyzer is the most advanced of the available analyzers. If
1968
+ * it were implemented in Ruby it would look like this;
1969
+ *
1970
+ * class StandardAnalyzer
1971
+ * def initialize(lower = true, stop_words = ENGLISH_STOP_WORDS)
1972
+ * @lower = lower
1973
+ * @stop_words = stop_words
1974
+ * end
1975
+ *
1976
+ * def token_stream(field, str)
1977
+ * return StopFilter.new(StandardTokenizer.new(str, @lower), @stop_words)
1978
+ * end
1979
+ * end
1980
+ *
1981
+ * As you can see it makes use of the StandardTokenizer and you can also add
1982
+ * your own list of stopwords if you wish.
1983
+ */
1984
+ static void Init_StandardAnalyzer(void)
1985
+ {
1986
+ cStandardAnalyzer =
1987
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1988
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1989
+ rb_define_method(cStandardAnalyzer, "initialize",
1990
+ frt_standard_analyzer_init, -1);
1991
+ }
1992
+
1993
+ /*
1994
+ * Document-class: Ferret::Analysis::PerFieldAnalyzer
1995
+ *
1996
+ * == Summary
1997
+ *
1998
+ * The PerFieldAnalyzer is for use when you want to analyze different fields
1999
+ * with different analyzers. With the PerFieldAnalyzer you can specify how
2000
+ * you want each field analyzed.
2001
+ *
2002
+ * === Example
2003
+ *
2004
+ * # Create a new PerFieldAnalyzer which uses StandardAnalyzer by default
2005
+ * pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
2006
+ *
2007
+ * # Use the WhiteSpaceAnalyzer with no lowercasing on the :title field
2008
+ * pfa[:title] = WhiteSpaceAnalyzer.new(false)
2009
+ *
2010
+ * # Use a custom analyzer on the :created_at field
2011
+ * pfa[:created_at] = DateAnalyzer.new
2012
+ */
2013
+ static void Init_PerFieldAnalyzer(void)
2014
+ {
2015
+ cPerFieldAnalyzer =
2016
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2017
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2018
+ rb_define_method(cPerFieldAnalyzer, "initialize",
2019
+ frt_per_field_analyzer_init, 1);
2020
+ rb_define_method(cPerFieldAnalyzer, "add_field",
2021
+ frt_per_field_analyzer_add_field, 2);
2022
+ rb_define_method(cPerFieldAnalyzer, "[]=",
2023
+ frt_per_field_analyzer_add_field, 2);
2024
+ }
2025
+
2026
+ /*
2027
+ * Document-class: Ferret::Analysis::RegExpAnalyzer
2028
+ *
2029
+ * == Summary
2030
+ *
2031
+ * Using a RegExpAnalyzer is a simple way to create a custom analyzer. If
2032
+ * implemented in Ruby it would look like this;
2033
+ *
2034
+ * class RegExpAnalyzer
2035
+ * def initialize(reg_exp, lower = true)
2036
+ * @lower = lower
2037
+ * @reg_exp = reg_exp
2038
+ * end
2039
+ *
2040
+ * def token_stream(field, str)
2041
+ * if @lower
2042
+ * return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
2043
+ * else
2044
+ * return RegExpTokenizer.new(str, reg_exp)
2045
+ * end
2046
+ * end
2047
+ * end
2048
+ *
2049
+ * === Example
2050
+ *
2051
+ * csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
2052
+ */
2053
+ static void Init_RegExpAnalyzer(void)
2054
+ {
2055
+ cRegExpAnalyzer =
2056
+ rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2057
+ rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2058
+ rb_define_method(cRegExpAnalyzer, "initialize",
2059
+ frt_re_analyzer_init, -1);
2060
+ }
2061
+
2062
+ /* rdoc hack
2063
+ extern VALUE mFerret = rb_define_module("Ferret");
2064
+ */
2065
+
2066
+ /*
2067
+ * Document-module: Ferret::Analysis
2068
+ *
2069
+ * == Summary
2070
+ *
2071
+ * The Analysis module contains all the classes used to analyze and tokenize
2072
+ * the data to be indexed. There are three main classes you need to know
2073
+ * about when dealing with analysis; Analyzer, TokenStream and Token.
2074
+ *
2075
+ * == Classes
2076
+ *
2077
+ * === Analyzer
2078
+ *
2079
+ * Analyzers handle all of your tokenizing needs. You pass an Analyzer to the
2080
+ * indexing class when you create it and it will create the TokenStreams
2081
+ * necessary to tokenize the fields in the documents. Most of the time you
2082
+ * won't need to worry about TokenStreams and Tokens, one of the Analyzers
2083
+ * distributed with Ferret will do exactly what you need. Otherwise you'll
2084
+ * need to implement a custom analyzer.
2085
+ *
2086
+ * === TokenStream
2087
+ *
2088
+ * A TokenStream is an enumeration of Tokens. There are two standard types of
2089
+ * TokenStream; Tokenizer and TokenFilter. A Tokenizer takes a String and
2090
+ * turns it into a list of Tokens. A TokenFilter takes another TokenStream
2091
+ * and post-processes the Tokens. You can chain as many TokenFilters together
2092
+ * as you like but they always need to finish with a Tokenizer.
2093
+ *
2094
+ * === Token
2095
+ *
2096
+ * A Token is a single term from a document field. A token contains the text
2097
+ * representing the term as well as the start and end offset of the token.
2098
+ * The start and end offset will represent the token as it appears in the
2099
+ * source field. Some TokenFilters may change the text in the Token but the
2100
+ * start and end offsets should stay the same so (end - start) won't
2101
+ * necessarily be equal to the length of text in the token. For example using
2102
+ * a stemming TokenFilter the term "Beginning" might have start and end
2103
+ * offsets of 10 and 19 respectively ("Beginning".length == 9) but Token#text
2104
+ * might be "begin" (after stemming).
2105
+ */
933
2106
  void
934
- Init_analysis(void)
935
- {
936
- /* TokenStream Methods */
937
- id_next = rb_intern("next");
938
- id_reset = rb_intern("text=");
939
- id_clone = rb_intern("clone");
940
-
941
- /* Analyzer Methods */
942
- id_token_stream = rb_intern("token_stream");
943
-
944
- object_space = rb_hash_new();
945
- rb_define_const(mFerret, "OBJECT_SPACE", object_space);
946
-
947
- /*** * * Locale stuff * * ***/
948
- frt_locale = setlocale(LC_ALL, "");
949
- rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
950
- rb_define_singleton_method(mFerret, "locale", frt_getlocale, 0);
951
-
952
- /*********************/
953
- /*** * * Token * * ***/
954
- /*********************/
955
- cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
956
- rb_define_alloc_func(cToken, frt_token_alloc);
957
- rb_include_module(cToken, rb_mComparable);
958
-
959
- rb_define_method(cToken, "initialize", frt_token_init, -1);
960
- rb_define_method(cToken, "<=>", frt_token_cmp, 1);
961
- rb_define_method(cToken, "text", frt_token_get_text, 0);
962
- rb_define_method(cToken, "text=", frt_token_set_text, 1);
963
- rb_define_method(cToken, "start_offset", frt_token_get_start_offset, 0);
964
- rb_define_method(cToken, "end_offset", frt_token_get_end_offset, 0);
965
- rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
966
- rb_define_method(cToken, "to_s", frt_token_to_s, 0);
967
-
968
- /****************************/
969
- /*** * * TokenStreams * * ***/
970
- /****************************/
971
-
972
- cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
973
- rb_define_method(cTokenStream, "next", frt_ts_next, 0);
974
- rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
975
- rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
976
-
977
- /******************/
978
- /*** Tokenizers ***/
979
- /******************/
980
-
981
- /*** * * AsciiLetterTokenizer * * ***/
982
- cAsciiLetterTokenizer =
983
- rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
984
- rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
985
- rb_define_method(cAsciiLetterTokenizer, "initialize",
986
- frt_a_letter_tokenizer_init, 1);
987
-
988
- /*** * * LetterTokenizer * * ***/
989
- cLetterTokenizer =
990
- rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
991
- rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
992
- rb_define_method(cLetterTokenizer, "initialize",
993
- frt_letter_tokenizer_init, -1);
994
-
995
- /*** * * AsciiWhiteSpaceTokenizer * * ***/
996
- cAsciiWhiteSpaceTokenizer =
997
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer", cTokenStream);
998
- rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
999
- rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
1000
- frt_a_whitespace_tokenizer_init, 1);
1001
-
1002
- /*** * * WhiteSpaceTokenizer * * ***/
1003
- cWhiteSpaceTokenizer =
1004
- rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1005
- rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1006
- rb_define_method(cWhiteSpaceTokenizer, "initialize",
1007
- frt_whitespace_tokenizer_init, -1);
1008
-
1009
- /*** * * AsciiStandardTokenizer * * ***/
1010
- cAsciiStandardTokenizer =
1011
- rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1012
- rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1013
- rb_define_method(cAsciiStandardTokenizer, "initialize",
1014
- frt_a_standard_tokenizer_init, 1);
1015
-
1016
- /*** * * StandardTokenizer * * ***/
1017
- cStandardTokenizer =
1018
- rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1019
- rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1020
- rb_define_method(cStandardTokenizer, "initialize",
1021
- frt_standard_tokenizer_init, 1);
1022
-
1023
- /*** * * RegExpTokenizer * * ***/
1024
- cRegExpTokenizer =
1025
- rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1026
- rtoken_re = rb_reg_new(token_re, strlen(token_re), 0);
1027
- rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1028
- rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
1029
- rb_define_method(cRegExpTokenizer, "initialize",
1030
- frt_rets_init, -1);
1031
- rb_define_method(cRegExpTokenizer, "next", frt_ts_next, 0);
1032
- rb_define_method(cRegExpTokenizer, "text=", frt_rets_set_text, 1);
1033
- rb_define_method(cRegExpTokenizer, "text", frt_rets_get_text, 0);
1034
-
1035
- /***************/
1036
- /*** Filters ***/
1037
- /***************/
1038
- rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
1039
- get_rstopwords(ENGLISH_STOP_WORDS));
1040
- rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
1041
- get_rstopwords(FULL_ENGLISH_STOP_WORDS));
1042
- rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
1043
- get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
1044
- rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
1045
- get_rstopwords(FULL_FRENCH_STOP_WORDS));
1046
- rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
1047
- get_rstopwords(FULL_SPANISH_STOP_WORDS));
1048
- rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
1049
- get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
1050
- rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
1051
- get_rstopwords(FULL_ITALIAN_STOP_WORDS));
1052
- rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
1053
- get_rstopwords(FULL_GERMAN_STOP_WORDS));
1054
- rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
1055
- get_rstopwords(FULL_DUTCH_STOP_WORDS));
1056
- rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
1057
- get_rstopwords(FULL_SWEDISH_STOP_WORDS));
1058
- rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
1059
- get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
1060
- rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
1061
- get_rstopwords(FULL_DANISH_STOP_WORDS));
1062
- rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
1063
- get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
1064
- rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
1065
- get_rstopwords(FULL_FINNISH_STOP_WORDS));
1066
-
1067
- cAsciiLowerCaseFilter =
1068
- rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1069
- rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1070
- rb_define_method(cAsciiLowerCaseFilter, "initialize",
1071
- frt_a_lowercase_filter_init, 1);
1072
-
1073
- cLowerCaseFilter =
1074
- rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1075
- rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1076
- rb_define_method(cLowerCaseFilter, "initialize",
1077
- frt_lowercase_filter_init, 1);
1078
-
1079
- cStopFilter =
1080
- rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1081
- rb_define_alloc_func(cStopFilter, frt_data_alloc);
1082
- rb_define_method(cStopFilter, "initialize",
1083
- frt_stop_filter_init, -1);
1084
-
1085
- cStemFilter =
1086
- rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1087
- rb_define_alloc_func(cStemFilter, frt_data_alloc);
1088
- rb_define_method(cStemFilter, "initialize",
1089
- frt_stem_filter_init, -1);
1090
-
1091
-
1092
- /*************************/
1093
- /*** * * Analyzers * * ***/
1094
- /*************************/
1095
-
1096
- /*** * * Analyzer * * ***/
1097
- cAnalyzer =
1098
- rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1099
- rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1100
- rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1101
- rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
1102
-
1103
- /*** * * AsciiLetterAnalyzer * * ***/
1104
- cAsciiLetterAnalyzer =
1105
- rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1106
- rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1107
- rb_define_method(cAsciiLetterAnalyzer, "initialize",
1108
- frt_a_letter_analyzer_init, -1);
1109
-
1110
- /*** * * LetterAnalyzer * * ***/
1111
- cLetterAnalyzer =
1112
- rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1113
- rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1114
- rb_define_method(cLetterAnalyzer, "initialize",
1115
- frt_letter_analyzer_init, -1);
1116
-
1117
- /*** * * AsciiWhiteSpaceAnalyzer * * ***/
1118
- cAsciiWhiteSpaceAnalyzer =
1119
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1120
- rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1121
- rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1122
- frt_a_white_space_analyzer_init, -1);
1123
-
1124
- /*** * * WhiteSpaceAnalyzer * * ***/
1125
- cWhiteSpaceAnalyzer =
1126
- rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1127
- rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1128
- rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1129
- frt_white_space_analyzer_init, -1);
1130
-
1131
- /*** * * AsciiStandardAnalyzer * * ***/
1132
- cAsciiStandardAnalyzer =
1133
- rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1134
- rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1135
- rb_define_method(cAsciiStandardAnalyzer, "initialize",
1136
- frt_a_standard_analyzer_init, -1);
1137
-
1138
- /*** * * StandardAnalyzer * * ***/
1139
- cStandardAnalyzer =
1140
- rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1141
- rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1142
- rb_define_method(cStandardAnalyzer, "initialize",
1143
- frt_standard_analyzer_init, -1);
1144
-
1145
- /*** * * PerFieldAnalyzer * * ***/
1146
- cPerFieldAnalyzer =
1147
- rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
1148
- rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
1149
- rb_define_method(cPerFieldAnalyzer, "initialize",
1150
- frt_per_field_analyzer_init, 1);
1151
- rb_define_method(cPerFieldAnalyzer, "add_field",
1152
- frt_per_field_analyzer_add_field, 2);
1153
- rb_define_method(cPerFieldAnalyzer, "[]=",
1154
- frt_per_field_analyzer_add_field, 2);
1155
- rb_define_class_under(mAnalysis, "PerFieldAnalyzerWrapper", cPerFieldAnalyzer);
1156
-
1157
- /*** * * RegexAnalyzer * * ***/
1158
- cRegExpAnalyzer =
1159
- rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
1160
- rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
1161
- rb_define_method(cRegExpAnalyzer, "initialize",
1162
- frt_re_analyzer_init, -1);
1163
-
1164
- /*
1165
- cRegexAnalyzer =
1166
- rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
1167
- rb_define_alloc_func(cRegexAnalyzer, frt_data_alloc);
1168
- rb_define_method(cRegexAnalyzer, "initialize",
1169
- frt_regex_analyzer_init, 0);
1170
- rb_define_method(cRegexAnalyzer, "token_stream",
1171
- frt_regex_analyzer_token_stream, 2);
1172
- rb_define_method(cRegexAnalyzer, "setlocale",
1173
- frt_regex_analyzer_setlocale, 1);
1174
- */
2107
+ Init_Analysis(void)
2108
+ {
2109
+ mAnalysis = rb_define_module_under(mFerret, "Analysis");
2110
+
2111
+ /* TokenStream Methods */
2112
+ id_next = rb_intern("next");
2113
+ id_reset = rb_intern("text=");
2114
+ id_clone = rb_intern("clone");
2115
+
2116
+ /* Analyzer Methods */
2117
+ id_token_stream = rb_intern("token_stream");
2118
+
2119
+ object_space = rb_hash_new();
2120
+ rb_define_const(mFerret, "OBJECT_SPACE", object_space);
2121
+
2122
+ /*** * * Locale stuff * * ***/
2123
+ frt_locale = setlocale(LC_ALL, "");
2124
+ rb_define_singleton_method(mFerret, "locale=", frt_set_locale, 1);
2125
+ rb_define_singleton_method(mFerret, "locale", frt_get_locale, 0);
2126
+
2127
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
2128
+ get_rstopwords(ENGLISH_STOP_WORDS));
2129
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
2130
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
2131
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
2132
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
2133
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
2134
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
2135
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
2136
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
2137
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
2138
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
2139
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
2140
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
2141
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
2142
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
2143
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
2144
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
2145
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
2146
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
2147
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
2148
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
2149
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
2150
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
2151
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
2152
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
2153
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
2154
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
2155
+
2156
+ Init_Token();
2157
+ Init_TokenStream();
2158
+
2159
+ Init_AsciiLetterTokenizer();
2160
+ Init_LetterTokenizer();
2161
+
2162
+ Init_AsciiWhiteSpaceTokenizer();
2163
+ Init_WhiteSpaceTokenizer();
2164
+
2165
+ Init_AsciiStandardTokenizer();
2166
+ Init_StandardTokenizer();
2167
+
2168
+ Init_RegExpTokenizer();
2169
+
2170
+ Init_AsciiLowerCaseFilter();
2171
+ Init_LowerCaseFilter();
2172
+ Init_StopFilter();
2173
+ Init_StemFilter();
2174
+
2175
+ Init_Analyzer();
2176
+ Init_AsciiLetterAnalyzer();
2177
+ Init_LetterAnalyzer();
2178
+ Init_AsciiWhiteSpaceAnalyzer();
2179
+ Init_WhiteSpaceAnalyzer();
2180
+ Init_AsciiStandardAnalyzer();
2181
+ Init_StandardAnalyzer();
2182
+ Init_PerFieldAnalyzer();
2183
+ Init_RegExpAnalyzer();
1175
2184
 
1176
2185
  }