ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
@@ -17,7 +17,7 @@ char *prq_to_s(Query *self, char *field)
17
17
 
18
18
  if (strcmp(term->field, field) != 0) {
19
19
  sprintf(bptr, "%s:", term->field);
20
- bptr++;
20
+ bptr += strlen(bptr);
21
21
  }
22
22
  sprintf(bptr, "%s*", term->text);
23
23
  if (self->boost != 1.0) {
@@ -38,17 +38,20 @@ Query *prq_rewrite(Query *self, IndexReader *ir)
38
38
  Query *tq;
39
39
  Query *bq = bq_create(true);
40
40
 
41
- do {
42
- TermBuffer *tb = te->tb_curr;
43
- if (!tb || strcmp(tb->field, prefix_field) != 0 ||
44
- strncmp(tb->text, prefix_text, prefix_length) != 0) {
45
- break;
46
- }
47
- tq = tq_create(term_create(tb->field, tb->text)); // found a match
48
- tq->boost = self->boost; // set the boost
49
- bq_add_query(bq, tq, BC_SHOULD); // add to query
50
- } while (te->next(te));
51
- te->close(te);
41
+ TRY
42
+ do {
43
+ TermBuffer *tb = te->tb_curr;
44
+ if (!tb || strcmp(tb->field, prefix_field) != 0 ||
45
+ strncmp(tb->text, prefix_text, prefix_length) != 0) {
46
+ break;
47
+ }
48
+ tq = tq_create(term_create(tb->field, tb->text)); // found a match
49
+ tq->boost = self->boost; // set the boost
50
+ bq_add_query(bq, tq, BC_SHOULD); // add to query
51
+ } while (te->next(te));
52
+ XFINALLY
53
+ te->close(te);
54
+ XENDTRY
52
55
 
53
56
  if (self->rewritten) self->rewritten->destroy(self->rewritten);
54
57
  return self->rewritten = bq;
@@ -1,6 +1,11 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
 
4
+ static char * const NIL_BOUNDS_ERROR_MSG = "At least one value must be non-nil";
5
+ static char * const LOWER_BOUND_ERROR_MSG = "The lower bound must be non-nil to be inclusive";
6
+ static char * const UPPER_BOUND_ERROR_MSG = "The upper bound must be non-nil to be inclusive";
7
+ static char * const BOUND_ORDER_ERROR_MSG = "The lower bound must less than the upper bound";
8
+
4
9
  /*****************************************************************************
5
10
  *
6
11
  * Range
@@ -9,51 +14,50 @@
9
14
 
10
15
  char *range_to_s(Range *range, char *field, float boost)
11
16
  {
12
- char *buffer, *bptr;
17
+ char *buffer, *b;
13
18
  int flen, llen, ulen;
14
19
 
15
20
  flen = strlen(range->field);
16
21
  llen = range->lower_term ? strlen(range->lower_term) : 0;
17
22
  ulen = range->upper_term ? strlen(range->upper_term) : 0;
18
23
  buffer = ALLOC_N(char, flen + llen + ulen + 40);
19
- bptr = buffer;
24
+ b = buffer;
20
25
 
21
26
  if (strcmp(field, range->field)) {
22
27
  memcpy(buffer, range->field, flen * sizeof(char));
23
- bptr += flen;
24
- *bptr = ':';
25
- bptr++;
28
+ b += flen;
29
+ *b = ':';
30
+ b++;
26
31
  }
27
32
 
28
33
  if (range->lower_term) {
29
- *bptr = range->include_lower ? '[' : '{';
30
- bptr++;
31
- memcpy(bptr, range->lower_term, llen);
32
- bptr += llen;
34
+ *b = range->include_lower ? '[' : '{';
35
+ b++;
36
+ memcpy(b, range->lower_term, llen);
37
+ b += llen;
33
38
  } else {
34
- *bptr = '<';
35
- bptr++;
39
+ *b = '<';
40
+ b++;
36
41
  }
37
42
 
38
43
  if (range->upper_term && range->lower_term) {
39
- *bptr = ' '; bptr++;
44
+ *b = ' '; b++;
40
45
  }
41
46
 
42
47
  if (range->upper_term) {
43
- memcpy(bptr, range->upper_term, ulen);
44
- bptr += ulen;
45
- *bptr = range->include_upper ? ']' : '}';
46
- bptr++;
48
+ memcpy(b, range->upper_term, ulen);
49
+ b += ulen;
50
+ *b = range->include_upper ? ']' : '}';
51
+ b++;
47
52
  } else {
48
- *bptr = '>';
49
- bptr++;
53
+ *b = '>';
54
+ b++;
50
55
  }
51
56
 
52
- *bptr = 0;
57
+ *b = 0;
53
58
  if (boost != 1.0) {
54
- char dbuf[32];
55
- dbl_to_s(dbuf, boost);
56
- sprintf(bptr, "^%s", dbuf);
59
+ *b = '^';
60
+ dbl_to_s(b + 1, boost);
57
61
  }
58
62
  return buffer;
59
63
  }
@@ -73,15 +77,13 @@ Range *range_create(const char *field, char *lower_term, char *upper_term,
73
77
  Range *range;
74
78
 
75
79
  if (!lower_term && !upper_term)
76
- eprintf(ARG_ERROR, "At least one value must be non-nil");
80
+ RAISE(ARG_ERROR, NIL_BOUNDS_ERROR_MSG);
77
81
  if (include_lower && !lower_term)
78
- eprintf(ARG_ERROR, "The lower bound must be non-nil to be inclusive");
82
+ RAISE(ARG_ERROR, LOWER_BOUND_ERROR_MSG);
79
83
  if (include_upper && !upper_term)
80
- eprintf(ARG_ERROR, "The upper bound must be non-nil to be inclusive");
84
+ RAISE(ARG_ERROR, UPPER_BOUND_ERROR_MSG);
81
85
  if (upper_term && lower_term && (strcmp(upper_term, lower_term) < 0))
82
- eprintf(ARG_ERROR,
83
- "The lower bound must less than the upper bound, %s > %s",
84
- upper_term, upper_term);
86
+ RAISE(ARG_ERROR, BOUND_ORDER_ERROR_MSG);
85
87
 
86
88
  range = ALLOC(Range);
87
89
 
@@ -39,21 +39,19 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
39
39
  }
40
40
 
41
41
  Explanation *expl = expl_create(0.0,
42
- epstrdup("weight(%s in %d), product of:",
43
- strlen(query_str) + 20,
44
- query_str, target));
42
+ strfmt("weight(%s in %d), product of:", query_str, target));
45
43
 
46
44
  /* We need two of these as it's included in both the query explanation
47
45
  * and the field explanation */
48
46
  Explanation *idf_expl1 = expl_create(self->idf,
49
- epstrdup("idf(%s: %s)", strlen(field) + df_i, field, doc_freqs));
47
+ strfmt("idf(%s: %s)", field, doc_freqs));
50
48
  Explanation *idf_expl2 = expl_create(self->idf,
51
- epstrdup("idf(%s: %s)", strlen(field) + df_i, field, doc_freqs));
49
+ strfmt("idf(%s: %s)", field, doc_freqs));
52
50
  if (terms->size > 0) free(doc_freqs); /* only free if allocated */
53
51
 
54
52
  /* explain query weight */
55
53
  Explanation *query_expl = expl_create(0.0,
56
- epstrdup("query_weight(%s), product of:", strlen(query_str), query_str));
54
+ strfmt("query_weight(%s), product of:", query_str));
57
55
 
58
56
  if (self->query->boost != 1.0) {
59
57
  expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
@@ -70,9 +68,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
70
68
 
71
69
  /* explain field weight */
72
70
  Explanation *field_expl = expl_create(0.0,
73
- epstrdup("field_weight(%s:%s in %d), product of:",
74
- strlen(field) + strlen(query_str) + 20,
75
- field, query_str, target));
71
+ strfmt("field_weight(%s:%s in %d), product of:", field, query_str, target));
76
72
  free(query_str);
77
73
 
78
74
  Scorer *scorer = self->scorer(self, ir);
@@ -84,8 +80,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
84
80
  uchar *field_norms = ir->get_norms(ir, field);
85
81
  float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[target]) : 0.0);
86
82
  Explanation *field_norm_expl = expl_create(field_norm,
87
- epstrdup("field_norm(field=%s, doc=%d)",
88
- strlen(field) + 20, field, target));
83
+ strfmt("field_norm(field=%s, doc=%d)", field, target));
89
84
  expl_add_detail(field_expl, field_norm_expl);
90
85
 
91
86
  field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
@@ -103,9 +98,7 @@ Explanation *spanw_explain(Weight *self, IndexReader *ir, int target)
103
98
 
104
99
  char *spanw_to_s(Weight *self)
105
100
  {
106
- char dbuf[32];
107
- dbl_to_s(dbuf, self->value);
108
- return epstrdup("SpanWeight(%s)", strlen(dbuf), dbuf);
101
+ return strfmt("SpanWeight(%f)", self->value);
109
102
  }
110
103
 
111
104
  void spanw_destroy(void *p)
@@ -317,7 +310,7 @@ char *spanfe_to_s(SpanEnum *self)
317
310
  {
318
311
  char *field = ((SpanQuery *)self->query->data)->field;
319
312
  char *query_str = self->query->to_s(self->query, field);
320
- char *res = epstrdup("SpanFirstEnum(%s)", strlen(query_str), query_str);
313
+ char *res = strfmt("SpanFirstEnum(%s)", query_str);
321
314
  free(query_str);
322
315
  return res;
323
316
  }
@@ -888,7 +881,7 @@ char *spanxe_to_s(SpanEnum *self)
888
881
  {
889
882
  char *field = ((SpanQuery *)self->query->data)->field;
890
883
  char *query_str = self->query->to_s(self->query, field);
891
- char *res = epstrdup("SpanNotEnum(%s)", strlen(query_str), query_str);
884
+ char *res = strfmt("SpanNotEnum(%s)", query_str);
892
885
  free(query_str);
893
886
  return res;
894
887
  }
@@ -954,7 +947,7 @@ char *spantq_to_s(Query *self, char *field)
954
947
  } else {
955
948
  term_str = term_to_s(term);
956
949
  }
957
- res = epstrdup("span_term(%s)", strlen(term_str), term_str);
950
+ res = strfmt("span_term(%s)", term_str);
958
951
  free(term_str);
959
952
  return res;
960
953
  }
@@ -1015,7 +1008,7 @@ char *spanfq_to_s(Query *self, char *field)
1015
1008
  SpanFirstQuery *sfq = (SpanFirstQuery *)((SpanQuery *)self->data)->data;
1016
1009
  Query *match = sfq->match;
1017
1010
  char *q_str = match->to_s(match, field);
1018
- char *res = epstrdup("span_first(%s, %d)", strlen(q_str) + 20, q_str, sfq->end);
1011
+ char *res = strfmt("span_first(%s, %d)", q_str, sfq->end);
1019
1012
  free(q_str);
1020
1013
  return res;
1021
1014
  }
@@ -1372,8 +1365,7 @@ char *spanxq_to_s(Query *self, char *field)
1372
1365
  SpanNotQuery *sxq = (SpanNotQuery *)((SpanQuery *)self->data)->data;
1373
1366
  char *inc_s = sxq->inc->to_s(sxq->inc, field);
1374
1367
  char *exc_s = sxq->exc->to_s(sxq->exc, field);
1375
- char *res = epstrdup("span_not(inc:<%s>, exc:<%s>)",
1376
- strlen(inc_s) + strlen(exc_s), inc_s, exc_s);
1368
+ char *res = strfmt("span_not(inc:<%s>, exc:<%s>)", inc_s, exc_s);
1377
1369
 
1378
1370
  free(inc_s);
1379
1371
  free(exc_s);
@@ -1526,7 +1518,7 @@ Explanation *spansc_explain(Scorer *self, int target)
1526
1518
  phrase_freq = (self->doc == target) ? spansc->freq : 0.0;
1527
1519
 
1528
1520
  Explanation *tf_explanation = expl_create(sim_tf(self->similarity, phrase_freq),
1529
- epstrdup("tf(phrase_freq(%#.5g)", 32, phrase_freq));
1521
+ strfmt("tf(phrase_freq(%f)", phrase_freq));
1530
1522
 
1531
1523
  return tf_explanation;
1532
1524
  }
@@ -24,20 +24,18 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
24
24
  char *field_name = term->field;
25
25
 
26
26
  Explanation *expl = expl_create(0.0,
27
- epstrdup("weight(%s in %d), product of:",
28
- strlen(query_str) + 20,
29
- query_str, doc_num));
27
+ strfmt("weight(%s in %d), product of:", query_str, doc_num));
30
28
 
31
29
  // We need two of these as it's included in both the query explanation
32
30
  // and the field explanation
33
31
  Explanation *idf_expl1 = expl_create(self->idf,
34
- epstrdup("idf(doc_freq=%d)", 20, ir->doc_freq(ir, tq->term)));
32
+ strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
35
33
  Explanation *idf_expl2 = expl_create(self->idf,
36
- epstrdup("idf(doc_freq=%d)", 20, ir->doc_freq(ir, tq->term)));
34
+ strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
37
35
 
38
36
  // explain query weight
39
37
  Explanation *query_expl = expl_create(0.0,
40
- epstrdup("query_weight(%s), product of:", strlen(query_str), query_str));
38
+ strfmt("query_weight(%s), product of:", query_str));
41
39
  free(query_str);
42
40
 
43
41
  if (self->query->boost != 1.0) {
@@ -55,8 +53,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
55
53
 
56
54
  // explain field weight
57
55
  Explanation *field_expl = expl_create(0.0,
58
- epstrdup("field_weight(%s:%s in %d), product of:",
59
- strlen(field_name) + strlen(term->text) + 20,
56
+ strfmt("field_weight(%s:%s in %d), product of:",
60
57
  field_name, term->text, doc_num));
61
58
 
62
59
  Scorer *scorer = self->scorer(self, ir);
@@ -68,8 +65,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
68
65
  uchar *field_norms = ir->get_norms(ir, field_name);
69
66
  float field_norm = (field_norms ? sim_decode_norm(self->similarity, field_norms[doc_num]) : 0.0);
70
67
  Explanation *field_norm_expl = expl_create(field_norm,
71
- epstrdup("field_norm(field=%s, doc=%d)",
72
- strlen(field_name) + 20, field_name, doc_num));
68
+ strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
73
69
  expl_add_detail(field_expl, field_norm_expl);
74
70
 
75
71
  field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
@@ -87,9 +83,7 @@ Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
87
83
 
88
84
  char *tw_to_s(Weight *self)
89
85
  {
90
- char dbuf[32];
91
- dbl_to_s(dbuf, self->value);
92
- return epstrdup("TermWeight(%#.5g)", strlen(dbuf), dbuf);
86
+ return strfmt("TermWeight(%f)", self->value);
93
87
  }
94
88
 
95
89
  void tw_destroy(void *p)
@@ -141,19 +135,18 @@ char *tq_to_s(Query *self, char *field)
141
135
  int flen = strlen(term->field);
142
136
  int tlen = strlen(term->text);
143
137
  char *buffer = ALLOC_N(char, 34 + flen + tlen);
144
- char *bp = buffer;
138
+ char *b = buffer;
145
139
  if (strcmp(field, term->field) != 0) {
146
- memcpy(bp, term->field, sizeof(char) * flen);
147
- bp[flen] = ':';
148
- bp += flen + 1;
140
+ memcpy(b, term->field, sizeof(char) * flen);
141
+ b[flen] = ':';
142
+ b += flen + 1;
149
143
  }
150
- memcpy(bp, term->text, tlen);
151
- bp += tlen;
152
- *bp = 0;
144
+ memcpy(b, term->text, tlen);
145
+ b += tlen;
146
+ *b = 0;
153
147
  if (self->boost != 1.0) {
154
- char dbuf[32];
155
- dbl_to_s(dbuf, self->boost);
156
- sprintf(bp, "^%s", dbuf);
148
+ *b = '^';
149
+ dbl_to_s(b+1, self->boost);
157
150
  }
158
151
  return buffer;
159
152
  }
@@ -268,9 +261,7 @@ Explanation *tsc_explain(Scorer *self, int doc_num)
268
261
  tde->close(tde);
269
262
  ts->tde = NULL;
270
263
  Explanation *tf_explanation = expl_create(sim_tf(self->similarity, tf),
271
- epstrdup("tf(term_freq(%s:%s)=%ld)",
272
- strlen(term->field) + strlen(term->text) + 20,
273
- term->field, term->text, tf));
264
+ strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
274
265
 
275
266
  return tf_explanation;
276
267
  }
@@ -1,17 +1,62 @@
1
1
  #include "ferret.h"
2
2
  #include "analysis.h"
3
+ #include "locale.h"
3
4
 
4
5
  static VALUE cToken;
6
+ static VALUE cAsciiLetterTokenizer;
5
7
  static VALUE cLetterTokenizer;
8
+ static VALUE cAsciiWhiteSpaceTokenizer;
9
+ static VALUE cWhiteSpaceTokenizer;
10
+ static VALUE cAsciiStandardTokenizer;
11
+ static VALUE cStandardTokenizer;
12
+
13
+ static VALUE cAsciiLowerCaseFilter;
14
+ static VALUE cLowerCaseFilter;
15
+ static VALUE cStopFilter;
16
+ static VALUE cStemFilter;
6
17
 
7
18
  static VALUE cAnalyzer;
19
+ static VALUE cAsciiLetterAnalyzer;
8
20
  static VALUE cLetterAnalyzer;
21
+ static VALUE cAsciiWhiteSpaceAnalyzer;
9
22
  static VALUE cWhiteSpaceAnalyzer;
23
+ static VALUE cAsciiStandardAnalyzer;
10
24
  static VALUE cStandardAnalyzer;
25
+ static VALUE cPerFieldAnalyzer;
26
+
27
+ //static VALUE cRegexAnalyzer;
28
+ static VALUE cTokenStream;
29
+
30
+ static ID id_next;
31
+ static ID id_reset;
32
+ static ID id_clone;
11
33
 
12
34
  /****************************************************************************
13
35
  *
14
- * Token Methods
36
+ * Utility Methods
37
+ *
38
+ ****************************************************************************/
39
+
40
+ static char **
41
+ get_stopwords(VALUE rstop_words)
42
+ {
43
+ char **stop_words;
44
+ int i, len;
45
+ VALUE rstr;
46
+ Check_Type(rstop_words, T_ARRAY);
47
+ len = RARRAY(rstop_words)->len;
48
+ stop_words = ALLOC_N(char *, RARRAY(rstop_words)->len + 1);
49
+ stop_words[len] = NULL;
50
+ for (i = 0; i < len; i++) {
51
+ rstr = rb_obj_as_string(RARRAY(rstop_words)->ptr[i]);
52
+ stop_words[i] = RSTRING(rstr)->ptr;
53
+ }
54
+ return stop_words;
55
+ }
56
+
57
+ /****************************************************************************
58
+ *
59
+ * token methods
15
60
  *
16
61
  ****************************************************************************/
17
62
 
@@ -41,6 +86,31 @@ frt_token_alloc(VALUE klass)
41
86
  return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
42
87
  }
43
88
 
89
+ static VALUE
90
+ get_token(Token *tk)
91
+ {
92
+ RToken *token = ALLOC(RToken);
93
+
94
+ token->text = rb_str_new2(tk->text);
95
+ token->start = tk->start;
96
+ token->end = tk->end;
97
+ token->pos_inc = tk->pos_inc;
98
+ return Data_Wrap_Struct(cToken, &frt_token_mark, &frt_token_free, token);
99
+ }
100
+
101
+ Token *
102
+ frt_set_token(Token *tk, VALUE rt)
103
+ {
104
+ RToken *rtk;
105
+
106
+ if (rt == Qnil) return NULL;
107
+
108
+ Data_Get_Struct(rt, RToken, rtk);
109
+ tk_set(tk, RSTRING(rtk->text)->ptr, RSTRING(rtk->text)->len,
110
+ rtk->start, rtk->end, rtk->pos_inc);
111
+ return tk;
112
+ }
113
+
44
114
  #define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
45
115
  static VALUE
46
116
  frt_token_init(int argc, VALUE *argv, VALUE self)
@@ -129,23 +199,270 @@ frt_token_to_s(VALUE self)
129
199
 
130
200
  /****************************************************************************
131
201
  *
132
- * Tokenizer Methods
202
+ * TokenStream Methods
133
203
  *
134
204
  ****************************************************************************/
135
205
 
136
206
  static void
137
- frt_tokenizer_free(void *p)
207
+ frt_ts_mark(void *p)
138
208
  {
139
209
  TokenStream *ts = (TokenStream *)p;
140
- object_del(p);
210
+ if (ts->text) frt_gc_mark(&ts->text);
211
+ if (ts->sub_ts) frt_gc_mark(&ts->sub_ts);
212
+ }
213
+
214
+ static void
215
+ frt_ts_free(void *p)
216
+ {
217
+ TokenStream *ts = (TokenStream *)p;
218
+ if (object_get(&ts->text) != Qnil) object_del(&ts->text);
219
+ if (ts->sub_ts && (object_get(&ts->sub_ts) != Qnil)) object_del(&ts->sub_ts);
220
+ object_del(ts);
141
221
  ts->destroy(ts);
142
222
  }
143
223
 
144
224
  static VALUE
145
- frt_letter_tokenizer_init(VALUE self, VALUE rstr)
225
+ get_token_stream(TokenStream *ts)
226
+ {
227
+ VALUE rts = object_get(ts);
228
+ if (rts == Qnil) {
229
+ rts = Data_Wrap_Struct(cTokenStream, &frt_ts_mark, &frt_ts_free, ts);
230
+ object_add(ts, rts);
231
+ }
232
+ return rts;
233
+ }
234
+
235
+ static inline VALUE
236
+ get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
146
237
  {
147
- TokenStream *ts = letter_tokenizer_create();
148
- Frt_Wrap_Struct(self, NULL, &frt_tokenizer_free, ts);
238
+ rstr = rb_obj_as_string(rstr);
239
+ ts->reset(ts, RSTRING(rstr)->ptr);
240
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
241
+ object_add(&ts->text, rstr);
242
+ object_add(ts, self);
243
+ return self;
244
+ }
245
+
246
+ static VALUE
247
+ frt_ts_set_text(VALUE self, VALUE rtext)
248
+ {
249
+ TokenStream *ts;
250
+ Data_Get_Struct(self, TokenStream, ts);
251
+ rtext = rb_obj_as_string(rtext);
252
+ ts->reset(ts, RSTRING(rtext)->ptr);
253
+ object_set(&ts->text, rtext);
254
+
255
+ return rtext;
256
+ }
257
+
258
+ static VALUE
259
+ frt_ts_get_text(VALUE self)
260
+ {
261
+ VALUE rtext = Qnil;
262
+ TokenStream *ts;
263
+ Data_Get_Struct(self, TokenStream, ts);
264
+ if (ts->text) {
265
+ if ((rtext = object_get(&ts->text)) == Qnil) {
266
+ rtext = rb_str_new2(ts->text);
267
+ object_set(&ts->text, rtext);
268
+ }
269
+ }
270
+ return rtext;
271
+ }
272
+
273
+ static VALUE
274
+ frt_ts_next(VALUE self)
275
+ {
276
+ TokenStream *ts;
277
+ Data_Get_Struct(self, TokenStream, ts);
278
+ Token *next = ts->next(ts);
279
+ if (next == NULL) {
280
+ return Qnil;
281
+ }
282
+
283
+ return get_token(next);
284
+ }
285
+
286
+ /****************************************************************************
287
+ * CWrappedTokenStream
288
+ ****************************************************************************/
289
+
290
+ void cwrts_destroy(void *p)
291
+ {
292
+ TokenStream *ts = (TokenStream *)p;
293
+ free(ts->token);
294
+ free(ts);
295
+ }
296
+
297
+ Token *cwrts_next(TokenStream *ts)
298
+ {
299
+ VALUE rts = (VALUE)ts->data;
300
+ VALUE rtoken = rb_funcall(rts, id_next, 0);
301
+ return frt_set_token(ts->token, rtoken);
302
+ }
303
+
304
+ void cwrts_reset(TokenStream *ts, char *text)
305
+ {
306
+ VALUE rts = (VALUE)ts->data;
307
+ ts->t = ts->text = text;
308
+ rb_funcall(rts, id_reset, 1, rb_str_new2(text));
309
+ }
310
+
311
+ void cwrts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
312
+ {
313
+ VALUE rorig_ts = (VALUE)orig_ts->data;
314
+ new_ts->data = (void *)rb_funcall(rorig_ts, id_clone, 0);
315
+ }
316
+
317
+ static TokenStream *
318
+ get_cwrapped_rts(VALUE rts, bool *self_destroy)
319
+ {
320
+ TokenStream *ts;
321
+ switch (TYPE(rts)) {
322
+ case T_DATA:
323
+ Data_Get_Struct(rts, TokenStream, ts);
324
+ *self_destroy = true;
325
+ break;
326
+ default:
327
+ ts = ALLOC(TokenStream);
328
+ ts->token = ALLOC(Token);
329
+ ts->data = (void *)rts;
330
+ ts->next = &cwrts_next;
331
+ ts->reset = &cwrts_reset;
332
+ ts->clone_i = &cwrts_clone_i;
333
+ ts->destroy = &cwrts_destroy;
334
+ ts->sub_ts = NULL;
335
+ *self_destroy = false;
336
+ break;
337
+ }
338
+ return ts;
339
+ }
340
+
341
+ /****************************************************************************
342
+ * Tokenizers
343
+ ****************************************************************************/
344
+
345
+ #define TS_ARGS(dflt) \
346
+ bool lower;\
347
+ VALUE rlower, rstr;\
348
+ rb_scan_args(argc, argv, "11", &rstr, &rlower);\
349
+ lower = (argc ? RTEST(rlower) : dflt)
350
+
351
+ static VALUE
352
+ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
353
+ {
354
+ return get_wrapped_ts(self, rstr, letter_tokenizer_create());
355
+ }
356
+
357
+ static VALUE
358
+ frt_letter_tokenizer_init(int argc, VALUE *argv, VALUE self)
359
+ {
360
+ TS_ARGS(false);
361
+ return get_wrapped_ts(self, rstr, mb_letter_tokenizer_create(lower));
362
+ }
363
+
364
+ static VALUE
365
+ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
366
+ {
367
+ return get_wrapped_ts(self, rstr, whitespace_tokenizer_create());
368
+ }
369
+
370
+ static VALUE
371
+ frt_whitespace_tokenizer_init(int argc, VALUE *argv, VALUE self)
372
+ {
373
+ TS_ARGS(false);
374
+ return get_wrapped_ts(self, rstr, mb_whitespace_tokenizer_create(lower));
375
+ }
376
+
377
+ static VALUE
378
+ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
379
+ {
380
+ return get_wrapped_ts(self, rstr, standard_tokenizer_create());
381
+ }
382
+
383
+ static VALUE
384
+ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
385
+ {
386
+ return get_wrapped_ts(self, rstr, mb_standard_tokenizer_create());
387
+ }
388
+
389
+ /****************************************************************************
390
+ * Filters
391
+ ****************************************************************************/
392
+
393
+
394
+ static VALUE
395
+ frt_a_lowercase_filter_init(VALUE self, VALUE rsub_ts)
396
+ {
397
+ bool self_destroy;
398
+ TokenStream *ts = lowercase_filter_create(
399
+ get_cwrapped_rts(rsub_ts, &self_destroy));
400
+ ts->destroy_sub = !self_destroy;
401
+ object_add(&ts->sub_ts, rsub_ts);
402
+
403
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
404
+ object_add(ts, self);
405
+ return self;
406
+ }
407
+
408
+ static VALUE
409
+ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
410
+ {
411
+ bool self_destroy;
412
+ TokenStream *ts = mb_lowercase_filter_create(
413
+ get_cwrapped_rts(rsub_ts, &self_destroy));
414
+ ts->destroy_sub = !self_destroy;
415
+ object_add(&ts->sub_ts, rsub_ts);
416
+
417
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
418
+ object_add(ts, self);
419
+ return self;
420
+ }
421
+
422
+ static VALUE
423
+ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
424
+ {
425
+ VALUE rsub_ts, rstop_words;
426
+ bool self_destroy;
427
+ TokenStream *ts;
428
+ rb_scan_args(argc, argv, "11", &rsub_ts, &rstop_words);
429
+ if (rstop_words != Qnil) {
430
+ char **stop_words = get_stopwords(rstop_words);
431
+ ts = stop_filter_create_with_words(
432
+ get_cwrapped_rts(rsub_ts, &self_destroy), (const char **)stop_words);
433
+ free(stop_words);
434
+ } else {
435
+ ts = stop_filter_create(
436
+ get_cwrapped_rts(rsub_ts, &self_destroy));
437
+ }
438
+ ts->destroy_sub = !self_destroy;
439
+ object_add(&ts->sub_ts, rsub_ts);
440
+
441
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
442
+ object_add(ts, self);
443
+ return self;
444
+ }
445
+
446
+ static VALUE
447
+ frt_stem_filter_init(int argc, VALUE *argv, VALUE self)
448
+ {
449
+ VALUE rsub_ts, ralgorithm, rcharenc;
450
+ char *algorithm = "english";
451
+ char *charenc = NULL;
452
+ bool self_destroy;
453
+ TokenStream *ts;
454
+ rb_scan_args(argc, argv, "12", &rsub_ts, &ralgorithm, &rcharenc);
455
+ switch (argc) {
456
+ case 3: charenc = RSTRING(rb_obj_as_string(rcharenc))->ptr;
457
+ case 2: algorithm = RSTRING(rb_obj_as_string(ralgorithm))->ptr;
458
+ }
459
+ ts = stem_filter_create(
460
+ get_cwrapped_rts(rsub_ts, &self_destroy), algorithm, charenc);
461
+ ts->destroy_sub = !self_destroy;
462
+ object_add(&ts->sub_ts, rsub_ts);
463
+
464
+ Frt_Wrap_Struct(self, &frt_ts_mark, &frt_ts_free, ts);
465
+ object_add(ts, self);
149
466
  return self;
150
467
  }
151
468
 
@@ -155,6 +472,28 @@ frt_letter_tokenizer_init(VALUE self, VALUE rstr)
155
472
  *
156
473
  ****************************************************************************/
157
474
 
475
+ Analyzer *get_cwrapped_analyzer(ranalyzer)
476
+ {
477
+ Analyzer *a = NULL;
478
+ switch (TYPE(ranalyzer)) {
479
+ case T_DATA:
480
+ Data_Get_Struct(ranalyzer, Analyzer, a);
481
+ break;
482
+ default:
483
+ printf("Oh RFuck\n");
484
+ //ts = ALLOC(TokenStream);
485
+ //ts->token = ALLOC(Token);
486
+ //ts->data = (void *)rts;
487
+ //ts->next = &cwrts_next;
488
+ //ts->reset = &cwrts_reset;
489
+ //ts->clone_i = &cwrts_clone_i;
490
+ //ts->destroy = &cwrts_destroy;
491
+ //ts->sub_ts = NULL;
492
+ break;
493
+ }
494
+ return a;
495
+ }
496
+
158
497
  static void
159
498
  frt_analyzer_free(void *p)
160
499
  {
@@ -171,11 +510,53 @@ frt_get_analyzer(Analyzer *a)
171
510
  return self;
172
511
  }
173
512
 
513
+ static VALUE
514
+ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
515
+ {
516
+ Analyzer *a = ((struct RData *)(self))->data;
517
+ rfield = rb_obj_as_string(rfield);
518
+ rstring = rb_obj_as_string(rstring);
519
+
520
+ TokenStream *ts = a_get_new_ts(a, RSTRING(rfield)->ptr, RSTRING(rstring)->ptr);
521
+
522
+ object_set(&ts->text, rstring); // Make sure that there is no entry already
523
+ return get_token_stream(ts);
524
+ }
525
+
526
+ #define GET_LOWER(dflt) \
527
+ bool lower;\
528
+ VALUE rlower;\
529
+ rb_scan_args(argc, argv, "01", &rlower);\
530
+ lower = (argc ? RTEST(rlower) : dflt)
531
+
532
+ /*** AsciiWhiteSpaceAnalyzer ***/
533
+ static VALUE
534
+ frt_a_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
535
+ {
536
+ GET_LOWER(false);
537
+ Analyzer *a = whitespace_analyzer_create(lower);
538
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
539
+ object_add(a, self);
540
+ return self;
541
+ }
542
+
174
543
  /*** WhiteSpaceAnalyzer ***/
175
544
  static VALUE
176
- frt_white_space_analyzer_init(VALUE self)
545
+ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
546
+ {
547
+ GET_LOWER(false);
548
+ Analyzer *a = mb_whitespace_analyzer_create(lower);
549
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
550
+ object_add(a, self);
551
+ return self;
552
+ }
553
+
554
+ /*** AsciiLetterAnalyzer ***/
555
+ static VALUE
556
+ frt_a_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
177
557
  {
178
- Analyzer *a = whitespace_analyzer_create();
558
+ GET_LOWER(true);
559
+ Analyzer *a = letter_analyzer_create(lower);
179
560
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
180
561
  object_add(a, self);
181
562
  return self;
@@ -183,9 +564,44 @@ frt_white_space_analyzer_init(VALUE self)
183
564
 
184
565
  /*** LetterAnalyzer ***/
185
566
  static VALUE
186
- frt_letter_analyzer_init(VALUE self)
567
+ frt_letter_analyzer_init(int argc, VALUE *argv, VALUE self)
187
568
  {
188
- Analyzer *a = letter_analyzer_create();
569
+ GET_LOWER(true);
570
+ Analyzer *a = mb_letter_analyzer_create(lower);
571
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
572
+ object_add(a, self);
573
+ return self;
574
+ }
575
+
576
+ static VALUE
577
+ get_rstopwords(const char **stop_words)
578
+ {
579
+ char **w = (char **)stop_words;
580
+ VALUE rstopwords = rb_ary_new();
581
+
582
+ while (*w) {
583
+ rb_ary_push(rstopwords, rb_str_new2(*w));
584
+ w++;
585
+ }
586
+ return rstopwords;
587
+ }
588
+
589
+ /*** AsciiStandardAnalyzer ***/
590
+ static VALUE
591
+ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
592
+ {
593
+ bool lower;
594
+ VALUE rlower, rstop_words;
595
+ Analyzer *a;
596
+ rb_scan_args(argc, argv, "02", &rlower, &rstop_words);
597
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
598
+ if (rstop_words != Qnil) {
599
+ char **stop_words = get_stopwords(rstop_words);
600
+ a = standard_analyzer_create_with_words((const char **)stop_words, lower);
601
+ free(stop_words);
602
+ } else {
603
+ a = standard_analyzer_create(lower);
604
+ }
189
605
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
190
606
  object_add(a, self);
191
607
  return self;
@@ -193,14 +609,98 @@ frt_letter_analyzer_init(VALUE self)
193
609
 
194
610
  /*** StandardAnalyzer ***/
195
611
  static VALUE
196
- frt_standard_analyzer_init(VALUE self)
612
+ frt_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
197
613
  {
198
- Analyzer *a = standard_analyzer_create();
614
+ bool lower;
615
+ VALUE rlower, rstop_words;
616
+ Analyzer *a;
617
+ rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
618
+ lower = ((rlower == Qnil) ? true : RTEST(rlower));
619
+ if (rstop_words != Qnil) {
620
+ char **stop_words = get_stopwords(rstop_words);
621
+ a = mb_standard_analyzer_create_with_words((const char **)stop_words, lower);
622
+ free(stop_words);
623
+ } else {
624
+ a = mb_standard_analyzer_create(lower);
625
+ }
199
626
  Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
200
627
  object_add(a, self);
201
628
  return self;
202
629
  }
203
630
 
631
+ /*** PerFieldAnalyzer ***/
632
+ static VALUE
633
+ frt_per_field_analyzer_init(VALUE self, VALUE ranalyzer)
634
+ {
635
+ Analyzer *def = get_cwrapped_analyzer(ranalyzer);
636
+ Analyzer *a = per_field_analyzer_create(def, false);
637
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
638
+ object_add(a, self);
639
+ return self;
640
+ }
641
+
642
+ static VALUE
643
+ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
644
+ {
645
+ Analyzer *pfa, *a;
646
+ Data_Get_Struct(self, Analyzer, pfa);
647
+ Data_Get_Struct(ranalyzer, Analyzer, a);
648
+
649
+ pfa_add_field(pfa, StringValuePtr(rfield), a);
650
+ return self;
651
+ }
652
+
653
+
654
+ /** RegexAnalyzer **/
655
+ /*
656
+ static VALUE
657
+ frt_regex_analyzer_init(VALUE self)
658
+ {
659
+ Analyzer *a = regex_analyzer_create();
660
+ // keine Ahnung warum hier das Makro und nicht Data_Wrap_Struct:
661
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
662
+ // wofuer?:
663
+ object_add(a, self);
664
+ return self;
665
+ }
666
+
667
+ // convenience method
668
+ // XXX this sets the locale for the entire program
669
+ static VALUE
670
+ frt_regex_analyzer_token_stream(VALUE self, VALUE field, VALUE string)
671
+ {
672
+ Analyzer *a =((struct RData *)(self))->data;
673
+ TokenStream *ts = a->get_ts( a, StringValuePtr(field), StringValuePtr(string) );
674
+ // already freed via analyzer's free()
675
+ VALUE token_stream = Data_Wrap_Struct(cTokenStream, NULL, NULL, ts);
676
+ return token_stream;
677
+ }
678
+ */
679
+ /** /RegexAnalyzer **/
680
+
681
+ /** TokenStream **/
682
+ /** /TokenStream **/
683
+
684
+ /****************************************************************************
685
+ *
686
+ * Locale stuff
687
+ *
688
+ ****************************************************************************/
689
+
690
+ static char *frt_locale = NULL;
691
+
692
+ static VALUE frt_getlocale(VALUE self, VALUE locale)
693
+ {
694
+ return (frt_locale ? rb_str_new2(frt_locale) : Qnil);
695
+ }
696
+
697
+ static VALUE frt_setlocale(VALUE self, VALUE locale)
698
+ {
699
+ char *l = ((locale == Qnil) ? NULL : RSTRING(rb_obj_as_string(locale))->ptr);
700
+ frt_locale = setlocale(LC_ALL, l);
701
+ return frt_locale ? rb_str_new2(frt_locale) : Qnil;
702
+ }
703
+
204
704
  /****************************************************************************
205
705
  *
206
706
  * Init Function
@@ -210,6 +710,18 @@ frt_standard_analyzer_init(VALUE self)
210
710
  void
211
711
  Init_analysis(void)
212
712
  {
713
+ id_next = rb_intern("next");
714
+ id_reset = rb_intern("text=");
715
+ id_clone = rb_intern("clone");
716
+
717
+ /*** * * Locale stuff * * ***/
718
+ frt_locale = setlocale(LC_ALL, "");
719
+ rb_define_singleton_method(mFerret, "locale=", frt_setlocale, 1);
720
+ rb_define_singleton_method(mFerret, "locale", frt_getlocale, 0);
721
+
722
+ /*********************/
723
+ /*** * * Token * * ***/
724
+ /*********************/
213
725
  cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
214
726
  rb_define_alloc_func(cToken, frt_token_alloc);
215
727
  rb_include_module(cToken, rb_mComparable);
@@ -223,33 +735,193 @@ Init_analysis(void)
223
735
  rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
224
736
  rb_define_method(cToken, "to_s", frt_token_to_s, 0);
225
737
 
738
+ /****************************/
739
+ /*** * * TokenStreams * * ***/
740
+ /****************************/
741
+
742
+ cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
743
+ rb_define_method(cTokenStream, "next", frt_ts_next, 0);
744
+ rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
745
+ rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
746
+
747
+ /******************/
748
+ /*** Tokenizers ***/
749
+ /******************/
750
+
751
+ /*** * * AsciiLetterTokenizer * * ***/
752
+ cAsciiLetterTokenizer =
753
+ rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
754
+ rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
755
+ rb_define_method(cAsciiLetterTokenizer, "initialize",
756
+ frt_a_letter_tokenizer_init, 1);
757
+
758
+ /*** * * LetterTokenizer * * ***/
226
759
  cLetterTokenizer =
227
- rb_define_class_under(mAnalysis, "LetterTokenizer", rb_cObject);
760
+ rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
228
761
  rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
229
762
  rb_define_method(cLetterTokenizer, "initialize",
230
- frt_letter_tokenizer_init, 1);
763
+ frt_letter_tokenizer_init, -1);
764
+
765
+ /*** * * AsciiWhiteSpaceTokenizer * * ***/
766
+ cAsciiWhiteSpaceTokenizer =
767
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer", cTokenStream);
768
+ rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
769
+ rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
770
+ frt_a_whitespace_tokenizer_init, 1);
771
+
772
+ /*** * * WhiteSpaceTokenizer * * ***/
773
+ cWhiteSpaceTokenizer =
774
+ rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
775
+ rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
776
+ rb_define_method(cWhiteSpaceTokenizer, "initialize",
777
+ frt_whitespace_tokenizer_init, -1);
778
+
779
+ /*** * * AsciiStandardTokenizer * * ***/
780
+ cAsciiStandardTokenizer =
781
+ rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
782
+ rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
783
+ rb_define_method(cAsciiStandardTokenizer, "initialize",
784
+ frt_a_standard_tokenizer_init, 1);
231
785
 
786
+ /*** * * StandardTokenizer * * ***/
787
+ cStandardTokenizer =
788
+ rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
789
+ rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
790
+ rb_define_method(cStandardTokenizer, "initialize",
791
+ frt_standard_tokenizer_init, 1);
792
+
793
+ /***************/
794
+ /*** Filters ***/
795
+ /***************/
796
+ rb_define_const(mAnalysis, "ENGLISH_STOP_WORDS",
797
+ get_rstopwords(ENGLISH_STOP_WORDS));
798
+ rb_define_const(mAnalysis, "FULL_ENGLISH_STOP_WORDS",
799
+ get_rstopwords(FULL_ENGLISH_STOP_WORDS));
800
+ rb_define_const(mAnalysis, "EXTENDED_ENGLISH_STOP_WORDS",
801
+ get_rstopwords(EXTENDED_ENGLISH_STOP_WORDS));
802
+ rb_define_const(mAnalysis, "FULL_FRENCH_STOP_WORDS",
803
+ get_rstopwords(FULL_FRENCH_STOP_WORDS));
804
+ rb_define_const(mAnalysis, "FULL_SPANISH_STOP_WORDS",
805
+ get_rstopwords(FULL_SPANISH_STOP_WORDS));
806
+ rb_define_const(mAnalysis, "FULL_PORTUGUESE_STOP_WORDS",
807
+ get_rstopwords(FULL_PORTUGUESE_STOP_WORDS));
808
+ rb_define_const(mAnalysis, "FULL_ITALIAN_STOP_WORDS",
809
+ get_rstopwords(FULL_ITALIAN_STOP_WORDS));
810
+ rb_define_const(mAnalysis, "FULL_GERMAN_STOP_WORDS",
811
+ get_rstopwords(FULL_GERMAN_STOP_WORDS));
812
+ rb_define_const(mAnalysis, "FULL_DUTCH_STOP_WORDS",
813
+ get_rstopwords(FULL_DUTCH_STOP_WORDS));
814
+ rb_define_const(mAnalysis, "FULL_SWEDISH_STOP_WORDS",
815
+ get_rstopwords(FULL_SWEDISH_STOP_WORDS));
816
+ rb_define_const(mAnalysis, "FULL_NORWEGIAN_STOP_WORDS",
817
+ get_rstopwords(FULL_NORWEGIAN_STOP_WORDS));
818
+ rb_define_const(mAnalysis, "FULL_DANISH_STOP_WORDS",
819
+ get_rstopwords(FULL_DANISH_STOP_WORDS));
820
+ rb_define_const(mAnalysis, "FULL_RUSSIAN_STOP_WORDS",
821
+ get_rstopwords(FULL_RUSSIAN_STOP_WORDS));
822
+ rb_define_const(mAnalysis, "FULL_FINNISH_STOP_WORDS",
823
+ get_rstopwords(FULL_FINNISH_STOP_WORDS));
824
+
825
+ cAsciiLowerCaseFilter =
826
+ rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
827
+ rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
828
+ rb_define_method(cAsciiLowerCaseFilter, "initialize",
829
+ frt_a_lowercase_filter_init, 1);
830
+
831
+ cLowerCaseFilter =
832
+ rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
833
+ rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
834
+ rb_define_method(cLowerCaseFilter, "initialize",
835
+ frt_lowercase_filter_init, 1);
836
+
837
+ cStopFilter =
838
+ rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
839
+ rb_define_alloc_func(cStopFilter, frt_data_alloc);
840
+ rb_define_method(cStopFilter, "initialize",
841
+ frt_stop_filter_init, -1);
842
+
843
+ cStemFilter =
844
+ rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
845
+ rb_define_alloc_func(cStemFilter, frt_data_alloc);
846
+ rb_define_method(cStemFilter, "initialize",
847
+ frt_stem_filter_init, -1);
848
+
849
+
850
+ /*************************/
851
+ /*** * * Analyzers * * ***/
852
+ /*************************/
853
+
854
+ /*** * * Analyzer * * ***/
232
855
  cAnalyzer =
233
856
  rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
234
857
  rb_define_alloc_func(cAnalyzer, frt_data_alloc);
235
- rb_define_method(cAnalyzer, "initialize",
236
- frt_letter_analyzer_init, 0);
858
+ rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
859
+ rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
860
+
861
+ /*** * * AsciiLetterAnalyzer * * ***/
862
+ cAsciiLetterAnalyzer =
863
+ rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
864
+ rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
865
+ rb_define_method(cAsciiLetterAnalyzer, "initialize",
866
+ frt_a_letter_analyzer_init, -1);
237
867
 
868
+ /*** * * LetterAnalyzer * * ***/
238
869
  cLetterAnalyzer =
239
870
  rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
240
871
  rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
241
- rb_define_method(cAnalyzer, "initialize",
242
- frt_letter_analyzer_init, 0);
872
+ rb_define_method(cLetterAnalyzer, "initialize",
873
+ frt_letter_analyzer_init, -1);
243
874
 
875
+ /*** * * AsciiWhiteSpaceAnalyzer * * ***/
876
+ cAsciiWhiteSpaceAnalyzer =
877
+ rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
878
+ rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
879
+ rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
880
+ frt_a_white_space_analyzer_init, -1);
881
+
882
+ /*** * * WhiteSpaceAnalyzer * * ***/
244
883
  cWhiteSpaceAnalyzer =
245
884
  rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
246
885
  rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
247
886
  rb_define_method(cWhiteSpaceAnalyzer, "initialize",
248
- frt_white_space_analyzer_init, 0);
887
+ frt_white_space_analyzer_init, -1);
888
+
889
+ /*** * * AsciiStandardAnalyzer * * ***/
890
+ cAsciiStandardAnalyzer =
891
+ rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
892
+ rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
893
+ rb_define_method(cAsciiStandardAnalyzer, "initialize",
894
+ frt_a_standard_analyzer_init, -1);
249
895
 
896
+ /*** * * StandardAnalyzer * * ***/
250
897
  cStandardAnalyzer =
251
898
  rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
252
899
  rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
253
900
  rb_define_method(cStandardAnalyzer, "initialize",
254
- frt_standard_analyzer_init, 0);
901
+ frt_standard_analyzer_init, -1);
902
+
903
+ /*** * * PerFieldAnalyzer * * ***/
904
+ cPerFieldAnalyzer =
905
+ rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
906
+ rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
907
+ rb_define_method(cPerFieldAnalyzer, "initialize",
908
+ frt_per_field_analyzer_init, 1);
909
+ rb_define_method(cPerFieldAnalyzer, "add_field",
910
+ frt_per_field_analyzer_add_field, 2);
911
+ rb_define_method(cPerFieldAnalyzer, "[]=",
912
+ frt_per_field_analyzer_add_field, 2);
913
+
914
+ /** RegexAnalyzer **/
915
+ /*
916
+ cRegexAnalyzer =
917
+ rb_define_class_under(mAnalysis, "RegexAnalyzer", cAnalyzer);
918
+ rb_define_alloc_func(cRegexAnalyzer, frt_data_alloc);
919
+ rb_define_method(cRegexAnalyzer, "initialize",
920
+ frt_regex_analyzer_init, 0);
921
+ rb_define_method(cRegexAnalyzer, "token_stream",
922
+ frt_regex_analyzer_token_stream, 2);
923
+ rb_define_method(cRegexAnalyzer, "setlocale",
924
+ frt_regex_analyzer_setlocale, 1);
925
+ */
926
+
255
927
  }