ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  $:. << 'lib'
2
2
  # Some parts of this Rakefile where taken from Jim Weirich's Rakefile for
3
- # Rake. Other parts where stolen from the David Heinemeier Hansson's Rails
3
+ # Rake. Other parts where taken from the David Heinemeier Hansson's Rails
4
4
  # Rakefile. Both are under MIT-LICENSE. Thanks to both for their excellent
5
5
  # projects.
6
6
 
@@ -32,12 +32,13 @@ end
32
32
  $VERBOSE = nil
33
33
 
34
34
  EXT = "ferret_ext.so"
35
- EXT_SRC = FileList["src/*/*.[ch]"]
35
+ EXT_SRC = FileList["src/**/*.[ch]"]
36
+
36
37
  EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
37
38
  SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
38
39
 
39
40
  CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
40
- CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
41
+ CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
41
42
 
42
43
  task :default => :all_tests
43
44
  desc "Run all tests"
@@ -57,7 +58,7 @@ end
57
58
  desc "run unit tests in test/unit for C ferret"
58
59
  Rake::TestTask.new("test_cunits" => :ext) do |t|
59
60
  t.libs << "test/unit"
60
- t.pattern = 'test/unit/t[cs]_*.rb'
61
+ t.pattern = 'test/unit/ts_*.rb'
61
62
  t.verbose = true
62
63
  end
63
64
 
@@ -102,6 +103,15 @@ EXT_SRC.each do |fn|
102
103
  dest_fn = File.join("ext", File.basename(fn))
103
104
  file dest_fn => fn do |t|
104
105
  cp fn, dest_fn
106
+ if fn =~ /stemmer/
107
+ # flatten the directory structure for lib_stemmer
108
+ open(dest_fn) do |in_f|
109
+ open(dest_fn + ".out", "w") do |out_f|
110
+ in_f.each {|line| out_f.write(line.sub(/(#include ["<])[.a-z_\/]*\//) {"#{$1}"})}
111
+ end
112
+ end
113
+ mv dest_fn + ".out", dest_fn
114
+ end
105
115
  end
106
116
  end
107
117
 
@@ -110,9 +120,17 @@ task :ext => ["ext/#{EXT}"] + SRC
110
120
 
111
121
  file "ext/#{EXT}" => ["ext/Makefile"] do
112
122
  cp "ext/inc/lang.h", "ext/lang.h"
123
+ cp "ext/inc/except.h", "ext/except.h"
113
124
  sh "cd ext; make"
114
125
  end
115
126
 
127
+ file "ext/lang.h" => ["ext/inc/lang.h"] do
128
+ cp "ext/inc/lang.h", "ext/lang.h"
129
+ end
130
+ file "ext/except.h" => ["ext/inc/except.h"] do
131
+ cp "ext/inc/except.h", "ext/except.h"
132
+ end
133
+
116
134
  file "ext/Makefile" => SRC do
117
135
  sh "cd ext; ruby extconf.rb"
118
136
  end
@@ -220,7 +238,7 @@ task :repackage => EXT_SRC_DEST
220
238
  task :package => EXT_SRC_DEST
221
239
  task :tag => [:prerelease]
222
240
  task :update_version => [:prerelease]
223
- task :release => [:tag, :update_version, :package] do
241
+ task :release do #=> [:tag, :update_version, :package] do
224
242
  announce
225
243
  announce "**************************************************************"
226
244
  announce "* Release #{PKG_VERSION} Complete."
data/TODO CHANGED
@@ -4,11 +4,12 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
4
4
 
5
5
  === To Do
6
6
 
7
- * Add the ability to persist an in memory index to Ferret::Index::Index
8
7
  * Make a dll for people on Windows
8
+ * pure ruby ConstantScoreQuery
9
9
 
10
10
  === Done
11
11
 
12
+ * Add the ability to persist an in memory index to Ferret::Index::Index
12
13
  * Add UTF-8 support
13
14
  * Multi Field Query
14
15
  * Test threading
@@ -1,7 +1,16 @@
1
1
  #include <analysis.h>
2
2
  #include <string.h>
3
3
  #include <ctype.h>
4
- #include <hash.h>
4
+ #include <wctype.h>
5
+ #include <wchar.h>
6
+ #include "hash.h"
7
+ #include "libstemmer.h"
8
+
9
+ /****************************************************************************
10
+ *
11
+ * Token
12
+ *
13
+ ****************************************************************************/
5
14
 
6
15
  Token *tk_create()
7
16
  {
@@ -24,6 +33,11 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
24
33
  return tk;
25
34
  }
26
35
 
36
+ inline Token *tk_set_ts(Token *tk, char *start, char *end, char *text, int pos_inc)
37
+ {
38
+ return tk_set(tk, start, end - start, start - text, end - text, pos_inc);
39
+ }
40
+
27
41
  inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
28
42
  {
29
43
  return tk_set(tk, text, strlen(text), start, end, pos_inc);
@@ -31,11 +45,8 @@ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_i
31
45
 
32
46
  int tk_eq(Token *tk1, Token *tk2)
33
47
  {
34
- if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
35
- tk1->start == tk2->start && tk1->end == tk2->end)
36
- return true;
37
- else
38
- return false;
48
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
49
+ tk1->start == tk2->start && tk1->end == tk2->end);
39
50
  }
40
51
 
41
52
  int tk_cmp(Token *tk1, Token *tk2)
@@ -57,46 +68,152 @@ int tk_cmp(Token *tk1, Token *tk2)
57
68
  return cmp;
58
69
  }
59
70
 
71
+
72
+ /****************************************************************************
73
+ *
74
+ * TokenStream
75
+ *
76
+ ****************************************************************************/
77
+
60
78
  void ts_standard_destroy(void *p)
61
79
  {
62
80
  TokenStream *ts = (TokenStream *)p;
63
81
  tk_destroy(ts->token);
64
- free(p);
82
+ free(ts);
65
83
  }
66
84
 
67
85
  void ts_reset(TokenStream *ts, char *text)
68
86
  {
69
- ts->text = text;
70
- ts->pos = 0;
87
+ ts->t = ts->text = text;
71
88
  }
72
89
 
73
90
  TokenStream *ts_create()
74
91
  {
75
92
  TokenStream *ts = ALLOC(TokenStream);
76
- ts->pos = -1;
77
93
  ts->text = NULL;
78
94
  ts->token = tk_create();
79
95
  ts->destroy = &ts_standard_destroy;
80
96
  ts->reset = &ts_reset;
97
+ ts->sub_ts = NULL;
98
+ ts->clone_i = NULL;
81
99
  return ts;
82
100
  }
83
101
 
102
+ TokenStream *ts_clone(TokenStream *orig_ts)
103
+ {
104
+ TokenStream *ts = ALLOC(TokenStream);
105
+ memcpy(ts, orig_ts, sizeof(TokenStream));
106
+ if (orig_ts->token) {
107
+ ts->token = ALLOC(Token);
108
+ memcpy(ts->token, orig_ts->token, sizeof(Token));
109
+ }
110
+ if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
111
+ if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
112
+ return ts;
113
+ }
114
+
115
+ /* * Multi-byte TokenStream * */
116
+ static char * const ENC_ERR_MSG = "Error decoding input string. "
117
+ "Check that you have the locale set correctly";
118
+ #define MB_NEXT_CHAR \
119
+ if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
120
+ RAISE(IO_ERROR, ENC_ERR_MSG)
121
+
122
+ inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
123
+ {
124
+ tk->text[wcstombs(tk->text, text, MAX_WORD_SIZE - 1)] = '\0';
125
+ tk->start = start;
126
+ tk->end = end;
127
+ tk->pos_inc = pos_inc;
128
+ return tk;
129
+ }
130
+
131
+ void mb_ts_standard_destroy(void *p)
132
+ {
133
+ TokenStream *ts = (TokenStream *)p;
134
+ tk_destroy(ts->token);
135
+ free(ts->data);
136
+ free(ts);
137
+ }
138
+
139
+ void mb_ts_reset(TokenStream *ts, char *text)
140
+ {
141
+ ZEROSET(ts->data, mbstate_t, 1);
142
+ ts_reset(ts, text);
143
+ }
144
+
145
+ void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
146
+ {
147
+ new_ts->data = ALLOC(mbstate_t);
148
+ memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
149
+ }
150
+
151
+ TokenStream *mb_ts_create()
152
+ {
153
+ TokenStream *ts = ALLOC(TokenStream);
154
+ ts->data = ALLOC(mbstate_t);
155
+ ts->text = NULL;
156
+ ts->token = tk_create();
157
+ ts->destroy = &mb_ts_standard_destroy;
158
+ ts->reset = &mb_ts_reset;
159
+ ts->clone_i = &mb_ts_clone_i;
160
+ ts->sub_ts = NULL;
161
+ return ts;
162
+ }
163
+
164
+ /****************************************************************************
165
+ *
166
+ * Analyzer
167
+ *
168
+ ****************************************************************************/
169
+
170
+ void a_standard_destroy(void *p)
171
+ {
172
+ Analyzer *a = (Analyzer *)p;
173
+ ts_destroy(a->current_ts);
174
+ free(p);
175
+ }
176
+
177
+ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
178
+ {
179
+ a->current_ts->reset(a->current_ts, text);
180
+ return a->current_ts;
181
+ }
182
+
183
+ Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
184
+ TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
185
+ {
186
+ Analyzer *a = ALLOC(Analyzer);
187
+ a->data = data;
188
+ a->current_ts = ts;
189
+ a->destroy = (destroy ? destroy : &a_standard_destroy);
190
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
191
+ return a;
192
+ }
193
+
194
+ /****************************************************************************
195
+ *
196
+ * Whitespace
197
+ *
198
+ ****************************************************************************/
199
+
200
+ /*
201
+ * WhitespaceTokenizer
202
+ */
84
203
  Token *wst_next(TokenStream *ts)
85
204
  {
86
- int i = ts->pos;
87
- int start, end;
88
- char *text = ts->text;
205
+ char *t = ts->t;
206
+ char *start;
89
207
 
90
- while (text[i] != '\0' && isspace(text[i]))
91
- i++;
92
- if (text[i] == '\0')
93
- return NULL;
208
+ while (*t != '\0' && isspace(*t)) t++;
94
209
 
95
- start = i;
96
- while (text[i] != '\0' && !isspace(text[i]))
97
- i++;
98
- ts->pos = end = i;
99
- tk_set(ts->token, text+start, end-start, start, end, 1);
210
+ if (*t == '\0') return NULL;
211
+
212
+ start = t;
213
+ while (*t != '\0' && !isspace(*t)) t++;
214
+
215
+ ts->t = t;
216
+ tk_set_ts(ts->token, start, t, ts->text, 1);
100
217
  return ts->token;
101
218
  }
102
219
 
@@ -107,22 +224,121 @@ TokenStream *whitespace_tokenizer_create()
107
224
  return ts;
108
225
  }
109
226
 
227
+ /*
228
+ * Multi-byte WhitespaceTokenizer
229
+ */
230
+ Token *mb_wst_next(TokenStream *ts)
231
+ {
232
+ int i;
233
+ char *start;
234
+ char *t = ts->t;
235
+ wchar_t wchr;
236
+
237
+ MB_NEXT_CHAR;
238
+ while (wchr != 0 && iswspace(wchr)) {
239
+ t += i;
240
+ MB_NEXT_CHAR;
241
+ }
242
+ if (wchr == 0) return NULL;
243
+
244
+ start = t;
245
+ t += i;
246
+ MB_NEXT_CHAR;
247
+ while (wchr != 0 && !iswspace(wchr)) {
248
+ t += i;
249
+ MB_NEXT_CHAR;
250
+ }
251
+ tk_set_ts(ts->token, start, t, ts->text, 1);
252
+ ts->t = t;
253
+ return ts->token;
254
+ }
255
+
256
+ /*
257
+ * Lowercasing Multi-byte WhitespaceTokenizer
258
+ */
259
+ Token *mb_wst_next_lc(TokenStream *ts)
260
+ {
261
+ int i;
262
+ char *start;
263
+ char *t = ts->t;
264
+ wchar_t wchr;
265
+ wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
266
+
267
+ w = wbuf;
268
+ w_end = &wbuf[MAX_WORD_SIZE];
269
+
270
+ MB_NEXT_CHAR;
271
+ while (wchr != 0 && iswspace(wchr)) {
272
+ t += i;
273
+ MB_NEXT_CHAR;
274
+ }
275
+ if (wchr == 0) return NULL;
276
+
277
+ start = t;
278
+ t += i;
279
+ *w++ = towlower(wchr);
280
+ MB_NEXT_CHAR;
281
+ while (wchr != 0 && !iswspace(wchr)) {
282
+ if (w < w_end) *w++ = towlower(wchr);
283
+ t += i;
284
+ MB_NEXT_CHAR;
285
+ }
286
+ *w = 0;
287
+ w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
288
+ ts->t = t;
289
+ return ts->token;
290
+ }
291
+
292
+ TokenStream *mb_whitespace_tokenizer_create(bool lowercase)
293
+ {
294
+ TokenStream *ts = mb_ts_create();
295
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
296
+ return ts;
297
+ }
298
+
299
+ /*
300
+ * WhitespaceAnalyzers
301
+ */
302
+ Analyzer *whitespace_analyzer_create(bool lowercase)
303
+ {
304
+ TokenStream *ts;
305
+ if (lowercase) {
306
+ ts = lowercase_filter_create(whitespace_tokenizer_create());
307
+ } else {
308
+ ts = whitespace_tokenizer_create();
309
+ }
310
+ return analyzer_create(NULL, ts, NULL, NULL);
311
+ }
312
+
313
+ Analyzer *mb_whitespace_analyzer_create(bool lowercase)
314
+ {
315
+ return analyzer_create(NULL, mb_whitespace_tokenizer_create(lowercase),
316
+ NULL, NULL);
317
+ }
318
+
319
+ /****************************************************************************
320
+ *
321
+ * Letter
322
+ *
323
+ ****************************************************************************/
324
+
325
+ /*
326
+ * LetterTokenizer
327
+ */
110
328
  Token *lt_next(TokenStream *ts)
111
329
  {
112
- int i = ts->pos;
113
- int start, end;
114
- char *text = ts->text;
330
+ char *start;
331
+ char *t = ts->t;
115
332
 
116
- while (text[i] != '\0' && !isalpha(text[i]))
117
- i++;
118
- if (text[i] == '\0')
119
- return NULL;
333
+ while (*t != '\0' && !isalpha(*t)) t++;
120
334
 
121
- start = i;
122
- while (text[i] != '\0' && isalpha(text[i]))
123
- i++;
124
- ts->pos = end = i;
125
- tk_set(ts->token, text+start, end-start, start, end, 1);
335
+ if (*t == '\0') return NULL;
336
+
337
+ start = t;
338
+ while (*t != '\0' && isalpha(*t)) t++;
339
+
340
+ tk_set_ts(ts->token, start, t, ts->text, 1);
341
+ ts->t = t;
126
342
  return ts->token;
127
343
  }
128
344
 
@@ -133,54 +349,174 @@ TokenStream *letter_tokenizer_create()
133
349
  return ts;
134
350
  }
135
351
 
136
- void a_standard_destroy(void *p)
352
+ /*
353
+ * Multi-byte LetterTokenizer
354
+ */
355
+ Token *mb_lt_next(TokenStream *ts)
137
356
  {
138
- Analyzer *a = (Analyzer *)p;
139
- ts_destroy(a->current_ts);
140
- free(p);
357
+ int i;
358
+ char *start;
359
+ char *t = ts->t;
360
+ wchar_t wchr;
361
+
362
+ MB_NEXT_CHAR;
363
+ while (wchr != 0 && !iswalpha(wchr)) {
364
+ t += i;
365
+ MB_NEXT_CHAR;
366
+ }
367
+ if (wchr == 0) return NULL;
368
+
369
+ start = t;
370
+ t += i;
371
+ MB_NEXT_CHAR;
372
+ while (wchr != 0 && iswalpha(wchr)) {
373
+ t += i;
374
+ MB_NEXT_CHAR;
375
+ }
376
+ tk_set_ts(ts->token, start, t, ts->text, 1);
377
+ ts->t = t;
378
+ return ts->token;
141
379
  }
142
380
 
143
- TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
381
+ /*
382
+ * Lowercasing Multi-byte LetterTokenizer
383
+ */
384
+ Token *mb_lt_next_lc(TokenStream *ts)
144
385
  {
145
- a->current_ts->reset(a->current_ts, text);
146
- return a->current_ts;
386
+ int i;
387
+ char *start;
388
+ char *t = ts->t;
389
+ wchar_t wchr;
390
+ wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
391
+
392
+ w = wbuf;
393
+ w_end = &wbuf[MAX_WORD_SIZE];
394
+
395
+ MB_NEXT_CHAR;
396
+ while (wchr != 0 && !iswalpha(wchr)) {
397
+ t += i;
398
+ MB_NEXT_CHAR;
399
+ }
400
+ if (wchr == 0) return NULL;
401
+
402
+ start = t;
403
+ t += i;
404
+ *w++ = towlower(wchr);
405
+ MB_NEXT_CHAR;
406
+ while (wchr != 0 && iswalpha(wchr)) {
407
+ if (w < w_end) *w++ = towlower(wchr);
408
+ t += i;
409
+ MB_NEXT_CHAR;
410
+ }
411
+ *w = 0;
412
+ w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
413
+ ts->t = t;
414
+ return ts->token;
147
415
  }
148
416
 
149
- Analyzer *whitespace_analyzer_create()
417
+ TokenStream *mb_letter_tokenizer_create(bool lowercase)
150
418
  {
151
- Analyzer *a = ALLOC(Analyzer);
152
- a->data = NULL;
153
- a->current_ts = whitespace_tokenizer_create();
154
- a->destroy = &a_standard_destroy;
155
- a->get_ts = &a_standard_get_ts;
156
- return a;
419
+ TokenStream *ts = mb_ts_create();
420
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
421
+ return ts;
157
422
  }
158
423
 
159
- int std_get_alpha(char *input, char *token)
424
+ /*
425
+ * LetterAnalyzers
426
+ */
427
+ Analyzer *letter_analyzer_create(bool lowercase)
428
+ {
429
+ TokenStream *ts;
430
+ if (lowercase) {
431
+ ts = lowercase_filter_create(letter_tokenizer_create());
432
+ } else {
433
+ ts = letter_tokenizer_create();
434
+ }
435
+ return analyzer_create(NULL, ts, NULL, NULL);
436
+ }
437
+
438
+ Analyzer *mb_letter_analyzer_create(bool lowercase)
439
+ {
440
+ return analyzer_create(NULL,
441
+ mb_letter_tokenizer_create(lowercase), NULL, NULL);
442
+ }
443
+
444
+ /****************************************************************************
445
+ *
446
+ * Standard
447
+ *
448
+ ****************************************************************************/
449
+
450
+ /*
451
+ * StandardTokenizer
452
+ */
453
+ int std_get_alpha(TokenStream *ts, char *token)
160
454
  {
161
455
  int i = 0;
162
- while (input[i] != '\0' && isalpha(input[i])) {
163
- token[i] = input[i];
456
+ char *t = ts->t;
457
+ while (t[i] != '\0' && isalpha(t[i])) {
458
+ if (i < MAX_WORD_SIZE) token[i] = t[i];
164
459
  i++;
165
460
  }
166
461
  return i;
167
462
  }
168
463
 
169
- int std_get_alnum(char *input, char *token)
464
+ int mb_std_get_alpha(TokenStream *ts, char *token)
465
+ {
466
+ char *t = ts->t;
467
+ wchar_t w;
468
+ int i;
469
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
470
+ while (w != 0 && iswalpha(w)) {
471
+ t += i;
472
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
473
+ }
474
+
475
+ i = t - ts->t;
476
+ if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
477
+ memcpy(token, ts->t, i);
478
+ return i;
479
+ }
480
+
481
+ int std_get_alnum(TokenStream *ts, char *token)
170
482
  {
171
483
  int i = 0;
172
- while (input[i] != '\0' && isalnum(input[i])) {
173
- token[i] = input[i];
484
+ char *t = ts->t;
485
+ while (t[i] != '\0' && isalnum(t[i])) {
486
+ if (i < MAX_WORD_SIZE) token[i] = t[i];
174
487
  i++;
175
488
  }
176
489
  return i;
177
490
  }
178
491
 
492
+ int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
493
+ {
494
+ char *t = ts->t;
495
+ wchar_t w;
496
+ int i;
497
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
498
+ while (w != 0 && iswalnum(w)) {
499
+ t += i;
500
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
501
+ }
502
+
503
+ i = t - ts->t;
504
+ if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
505
+ memcpy(token, ts->t, i);
506
+ return i;
507
+
508
+ }
509
+
179
510
  int isnumpunc(char c)
180
511
  {
181
512
  return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
182
513
  }
183
514
 
515
+ int w_isnumpunc(wchar_t c)
516
+ {
517
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_' || c == L'-');
518
+ }
519
+
184
520
  int isurlpunc(char c)
185
521
  {
186
522
  return (c == '.' || c == '/' || c == '-' || c == '_');
@@ -201,11 +537,23 @@ int isurlxatc(char c)
201
537
  return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
202
538
  }
203
539
 
204
- int isstdtokchar(char c)
540
+ bool std_is_tok_char(char *c)
205
541
  {
206
- if (isspace(c)) return false; // most common so check first.
207
- if (isalnum(c) || isnumpunc(c) || c == '&' ||
208
- c == '@' || c == '\'' || c == ':')
542
+ if (isspace(*c)) return false; // most common so check first.
543
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
544
+ *c == '@' || *c == '\'' || *c == ':')
545
+ return true;
546
+ return false;
547
+ }
548
+
549
+ bool w_std_is_tok_char(char *t)
550
+ {
551
+ wchar_t c;
552
+ if ((mbtowc(&c, t, MB_CUR_MAX)) < 0)
553
+ RAISE(IO_ERROR, ENC_ERR_MSG);
554
+ if (iswspace(c)) return false; // most common so check first.
555
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' ||
556
+ c == L'@' || c == L'\'' || c == L':')
209
557
  return true;
210
558
  return false;
211
559
  }
@@ -246,22 +594,34 @@ int std_get_number(char *input)
246
594
 
247
595
  int std_get_apostrophe(char *input)
248
596
  {
249
- int i = 0;
597
+ char *t = input;
250
598
 
251
- while (isalpha(input[i]) || input[i] == '\'')
252
- i++;
599
+ while (isalpha(*t) || *t == '\'')
600
+ t++;
253
601
 
254
- return i;
602
+ return t - input;
255
603
  }
256
604
 
257
- int std_get_url(char *input, char *token)
605
+ int mb_std_get_apostrophe(char *input)
258
606
  {
259
- int i = 0;
607
+ char *t = input;
608
+ wchar_t w;
609
+ int i;
260
610
 
611
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
612
+ while (iswalpha(w) || w == L'\'') {
613
+ t += i;
614
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
615
+ }
616
+ return t - input;
617
+ }
618
+
619
+ int std_get_url(char *input, char *token, int i)
620
+ {
261
621
  while (isurlc(input[i])) {
262
622
  if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
263
623
  break; // can't have to puncs in a row
264
- token[i] = input[i];
624
+ if (i < MAX_WORD_SIZE) token[i] = input[i];
265
625
  i++;
266
626
  }
267
627
 
@@ -282,148 +642,229 @@ int std_get_company_name(char *input)
282
642
  return i;
283
643
  }
284
644
 
645
+ int mb_std_get_company_name(char *input, TokenStream *ts)
646
+ {
647
+ char *t = input;
648
+ wchar_t wchr;
649
+ int i;
650
+
651
+ MB_NEXT_CHAR;
652
+ while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
653
+ t += i;
654
+ MB_NEXT_CHAR;
655
+ }
656
+
657
+ return t - input;
658
+ }
659
+
660
+ bool std_advance_to_start(TokenStream *ts)
661
+ {
662
+ char *t = ts->t;
663
+ while (*t != '\0' && !isalnum(*t)) t++;
664
+
665
+ ts->t = t;
666
+
667
+ return (*t != '\0');
668
+ }
669
+
670
+ bool mb_std_advance_to_start(TokenStream *ts)
671
+ {
672
+ int i;
673
+ wchar_t w;
674
+
675
+ if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
676
+ while (w != 0 && !iswalnum(w)) {
677
+ ts->t += i;
678
+ if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
679
+ }
680
+
681
+ return (w != 0);
682
+ }
683
+
684
+ typedef struct StandardTokenizer {
685
+ bool (*advance_to_start)(TokenStream *ts);
686
+ bool (*is_tok_char)(char *c);
687
+ int (*get_alpha)(TokenStream *ts, char *token);
688
+ int (*get_apostrophe)(char *input);
689
+ } StandardTokenizer;
690
+
285
691
  Token *std_next(TokenStream *ts)
286
692
  {
287
- int i = ts->pos, j;
288
- int start;
289
- char *text = ts->text;
693
+ StandardTokenizer *std_tz = (StandardTokenizer *)ts->data;
694
+ char *s;
695
+ char *t;
696
+ char *start = NULL;
697
+ char *num_end = NULL;
290
698
  char token[MAX_WORD_SIZE];
291
699
  int token_i = 0;
292
700
  int len;
293
- int num_end = 0;
294
- int is_acronym;
295
- int seen_at_symbol;
701
+ bool is_acronym;
702
+ bool seen_at_symbol;
296
703
 
297
- while (text[i] != '\0' && !isalnum(text[i]))
298
- i++;
299
- if (text[i] == '\0')
300
- return NULL;
301
-
302
- start = i;
303
- if (isdigit(text[i])) {
304
- i += std_get_number(text + i);
305
- ts->pos = i;
306
- tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
704
+
705
+ if (!std_tz->advance_to_start(ts)) return NULL;
706
+
707
+ start = t = ts->t;
708
+ if (isdigit(*t)) {
709
+ t += std_get_number(t);
710
+ ts->t = t;
711
+ tk_set_ts(ts->token, start, t, ts->text, 1);
307
712
  } else {
308
- token_i = std_get_alpha(text + i, token);
309
- i += token_i;
713
+ token_i = std_tz->get_alpha(ts, token);
714
+ t += token_i;
310
715
 
311
- if (!isstdtokchar(text[i])) {
716
+ if (!std_tz->is_tok_char(t)) {
312
717
  // very common case, ie a plain word, so check and return
313
- tk_set(ts->token, text+start, i-start, start, i, 1);
314
- ts->pos = i;
718
+ tk_set_ts(ts->token, start, t, ts->text, 1);
719
+ ts->t = t;
315
720
  return ts->token;
316
721
  }
317
722
 
318
- if (text[i] == '\'') { // apostrophe case.
319
- i += std_get_apostrophe(text + i);
320
- ts->pos = i;
321
- len = i - start;
723
+ if (*t == '\'') { // apostrophe case.
724
+ t += std_tz->get_apostrophe(t);
725
+ ts->t = t;
726
+ len = t - start;
322
727
  // strip possesive
323
- if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
324
- len -= 2;
325
- tk_set(ts->token, text+start, len, start, i, 1);
728
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
729
+
730
+ tk_set_ts(ts->token, start, t, ts->text, 1);
326
731
  return ts->token;
327
732
  }
328
- if (text[i] == '&') { // apostrophe case.
329
- i += std_get_company_name(text + i);
330
- ts->pos = i;
331
- tk_set(ts->token, text+start, i - start, start, i, 1);
733
+
734
+ if (*t == '&') { // apostrophe case.
735
+ t += std_get_company_name(t);
736
+ ts->t = t;
737
+ tk_set_ts(ts->token, start, t, ts->text, 1);
332
738
  return ts->token;
333
739
  }
334
740
 
335
- if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
336
- num_end = start + std_get_number(text + start);
337
- if (!isstdtokchar(text[num_end])) { // we won't find a longer token
338
- ts->pos = num_end;
339
- tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
741
+ if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
742
+ num_end = start + std_get_number(start);
743
+ if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
744
+ ts->t = num_end;
745
+ tk_set_ts(ts->token, start, num_end, ts->text, 1);
340
746
  return ts->token;
341
747
  }
342
748
  // else there may be a longer token so check
343
749
  }
344
750
 
345
- if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
751
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
346
752
  // check for a known url start
347
753
  token[token_i] = '\0';
348
- i += 3;
349
- while (text[i] == '/') i++;
350
- if (isalpha(text[i]) &&
351
- (strcmp(token, "ftp") == 0 ||
352
- strcmp(token, "http") == 0 ||
353
- strcmp(token, "https") == 0 ||
354
- strcmp(token, "file") == 0)) {
355
- len = std_get_url(text + i, token); // dispose of first part of the URL
754
+ t += 3;
755
+ while (*t == '/') t++;
756
+ if (isalpha(*t) &&
757
+ (memcmp(token, "ftp", 3) == 0 ||
758
+ memcmp(token, "http", 4) == 0 ||
759
+ memcmp(token, "https", 5) == 0 ||
760
+ memcmp(token, "file", 4) == 0)) {
761
+ len = std_get_url(t, token, 0); // dispose of first part of the URL
356
762
  } else { //still treat as url but keep the first part
357
- token_i = i - start;
358
- memcpy(token, text + start, token_i * sizeof(char));
359
- len = token_i + std_get_url(text + i, token + token_i); // keep start
763
+ token_i = t - start;
764
+ memcpy(token, start, token_i * sizeof(char));
765
+ len = token_i + std_get_url(t, token, token_i); // keep start
360
766
  }
361
- ts->pos = i + len;
767
+ ts->t = t + len;
362
768
  token[len] = 0;
363
- tk_set(ts->token, token, len, start, ts->pos, 1);
769
+ tk_set(ts->token, token, len, start - ts->text, ts->t - ts->text, 1);
364
770
  return ts->token;
365
771
  }
366
772
 
367
- // now see how int a url we can find.
773
+ // now see how long a url we can find.
368
774
  is_acronym = true;
369
775
  seen_at_symbol = false;
370
- while (isurlxatc(text[i])) {
371
- if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
776
+ while (isurlxatc(*t)) {
777
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
372
778
  is_acronym = false;
373
779
  }
374
- if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
375
- break; // can't have to punctuation characters in a row
376
- if (text[i] == '@') {
377
- if (seen_at_symbol)
780
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
781
+ break; // can't have two punctuation characters in a row
782
+ }
783
+ if (*t == '@') {
784
+ if (seen_at_symbol) {
378
785
  break; // we can only have one @ symbol
379
- else
786
+ } else {
380
787
  seen_at_symbol = true;
788
+ }
381
789
  }
382
- i++;
790
+ t++;
383
791
  }
384
- while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
385
- if (i > num_end) {
386
- ts->pos = i;
792
+ while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
793
+
794
+ if (t > num_end) {
795
+ ts->t = t;
387
796
 
388
797
  if (is_acronym) { // check that it is one letter followed by one '.'
389
- for (j = start; j < i-1; j++) {
390
- if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
798
+ for (s = start; s < t-1; s++) {
799
+ if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
391
800
  }
392
801
  }
393
802
  if (is_acronym) {// strip '.'s
394
- for (j = start + token_i; j < i; j++) {
395
- if (text[j] != '.') {
396
- token[token_i] = text[j];
803
+ for (s = start + token_i; s < t; s++) {
804
+ if (*s != '.') {
805
+ token[token_i] = *s;
397
806
  token_i++;
398
807
  }
399
808
  }
400
- tk_set(ts->token, token, token_i, start, ts->pos, 1);
809
+ tk_set(ts->token, token, token_i, start - ts->text, t - ts->text, 1);
401
810
  } else { // just return the url as is
402
- tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
811
+ tk_set_ts(ts->token, start, t, ts->text, 1);
403
812
  }
404
813
  } else { // return the number
405
- ts->pos = num_end;
406
- tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
814
+ ts->t = num_end;
815
+ tk_set_ts(ts->token, start, num_end, ts->text, 1);
407
816
  }
408
817
  }
409
818
 
410
819
  return ts->token;
411
820
  }
412
821
 
822
+ void std_ts_destroy(void *p)
823
+ {
824
+ TokenStream *ts = (TokenStream *)p;
825
+ free(ts->data);
826
+ ts_standard_destroy(ts);
827
+ }
828
+
829
+ void std_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
830
+ {
831
+ new_ts->data = ALLOC(StandardTokenizer);
832
+ memcpy(new_ts->data, orig_ts->data, sizeof(StandardTokenizer));
833
+ }
834
+
413
835
  TokenStream *standard_tokenizer_create()
414
836
  {
415
837
  TokenStream *ts = ts_create();
838
+
839
+ StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
840
+ std_tz->advance_to_start = &std_advance_to_start;
841
+ std_tz->get_alpha = &std_get_alpha;
842
+ std_tz->is_tok_char = &std_is_tok_char;
843
+ std_tz->get_apostrophe = &std_get_apostrophe;
844
+
845
+ ts->data = std_tz;
846
+ ts->destroy = &std_ts_destroy;
847
+ ts->clone_i = &std_ts_clone_i;
416
848
  ts->next = &std_next;
417
849
  return ts;
418
850
  }
419
851
 
420
- const char *ENGLISH_STOP_WORDS[] = {
421
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
422
- "for", "if", "in", "into", "is", "it",
423
- "no", "not", "of", "on", "or", "s", "such",
424
- "t", "that", "the", "their", "then", "there", "these",
425
- "they", "this", "to", "was", "will", "with"
426
- };
852
+ TokenStream *mb_standard_tokenizer_create()
853
+ {
854
+ TokenStream *ts = ts_create();
855
+
856
+ StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
857
+ std_tz->advance_to_start = &mb_std_advance_to_start;
858
+ std_tz->get_alpha = &mb_std_get_alpha;
859
+ std_tz->is_tok_char = &w_std_is_tok_char;
860
+ std_tz->get_apostrophe = &mb_std_get_apostrophe;
861
+
862
+ ts->data = std_tz;
863
+ ts->destroy = &std_ts_destroy;
864
+ ts->clone_i = &std_ts_clone_i;
865
+ ts->next = &std_next;
866
+ return ts;
867
+ }
427
868
 
428
869
  void filter_reset(TokenStream *ts, char *text)
429
870
  {
@@ -432,10 +873,10 @@ void filter_reset(TokenStream *ts, char *text)
432
873
 
433
874
  void filter_destroy(void *p)
434
875
  {
435
- TokenStream *ts = (TokenStream *)p;
436
- ts->sub_ts->destroy(ts->sub_ts);
437
- if (ts->token != NULL) tk_destroy(ts->token);
438
- free(ts);
876
+ TokenStream *tf = (TokenStream *)p;
877
+ if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
878
+ if (tf->token != NULL) tk_destroy(tf->token);
879
+ free(tf);
439
880
  }
440
881
 
441
882
  void sf_destroy(void *p)
@@ -445,40 +886,109 @@ void sf_destroy(void *p)
445
886
  filter_destroy(p);
446
887
  }
447
888
 
448
- Token *sf_next(TokenStream *ts)
889
+ void sf_clone_i_i(void *key, void *value, void *arg)
890
+ {
891
+ HshTable *wordtable = (HshTable *)arg;
892
+ char *w = estrdup(key);
893
+ h_set(wordtable, w, w);
894
+ }
895
+
896
+ void sf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
897
+ {
898
+ new_ts->data = h_new_str(&free, NULL);
899
+ h_each(orig_ts->data, &sf_clone_i_i, new_ts->data);
900
+ }
901
+
902
+ Token *sf_next(TokenStream *tf)
449
903
  {
450
904
  int pos_inc = 1;
451
- HshTable *words = (HshTable *)ts->data;
452
- Token *tk = ts->sub_ts->next(ts->sub_ts);
905
+ HshTable *words = (HshTable *)tf->data;
906
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
453
907
  while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
454
- tk = ts->sub_ts->next(ts->sub_ts);
908
+ tk = tf->sub_ts->next(tf->sub_ts);
455
909
  pos_inc++;
456
910
  }
457
911
  if (tk != NULL) tk->pos_inc = pos_inc;
458
912
  return tk;
459
913
  }
460
914
 
461
- TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
915
+ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
916
+ const char **words, int len)
462
917
  {
463
918
  int i;
919
+ char *w;
464
920
  TokenStream *tf = ALLOC(TokenStream);
465
921
  tf->sub_ts = ts;
466
- HshTable *wordtable = h_new_str(NULL, NULL);
922
+ tf->destroy_sub = true;
923
+ HshTable *wordtable = h_new_str(&free, NULL);
467
924
  for (i = 0; i < len; i++) {
468
- h_set(wordtable, words[i], words[i]);
925
+ w = estrdup(words[i]);
926
+ h_set(wordtable, w, w);
927
+ }
928
+ tf->data = wordtable;
929
+ tf->token = NULL;
930
+ tf->next = &sf_next;
931
+ tf->reset = &filter_reset;
932
+ tf->destroy = &sf_destroy;
933
+ tf->clone_i = &sf_clone_i;
934
+ return tf;
935
+ }
936
+
937
+ TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
938
+ {
939
+ char *w;
940
+ TokenStream *tf = ALLOC(TokenStream);
941
+ tf->sub_ts = ts;
942
+ tf->destroy_sub = true;
943
+ HshTable *wordtable = h_new_str(&free, NULL);
944
+ while (*words) {
945
+ w = estrdup(*words);
946
+ h_set(wordtable, w, w);
947
+ words++;
469
948
  }
470
949
  tf->data = wordtable;
471
950
  tf->token = NULL;
472
951
  tf->next = &sf_next;
473
952
  tf->reset = &filter_reset;
474
953
  tf->destroy = &sf_destroy;
954
+ tf->clone_i = &sf_clone_i;
475
955
  return tf;
476
956
  }
477
957
 
478
958
  TokenStream *stop_filter_create(TokenStream *ts)
479
959
  {
480
- return stop_filter_create_with_words(ts,
481
- (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
960
+ return stop_filter_create_with_words(ts, FULL_ENGLISH_STOP_WORDS);
961
+ }
962
+
963
+ Token *mb_lcf_next(TokenStream *ts)
964
+ {
965
+ wchar_t wbuf[MAX_WORD_SIZE], *w;
966
+ //mbstate_t state = {0};
967
+ int i;
968
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
969
+ if (tk == NULL) return tk;
970
+
971
+ i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
972
+ w = wbuf;
973
+ while (*w != 0) {
974
+ *w = towlower(*w);
975
+ w++;
976
+ }
977
+ wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
978
+ return tk;
979
+ }
980
+
981
+ TokenStream *mb_lowercase_filter_create(TokenStream *ts)
982
+ {
983
+ TokenStream *tf = ALLOC(TokenStream);
984
+ tf->token = NULL;
985
+ tf->next = &mb_lcf_next;
986
+ tf->reset = &filter_reset;
987
+ tf->destroy = &filter_destroy;
988
+ tf->sub_ts = ts;
989
+ tf->destroy_sub = true;
990
+ tf->clone_i = NULL;
991
+ return tf;
482
992
  }
483
993
 
484
994
  Token *lcf_next(TokenStream *ts)
@@ -501,48 +1011,199 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
501
1011
  tf->reset = &filter_reset;
502
1012
  tf->destroy = &filter_destroy;
503
1013
  tf->sub_ts = ts;
1014
+ tf->destroy_sub = true;
1015
+ tf->clone_i = NULL;
504
1016
  return tf;
505
1017
  }
506
1018
 
507
- Analyzer *letter_analyzer_create()
1019
+ typedef struct StemFilter {
1020
+ struct sb_stemmer *stemmer;
1021
+ char *algorithm;
1022
+ char *charenc;
1023
+ } StemFilter;
1024
+
1025
+ void stemf_destroy(void *p)
508
1026
  {
509
- Analyzer *a = ALLOC(Analyzer);
510
- a->data = NULL;
511
- a->current_ts = lowercase_filter_create(letter_tokenizer_create());
512
- a->destroy = &a_standard_destroy;
513
- a->get_ts = &a_standard_get_ts;
514
- return a;
1027
+ TokenStream *ts = (TokenStream *)p;
1028
+ StemFilter *stemf = (StemFilter *)ts->data;
1029
+ sb_stemmer_delete(stemf->stemmer);
1030
+ free(stemf->algorithm);
1031
+ free(stemf->charenc);
1032
+ free(stemf);
1033
+ filter_destroy(ts);
515
1034
  }
516
1035
 
1036
+ Token *stemf_next(TokenStream *ts)
1037
+ {
1038
+ int len;
1039
+ const sb_symbol *stemmed;
1040
+ struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
1041
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
1042
+ if (tk == NULL) return tk;
1043
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
1044
+ len = sb_stemmer_length(stemmer);
1045
+ if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
1046
+ memcpy(tk->text, stemmed, len);
1047
+ tk->text[len] = '\0';
1048
+ return tk;
1049
+ }
517
1050
 
518
- Analyzer *standard_analyzer_create_with_words(char **words, int len)
1051
+ void stemf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
519
1052
  {
520
- Analyzer *a = ALLOC(Analyzer);
521
- a->data = NULL;
522
- a->current_ts =
523
- stop_filter_create_with_words(
1053
+ StemFilter *orig_stemf = (StemFilter *)orig_ts->data;
1054
+ StemFilter *stemf = ALLOC(StemFilter);
1055
+ stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1056
+ stemf->algorithm = orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1057
+ stemf->charenc = orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1058
+ new_ts->data = stemf;
1059
+ }
1060
+
1061
+ TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
1062
+ const char * charenc)
1063
+ {
1064
+ TokenStream *tf = ALLOC(TokenStream);
1065
+ StemFilter *stemf = ALLOC(StemFilter);
1066
+ stemf->stemmer = sb_stemmer_new(algorithm, charenc);
1067
+ stemf->algorithm = algorithm ? estrdup(algorithm) : NULL;
1068
+ stemf->charenc = charenc ? estrdup(charenc) : NULL;
1069
+ tf->data = stemf;
1070
+
1071
+ tf->token = NULL;
1072
+ tf->next = &stemf_next;
1073
+ tf->reset = &filter_reset;
1074
+ tf->destroy = &stemf_destroy;
1075
+ tf->clone_i = &stemf_clone_i;
1076
+ tf->sub_ts = ts;
1077
+ tf->destroy_sub = true;
1078
+ return tf;
1079
+ }
1080
+
1081
+ Analyzer *standard_analyzer_create_with_words_len(
1082
+ const char **words, int len, bool lowercase)
1083
+ {
1084
+ TokenStream *ts;
1085
+ if (lowercase) {
1086
+ ts = stop_filter_create_with_words_len(
524
1087
  lowercase_filter_create(standard_tokenizer_create()), words, len);
525
- a->destroy = &a_standard_destroy;
526
- a->get_ts = &a_standard_get_ts;
527
- return a;
1088
+ } else {
1089
+ ts = stop_filter_create_with_words_len(
1090
+ standard_tokenizer_create(), words, len);
1091
+ }
1092
+ return analyzer_create(NULL, ts, NULL, NULL);
1093
+ }
1094
+
1095
+ Analyzer *standard_analyzer_create_with_words(const char **words, bool lowercase)
1096
+ {
1097
+ TokenStream *ts;
1098
+ if (lowercase) {
1099
+ ts = stop_filter_create_with_words(
1100
+ lowercase_filter_create(standard_tokenizer_create()), words);
1101
+ } else {
1102
+ ts = stop_filter_create_with_words(
1103
+ standard_tokenizer_create(), words);
1104
+ }
1105
+ return analyzer_create(NULL, ts, NULL, NULL);
1106
+ }
1107
+
1108
+ Analyzer *mb_standard_analyzer_create_with_words_len(
1109
+ const char **words, int len, bool lowercase)
1110
+ {
1111
+ TokenStream *ts;
1112
+ if (lowercase) {
1113
+ ts = stop_filter_create_with_words_len(
1114
+ mb_lowercase_filter_create(mb_standard_tokenizer_create()), words, len);
1115
+ } else {
1116
+ ts = stop_filter_create_with_words_len(
1117
+ mb_standard_tokenizer_create(), words, len);
1118
+ }
1119
+ return analyzer_create(NULL, ts, NULL, NULL);
1120
+ }
1121
+
1122
+ Analyzer *mb_standard_analyzer_create_with_words(
1123
+ const char **words, bool lowercase)
1124
+ {
1125
+ TokenStream *ts;
1126
+ if (lowercase) {
1127
+ ts = stop_filter_create_with_words(
1128
+ mb_lowercase_filter_create(mb_standard_tokenizer_create()), words);
1129
+ } else {
1130
+ ts = stop_filter_create_with_words(mb_standard_tokenizer_create(), words);
1131
+ }
1132
+ return analyzer_create(NULL, ts, NULL, NULL);
1133
+ }
1134
+
1135
+ Analyzer *standard_analyzer_create(bool lowercase)
1136
+ {
1137
+ return standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1138
+ }
1139
+
1140
+ Analyzer *mb_standard_analyzer_create(bool lowercase)
1141
+ {
1142
+ return mb_standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1143
+ }
1144
+
1145
+ /****************************************************************************
1146
+ *
1147
+ * PerFieldAnalyzer
1148
+ *
1149
+ ****************************************************************************/
1150
+
1151
+ typedef struct PerFieldAnalyzer {
1152
+ HshTable *dict;
1153
+ Analyzer *def;
1154
+ bool destroy_subs : 1;
1155
+ } PerFieldAnalyzer;
1156
+
1157
+ void pfa_destroy(void *p)
1158
+ {
1159
+ Analyzer *self = (Analyzer *)p;
1160
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1161
+ h_destroy(pfa->dict);
1162
+
1163
+ if (pfa->destroy_subs) a_destroy(pfa->def);
1164
+ free(pfa);
1165
+ free(self);
1166
+ }
1167
+
1168
+ TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
1169
+ {
1170
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1171
+ Analyzer *a = h_get(pfa->dict, field);
1172
+ if (a == NULL) a = pfa->def;
1173
+ return a_get_ts(a, field, text);
1174
+ }
1175
+
1176
+ void pfa_sub_a_destroy(void *p)
1177
+ {
1178
+ Analyzer *a = (Analyzer *)p;
1179
+ a->destroy(a);
1180
+ }
1181
+
1182
+ void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
1183
+ {
1184
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1185
+ h_set(pfa->dict, estrdup(field), analyzer);
528
1186
  }
529
1187
 
530
- Analyzer *standard_analyzer_create()
1188
+ Analyzer *per_field_analyzer_create(Analyzer *def, bool destroy_subs)
531
1189
  {
532
- return standard_analyzer_create_with_words(
533
- (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
1190
+ PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
1191
+ pfa->def = def;
1192
+ pfa->destroy_subs = destroy_subs;
1193
+ pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
1194
+ : h_new_str(&free, NULL);
1195
+ return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
534
1196
  }
535
1197
 
536
1198
  #ifdef ALONE
537
1199
  int main(int argc, char **argv)
538
1200
  {
539
1201
  char buf[10000];
540
- Analyzer *a = standard_analyzer_create();
1202
+ Analyzer *a = standard_analyzer_create(true);
541
1203
  TokenStream *ts;
542
1204
  Token *tk;
543
1205
  while (fgets(buf, 9999, stdin) != NULL) {
544
1206
  ts = a->get_ts(a, "hello", buf);
545
- ts->pos = 0;
546
1207
  while ((tk = ts->next(ts)) != NULL) {
547
1208
  printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
548
1209
  }