ferret 0.9.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  $:. << 'lib'
2
2
  # Some parts of this Rakefile where taken from Jim Weirich's Rakefile for
3
- # Rake. Other parts where stolen from the David Heinemeier Hansson's Rails
3
+ # Rake. Other parts where taken from the David Heinemeier Hansson's Rails
4
4
  # Rakefile. Both are under MIT-LICENSE. Thanks to both for their excellent
5
5
  # projects.
6
6
 
@@ -32,12 +32,13 @@ end
32
32
  $VERBOSE = nil
33
33
 
34
34
  EXT = "ferret_ext.so"
35
- EXT_SRC = FileList["src/*/*.[ch]"]
35
+ EXT_SRC = FileList["src/**/*.[ch]"]
36
+
36
37
  EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
37
38
  SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
38
39
 
39
40
  CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
40
- CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
41
+ CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
41
42
 
42
43
  task :default => :all_tests
43
44
  desc "Run all tests"
@@ -57,7 +58,7 @@ end
57
58
  desc "run unit tests in test/unit for C ferret"
58
59
  Rake::TestTask.new("test_cunits" => :ext) do |t|
59
60
  t.libs << "test/unit"
60
- t.pattern = 'test/unit/t[cs]_*.rb'
61
+ t.pattern = 'test/unit/ts_*.rb'
61
62
  t.verbose = true
62
63
  end
63
64
 
@@ -102,6 +103,15 @@ EXT_SRC.each do |fn|
102
103
  dest_fn = File.join("ext", File.basename(fn))
103
104
  file dest_fn => fn do |t|
104
105
  cp fn, dest_fn
106
+ if fn =~ /stemmer/
107
+ # flatten the directory structure for lib_stemmer
108
+ open(dest_fn) do |in_f|
109
+ open(dest_fn + ".out", "w") do |out_f|
110
+ in_f.each {|line| out_f.write(line.sub(/(#include ["<])[.a-z_\/]*\//) {"#{$1}"})}
111
+ end
112
+ end
113
+ mv dest_fn + ".out", dest_fn
114
+ end
105
115
  end
106
116
  end
107
117
 
@@ -110,9 +120,17 @@ task :ext => ["ext/#{EXT}"] + SRC
110
120
 
111
121
  file "ext/#{EXT}" => ["ext/Makefile"] do
112
122
  cp "ext/inc/lang.h", "ext/lang.h"
123
+ cp "ext/inc/except.h", "ext/except.h"
113
124
  sh "cd ext; make"
114
125
  end
115
126
 
127
+ file "ext/lang.h" => ["ext/inc/lang.h"] do
128
+ cp "ext/inc/lang.h", "ext/lang.h"
129
+ end
130
+ file "ext/except.h" => ["ext/inc/except.h"] do
131
+ cp "ext/inc/except.h", "ext/except.h"
132
+ end
133
+
116
134
  file "ext/Makefile" => SRC do
117
135
  sh "cd ext; ruby extconf.rb"
118
136
  end
@@ -220,7 +238,7 @@ task :repackage => EXT_SRC_DEST
220
238
  task :package => EXT_SRC_DEST
221
239
  task :tag => [:prerelease]
222
240
  task :update_version => [:prerelease]
223
- task :release => [:tag, :update_version, :package] do
241
+ task :release do #=> [:tag, :update_version, :package] do
224
242
  announce
225
243
  announce "**************************************************************"
226
244
  announce "* Release #{PKG_VERSION} Complete."
data/TODO CHANGED
@@ -4,11 +4,12 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
4
4
 
5
5
  === To Do
6
6
 
7
- * Add the ability to persist an in memory index to Ferret::Index::Index
8
7
  * Make a dll for people on Windows
8
+ * pure ruby ConstantScoreQuery
9
9
 
10
10
  === Done
11
11
 
12
+ * Add the ability to persist an in memory index to Ferret::Index::Index
12
13
  * Add UTF-8 support
13
14
  * Multi Field Query
14
15
  * Test threading
@@ -1,7 +1,16 @@
1
1
  #include <analysis.h>
2
2
  #include <string.h>
3
3
  #include <ctype.h>
4
- #include <hash.h>
4
+ #include <wctype.h>
5
+ #include <wchar.h>
6
+ #include "hash.h"
7
+ #include "libstemmer.h"
8
+
9
+ /****************************************************************************
10
+ *
11
+ * Token
12
+ *
13
+ ****************************************************************************/
5
14
 
6
15
  Token *tk_create()
7
16
  {
@@ -24,6 +33,11 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
24
33
  return tk;
25
34
  }
26
35
 
36
+ inline Token *tk_set_ts(Token *tk, char *start, char *end, char *text, int pos_inc)
37
+ {
38
+ return tk_set(tk, start, end - start, start - text, end - text, pos_inc);
39
+ }
40
+
27
41
  inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
28
42
  {
29
43
  return tk_set(tk, text, strlen(text), start, end, pos_inc);
@@ -31,11 +45,8 @@ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_i
31
45
 
32
46
  int tk_eq(Token *tk1, Token *tk2)
33
47
  {
34
- if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
35
- tk1->start == tk2->start && tk1->end == tk2->end)
36
- return true;
37
- else
38
- return false;
48
+ return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
49
+ tk1->start == tk2->start && tk1->end == tk2->end);
39
50
  }
40
51
 
41
52
  int tk_cmp(Token *tk1, Token *tk2)
@@ -57,46 +68,152 @@ int tk_cmp(Token *tk1, Token *tk2)
57
68
  return cmp;
58
69
  }
59
70
 
71
+
72
+ /****************************************************************************
73
+ *
74
+ * TokenStream
75
+ *
76
+ ****************************************************************************/
77
+
60
78
  void ts_standard_destroy(void *p)
61
79
  {
62
80
  TokenStream *ts = (TokenStream *)p;
63
81
  tk_destroy(ts->token);
64
- free(p);
82
+ free(ts);
65
83
  }
66
84
 
67
85
  void ts_reset(TokenStream *ts, char *text)
68
86
  {
69
- ts->text = text;
70
- ts->pos = 0;
87
+ ts->t = ts->text = text;
71
88
  }
72
89
 
73
90
  TokenStream *ts_create()
74
91
  {
75
92
  TokenStream *ts = ALLOC(TokenStream);
76
- ts->pos = -1;
77
93
  ts->text = NULL;
78
94
  ts->token = tk_create();
79
95
  ts->destroy = &ts_standard_destroy;
80
96
  ts->reset = &ts_reset;
97
+ ts->sub_ts = NULL;
98
+ ts->clone_i = NULL;
81
99
  return ts;
82
100
  }
83
101
 
102
+ TokenStream *ts_clone(TokenStream *orig_ts)
103
+ {
104
+ TokenStream *ts = ALLOC(TokenStream);
105
+ memcpy(ts, orig_ts, sizeof(TokenStream));
106
+ if (orig_ts->token) {
107
+ ts->token = ALLOC(Token);
108
+ memcpy(ts->token, orig_ts->token, sizeof(Token));
109
+ }
110
+ if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
111
+ if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
112
+ return ts;
113
+ }
114
+
115
+ /* * Multi-byte TokenStream * */
116
+ static char * const ENC_ERR_MSG = "Error decoding input string. "
117
+ "Check that you have the locale set correctly";
118
+ #define MB_NEXT_CHAR \
119
+ if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
120
+ RAISE(IO_ERROR, ENC_ERR_MSG)
121
+
122
+ inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
123
+ {
124
+ tk->text[wcstombs(tk->text, text, MAX_WORD_SIZE - 1)] = '\0';
125
+ tk->start = start;
126
+ tk->end = end;
127
+ tk->pos_inc = pos_inc;
128
+ return tk;
129
+ }
130
+
131
+ void mb_ts_standard_destroy(void *p)
132
+ {
133
+ TokenStream *ts = (TokenStream *)p;
134
+ tk_destroy(ts->token);
135
+ free(ts->data);
136
+ free(ts);
137
+ }
138
+
139
+ void mb_ts_reset(TokenStream *ts, char *text)
140
+ {
141
+ ZEROSET(ts->data, mbstate_t, 1);
142
+ ts_reset(ts, text);
143
+ }
144
+
145
+ void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
146
+ {
147
+ new_ts->data = ALLOC(mbstate_t);
148
+ memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
149
+ }
150
+
151
+ TokenStream *mb_ts_create()
152
+ {
153
+ TokenStream *ts = ALLOC(TokenStream);
154
+ ts->data = ALLOC(mbstate_t);
155
+ ts->text = NULL;
156
+ ts->token = tk_create();
157
+ ts->destroy = &mb_ts_standard_destroy;
158
+ ts->reset = &mb_ts_reset;
159
+ ts->clone_i = &mb_ts_clone_i;
160
+ ts->sub_ts = NULL;
161
+ return ts;
162
+ }
163
+
164
+ /****************************************************************************
165
+ *
166
+ * Analyzer
167
+ *
168
+ ****************************************************************************/
169
+
170
+ void a_standard_destroy(void *p)
171
+ {
172
+ Analyzer *a = (Analyzer *)p;
173
+ ts_destroy(a->current_ts);
174
+ free(p);
175
+ }
176
+
177
+ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
178
+ {
179
+ a->current_ts->reset(a->current_ts, text);
180
+ return a->current_ts;
181
+ }
182
+
183
+ Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
184
+ TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
185
+ {
186
+ Analyzer *a = ALLOC(Analyzer);
187
+ a->data = data;
188
+ a->current_ts = ts;
189
+ a->destroy = (destroy ? destroy : &a_standard_destroy);
190
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
191
+ return a;
192
+ }
193
+
194
+ /****************************************************************************
195
+ *
196
+ * Whitespace
197
+ *
198
+ ****************************************************************************/
199
+
200
+ /*
201
+ * WhitespaceTokenizer
202
+ */
84
203
  Token *wst_next(TokenStream *ts)
85
204
  {
86
- int i = ts->pos;
87
- int start, end;
88
- char *text = ts->text;
205
+ char *t = ts->t;
206
+ char *start;
89
207
 
90
- while (text[i] != '\0' && isspace(text[i]))
91
- i++;
92
- if (text[i] == '\0')
93
- return NULL;
208
+ while (*t != '\0' && isspace(*t)) t++;
94
209
 
95
- start = i;
96
- while (text[i] != '\0' && !isspace(text[i]))
97
- i++;
98
- ts->pos = end = i;
99
- tk_set(ts->token, text+start, end-start, start, end, 1);
210
+ if (*t == '\0') return NULL;
211
+
212
+ start = t;
213
+ while (*t != '\0' && !isspace(*t)) t++;
214
+
215
+ ts->t = t;
216
+ tk_set_ts(ts->token, start, t, ts->text, 1);
100
217
  return ts->token;
101
218
  }
102
219
 
@@ -107,22 +224,121 @@ TokenStream *whitespace_tokenizer_create()
107
224
  return ts;
108
225
  }
109
226
 
227
+ /*
228
+ * Multi-byte WhitespaceTokenizer
229
+ */
230
+ Token *mb_wst_next(TokenStream *ts)
231
+ {
232
+ int i;
233
+ char *start;
234
+ char *t = ts->t;
235
+ wchar_t wchr;
236
+
237
+ MB_NEXT_CHAR;
238
+ while (wchr != 0 && iswspace(wchr)) {
239
+ t += i;
240
+ MB_NEXT_CHAR;
241
+ }
242
+ if (wchr == 0) return NULL;
243
+
244
+ start = t;
245
+ t += i;
246
+ MB_NEXT_CHAR;
247
+ while (wchr != 0 && !iswspace(wchr)) {
248
+ t += i;
249
+ MB_NEXT_CHAR;
250
+ }
251
+ tk_set_ts(ts->token, start, t, ts->text, 1);
252
+ ts->t = t;
253
+ return ts->token;
254
+ }
255
+
256
+ /*
257
+ * Lowercasing Multi-byte WhitespaceTokenizer
258
+ */
259
+ Token *mb_wst_next_lc(TokenStream *ts)
260
+ {
261
+ int i;
262
+ char *start;
263
+ char *t = ts->t;
264
+ wchar_t wchr;
265
+ wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
266
+
267
+ w = wbuf;
268
+ w_end = &wbuf[MAX_WORD_SIZE];
269
+
270
+ MB_NEXT_CHAR;
271
+ while (wchr != 0 && iswspace(wchr)) {
272
+ t += i;
273
+ MB_NEXT_CHAR;
274
+ }
275
+ if (wchr == 0) return NULL;
276
+
277
+ start = t;
278
+ t += i;
279
+ *w++ = towlower(wchr);
280
+ MB_NEXT_CHAR;
281
+ while (wchr != 0 && !iswspace(wchr)) {
282
+ if (w < w_end) *w++ = towlower(wchr);
283
+ t += i;
284
+ MB_NEXT_CHAR;
285
+ }
286
+ *w = 0;
287
+ w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
288
+ ts->t = t;
289
+ return ts->token;
290
+ }
291
+
292
+ TokenStream *mb_whitespace_tokenizer_create(bool lowercase)
293
+ {
294
+ TokenStream *ts = mb_ts_create();
295
+ ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
296
+ return ts;
297
+ }
298
+
299
+ /*
300
+ * WhitespaceAnalyzers
301
+ */
302
+ Analyzer *whitespace_analyzer_create(bool lowercase)
303
+ {
304
+ TokenStream *ts;
305
+ if (lowercase) {
306
+ ts = lowercase_filter_create(whitespace_tokenizer_create());
307
+ } else {
308
+ ts = whitespace_tokenizer_create();
309
+ }
310
+ return analyzer_create(NULL, ts, NULL, NULL);
311
+ }
312
+
313
+ Analyzer *mb_whitespace_analyzer_create(bool lowercase)
314
+ {
315
+ return analyzer_create(NULL, mb_whitespace_tokenizer_create(lowercase),
316
+ NULL, NULL);
317
+ }
318
+
319
+ /****************************************************************************
320
+ *
321
+ * Letter
322
+ *
323
+ ****************************************************************************/
324
+
325
+ /*
326
+ * LetterTokenizer
327
+ */
110
328
  Token *lt_next(TokenStream *ts)
111
329
  {
112
- int i = ts->pos;
113
- int start, end;
114
- char *text = ts->text;
330
+ char *start;
331
+ char *t = ts->t;
115
332
 
116
- while (text[i] != '\0' && !isalpha(text[i]))
117
- i++;
118
- if (text[i] == '\0')
119
- return NULL;
333
+ while (*t != '\0' && !isalpha(*t)) t++;
120
334
 
121
- start = i;
122
- while (text[i] != '\0' && isalpha(text[i]))
123
- i++;
124
- ts->pos = end = i;
125
- tk_set(ts->token, text+start, end-start, start, end, 1);
335
+ if (*t == '\0') return NULL;
336
+
337
+ start = t;
338
+ while (*t != '\0' && isalpha(*t)) t++;
339
+
340
+ tk_set_ts(ts->token, start, t, ts->text, 1);
341
+ ts->t = t;
126
342
  return ts->token;
127
343
  }
128
344
 
@@ -133,54 +349,174 @@ TokenStream *letter_tokenizer_create()
133
349
  return ts;
134
350
  }
135
351
 
136
- void a_standard_destroy(void *p)
352
+ /*
353
+ * Multi-byte LetterTokenizer
354
+ */
355
+ Token *mb_lt_next(TokenStream *ts)
137
356
  {
138
- Analyzer *a = (Analyzer *)p;
139
- ts_destroy(a->current_ts);
140
- free(p);
357
+ int i;
358
+ char *start;
359
+ char *t = ts->t;
360
+ wchar_t wchr;
361
+
362
+ MB_NEXT_CHAR;
363
+ while (wchr != 0 && !iswalpha(wchr)) {
364
+ t += i;
365
+ MB_NEXT_CHAR;
366
+ }
367
+ if (wchr == 0) return NULL;
368
+
369
+ start = t;
370
+ t += i;
371
+ MB_NEXT_CHAR;
372
+ while (wchr != 0 && iswalpha(wchr)) {
373
+ t += i;
374
+ MB_NEXT_CHAR;
375
+ }
376
+ tk_set_ts(ts->token, start, t, ts->text, 1);
377
+ ts->t = t;
378
+ return ts->token;
141
379
  }
142
380
 
143
- TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
381
+ /*
382
+ * Lowercasing Multi-byte LetterTokenizer
383
+ */
384
+ Token *mb_lt_next_lc(TokenStream *ts)
144
385
  {
145
- a->current_ts->reset(a->current_ts, text);
146
- return a->current_ts;
386
+ int i;
387
+ char *start;
388
+ char *t = ts->t;
389
+ wchar_t wchr;
390
+ wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
391
+
392
+ w = wbuf;
393
+ w_end = &wbuf[MAX_WORD_SIZE];
394
+
395
+ MB_NEXT_CHAR;
396
+ while (wchr != 0 && !iswalpha(wchr)) {
397
+ t += i;
398
+ MB_NEXT_CHAR;
399
+ }
400
+ if (wchr == 0) return NULL;
401
+
402
+ start = t;
403
+ t += i;
404
+ *w++ = towlower(wchr);
405
+ MB_NEXT_CHAR;
406
+ while (wchr != 0 && iswalpha(wchr)) {
407
+ if (w < w_end) *w++ = towlower(wchr);
408
+ t += i;
409
+ MB_NEXT_CHAR;
410
+ }
411
+ *w = 0;
412
+ w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
413
+ ts->t = t;
414
+ return ts->token;
147
415
  }
148
416
 
149
- Analyzer *whitespace_analyzer_create()
417
+ TokenStream *mb_letter_tokenizer_create(bool lowercase)
150
418
  {
151
- Analyzer *a = ALLOC(Analyzer);
152
- a->data = NULL;
153
- a->current_ts = whitespace_tokenizer_create();
154
- a->destroy = &a_standard_destroy;
155
- a->get_ts = &a_standard_get_ts;
156
- return a;
419
+ TokenStream *ts = mb_ts_create();
420
+ ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
421
+ return ts;
157
422
  }
158
423
 
159
- int std_get_alpha(char *input, char *token)
424
+ /*
425
+ * LetterAnalyzers
426
+ */
427
+ Analyzer *letter_analyzer_create(bool lowercase)
428
+ {
429
+ TokenStream *ts;
430
+ if (lowercase) {
431
+ ts = lowercase_filter_create(letter_tokenizer_create());
432
+ } else {
433
+ ts = letter_tokenizer_create();
434
+ }
435
+ return analyzer_create(NULL, ts, NULL, NULL);
436
+ }
437
+
438
+ Analyzer *mb_letter_analyzer_create(bool lowercase)
439
+ {
440
+ return analyzer_create(NULL,
441
+ mb_letter_tokenizer_create(lowercase), NULL, NULL);
442
+ }
443
+
444
+ /****************************************************************************
445
+ *
446
+ * Standard
447
+ *
448
+ ****************************************************************************/
449
+
450
+ /*
451
+ * StandardTokenizer
452
+ */
453
+ int std_get_alpha(TokenStream *ts, char *token)
160
454
  {
161
455
  int i = 0;
162
- while (input[i] != '\0' && isalpha(input[i])) {
163
- token[i] = input[i];
456
+ char *t = ts->t;
457
+ while (t[i] != '\0' && isalpha(t[i])) {
458
+ if (i < MAX_WORD_SIZE) token[i] = t[i];
164
459
  i++;
165
460
  }
166
461
  return i;
167
462
  }
168
463
 
169
- int std_get_alnum(char *input, char *token)
464
+ int mb_std_get_alpha(TokenStream *ts, char *token)
465
+ {
466
+ char *t = ts->t;
467
+ wchar_t w;
468
+ int i;
469
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
470
+ while (w != 0 && iswalpha(w)) {
471
+ t += i;
472
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
473
+ }
474
+
475
+ i = t - ts->t;
476
+ if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
477
+ memcpy(token, ts->t, i);
478
+ return i;
479
+ }
480
+
481
+ int std_get_alnum(TokenStream *ts, char *token)
170
482
  {
171
483
  int i = 0;
172
- while (input[i] != '\0' && isalnum(input[i])) {
173
- token[i] = input[i];
484
+ char *t = ts->t;
485
+ while (t[i] != '\0' && isalnum(t[i])) {
486
+ if (i < MAX_WORD_SIZE) token[i] = t[i];
174
487
  i++;
175
488
  }
176
489
  return i;
177
490
  }
178
491
 
492
+ int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
493
+ {
494
+ char *t = ts->t;
495
+ wchar_t w;
496
+ int i;
497
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
498
+ while (w != 0 && iswalnum(w)) {
499
+ t += i;
500
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
501
+ }
502
+
503
+ i = t - ts->t;
504
+ if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
505
+ memcpy(token, ts->t, i);
506
+ return i;
507
+
508
+ }
509
+
179
510
  int isnumpunc(char c)
180
511
  {
181
512
  return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
182
513
  }
183
514
 
515
+ int w_isnumpunc(wchar_t c)
516
+ {
517
+ return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_' || c == L'-');
518
+ }
519
+
184
520
  int isurlpunc(char c)
185
521
  {
186
522
  return (c == '.' || c == '/' || c == '-' || c == '_');
@@ -201,11 +537,23 @@ int isurlxatc(char c)
201
537
  return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
202
538
  }
203
539
 
204
- int isstdtokchar(char c)
540
+ bool std_is_tok_char(char *c)
205
541
  {
206
- if (isspace(c)) return false; // most common so check first.
207
- if (isalnum(c) || isnumpunc(c) || c == '&' ||
208
- c == '@' || c == '\'' || c == ':')
542
+ if (isspace(*c)) return false; // most common so check first.
543
+ if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
544
+ *c == '@' || *c == '\'' || *c == ':')
545
+ return true;
546
+ return false;
547
+ }
548
+
549
+ bool w_std_is_tok_char(char *t)
550
+ {
551
+ wchar_t c;
552
+ if ((mbtowc(&c, t, MB_CUR_MAX)) < 0)
553
+ RAISE(IO_ERROR, ENC_ERR_MSG);
554
+ if (iswspace(c)) return false; // most common so check first.
555
+ if (iswalnum(c) || w_isnumpunc(c) || c == L'&' ||
556
+ c == L'@' || c == L'\'' || c == L':')
209
557
  return true;
210
558
  return false;
211
559
  }
@@ -246,22 +594,34 @@ int std_get_number(char *input)
246
594
 
247
595
  int std_get_apostrophe(char *input)
248
596
  {
249
- int i = 0;
597
+ char *t = input;
250
598
 
251
- while (isalpha(input[i]) || input[i] == '\'')
252
- i++;
599
+ while (isalpha(*t) || *t == '\'')
600
+ t++;
253
601
 
254
- return i;
602
+ return t - input;
255
603
  }
256
604
 
257
- int std_get_url(char *input, char *token)
605
+ int mb_std_get_apostrophe(char *input)
258
606
  {
259
- int i = 0;
607
+ char *t = input;
608
+ wchar_t w;
609
+ int i;
260
610
 
611
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
612
+ while (iswalpha(w) || w == L'\'') {
613
+ t += i;
614
+ if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
615
+ }
616
+ return t - input;
617
+ }
618
+
619
+ int std_get_url(char *input, char *token, int i)
620
+ {
261
621
  while (isurlc(input[i])) {
262
622
  if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
263
623
  break; // can't have to puncs in a row
264
- token[i] = input[i];
624
+ if (i < MAX_WORD_SIZE) token[i] = input[i];
265
625
  i++;
266
626
  }
267
627
 
@@ -282,148 +642,229 @@ int std_get_company_name(char *input)
282
642
  return i;
283
643
  }
284
644
 
645
+ int mb_std_get_company_name(char *input, TokenStream *ts)
646
+ {
647
+ char *t = input;
648
+ wchar_t wchr;
649
+ int i;
650
+
651
+ MB_NEXT_CHAR;
652
+ while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
653
+ t += i;
654
+ MB_NEXT_CHAR;
655
+ }
656
+
657
+ return t - input;
658
+ }
659
+
660
+ bool std_advance_to_start(TokenStream *ts)
661
+ {
662
+ char *t = ts->t;
663
+ while (*t != '\0' && !isalnum(*t)) t++;
664
+
665
+ ts->t = t;
666
+
667
+ return (*t != '\0');
668
+ }
669
+
670
+ bool mb_std_advance_to_start(TokenStream *ts)
671
+ {
672
+ int i;
673
+ wchar_t w;
674
+
675
+ if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
676
+ while (w != 0 && !iswalnum(w)) {
677
+ ts->t += i;
678
+ if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
679
+ }
680
+
681
+ return (w != 0);
682
+ }
683
+
684
+ typedef struct StandardTokenizer {
685
+ bool (*advance_to_start)(TokenStream *ts);
686
+ bool (*is_tok_char)(char *c);
687
+ int (*get_alpha)(TokenStream *ts, char *token);
688
+ int (*get_apostrophe)(char *input);
689
+ } StandardTokenizer;
690
+
285
691
  Token *std_next(TokenStream *ts)
286
692
  {
287
- int i = ts->pos, j;
288
- int start;
289
- char *text = ts->text;
693
+ StandardTokenizer *std_tz = (StandardTokenizer *)ts->data;
694
+ char *s;
695
+ char *t;
696
+ char *start = NULL;
697
+ char *num_end = NULL;
290
698
  char token[MAX_WORD_SIZE];
291
699
  int token_i = 0;
292
700
  int len;
293
- int num_end = 0;
294
- int is_acronym;
295
- int seen_at_symbol;
701
+ bool is_acronym;
702
+ bool seen_at_symbol;
296
703
 
297
- while (text[i] != '\0' && !isalnum(text[i]))
298
- i++;
299
- if (text[i] == '\0')
300
- return NULL;
301
-
302
- start = i;
303
- if (isdigit(text[i])) {
304
- i += std_get_number(text + i);
305
- ts->pos = i;
306
- tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
704
+
705
+ if (!std_tz->advance_to_start(ts)) return NULL;
706
+
707
+ start = t = ts->t;
708
+ if (isdigit(*t)) {
709
+ t += std_get_number(t);
710
+ ts->t = t;
711
+ tk_set_ts(ts->token, start, t, ts->text, 1);
307
712
  } else {
308
- token_i = std_get_alpha(text + i, token);
309
- i += token_i;
713
+ token_i = std_tz->get_alpha(ts, token);
714
+ t += token_i;
310
715
 
311
- if (!isstdtokchar(text[i])) {
716
+ if (!std_tz->is_tok_char(t)) {
312
717
  // very common case, ie a plain word, so check and return
313
- tk_set(ts->token, text+start, i-start, start, i, 1);
314
- ts->pos = i;
718
+ tk_set_ts(ts->token, start, t, ts->text, 1);
719
+ ts->t = t;
315
720
  return ts->token;
316
721
  }
317
722
 
318
- if (text[i] == '\'') { // apostrophe case.
319
- i += std_get_apostrophe(text + i);
320
- ts->pos = i;
321
- len = i - start;
723
+ if (*t == '\'') { // apostrophe case.
724
+ t += std_tz->get_apostrophe(t);
725
+ ts->t = t;
726
+ len = t - start;
322
727
  // strip possesive
323
- if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
324
- len -= 2;
325
- tk_set(ts->token, text+start, len, start, i, 1);
728
+ if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
729
+
730
+ tk_set_ts(ts->token, start, t, ts->text, 1);
326
731
  return ts->token;
327
732
  }
328
- if (text[i] == '&') { // apostrophe case.
329
- i += std_get_company_name(text + i);
330
- ts->pos = i;
331
- tk_set(ts->token, text+start, i - start, start, i, 1);
733
+
734
+ if (*t == '&') { // apostrophe case.
735
+ t += std_get_company_name(t);
736
+ ts->t = t;
737
+ tk_set_ts(ts->token, start, t, ts->text, 1);
332
738
  return ts->token;
333
739
  }
334
740
 
335
- if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
336
- num_end = start + std_get_number(text + start);
337
- if (!isstdtokchar(text[num_end])) { // we won't find a longer token
338
- ts->pos = num_end;
339
- tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
741
+ if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
742
+ num_end = start + std_get_number(start);
743
+ if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
744
+ ts->t = num_end;
745
+ tk_set_ts(ts->token, start, num_end, ts->text, 1);
340
746
  return ts->token;
341
747
  }
342
748
  // else there may be a longer token so check
343
749
  }
344
750
 
345
- if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
751
+ if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
346
752
  // check for a known url start
347
753
  token[token_i] = '\0';
348
- i += 3;
349
- while (text[i] == '/') i++;
350
- if (isalpha(text[i]) &&
351
- (strcmp(token, "ftp") == 0 ||
352
- strcmp(token, "http") == 0 ||
353
- strcmp(token, "https") == 0 ||
354
- strcmp(token, "file") == 0)) {
355
- len = std_get_url(text + i, token); // dispose of first part of the URL
754
+ t += 3;
755
+ while (*t == '/') t++;
756
+ if (isalpha(*t) &&
757
+ (memcmp(token, "ftp", 3) == 0 ||
758
+ memcmp(token, "http", 4) == 0 ||
759
+ memcmp(token, "https", 5) == 0 ||
760
+ memcmp(token, "file", 4) == 0)) {
761
+ len = std_get_url(t, token, 0); // dispose of first part of the URL
356
762
  } else { //still treat as url but keep the first part
357
- token_i = i - start;
358
- memcpy(token, text + start, token_i * sizeof(char));
359
- len = token_i + std_get_url(text + i, token + token_i); // keep start
763
+ token_i = t - start;
764
+ memcpy(token, start, token_i * sizeof(char));
765
+ len = token_i + std_get_url(t, token, token_i); // keep start
360
766
  }
361
- ts->pos = i + len;
767
+ ts->t = t + len;
362
768
  token[len] = 0;
363
- tk_set(ts->token, token, len, start, ts->pos, 1);
769
+ tk_set(ts->token, token, len, start - ts->text, ts->t - ts->text, 1);
364
770
  return ts->token;
365
771
  }
366
772
 
367
- // now see how int a url we can find.
773
+ // now see how long a url we can find.
368
774
  is_acronym = true;
369
775
  seen_at_symbol = false;
370
- while (isurlxatc(text[i])) {
371
- if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
776
+ while (isurlxatc(*t)) {
777
+ if (is_acronym && !isalpha(*t) && (*t != '.')) {
372
778
  is_acronym = false;
373
779
  }
374
- if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
375
- break; // can't have to punctuation characters in a row
376
- if (text[i] == '@') {
377
- if (seen_at_symbol)
780
+ if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
781
+ break; // can't have two punctuation characters in a row
782
+ }
783
+ if (*t == '@') {
784
+ if (seen_at_symbol) {
378
785
  break; // we can only have one @ symbol
379
- else
786
+ } else {
380
787
  seen_at_symbol = true;
788
+ }
381
789
  }
382
- i++;
790
+ t++;
383
791
  }
384
- while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
385
- if (i > num_end) {
386
- ts->pos = i;
792
+ while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
793
+
794
+ if (t > num_end) {
795
+ ts->t = t;
387
796
 
388
797
  if (is_acronym) { // check that it is one letter followed by one '.'
389
- for (j = start; j < i-1; j++) {
390
- if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
798
+ for (s = start; s < t-1; s++) {
799
+ if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
391
800
  }
392
801
  }
393
802
  if (is_acronym) {// strip '.'s
394
- for (j = start + token_i; j < i; j++) {
395
- if (text[j] != '.') {
396
- token[token_i] = text[j];
803
+ for (s = start + token_i; s < t; s++) {
804
+ if (*s != '.') {
805
+ token[token_i] = *s;
397
806
  token_i++;
398
807
  }
399
808
  }
400
- tk_set(ts->token, token, token_i, start, ts->pos, 1);
809
+ tk_set(ts->token, token, token_i, start - ts->text, t - ts->text, 1);
401
810
  } else { // just return the url as is
402
- tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
811
+ tk_set_ts(ts->token, start, t, ts->text, 1);
403
812
  }
404
813
  } else { // return the number
405
- ts->pos = num_end;
406
- tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
814
+ ts->t = num_end;
815
+ tk_set_ts(ts->token, start, num_end, ts->text, 1);
407
816
  }
408
817
  }
409
818
 
410
819
  return ts->token;
411
820
  }
412
821
 
822
+ void std_ts_destroy(void *p)
823
+ {
824
+ TokenStream *ts = (TokenStream *)p;
825
+ free(ts->data);
826
+ ts_standard_destroy(ts);
827
+ }
828
+
829
+ void std_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
830
+ {
831
+ new_ts->data = ALLOC(StandardTokenizer);
832
+ memcpy(new_ts->data, orig_ts->data, sizeof(StandardTokenizer));
833
+ }
834
+
413
835
  TokenStream *standard_tokenizer_create()
414
836
  {
415
837
  TokenStream *ts = ts_create();
838
+
839
+ StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
840
+ std_tz->advance_to_start = &std_advance_to_start;
841
+ std_tz->get_alpha = &std_get_alpha;
842
+ std_tz->is_tok_char = &std_is_tok_char;
843
+ std_tz->get_apostrophe = &std_get_apostrophe;
844
+
845
+ ts->data = std_tz;
846
+ ts->destroy = &std_ts_destroy;
847
+ ts->clone_i = &std_ts_clone_i;
416
848
  ts->next = &std_next;
417
849
  return ts;
418
850
  }
419
851
 
420
- const char *ENGLISH_STOP_WORDS[] = {
421
- "a", "an", "and", "are", "as", "at", "be", "but", "by",
422
- "for", "if", "in", "into", "is", "it",
423
- "no", "not", "of", "on", "or", "s", "such",
424
- "t", "that", "the", "their", "then", "there", "these",
425
- "they", "this", "to", "was", "will", "with"
426
- };
852
+ TokenStream *mb_standard_tokenizer_create()
853
+ {
854
+ TokenStream *ts = ts_create();
855
+
856
+ StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
857
+ std_tz->advance_to_start = &mb_std_advance_to_start;
858
+ std_tz->get_alpha = &mb_std_get_alpha;
859
+ std_tz->is_tok_char = &w_std_is_tok_char;
860
+ std_tz->get_apostrophe = &mb_std_get_apostrophe;
861
+
862
+ ts->data = std_tz;
863
+ ts->destroy = &std_ts_destroy;
864
+ ts->clone_i = &std_ts_clone_i;
865
+ ts->next = &std_next;
866
+ return ts;
867
+ }
427
868
 
428
869
  void filter_reset(TokenStream *ts, char *text)
429
870
  {
@@ -432,10 +873,10 @@ void filter_reset(TokenStream *ts, char *text)
432
873
 
433
874
  void filter_destroy(void *p)
434
875
  {
435
- TokenStream *ts = (TokenStream *)p;
436
- ts->sub_ts->destroy(ts->sub_ts);
437
- if (ts->token != NULL) tk_destroy(ts->token);
438
- free(ts);
876
+ TokenStream *tf = (TokenStream *)p;
877
+ if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
878
+ if (tf->token != NULL) tk_destroy(tf->token);
879
+ free(tf);
439
880
  }
440
881
 
441
882
  void sf_destroy(void *p)
@@ -445,40 +886,109 @@ void sf_destroy(void *p)
445
886
  filter_destroy(p);
446
887
  }
447
888
 
448
- Token *sf_next(TokenStream *ts)
889
+ void sf_clone_i_i(void *key, void *value, void *arg)
890
+ {
891
+ HshTable *wordtable = (HshTable *)arg;
892
+ char *w = estrdup(key);
893
+ h_set(wordtable, w, w);
894
+ }
895
+
896
+ void sf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
897
+ {
898
+ new_ts->data = h_new_str(&free, NULL);
899
+ h_each(orig_ts->data, &sf_clone_i_i, new_ts->data);
900
+ }
901
+
902
+ Token *sf_next(TokenStream *tf)
449
903
  {
450
904
  int pos_inc = 1;
451
- HshTable *words = (HshTable *)ts->data;
452
- Token *tk = ts->sub_ts->next(ts->sub_ts);
905
+ HshTable *words = (HshTable *)tf->data;
906
+ Token *tk = tf->sub_ts->next(tf->sub_ts);
453
907
  while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
454
- tk = ts->sub_ts->next(ts->sub_ts);
908
+ tk = tf->sub_ts->next(tf->sub_ts);
455
909
  pos_inc++;
456
910
  }
457
911
  if (tk != NULL) tk->pos_inc = pos_inc;
458
912
  return tk;
459
913
  }
460
914
 
461
- TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
915
+ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
916
+ const char **words, int len)
462
917
  {
463
918
  int i;
919
+ char *w;
464
920
  TokenStream *tf = ALLOC(TokenStream);
465
921
  tf->sub_ts = ts;
466
- HshTable *wordtable = h_new_str(NULL, NULL);
922
+ tf->destroy_sub = true;
923
+ HshTable *wordtable = h_new_str(&free, NULL);
467
924
  for (i = 0; i < len; i++) {
468
- h_set(wordtable, words[i], words[i]);
925
+ w = estrdup(words[i]);
926
+ h_set(wordtable, w, w);
927
+ }
928
+ tf->data = wordtable;
929
+ tf->token = NULL;
930
+ tf->next = &sf_next;
931
+ tf->reset = &filter_reset;
932
+ tf->destroy = &sf_destroy;
933
+ tf->clone_i = &sf_clone_i;
934
+ return tf;
935
+ }
936
+
937
+ TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
938
+ {
939
+ char *w;
940
+ TokenStream *tf = ALLOC(TokenStream);
941
+ tf->sub_ts = ts;
942
+ tf->destroy_sub = true;
943
+ HshTable *wordtable = h_new_str(&free, NULL);
944
+ while (*words) {
945
+ w = estrdup(*words);
946
+ h_set(wordtable, w, w);
947
+ words++;
469
948
  }
470
949
  tf->data = wordtable;
471
950
  tf->token = NULL;
472
951
  tf->next = &sf_next;
473
952
  tf->reset = &filter_reset;
474
953
  tf->destroy = &sf_destroy;
954
+ tf->clone_i = &sf_clone_i;
475
955
  return tf;
476
956
  }
477
957
 
478
958
  TokenStream *stop_filter_create(TokenStream *ts)
479
959
  {
480
- return stop_filter_create_with_words(ts,
481
- (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
960
+ return stop_filter_create_with_words(ts, FULL_ENGLISH_STOP_WORDS);
961
+ }
962
+
963
+ Token *mb_lcf_next(TokenStream *ts)
964
+ {
965
+ wchar_t wbuf[MAX_WORD_SIZE], *w;
966
+ //mbstate_t state = {0};
967
+ int i;
968
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
969
+ if (tk == NULL) return tk;
970
+
971
+ i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
972
+ w = wbuf;
973
+ while (*w != 0) {
974
+ *w = towlower(*w);
975
+ w++;
976
+ }
977
+ wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
978
+ return tk;
979
+ }
980
+
981
+ TokenStream *mb_lowercase_filter_create(TokenStream *ts)
982
+ {
983
+ TokenStream *tf = ALLOC(TokenStream);
984
+ tf->token = NULL;
985
+ tf->next = &mb_lcf_next;
986
+ tf->reset = &filter_reset;
987
+ tf->destroy = &filter_destroy;
988
+ tf->sub_ts = ts;
989
+ tf->destroy_sub = true;
990
+ tf->clone_i = NULL;
991
+ return tf;
482
992
  }
483
993
 
484
994
  Token *lcf_next(TokenStream *ts)
@@ -501,48 +1011,199 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
501
1011
  tf->reset = &filter_reset;
502
1012
  tf->destroy = &filter_destroy;
503
1013
  tf->sub_ts = ts;
1014
+ tf->destroy_sub = true;
1015
+ tf->clone_i = NULL;
504
1016
  return tf;
505
1017
  }
506
1018
 
507
- Analyzer *letter_analyzer_create()
1019
+ typedef struct StemFilter {
1020
+ struct sb_stemmer *stemmer;
1021
+ char *algorithm;
1022
+ char *charenc;
1023
+ } StemFilter;
1024
+
1025
+ void stemf_destroy(void *p)
508
1026
  {
509
- Analyzer *a = ALLOC(Analyzer);
510
- a->data = NULL;
511
- a->current_ts = lowercase_filter_create(letter_tokenizer_create());
512
- a->destroy = &a_standard_destroy;
513
- a->get_ts = &a_standard_get_ts;
514
- return a;
1027
+ TokenStream *ts = (TokenStream *)p;
1028
+ StemFilter *stemf = (StemFilter *)ts->data;
1029
+ sb_stemmer_delete(stemf->stemmer);
1030
+ free(stemf->algorithm);
1031
+ free(stemf->charenc);
1032
+ free(stemf);
1033
+ filter_destroy(ts);
515
1034
  }
516
1035
 
1036
+ Token *stemf_next(TokenStream *ts)
1037
+ {
1038
+ int len;
1039
+ const sb_symbol *stemmed;
1040
+ struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
1041
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
1042
+ if (tk == NULL) return tk;
1043
+ stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
1044
+ len = sb_stemmer_length(stemmer);
1045
+ if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
1046
+ memcpy(tk->text, stemmed, len);
1047
+ tk->text[len] = '\0';
1048
+ return tk;
1049
+ }
517
1050
 
518
- Analyzer *standard_analyzer_create_with_words(char **words, int len)
1051
+ void stemf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
519
1052
  {
520
- Analyzer *a = ALLOC(Analyzer);
521
- a->data = NULL;
522
- a->current_ts =
523
- stop_filter_create_with_words(
1053
+ StemFilter *orig_stemf = (StemFilter *)orig_ts->data;
1054
+ StemFilter *stemf = ALLOC(StemFilter);
1055
+ stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1056
+ stemf->algorithm = orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
1057
+ stemf->charenc = orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
1058
+ new_ts->data = stemf;
1059
+ }
1060
+
1061
+ TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
1062
+ const char * charenc)
1063
+ {
1064
+ TokenStream *tf = ALLOC(TokenStream);
1065
+ StemFilter *stemf = ALLOC(StemFilter);
1066
+ stemf->stemmer = sb_stemmer_new(algorithm, charenc);
1067
+ stemf->algorithm = algorithm ? estrdup(algorithm) : NULL;
1068
+ stemf->charenc = charenc ? estrdup(charenc) : NULL;
1069
+ tf->data = stemf;
1070
+
1071
+ tf->token = NULL;
1072
+ tf->next = &stemf_next;
1073
+ tf->reset = &filter_reset;
1074
+ tf->destroy = &stemf_destroy;
1075
+ tf->clone_i = &stemf_clone_i;
1076
+ tf->sub_ts = ts;
1077
+ tf->destroy_sub = true;
1078
+ return tf;
1079
+ }
1080
+
1081
+ Analyzer *standard_analyzer_create_with_words_len(
1082
+ const char **words, int len, bool lowercase)
1083
+ {
1084
+ TokenStream *ts;
1085
+ if (lowercase) {
1086
+ ts = stop_filter_create_with_words_len(
524
1087
  lowercase_filter_create(standard_tokenizer_create()), words, len);
525
- a->destroy = &a_standard_destroy;
526
- a->get_ts = &a_standard_get_ts;
527
- return a;
1088
+ } else {
1089
+ ts = stop_filter_create_with_words_len(
1090
+ standard_tokenizer_create(), words, len);
1091
+ }
1092
+ return analyzer_create(NULL, ts, NULL, NULL);
1093
+ }
1094
+
1095
+ Analyzer *standard_analyzer_create_with_words(const char **words, bool lowercase)
1096
+ {
1097
+ TokenStream *ts;
1098
+ if (lowercase) {
1099
+ ts = stop_filter_create_with_words(
1100
+ lowercase_filter_create(standard_tokenizer_create()), words);
1101
+ } else {
1102
+ ts = stop_filter_create_with_words(
1103
+ standard_tokenizer_create(), words);
1104
+ }
1105
+ return analyzer_create(NULL, ts, NULL, NULL);
1106
+ }
1107
+
1108
+ Analyzer *mb_standard_analyzer_create_with_words_len(
1109
+ const char **words, int len, bool lowercase)
1110
+ {
1111
+ TokenStream *ts;
1112
+ if (lowercase) {
1113
+ ts = stop_filter_create_with_words_len(
1114
+ mb_lowercase_filter_create(mb_standard_tokenizer_create()), words, len);
1115
+ } else {
1116
+ ts = stop_filter_create_with_words_len(
1117
+ mb_standard_tokenizer_create(), words, len);
1118
+ }
1119
+ return analyzer_create(NULL, ts, NULL, NULL);
1120
+ }
1121
+
1122
+ Analyzer *mb_standard_analyzer_create_with_words(
1123
+ const char **words, bool lowercase)
1124
+ {
1125
+ TokenStream *ts;
1126
+ if (lowercase) {
1127
+ ts = stop_filter_create_with_words(
1128
+ mb_lowercase_filter_create(mb_standard_tokenizer_create()), words);
1129
+ } else {
1130
+ ts = stop_filter_create_with_words(mb_standard_tokenizer_create(), words);
1131
+ }
1132
+ return analyzer_create(NULL, ts, NULL, NULL);
1133
+ }
1134
+
1135
+ Analyzer *standard_analyzer_create(bool lowercase)
1136
+ {
1137
+ return standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1138
+ }
1139
+
1140
+ Analyzer *mb_standard_analyzer_create(bool lowercase)
1141
+ {
1142
+ return mb_standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
1143
+ }
1144
+
1145
+ /****************************************************************************
1146
+ *
1147
+ * PerFieldAnalyzer
1148
+ *
1149
+ ****************************************************************************/
1150
+
1151
+ typedef struct PerFieldAnalyzer {
1152
+ HshTable *dict;
1153
+ Analyzer *def;
1154
+ bool destroy_subs : 1;
1155
+ } PerFieldAnalyzer;
1156
+
1157
+ void pfa_destroy(void *p)
1158
+ {
1159
+ Analyzer *self = (Analyzer *)p;
1160
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1161
+ h_destroy(pfa->dict);
1162
+
1163
+ if (pfa->destroy_subs) a_destroy(pfa->def);
1164
+ free(pfa);
1165
+ free(self);
1166
+ }
1167
+
1168
+ TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
1169
+ {
1170
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1171
+ Analyzer *a = h_get(pfa->dict, field);
1172
+ if (a == NULL) a = pfa->def;
1173
+ return a_get_ts(a, field, text);
1174
+ }
1175
+
1176
+ void pfa_sub_a_destroy(void *p)
1177
+ {
1178
+ Analyzer *a = (Analyzer *)p;
1179
+ a->destroy(a);
1180
+ }
1181
+
1182
+ void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
1183
+ {
1184
+ PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
1185
+ h_set(pfa->dict, estrdup(field), analyzer);
528
1186
  }
529
1187
 
530
- Analyzer *standard_analyzer_create()
1188
+ Analyzer *per_field_analyzer_create(Analyzer *def, bool destroy_subs)
531
1189
  {
532
- return standard_analyzer_create_with_words(
533
- (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
1190
+ PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
1191
+ pfa->def = def;
1192
+ pfa->destroy_subs = destroy_subs;
1193
+ pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
1194
+ : h_new_str(&free, NULL);
1195
+ return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
534
1196
  }
535
1197
 
536
1198
  #ifdef ALONE
537
1199
  int main(int argc, char **argv)
538
1200
  {
539
1201
  char buf[10000];
540
- Analyzer *a = standard_analyzer_create();
1202
+ Analyzer *a = standard_analyzer_create(true);
541
1203
  TokenStream *ts;
542
1204
  Token *tk;
543
1205
  while (fgets(buf, 9999, stdin) != NULL) {
544
1206
  ts = a->get_ts(a, "hello", buf);
545
- ts->pos = 0;
546
1207
  while ((tk = ts->next(ts)) != NULL) {
547
1208
  printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
548
1209
  }