ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/CHANGELOG ADDED
@@ -0,0 +1,9 @@
1
+ 20060316:
2
+ * changed Token#term_text to Token#text
3
+ * changed Token#position_increment to Term#pos_inc
4
+ * changed order of args to Token.new. Now Term.new(text, start_offset,
5
+ end_offset, pos_inc=1, type="text"). NOTE: type does nothing.
6
+ * changed TermVectorOffsetInfo#start_offset to TermVectorOffsetInfo#start
7
+ * changed TermVectorOffsetInfo#end_offset to TermVectorOffsetInfo#end
8
+ * added :id_field option to Index::Index class.
9
+
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require 'rake/testtask'
9
9
  require 'rake/rdoctask'
10
10
  require 'rake/clean'
11
11
  require 'rake_utils/code_statistics'
12
- require 'lib/ferret'
12
+ require 'lib/rferret'
13
13
 
14
14
  begin
15
15
  require 'rubygems'
@@ -30,18 +30,32 @@ def announce(msg='')
30
30
  end
31
31
 
32
32
  $VERBOSE = nil
33
+
34
+ EXT = "ferret_ext.so"
35
+ EXT_SRC = FileList["src/*/*.[ch]"]
36
+ EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
37
+ SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
38
+
33
39
  CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
34
40
  CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
35
41
 
36
42
  task :default => :all_tests
37
43
  desc "Run all tests"
38
- task :all_tests => [ :test_units, :test_functional ]
44
+ task :all_tests => [ :test_runits, :test_cunits, :test_functional ]
39
45
 
40
46
  desc "Generate API documentation, and show coding stats"
41
47
  task :doc => [ :stats, :appdoc ]
42
48
 
43
- desc "run unit tests in test/unit"
44
- Rake::TestTask.new("test_units" => :parsers) do |t|
49
+ desc "run unit tests in test/unit for pure ruby ferret"
50
+ Rake::TestTask.new("test_runits" => :parsers) do |t|
51
+ t.ruby_opts = ["-r 'lib/rferret'"]
52
+ t.libs << "test/unit"
53
+ t.pattern = 'test/unit/ts_*.rb'
54
+ t.verbose = true
55
+ end
56
+
57
+ desc "run unit tests in test/unit for C ferret"
58
+ Rake::TestTask.new("test_cunits" => :ext) do |t|
45
59
  t.libs << "test/unit"
46
60
  t.pattern = 'test/unit/t[cs]_*.rb'
47
61
  t.verbose = true
@@ -84,22 +98,28 @@ rd = Rake::RDocTask.new("appdoc") do |rdoc|
84
98
  rdoc.rdoc_files.include('lib/**/*.rb')
85
99
  end
86
100
 
87
- EXT = "ferret_ext.so"
101
+ EXT_SRC.each do |fn|
102
+ dest_fn = File.join("ext", File.basename(fn))
103
+ file dest_fn => fn do |t|
104
+ cp fn, dest_fn
105
+ end
106
+ end
88
107
 
89
108
  desc "Build the extension"
90
- task :ext => "ext/#{EXT}"
109
+ task :ext => ["ext/#{EXT}"] + SRC
91
110
 
92
- file "ext/#{EXT}" => "ext/Makefile" do
111
+ file "ext/#{EXT}" => ["ext/Makefile"] do
112
+ cp "ext/inc/lang.h", "ext/lang.h"
93
113
  sh "cd ext; make"
94
114
  end
95
115
 
96
- file "ext/Makefile" do
116
+ file "ext/Makefile" => SRC do
97
117
  sh "cd ext; ruby extconf.rb"
98
118
  end
99
119
 
100
120
  # Make Parsers ---------------------------------------------------------------
101
121
 
102
- RACC_SRC = FileList["**/*.y"]
122
+ RACC_SRC = FileList["lib/**/*.y"]
103
123
  RACC_OUT = RACC_SRC.collect { |fn| fn.sub(/\.y$/, '.tab.rb') }
104
124
 
105
125
  task :parsers => RACC_OUT
@@ -195,8 +215,9 @@ end
195
215
  # Creating a release
196
216
 
197
217
  desc "Make a new release"
198
- task :prerelease => [:clobber, :all_tests, :parsers]
199
- task :package => [:prerelease]
218
+ task :prerelease => [:all_tests, :clobber]
219
+ task :repackage => EXT_SRC_DEST
220
+ task :package => EXT_SRC_DEST
200
221
  task :tag => [:prerelease]
201
222
  task :update_version => [:prerelease]
202
223
  task :release => [:tag, :update_version, :package] do
@@ -229,7 +250,7 @@ task :prerelease do
229
250
  end
230
251
 
231
252
  # Are all source files checked in?
232
- data = `svn -q status`
253
+ data = `svn -q --ignore-externals status`
233
254
  unless data =~ /^$/
234
255
  fail "'svn -q status' is not clean ... do you have unchecked-in files?"
235
256
  end
@@ -237,28 +258,33 @@ task :prerelease do
237
258
  announce "No outstanding checkins found ... OK"
238
259
  end
239
260
 
261
+ def reversion(fn)
262
+ open(fn) do |ferret_in|
263
+ open(fn + ".new", "w") do |ferret_out|
264
+ ferret_in.each do |line|
265
+ if line =~ /^ VERSION\s*=\s*/
266
+ ferret_out.puts " VERSION = '#{PKG_VERSION}'"
267
+ else
268
+ ferret_out.puts line
269
+ end
270
+ end
271
+ end
272
+ end
273
+ end
274
+
240
275
  task :update_version => [:prerelease] do
241
276
  if PKG_VERSION == CURRENT_VERSION
242
277
  announce "No version change ... skipping version update"
243
278
  else
244
279
  announce "Updating Ferret version to #{PKG_VERSION}"
245
- open("lib/ferret.rb") do |ferret_in|
246
- open("lib/ferret.rb.new", "w") do |ferret_out|
247
- ferret_in.each do |line|
248
- if line =~ /^ VERSION\s*=\s*/
249
- ferret_out.puts " VERSION = '#{PKG_VERSION}'"
250
- else
251
- ferret_out.puts line
252
- end
253
- end
254
- end
255
- end
280
+ reversion("lib/ferret.rb")
281
+ reversion("lib/rferret.rb")
256
282
  if ENV['RELTEST']
257
283
  announce "Release Task Testing, skipping commiting of new version"
258
284
  else
259
- mv "lib/ferret.rb.new", "lib/ferret.rb"
285
+ mv "lib/rferret.rb.new", "lib/rferret.rb"
260
286
  end
261
- sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
287
+ sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
262
288
  end
263
289
  end
264
290
 
data/ext/analysis.c ADDED
@@ -0,0 +1,553 @@
1
+ #include <analysis.h>
2
+ #include <string.h>
3
+ #include <ctype.h>
4
+ #include <hash.h>
5
+
6
+ Token *tk_create()
7
+ {
8
+ return ALLOC(Token);
9
+ }
10
+
11
+ void tk_destroy(void *p)
12
+ {
13
+ free(p);
14
+ }
15
+
16
+ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc)
17
+ {
18
+ if (tlen >= MAX_WORD_SIZE) tlen = MAX_WORD_SIZE - 1;
19
+ memcpy(tk->text, text, sizeof(char) * tlen);
20
+ tk->text[tlen] = '\0';
21
+ tk->start = start;
22
+ tk->end = end;
23
+ tk->pos_inc = pos_inc;
24
+ return tk;
25
+ }
26
+
27
+ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
28
+ {
29
+ return tk_set(tk, text, strlen(text), start, end, pos_inc);
30
+ }
31
+
32
+ int tk_eq(Token *tk1, Token *tk2)
33
+ {
34
+ if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
35
+ tk1->start == tk2->start && tk1->end == tk2->end)
36
+ return true;
37
+ else
38
+ return false;
39
+ }
40
+
41
+ int tk_cmp(Token *tk1, Token *tk2)
42
+ {
43
+ int cmp;
44
+ if (tk1->start > tk2->start) {
45
+ cmp = 1;
46
+ } else if (tk1->start < tk2->start) {
47
+ cmp = -1;
48
+ } else {
49
+ if (tk1->end > tk2->end) {
50
+ cmp = 1;
51
+ } else if (tk1->end < tk2->end) {
52
+ cmp = -1;
53
+ } else {
54
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
55
+ }
56
+ }
57
+ return cmp;
58
+ }
59
+
60
+ void ts_standard_destroy(void *p)
61
+ {
62
+ TokenStream *ts = (TokenStream *)p;
63
+ tk_destroy(ts->token);
64
+ free(p);
65
+ }
66
+
67
+ void ts_reset(TokenStream *ts, char *text)
68
+ {
69
+ ts->text = text;
70
+ ts->pos = 0;
71
+ }
72
+
73
+ TokenStream *ts_create()
74
+ {
75
+ TokenStream *ts = ALLOC(TokenStream);
76
+ ts->pos = -1;
77
+ ts->text = NULL;
78
+ ts->token = tk_create();
79
+ ts->destroy = &ts_standard_destroy;
80
+ ts->reset = &ts_reset;
81
+ return ts;
82
+ }
83
+
84
+ Token *wst_next(TokenStream *ts)
85
+ {
86
+ int i = ts->pos;
87
+ int start, end;
88
+ char *text = ts->text;
89
+
90
+ while (text[i] != '\0' && isspace(text[i]))
91
+ i++;
92
+ if (text[i] == '\0')
93
+ return NULL;
94
+
95
+ start = i;
96
+ while (text[i] != '\0' && !isspace(text[i]))
97
+ i++;
98
+ ts->pos = end = i;
99
+ tk_set(ts->token, text+start, end-start, start, end, 1);
100
+ return ts->token;
101
+ }
102
+
103
+ TokenStream *whitespace_tokenizer_create()
104
+ {
105
+ TokenStream *ts = ts_create();
106
+ ts->next = &wst_next;
107
+ return ts;
108
+ }
109
+
110
+ Token *lt_next(TokenStream *ts)
111
+ {
112
+ int i = ts->pos;
113
+ int start, end;
114
+ char *text = ts->text;
115
+
116
+ while (text[i] != '\0' && !isalpha(text[i]))
117
+ i++;
118
+ if (text[i] == '\0')
119
+ return NULL;
120
+
121
+ start = i;
122
+ while (text[i] != '\0' && isalpha(text[i]))
123
+ i++;
124
+ ts->pos = end = i;
125
+ tk_set(ts->token, text+start, end-start, start, end, 1);
126
+ return ts->token;
127
+ }
128
+
129
+ TokenStream *letter_tokenizer_create()
130
+ {
131
+ TokenStream *ts = ts_create();
132
+ ts->next = &lt_next;
133
+ return ts;
134
+ }
135
+
136
+ void a_standard_destroy(void *p)
137
+ {
138
+ Analyzer *a = (Analyzer *)p;
139
+ ts_destroy(a->current_ts);
140
+ free(p);
141
+ }
142
+
143
+ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
144
+ {
145
+ a->current_ts->reset(a->current_ts, text);
146
+ return a->current_ts;
147
+ }
148
+
149
+ Analyzer *whitespace_analyzer_create()
150
+ {
151
+ Analyzer *a = ALLOC(Analyzer);
152
+ a->data = NULL;
153
+ a->current_ts = whitespace_tokenizer_create();
154
+ a->destroy = &a_standard_destroy;
155
+ a->get_ts = &a_standard_get_ts;
156
+ return a;
157
+ }
158
+
159
+ int std_get_alpha(char *input, char *token)
160
+ {
161
+ int i = 0;
162
+ while (input[i] != '\0' && isalpha(input[i])) {
163
+ token[i] = input[i];
164
+ i++;
165
+ }
166
+ return i;
167
+ }
168
+
169
+ int std_get_alnum(char *input, char *token)
170
+ {
171
+ int i = 0;
172
+ while (input[i] != '\0' && isalnum(input[i])) {
173
+ token[i] = input[i];
174
+ i++;
175
+ }
176
+ return i;
177
+ }
178
+
179
+ int isnumpunc(char c)
180
+ {
181
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
182
+ }
183
+
184
+ int isurlpunc(char c)
185
+ {
186
+ return (c == '.' || c == '/' || c == '-' || c == '_');
187
+ }
188
+
189
+ int isurlc(char c)
190
+ {
191
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
192
+ }
193
+
194
+ int isurlxatpunc(char c)
195
+ {
196
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
197
+ }
198
+
199
+ int isurlxatc(char c)
200
+ {
201
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
202
+ }
203
+
204
+ int isstdtokchar(char c)
205
+ {
206
+ if (isspace(c)) return false; // most common so check first.
207
+ if (isalnum(c) || isnumpunc(c) || c == '&' ||
208
+ c == '@' || c == '\'' || c == ':')
209
+ return true;
210
+ return false;
211
+ }
212
+
213
+ /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
214
+ * least one digit.
215
+ * (alnum) = [a-zA-Z0-9]
216
+ * (punc) = [_\/.,-]
217
+ */
218
+ int std_get_number(char *input)
219
+ {
220
+ int i = 0;
221
+ int count = 0;
222
+ int last_seen_digit = 2;
223
+ int seen_digit = false;
224
+
225
+ while (last_seen_digit >= 0) {
226
+ while ((input[i] != '\0') && isalnum(input[i])) {
227
+ if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
228
+ if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
229
+ i++;
230
+ }
231
+ last_seen_digit--;
232
+ if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
233
+
234
+ if (last_seen_digit >= 0)
235
+ count = i;
236
+ break;
237
+ }
238
+ count = i;
239
+ i++;
240
+ }
241
+ if (seen_digit)
242
+ return count;
243
+ else
244
+ return 0;
245
+ }
246
+
247
+ int std_get_apostrophe(char *input)
248
+ {
249
+ int i = 0;
250
+
251
+ while (isalpha(input[i]) || input[i] == '\'')
252
+ i++;
253
+
254
+ return i;
255
+ }
256
+
257
+ int std_get_url(char *input, char *token)
258
+ {
259
+ int i = 0;
260
+
261
+ while (isurlc(input[i])) {
262
+ if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
263
+ break; // can't have to puncs in a row
264
+ token[i] = input[i];
265
+ i++;
266
+ }
267
+
268
+ //strip trailing puncs
269
+ while (isurlpunc(input[i-1])) i--;
270
+
271
+ return i;
272
+ }
273
+
274
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
275
+ */
276
+ int std_get_company_name(char *input)
277
+ {
278
+ int i = 0;
279
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
280
+ i++;
281
+
282
+ return i;
283
+ }
284
+
285
+ Token *std_next(TokenStream *ts)
286
+ {
287
+ int i = ts->pos, j;
288
+ int start;
289
+ char *text = ts->text;
290
+ char token[MAX_WORD_SIZE];
291
+ int token_i = 0;
292
+ int len;
293
+ int num_end = 0;
294
+ int is_acronym;
295
+ int seen_at_symbol;
296
+
297
+ while (text[i] != '\0' && !isalnum(text[i]))
298
+ i++;
299
+ if (text[i] == '\0')
300
+ return NULL;
301
+
302
+ start = i;
303
+ if (isdigit(text[i])) {
304
+ i += std_get_number(text + i);
305
+ ts->pos = i;
306
+ tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
307
+ } else {
308
+ token_i = std_get_alpha(text + i, token);
309
+ i += token_i;
310
+
311
+ if (!isstdtokchar(text[i])) {
312
+ // very common case, ie a plain word, so check and return
313
+ tk_set(ts->token, text+start, i-start, start, i, 1);
314
+ ts->pos = i;
315
+ return ts->token;
316
+ }
317
+
318
+ if (text[i] == '\'') { // apostrophe case.
319
+ i += std_get_apostrophe(text + i);
320
+ ts->pos = i;
321
+ len = i - start;
322
+ // strip possesive
323
+ if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
324
+ len -= 2;
325
+ tk_set(ts->token, text+start, len, start, i, 1);
326
+ return ts->token;
327
+ }
328
+ if (text[i] == '&') { // apostrophe case.
329
+ i += std_get_company_name(text + i);
330
+ ts->pos = i;
331
+ tk_set(ts->token, text+start, i - start, start, i, 1);
332
+ return ts->token;
333
+ }
334
+
335
+ if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
336
+ num_end = start + std_get_number(text + start);
337
+ if (!isstdtokchar(text[num_end])) { // we won't find a longer token
338
+ ts->pos = num_end;
339
+ tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
340
+ return ts->token;
341
+ }
342
+ // else there may be a longer token so check
343
+ }
344
+
345
+ if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
346
+ // check for a known url start
347
+ token[token_i] = '\0';
348
+ i += 3;
349
+ while (text[i] == '/') i++;
350
+ if (isalpha(text[i]) &&
351
+ (strcmp(token, "ftp") == 0 ||
352
+ strcmp(token, "http") == 0 ||
353
+ strcmp(token, "https") == 0 ||
354
+ strcmp(token, "file") == 0)) {
355
+ len = std_get_url(text + i, token); // dispose of first part of the URL
356
+ } else { //still treat as url but keep the first part
357
+ token_i = i - start;
358
+ memcpy(token, text + start, token_i * sizeof(char));
359
+ len = token_i + std_get_url(text + i, token + token_i); // keep start
360
+ }
361
+ ts->pos = i + len;
362
+ token[len] = 0;
363
+ tk_set(ts->token, token, len, start, ts->pos, 1);
364
+ return ts->token;
365
+ }
366
+
367
+ // now see how int a url we can find.
368
+ is_acronym = true;
369
+ seen_at_symbol = false;
370
+ while (isurlxatc(text[i])) {
371
+ if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
372
+ is_acronym = false;
373
+ }
374
+ if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
375
+ break; // can't have to punctuation characters in a row
376
+ if (text[i] == '@') {
377
+ if (seen_at_symbol)
378
+ break; // we can only have one @ symbol
379
+ else
380
+ seen_at_symbol = true;
381
+ }
382
+ i++;
383
+ }
384
+ while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
385
+ if (i > num_end) {
386
+ ts->pos = i;
387
+
388
+ if (is_acronym) { // check that it is one letter followed by one '.'
389
+ for (j = start; j < i-1; j++) {
390
+ if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
391
+ }
392
+ }
393
+ if (is_acronym) {// strip '.'s
394
+ for (j = start + token_i; j < i; j++) {
395
+ if (text[j] != '.') {
396
+ token[token_i] = text[j];
397
+ token_i++;
398
+ }
399
+ }
400
+ tk_set(ts->token, token, token_i, start, ts->pos, 1);
401
+ } else { // just return the url as is
402
+ tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
403
+ }
404
+ } else { // return the number
405
+ ts->pos = num_end;
406
+ tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
407
+ }
408
+ }
409
+
410
+ return ts->token;
411
+ }
412
+
413
+ TokenStream *standard_tokenizer_create()
414
+ {
415
+ TokenStream *ts = ts_create();
416
+ ts->next = &std_next;
417
+ return ts;
418
+ }
419
+
420
+ const char *ENGLISH_STOP_WORDS[] = {
421
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
422
+ "for", "if", "in", "into", "is", "it",
423
+ "no", "not", "of", "on", "or", "s", "such",
424
+ "t", "that", "the", "their", "then", "there", "these",
425
+ "they", "this", "to", "was", "will", "with"
426
+ };
427
+
428
+ void filter_reset(TokenStream *ts, char *text)
429
+ {
430
+ ts->sub_ts->reset(ts->sub_ts, text);
431
+ }
432
+
433
+ void filter_destroy(void *p)
434
+ {
435
+ TokenStream *ts = (TokenStream *)p;
436
+ ts->sub_ts->destroy(ts->sub_ts);
437
+ if (ts->token != NULL) tk_destroy(ts->token);
438
+ free(ts);
439
+ }
440
+
441
+ void sf_destroy(void *p)
442
+ {
443
+ HshTable *words = (HshTable *)((TokenStream *)p)->data;
444
+ h_destroy(words);
445
+ filter_destroy(p);
446
+ }
447
+
448
+ Token *sf_next(TokenStream *ts)
449
+ {
450
+ int pos_inc = 1;
451
+ HshTable *words = (HshTable *)ts->data;
452
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
453
+ while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
454
+ tk = ts->sub_ts->next(ts->sub_ts);
455
+ pos_inc++;
456
+ }
457
+ if (tk != NULL) tk->pos_inc = pos_inc;
458
+ return tk;
459
+ }
460
+
461
+ TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
462
+ {
463
+ int i;
464
+ TokenStream *tf = ALLOC(TokenStream);
465
+ tf->sub_ts = ts;
466
+ HshTable *wordtable = h_new_str(NULL, NULL);
467
+ for (i = 0; i < len; i++) {
468
+ h_set(wordtable, words[i], words[i]);
469
+ }
470
+ tf->data = wordtable;
471
+ tf->token = NULL;
472
+ tf->next = &sf_next;
473
+ tf->reset = &filter_reset;
474
+ tf->destroy = &sf_destroy;
475
+ return tf;
476
+ }
477
+
478
+ TokenStream *stop_filter_create(TokenStream *ts)
479
+ {
480
+ return stop_filter_create_with_words(ts,
481
+ (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
482
+ }
483
+
484
+ Token *lcf_next(TokenStream *ts)
485
+ {
486
+ int i = 0;
487
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
488
+ if (tk == NULL) return tk;
489
+ while (tk->text[i] != '\0') {
490
+ tk->text[i] = tolower(tk->text[i]);
491
+ i++;
492
+ }
493
+ return tk;
494
+ }
495
+
496
+ TokenStream *lowercase_filter_create(TokenStream *ts)
497
+ {
498
+ TokenStream *tf = ALLOC(TokenStream);
499
+ tf->token = NULL;
500
+ tf->next = &lcf_next;
501
+ tf->reset = &filter_reset;
502
+ tf->destroy = &filter_destroy;
503
+ tf->sub_ts = ts;
504
+ return tf;
505
+ }
506
+
507
+ Analyzer *letter_analyzer_create()
508
+ {
509
+ Analyzer *a = ALLOC(Analyzer);
510
+ a->data = NULL;
511
+ a->current_ts = lowercase_filter_create(letter_tokenizer_create());
512
+ a->destroy = &a_standard_destroy;
513
+ a->get_ts = &a_standard_get_ts;
514
+ return a;
515
+ }
516
+
517
+
518
+ Analyzer *standard_analyzer_create_with_words(char **words, int len)
519
+ {
520
+ Analyzer *a = ALLOC(Analyzer);
521
+ a->data = NULL;
522
+ a->current_ts =
523
+ stop_filter_create_with_words(
524
+ lowercase_filter_create(standard_tokenizer_create()), words, len);
525
+ a->destroy = &a_standard_destroy;
526
+ a->get_ts = &a_standard_get_ts;
527
+ return a;
528
+ }
529
+
530
+ Analyzer *standard_analyzer_create()
531
+ {
532
+ return standard_analyzer_create_with_words(
533
+ (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
534
+ }
535
+
536
+ #ifdef ALONE
537
+ int main(int argc, char **argv)
538
+ {
539
+ char buf[10000];
540
+ Analyzer *a = standard_analyzer_create();
541
+ TokenStream *ts;
542
+ Token *tk;
543
+ while (fgets(buf, 9999, stdin) != NULL) {
544
+ ts = a->get_ts(a, "hello", buf);
545
+ ts->pos = 0;
546
+ while ((tk = ts->next(ts)) != NULL) {
547
+ printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
548
+ }
549
+ printf("\n");
550
+ }
551
+ return 0;
552
+ }
553
+ #endif