ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/CHANGELOG ADDED
@@ -0,0 +1,9 @@
1
+ 20060316:
2
+ * changed Token#term_text to Token#text
3
+ * changed Token#position_increment to Term#pos_inc
4
+ * changed order of args to Token.new. Now Term.new(text, start_offset,
5
+ end_offset, pos_inc=1, type="text"). NOTE: type does nothing.
6
+ * changed TermVectorOffsetInfo#start_offset to TermVectorOffsetInfo#start
7
+ * changed TermVectorOffsetInfo#end_offset to TermVectorOffsetInfo#end
8
+ * added :id_field option to Index::Index class.
9
+
data/Rakefile CHANGED
@@ -9,7 +9,7 @@ require 'rake/testtask'
9
9
  require 'rake/rdoctask'
10
10
  require 'rake/clean'
11
11
  require 'rake_utils/code_statistics'
12
- require 'lib/ferret'
12
+ require 'lib/rferret'
13
13
 
14
14
  begin
15
15
  require 'rubygems'
@@ -30,18 +30,32 @@ def announce(msg='')
30
30
  end
31
31
 
32
32
  $VERBOSE = nil
33
+
34
+ EXT = "ferret_ext.so"
35
+ EXT_SRC = FileList["src/*/*.[ch]"]
36
+ EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
37
+ SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
38
+
33
39
  CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
34
40
  CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
35
41
 
36
42
  task :default => :all_tests
37
43
  desc "Run all tests"
38
- task :all_tests => [ :test_units, :test_functional ]
44
+ task :all_tests => [ :test_runits, :test_cunits, :test_functional ]
39
45
 
40
46
  desc "Generate API documentation, and show coding stats"
41
47
  task :doc => [ :stats, :appdoc ]
42
48
 
43
- desc "run unit tests in test/unit"
44
- Rake::TestTask.new("test_units" => :parsers) do |t|
49
+ desc "run unit tests in test/unit for pure ruby ferret"
50
+ Rake::TestTask.new("test_runits" => :parsers) do |t|
51
+ t.ruby_opts = ["-r 'lib/rferret'"]
52
+ t.libs << "test/unit"
53
+ t.pattern = 'test/unit/ts_*.rb'
54
+ t.verbose = true
55
+ end
56
+
57
+ desc "run unit tests in test/unit for C ferret"
58
+ Rake::TestTask.new("test_cunits" => :ext) do |t|
45
59
  t.libs << "test/unit"
46
60
  t.pattern = 'test/unit/t[cs]_*.rb'
47
61
  t.verbose = true
@@ -84,22 +98,28 @@ rd = Rake::RDocTask.new("appdoc") do |rdoc|
84
98
  rdoc.rdoc_files.include('lib/**/*.rb')
85
99
  end
86
100
 
87
- EXT = "ferret_ext.so"
101
+ EXT_SRC.each do |fn|
102
+ dest_fn = File.join("ext", File.basename(fn))
103
+ file dest_fn => fn do |t|
104
+ cp fn, dest_fn
105
+ end
106
+ end
88
107
 
89
108
  desc "Build the extension"
90
- task :ext => "ext/#{EXT}"
109
+ task :ext => ["ext/#{EXT}"] + SRC
91
110
 
92
- file "ext/#{EXT}" => "ext/Makefile" do
111
+ file "ext/#{EXT}" => ["ext/Makefile"] do
112
+ cp "ext/inc/lang.h", "ext/lang.h"
93
113
  sh "cd ext; make"
94
114
  end
95
115
 
96
- file "ext/Makefile" do
116
+ file "ext/Makefile" => SRC do
97
117
  sh "cd ext; ruby extconf.rb"
98
118
  end
99
119
 
100
120
  # Make Parsers ---------------------------------------------------------------
101
121
 
102
- RACC_SRC = FileList["**/*.y"]
122
+ RACC_SRC = FileList["lib/**/*.y"]
103
123
  RACC_OUT = RACC_SRC.collect { |fn| fn.sub(/\.y$/, '.tab.rb') }
104
124
 
105
125
  task :parsers => RACC_OUT
@@ -195,8 +215,9 @@ end
195
215
  # Creating a release
196
216
 
197
217
  desc "Make a new release"
198
- task :prerelease => [:clobber, :all_tests, :parsers]
199
- task :package => [:prerelease]
218
+ task :prerelease => [:all_tests, :clobber]
219
+ task :repackage => EXT_SRC_DEST
220
+ task :package => EXT_SRC_DEST
200
221
  task :tag => [:prerelease]
201
222
  task :update_version => [:prerelease]
202
223
  task :release => [:tag, :update_version, :package] do
@@ -229,7 +250,7 @@ task :prerelease do
229
250
  end
230
251
 
231
252
  # Are all source files checked in?
232
- data = `svn -q status`
253
+ data = `svn -q --ignore-externals status`
233
254
  unless data =~ /^$/
234
255
  fail "'svn -q status' is not clean ... do you have unchecked-in files?"
235
256
  end
@@ -237,28 +258,33 @@ task :prerelease do
237
258
  announce "No outstanding checkins found ... OK"
238
259
  end
239
260
 
261
+ def reversion(fn)
262
+ open(fn) do |ferret_in|
263
+ open(fn + ".new", "w") do |ferret_out|
264
+ ferret_in.each do |line|
265
+ if line =~ /^ VERSION\s*=\s*/
266
+ ferret_out.puts " VERSION = '#{PKG_VERSION}'"
267
+ else
268
+ ferret_out.puts line
269
+ end
270
+ end
271
+ end
272
+ end
273
+ end
274
+
240
275
  task :update_version => [:prerelease] do
241
276
  if PKG_VERSION == CURRENT_VERSION
242
277
  announce "No version change ... skipping version update"
243
278
  else
244
279
  announce "Updating Ferret version to #{PKG_VERSION}"
245
- open("lib/ferret.rb") do |ferret_in|
246
- open("lib/ferret.rb.new", "w") do |ferret_out|
247
- ferret_in.each do |line|
248
- if line =~ /^ VERSION\s*=\s*/
249
- ferret_out.puts " VERSION = '#{PKG_VERSION}'"
250
- else
251
- ferret_out.puts line
252
- end
253
- end
254
- end
255
- end
280
+ reversion("lib/ferret.rb")
281
+ reversion("lib/rferret.rb")
256
282
  if ENV['RELTEST']
257
283
  announce "Release Task Testing, skipping commiting of new version"
258
284
  else
259
- mv "lib/ferret.rb.new", "lib/ferret.rb"
285
+ mv "lib/rferret.rb.new", "lib/rferret.rb"
260
286
  end
261
- sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
287
+ sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
262
288
  end
263
289
  end
264
290
 
data/ext/analysis.c ADDED
@@ -0,0 +1,553 @@
1
+ #include <analysis.h>
2
+ #include <string.h>
3
+ #include <ctype.h>
4
+ #include <hash.h>
5
+
6
+ Token *tk_create()
7
+ {
8
+ return ALLOC(Token);
9
+ }
10
+
11
+ void tk_destroy(void *p)
12
+ {
13
+ free(p);
14
+ }
15
+
16
+ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc)
17
+ {
18
+ if (tlen >= MAX_WORD_SIZE) tlen = MAX_WORD_SIZE - 1;
19
+ memcpy(tk->text, text, sizeof(char) * tlen);
20
+ tk->text[tlen] = '\0';
21
+ tk->start = start;
22
+ tk->end = end;
23
+ tk->pos_inc = pos_inc;
24
+ return tk;
25
+ }
26
+
27
+ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
28
+ {
29
+ return tk_set(tk, text, strlen(text), start, end, pos_inc);
30
+ }
31
+
32
+ int tk_eq(Token *tk1, Token *tk2)
33
+ {
34
+ if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
35
+ tk1->start == tk2->start && tk1->end == tk2->end)
36
+ return true;
37
+ else
38
+ return false;
39
+ }
40
+
41
+ int tk_cmp(Token *tk1, Token *tk2)
42
+ {
43
+ int cmp;
44
+ if (tk1->start > tk2->start) {
45
+ cmp = 1;
46
+ } else if (tk1->start < tk2->start) {
47
+ cmp = -1;
48
+ } else {
49
+ if (tk1->end > tk2->end) {
50
+ cmp = 1;
51
+ } else if (tk1->end < tk2->end) {
52
+ cmp = -1;
53
+ } else {
54
+ cmp = strcmp((char *)tk1->text, (char *)tk2->text);
55
+ }
56
+ }
57
+ return cmp;
58
+ }
59
+
60
+ void ts_standard_destroy(void *p)
61
+ {
62
+ TokenStream *ts = (TokenStream *)p;
63
+ tk_destroy(ts->token);
64
+ free(p);
65
+ }
66
+
67
+ void ts_reset(TokenStream *ts, char *text)
68
+ {
69
+ ts->text = text;
70
+ ts->pos = 0;
71
+ }
72
+
73
+ TokenStream *ts_create()
74
+ {
75
+ TokenStream *ts = ALLOC(TokenStream);
76
+ ts->pos = -1;
77
+ ts->text = NULL;
78
+ ts->token = tk_create();
79
+ ts->destroy = &ts_standard_destroy;
80
+ ts->reset = &ts_reset;
81
+ return ts;
82
+ }
83
+
84
+ Token *wst_next(TokenStream *ts)
85
+ {
86
+ int i = ts->pos;
87
+ int start, end;
88
+ char *text = ts->text;
89
+
90
+ while (text[i] != '\0' && isspace(text[i]))
91
+ i++;
92
+ if (text[i] == '\0')
93
+ return NULL;
94
+
95
+ start = i;
96
+ while (text[i] != '\0' && !isspace(text[i]))
97
+ i++;
98
+ ts->pos = end = i;
99
+ tk_set(ts->token, text+start, end-start, start, end, 1);
100
+ return ts->token;
101
+ }
102
+
103
+ TokenStream *whitespace_tokenizer_create()
104
+ {
105
+ TokenStream *ts = ts_create();
106
+ ts->next = &wst_next;
107
+ return ts;
108
+ }
109
+
110
+ Token *lt_next(TokenStream *ts)
111
+ {
112
+ int i = ts->pos;
113
+ int start, end;
114
+ char *text = ts->text;
115
+
116
+ while (text[i] != '\0' && !isalpha(text[i]))
117
+ i++;
118
+ if (text[i] == '\0')
119
+ return NULL;
120
+
121
+ start = i;
122
+ while (text[i] != '\0' && isalpha(text[i]))
123
+ i++;
124
+ ts->pos = end = i;
125
+ tk_set(ts->token, text+start, end-start, start, end, 1);
126
+ return ts->token;
127
+ }
128
+
129
+ TokenStream *letter_tokenizer_create()
130
+ {
131
+ TokenStream *ts = ts_create();
132
+ ts->next = &lt_next;
133
+ return ts;
134
+ }
135
+
136
+ void a_standard_destroy(void *p)
137
+ {
138
+ Analyzer *a = (Analyzer *)p;
139
+ ts_destroy(a->current_ts);
140
+ free(p);
141
+ }
142
+
143
+ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
144
+ {
145
+ a->current_ts->reset(a->current_ts, text);
146
+ return a->current_ts;
147
+ }
148
+
149
+ Analyzer *whitespace_analyzer_create()
150
+ {
151
+ Analyzer *a = ALLOC(Analyzer);
152
+ a->data = NULL;
153
+ a->current_ts = whitespace_tokenizer_create();
154
+ a->destroy = &a_standard_destroy;
155
+ a->get_ts = &a_standard_get_ts;
156
+ return a;
157
+ }
158
+
159
+ int std_get_alpha(char *input, char *token)
160
+ {
161
+ int i = 0;
162
+ while (input[i] != '\0' && isalpha(input[i])) {
163
+ token[i] = input[i];
164
+ i++;
165
+ }
166
+ return i;
167
+ }
168
+
169
+ int std_get_alnum(char *input, char *token)
170
+ {
171
+ int i = 0;
172
+ while (input[i] != '\0' && isalnum(input[i])) {
173
+ token[i] = input[i];
174
+ i++;
175
+ }
176
+ return i;
177
+ }
178
+
179
+ int isnumpunc(char c)
180
+ {
181
+ return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
182
+ }
183
+
184
+ int isurlpunc(char c)
185
+ {
186
+ return (c == '.' || c == '/' || c == '-' || c == '_');
187
+ }
188
+
189
+ int isurlc(char c)
190
+ {
191
+ return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
192
+ }
193
+
194
+ int isurlxatpunc(char c)
195
+ {
196
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
197
+ }
198
+
199
+ int isurlxatc(char c)
200
+ {
201
+ return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
202
+ }
203
+
204
+ int isstdtokchar(char c)
205
+ {
206
+ if (isspace(c)) return false; // most common so check first.
207
+ if (isalnum(c) || isnumpunc(c) || c == '&' ||
208
+ c == '@' || c == '\'' || c == ':')
209
+ return true;
210
+ return false;
211
+ }
212
+
213
+ /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
214
+ * least one digit.
215
+ * (alnum) = [a-zA-Z0-9]
216
+ * (punc) = [_\/.,-]
217
+ */
218
+ int std_get_number(char *input)
219
+ {
220
+ int i = 0;
221
+ int count = 0;
222
+ int last_seen_digit = 2;
223
+ int seen_digit = false;
224
+
225
+ while (last_seen_digit >= 0) {
226
+ while ((input[i] != '\0') && isalnum(input[i])) {
227
+ if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
228
+ if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
229
+ i++;
230
+ }
231
+ last_seen_digit--;
232
+ if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
233
+
234
+ if (last_seen_digit >= 0)
235
+ count = i;
236
+ break;
237
+ }
238
+ count = i;
239
+ i++;
240
+ }
241
+ if (seen_digit)
242
+ return count;
243
+ else
244
+ return 0;
245
+ }
246
+
247
+ int std_get_apostrophe(char *input)
248
+ {
249
+ int i = 0;
250
+
251
+ while (isalpha(input[i]) || input[i] == '\'')
252
+ i++;
253
+
254
+ return i;
255
+ }
256
+
257
+ int std_get_url(char *input, char *token)
258
+ {
259
+ int i = 0;
260
+
261
+ while (isurlc(input[i])) {
262
+ if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
263
+ break; // can't have to puncs in a row
264
+ token[i] = input[i];
265
+ i++;
266
+ }
267
+
268
+ //strip trailing puncs
269
+ while (isurlpunc(input[i-1])) i--;
270
+
271
+ return i;
272
+ }
273
+
274
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
275
+ */
276
+ int std_get_company_name(char *input)
277
+ {
278
+ int i = 0;
279
+ while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
280
+ i++;
281
+
282
+ return i;
283
+ }
284
+
285
+ Token *std_next(TokenStream *ts)
286
+ {
287
+ int i = ts->pos, j;
288
+ int start;
289
+ char *text = ts->text;
290
+ char token[MAX_WORD_SIZE];
291
+ int token_i = 0;
292
+ int len;
293
+ int num_end = 0;
294
+ int is_acronym;
295
+ int seen_at_symbol;
296
+
297
+ while (text[i] != '\0' && !isalnum(text[i]))
298
+ i++;
299
+ if (text[i] == '\0')
300
+ return NULL;
301
+
302
+ start = i;
303
+ if (isdigit(text[i])) {
304
+ i += std_get_number(text + i);
305
+ ts->pos = i;
306
+ tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
307
+ } else {
308
+ token_i = std_get_alpha(text + i, token);
309
+ i += token_i;
310
+
311
+ if (!isstdtokchar(text[i])) {
312
+ // very common case, ie a plain word, so check and return
313
+ tk_set(ts->token, text+start, i-start, start, i, 1);
314
+ ts->pos = i;
315
+ return ts->token;
316
+ }
317
+
318
+ if (text[i] == '\'') { // apostrophe case.
319
+ i += std_get_apostrophe(text + i);
320
+ ts->pos = i;
321
+ len = i - start;
322
+ // strip possesive
323
+ if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
324
+ len -= 2;
325
+ tk_set(ts->token, text+start, len, start, i, 1);
326
+ return ts->token;
327
+ }
328
+ if (text[i] == '&') { // apostrophe case.
329
+ i += std_get_company_name(text + i);
330
+ ts->pos = i;
331
+ tk_set(ts->token, text+start, i - start, start, i, 1);
332
+ return ts->token;
333
+ }
334
+
335
+ if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
336
+ num_end = start + std_get_number(text + start);
337
+ if (!isstdtokchar(text[num_end])) { // we won't find a longer token
338
+ ts->pos = num_end;
339
+ tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
340
+ return ts->token;
341
+ }
342
+ // else there may be a longer token so check
343
+ }
344
+
345
+ if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
346
+ // check for a known url start
347
+ token[token_i] = '\0';
348
+ i += 3;
349
+ while (text[i] == '/') i++;
350
+ if (isalpha(text[i]) &&
351
+ (strcmp(token, "ftp") == 0 ||
352
+ strcmp(token, "http") == 0 ||
353
+ strcmp(token, "https") == 0 ||
354
+ strcmp(token, "file") == 0)) {
355
+ len = std_get_url(text + i, token); // dispose of first part of the URL
356
+ } else { //still treat as url but keep the first part
357
+ token_i = i - start;
358
+ memcpy(token, text + start, token_i * sizeof(char));
359
+ len = token_i + std_get_url(text + i, token + token_i); // keep start
360
+ }
361
+ ts->pos = i + len;
362
+ token[len] = 0;
363
+ tk_set(ts->token, token, len, start, ts->pos, 1);
364
+ return ts->token;
365
+ }
366
+
367
+ // now see how int a url we can find.
368
+ is_acronym = true;
369
+ seen_at_symbol = false;
370
+ while (isurlxatc(text[i])) {
371
+ if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
372
+ is_acronym = false;
373
+ }
374
+ if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
375
+ break; // can't have to punctuation characters in a row
376
+ if (text[i] == '@') {
377
+ if (seen_at_symbol)
378
+ break; // we can only have one @ symbol
379
+ else
380
+ seen_at_symbol = true;
381
+ }
382
+ i++;
383
+ }
384
+ while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
385
+ if (i > num_end) {
386
+ ts->pos = i;
387
+
388
+ if (is_acronym) { // check that it is one letter followed by one '.'
389
+ for (j = start; j < i-1; j++) {
390
+ if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
391
+ }
392
+ }
393
+ if (is_acronym) {// strip '.'s
394
+ for (j = start + token_i; j < i; j++) {
395
+ if (text[j] != '.') {
396
+ token[token_i] = text[j];
397
+ token_i++;
398
+ }
399
+ }
400
+ tk_set(ts->token, token, token_i, start, ts->pos, 1);
401
+ } else { // just return the url as is
402
+ tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
403
+ }
404
+ } else { // return the number
405
+ ts->pos = num_end;
406
+ tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
407
+ }
408
+ }
409
+
410
+ return ts->token;
411
+ }
412
+
413
+ TokenStream *standard_tokenizer_create()
414
+ {
415
+ TokenStream *ts = ts_create();
416
+ ts->next = &std_next;
417
+ return ts;
418
+ }
419
+
420
+ const char *ENGLISH_STOP_WORDS[] = {
421
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
422
+ "for", "if", "in", "into", "is", "it",
423
+ "no", "not", "of", "on", "or", "s", "such",
424
+ "t", "that", "the", "their", "then", "there", "these",
425
+ "they", "this", "to", "was", "will", "with"
426
+ };
427
+
428
+ void filter_reset(TokenStream *ts, char *text)
429
+ {
430
+ ts->sub_ts->reset(ts->sub_ts, text);
431
+ }
432
+
433
+ void filter_destroy(void *p)
434
+ {
435
+ TokenStream *ts = (TokenStream *)p;
436
+ ts->sub_ts->destroy(ts->sub_ts);
437
+ if (ts->token != NULL) tk_destroy(ts->token);
438
+ free(ts);
439
+ }
440
+
441
+ void sf_destroy(void *p)
442
+ {
443
+ HshTable *words = (HshTable *)((TokenStream *)p)->data;
444
+ h_destroy(words);
445
+ filter_destroy(p);
446
+ }
447
+
448
+ Token *sf_next(TokenStream *ts)
449
+ {
450
+ int pos_inc = 1;
451
+ HshTable *words = (HshTable *)ts->data;
452
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
453
+ while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
454
+ tk = ts->sub_ts->next(ts->sub_ts);
455
+ pos_inc++;
456
+ }
457
+ if (tk != NULL) tk->pos_inc = pos_inc;
458
+ return tk;
459
+ }
460
+
461
+ TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
462
+ {
463
+ int i;
464
+ TokenStream *tf = ALLOC(TokenStream);
465
+ tf->sub_ts = ts;
466
+ HshTable *wordtable = h_new_str(NULL, NULL);
467
+ for (i = 0; i < len; i++) {
468
+ h_set(wordtable, words[i], words[i]);
469
+ }
470
+ tf->data = wordtable;
471
+ tf->token = NULL;
472
+ tf->next = &sf_next;
473
+ tf->reset = &filter_reset;
474
+ tf->destroy = &sf_destroy;
475
+ return tf;
476
+ }
477
+
478
+ TokenStream *stop_filter_create(TokenStream *ts)
479
+ {
480
+ return stop_filter_create_with_words(ts,
481
+ (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
482
+ }
483
+
484
+ Token *lcf_next(TokenStream *ts)
485
+ {
486
+ int i = 0;
487
+ Token *tk = ts->sub_ts->next(ts->sub_ts);
488
+ if (tk == NULL) return tk;
489
+ while (tk->text[i] != '\0') {
490
+ tk->text[i] = tolower(tk->text[i]);
491
+ i++;
492
+ }
493
+ return tk;
494
+ }
495
+
496
+ TokenStream *lowercase_filter_create(TokenStream *ts)
497
+ {
498
+ TokenStream *tf = ALLOC(TokenStream);
499
+ tf->token = NULL;
500
+ tf->next = &lcf_next;
501
+ tf->reset = &filter_reset;
502
+ tf->destroy = &filter_destroy;
503
+ tf->sub_ts = ts;
504
+ return tf;
505
+ }
506
+
507
+ Analyzer *letter_analyzer_create()
508
+ {
509
+ Analyzer *a = ALLOC(Analyzer);
510
+ a->data = NULL;
511
+ a->current_ts = lowercase_filter_create(letter_tokenizer_create());
512
+ a->destroy = &a_standard_destroy;
513
+ a->get_ts = &a_standard_get_ts;
514
+ return a;
515
+ }
516
+
517
+
518
+ Analyzer *standard_analyzer_create_with_words(char **words, int len)
519
+ {
520
+ Analyzer *a = ALLOC(Analyzer);
521
+ a->data = NULL;
522
+ a->current_ts =
523
+ stop_filter_create_with_words(
524
+ lowercase_filter_create(standard_tokenizer_create()), words, len);
525
+ a->destroy = &a_standard_destroy;
526
+ a->get_ts = &a_standard_get_ts;
527
+ return a;
528
+ }
529
+
530
+ Analyzer *standard_analyzer_create()
531
+ {
532
+ return standard_analyzer_create_with_words(
533
+ (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
534
+ }
535
+
536
+ #ifdef ALONE
537
+ int main(int argc, char **argv)
538
+ {
539
+ char buf[10000];
540
+ Analyzer *a = standard_analyzer_create();
541
+ TokenStream *ts;
542
+ Token *tk;
543
+ while (fgets(buf, 9999, stdin) != NULL) {
544
+ ts = a->get_ts(a, "hello", buf);
545
+ ts->pos = 0;
546
+ while ((tk = ts->next(ts)) != NULL) {
547
+ printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
548
+ }
549
+ printf("\n");
550
+ }
551
+ return 0;
552
+ }
553
+ #endif