ferret 0.3.2 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/CHANGELOG
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
20060316:
|
2
|
+
* changed Token#term_text to Token#text
|
3
|
+
* changed Token#position_increment to Term#pos_inc
|
4
|
+
* changed order of args to Token.new. Now Term.new(text, start_offset,
|
5
|
+
end_offset, pos_inc=1, type="text"). NOTE: type does nothing.
|
6
|
+
* changed TermVectorOffsetInfo#start_offset to TermVectorOffsetInfo#start
|
7
|
+
* changed TermVectorOffsetInfo#end_offset to TermVectorOffsetInfo#end
|
8
|
+
* added :id_field option to Index::Index class.
|
9
|
+
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ require 'rake/testtask'
|
|
9
9
|
require 'rake/rdoctask'
|
10
10
|
require 'rake/clean'
|
11
11
|
require 'rake_utils/code_statistics'
|
12
|
-
require 'lib/
|
12
|
+
require 'lib/rferret'
|
13
13
|
|
14
14
|
begin
|
15
15
|
require 'rubygems'
|
@@ -30,18 +30,32 @@ def announce(msg='')
|
|
30
30
|
end
|
31
31
|
|
32
32
|
$VERBOSE = nil
|
33
|
+
|
34
|
+
EXT = "ferret_ext.so"
|
35
|
+
EXT_SRC = FileList["src/*/*.[ch]"]
|
36
|
+
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
37
|
+
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
38
|
+
|
33
39
|
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
34
40
|
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
|
35
41
|
|
36
42
|
task :default => :all_tests
|
37
43
|
desc "Run all tests"
|
38
|
-
task :all_tests => [ :
|
44
|
+
task :all_tests => [ :test_runits, :test_cunits, :test_functional ]
|
39
45
|
|
40
46
|
desc "Generate API documentation, and show coding stats"
|
41
47
|
task :doc => [ :stats, :appdoc ]
|
42
48
|
|
43
|
-
desc "run unit tests in test/unit"
|
44
|
-
Rake::TestTask.new("
|
49
|
+
desc "run unit tests in test/unit for pure ruby ferret"
|
50
|
+
Rake::TestTask.new("test_runits" => :parsers) do |t|
|
51
|
+
t.ruby_opts = ["-r 'lib/rferret'"]
|
52
|
+
t.libs << "test/unit"
|
53
|
+
t.pattern = 'test/unit/ts_*.rb'
|
54
|
+
t.verbose = true
|
55
|
+
end
|
56
|
+
|
57
|
+
desc "run unit tests in test/unit for C ferret"
|
58
|
+
Rake::TestTask.new("test_cunits" => :ext) do |t|
|
45
59
|
t.libs << "test/unit"
|
46
60
|
t.pattern = 'test/unit/t[cs]_*.rb'
|
47
61
|
t.verbose = true
|
@@ -84,22 +98,28 @@ rd = Rake::RDocTask.new("appdoc") do |rdoc|
|
|
84
98
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
85
99
|
end
|
86
100
|
|
87
|
-
|
101
|
+
EXT_SRC.each do |fn|
|
102
|
+
dest_fn = File.join("ext", File.basename(fn))
|
103
|
+
file dest_fn => fn do |t|
|
104
|
+
cp fn, dest_fn
|
105
|
+
end
|
106
|
+
end
|
88
107
|
|
89
108
|
desc "Build the extension"
|
90
|
-
task :ext => "ext/#{EXT}"
|
109
|
+
task :ext => ["ext/#{EXT}"] + SRC
|
91
110
|
|
92
|
-
file "ext/#{EXT}" => "ext/Makefile" do
|
111
|
+
file "ext/#{EXT}" => ["ext/Makefile"] do
|
112
|
+
cp "ext/inc/lang.h", "ext/lang.h"
|
93
113
|
sh "cd ext; make"
|
94
114
|
end
|
95
115
|
|
96
|
-
file "ext/Makefile" do
|
116
|
+
file "ext/Makefile" => SRC do
|
97
117
|
sh "cd ext; ruby extconf.rb"
|
98
118
|
end
|
99
119
|
|
100
120
|
# Make Parsers ---------------------------------------------------------------
|
101
121
|
|
102
|
-
RACC_SRC = FileList["
|
122
|
+
RACC_SRC = FileList["lib/**/*.y"]
|
103
123
|
RACC_OUT = RACC_SRC.collect { |fn| fn.sub(/\.y$/, '.tab.rb') }
|
104
124
|
|
105
125
|
task :parsers => RACC_OUT
|
@@ -195,8 +215,9 @@ end
|
|
195
215
|
# Creating a release
|
196
216
|
|
197
217
|
desc "Make a new release"
|
198
|
-
task :prerelease => [:
|
199
|
-
task :
|
218
|
+
task :prerelease => [:all_tests, :clobber]
|
219
|
+
task :repackage => EXT_SRC_DEST
|
220
|
+
task :package => EXT_SRC_DEST
|
200
221
|
task :tag => [:prerelease]
|
201
222
|
task :update_version => [:prerelease]
|
202
223
|
task :release => [:tag, :update_version, :package] do
|
@@ -229,7 +250,7 @@ task :prerelease do
|
|
229
250
|
end
|
230
251
|
|
231
252
|
# Are all source files checked in?
|
232
|
-
data = `svn -q status`
|
253
|
+
data = `svn -q --ignore-externals status`
|
233
254
|
unless data =~ /^$/
|
234
255
|
fail "'svn -q status' is not clean ... do you have unchecked-in files?"
|
235
256
|
end
|
@@ -237,28 +258,33 @@ task :prerelease do
|
|
237
258
|
announce "No outstanding checkins found ... OK"
|
238
259
|
end
|
239
260
|
|
261
|
+
def reversion(fn)
|
262
|
+
open(fn) do |ferret_in|
|
263
|
+
open(fn + ".new", "w") do |ferret_out|
|
264
|
+
ferret_in.each do |line|
|
265
|
+
if line =~ /^ VERSION\s*=\s*/
|
266
|
+
ferret_out.puts " VERSION = '#{PKG_VERSION}'"
|
267
|
+
else
|
268
|
+
ferret_out.puts line
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
240
275
|
task :update_version => [:prerelease] do
|
241
276
|
if PKG_VERSION == CURRENT_VERSION
|
242
277
|
announce "No version change ... skipping version update"
|
243
278
|
else
|
244
279
|
announce "Updating Ferret version to #{PKG_VERSION}"
|
245
|
-
|
246
|
-
|
247
|
-
ferret_in.each do |line|
|
248
|
-
if line =~ /^ VERSION\s*=\s*/
|
249
|
-
ferret_out.puts " VERSION = '#{PKG_VERSION}'"
|
250
|
-
else
|
251
|
-
ferret_out.puts line
|
252
|
-
end
|
253
|
-
end
|
254
|
-
end
|
255
|
-
end
|
280
|
+
reversion("lib/ferret.rb")
|
281
|
+
reversion("lib/rferret.rb")
|
256
282
|
if ENV['RELTEST']
|
257
283
|
announce "Release Task Testing, skipping commiting of new version"
|
258
284
|
else
|
259
|
-
mv "lib/
|
285
|
+
mv "lib/rferret.rb.new", "lib/rferret.rb"
|
260
286
|
end
|
261
|
-
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/
|
287
|
+
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
262
288
|
end
|
263
289
|
end
|
264
290
|
|
data/ext/analysis.c
ADDED
@@ -0,0 +1,553 @@
|
|
1
|
+
#include <analysis.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include <hash.h>
|
5
|
+
|
6
|
+
Token *tk_create()
|
7
|
+
{
|
8
|
+
return ALLOC(Token);
|
9
|
+
}
|
10
|
+
|
11
|
+
void tk_destroy(void *p)
|
12
|
+
{
|
13
|
+
free(p);
|
14
|
+
}
|
15
|
+
|
16
|
+
inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc)
|
17
|
+
{
|
18
|
+
if (tlen >= MAX_WORD_SIZE) tlen = MAX_WORD_SIZE - 1;
|
19
|
+
memcpy(tk->text, text, sizeof(char) * tlen);
|
20
|
+
tk->text[tlen] = '\0';
|
21
|
+
tk->start = start;
|
22
|
+
tk->end = end;
|
23
|
+
tk->pos_inc = pos_inc;
|
24
|
+
return tk;
|
25
|
+
}
|
26
|
+
|
27
|
+
inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
|
28
|
+
{
|
29
|
+
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
30
|
+
}
|
31
|
+
|
32
|
+
int tk_eq(Token *tk1, Token *tk2)
|
33
|
+
{
|
34
|
+
if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
35
|
+
tk1->start == tk2->start && tk1->end == tk2->end)
|
36
|
+
return true;
|
37
|
+
else
|
38
|
+
return false;
|
39
|
+
}
|
40
|
+
|
41
|
+
int tk_cmp(Token *tk1, Token *tk2)
|
42
|
+
{
|
43
|
+
int cmp;
|
44
|
+
if (tk1->start > tk2->start) {
|
45
|
+
cmp = 1;
|
46
|
+
} else if (tk1->start < tk2->start) {
|
47
|
+
cmp = -1;
|
48
|
+
} else {
|
49
|
+
if (tk1->end > tk2->end) {
|
50
|
+
cmp = 1;
|
51
|
+
} else if (tk1->end < tk2->end) {
|
52
|
+
cmp = -1;
|
53
|
+
} else {
|
54
|
+
cmp = strcmp((char *)tk1->text, (char *)tk2->text);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
return cmp;
|
58
|
+
}
|
59
|
+
|
60
|
+
void ts_standard_destroy(void *p)
|
61
|
+
{
|
62
|
+
TokenStream *ts = (TokenStream *)p;
|
63
|
+
tk_destroy(ts->token);
|
64
|
+
free(p);
|
65
|
+
}
|
66
|
+
|
67
|
+
void ts_reset(TokenStream *ts, char *text)
|
68
|
+
{
|
69
|
+
ts->text = text;
|
70
|
+
ts->pos = 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
TokenStream *ts_create()
|
74
|
+
{
|
75
|
+
TokenStream *ts = ALLOC(TokenStream);
|
76
|
+
ts->pos = -1;
|
77
|
+
ts->text = NULL;
|
78
|
+
ts->token = tk_create();
|
79
|
+
ts->destroy = &ts_standard_destroy;
|
80
|
+
ts->reset = &ts_reset;
|
81
|
+
return ts;
|
82
|
+
}
|
83
|
+
|
84
|
+
Token *wst_next(TokenStream *ts)
|
85
|
+
{
|
86
|
+
int i = ts->pos;
|
87
|
+
int start, end;
|
88
|
+
char *text = ts->text;
|
89
|
+
|
90
|
+
while (text[i] != '\0' && isspace(text[i]))
|
91
|
+
i++;
|
92
|
+
if (text[i] == '\0')
|
93
|
+
return NULL;
|
94
|
+
|
95
|
+
start = i;
|
96
|
+
while (text[i] != '\0' && !isspace(text[i]))
|
97
|
+
i++;
|
98
|
+
ts->pos = end = i;
|
99
|
+
tk_set(ts->token, text+start, end-start, start, end, 1);
|
100
|
+
return ts->token;
|
101
|
+
}
|
102
|
+
|
103
|
+
TokenStream *whitespace_tokenizer_create()
|
104
|
+
{
|
105
|
+
TokenStream *ts = ts_create();
|
106
|
+
ts->next = &wst_next;
|
107
|
+
return ts;
|
108
|
+
}
|
109
|
+
|
110
|
+
Token *lt_next(TokenStream *ts)
|
111
|
+
{
|
112
|
+
int i = ts->pos;
|
113
|
+
int start, end;
|
114
|
+
char *text = ts->text;
|
115
|
+
|
116
|
+
while (text[i] != '\0' && !isalpha(text[i]))
|
117
|
+
i++;
|
118
|
+
if (text[i] == '\0')
|
119
|
+
return NULL;
|
120
|
+
|
121
|
+
start = i;
|
122
|
+
while (text[i] != '\0' && isalpha(text[i]))
|
123
|
+
i++;
|
124
|
+
ts->pos = end = i;
|
125
|
+
tk_set(ts->token, text+start, end-start, start, end, 1);
|
126
|
+
return ts->token;
|
127
|
+
}
|
128
|
+
|
129
|
+
TokenStream *letter_tokenizer_create()
|
130
|
+
{
|
131
|
+
TokenStream *ts = ts_create();
|
132
|
+
ts->next = <_next;
|
133
|
+
return ts;
|
134
|
+
}
|
135
|
+
|
136
|
+
void a_standard_destroy(void *p)
|
137
|
+
{
|
138
|
+
Analyzer *a = (Analyzer *)p;
|
139
|
+
ts_destroy(a->current_ts);
|
140
|
+
free(p);
|
141
|
+
}
|
142
|
+
|
143
|
+
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
144
|
+
{
|
145
|
+
a->current_ts->reset(a->current_ts, text);
|
146
|
+
return a->current_ts;
|
147
|
+
}
|
148
|
+
|
149
|
+
Analyzer *whitespace_analyzer_create()
|
150
|
+
{
|
151
|
+
Analyzer *a = ALLOC(Analyzer);
|
152
|
+
a->data = NULL;
|
153
|
+
a->current_ts = whitespace_tokenizer_create();
|
154
|
+
a->destroy = &a_standard_destroy;
|
155
|
+
a->get_ts = &a_standard_get_ts;
|
156
|
+
return a;
|
157
|
+
}
|
158
|
+
|
159
|
+
int std_get_alpha(char *input, char *token)
|
160
|
+
{
|
161
|
+
int i = 0;
|
162
|
+
while (input[i] != '\0' && isalpha(input[i])) {
|
163
|
+
token[i] = input[i];
|
164
|
+
i++;
|
165
|
+
}
|
166
|
+
return i;
|
167
|
+
}
|
168
|
+
|
169
|
+
int std_get_alnum(char *input, char *token)
|
170
|
+
{
|
171
|
+
int i = 0;
|
172
|
+
while (input[i] != '\0' && isalnum(input[i])) {
|
173
|
+
token[i] = input[i];
|
174
|
+
i++;
|
175
|
+
}
|
176
|
+
return i;
|
177
|
+
}
|
178
|
+
|
179
|
+
int isnumpunc(char c)
|
180
|
+
{
|
181
|
+
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
|
182
|
+
}
|
183
|
+
|
184
|
+
int isurlpunc(char c)
|
185
|
+
{
|
186
|
+
return (c == '.' || c == '/' || c == '-' || c == '_');
|
187
|
+
}
|
188
|
+
|
189
|
+
int isurlc(char c)
|
190
|
+
{
|
191
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
|
192
|
+
}
|
193
|
+
|
194
|
+
int isurlxatpunc(char c)
|
195
|
+
{
|
196
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
|
197
|
+
}
|
198
|
+
|
199
|
+
int isurlxatc(char c)
|
200
|
+
{
|
201
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
|
202
|
+
}
|
203
|
+
|
204
|
+
int isstdtokchar(char c)
|
205
|
+
{
|
206
|
+
if (isspace(c)) return false; // most common so check first.
|
207
|
+
if (isalnum(c) || isnumpunc(c) || c == '&' ||
|
208
|
+
c == '@' || c == '\'' || c == ':')
|
209
|
+
return true;
|
210
|
+
return false;
|
211
|
+
}
|
212
|
+
|
213
|
+
/* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
|
214
|
+
* least one digit.
|
215
|
+
* (alnum) = [a-zA-Z0-9]
|
216
|
+
* (punc) = [_\/.,-]
|
217
|
+
*/
|
218
|
+
int std_get_number(char *input)
|
219
|
+
{
|
220
|
+
int i = 0;
|
221
|
+
int count = 0;
|
222
|
+
int last_seen_digit = 2;
|
223
|
+
int seen_digit = false;
|
224
|
+
|
225
|
+
while (last_seen_digit >= 0) {
|
226
|
+
while ((input[i] != '\0') && isalnum(input[i])) {
|
227
|
+
if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
|
228
|
+
if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
|
229
|
+
i++;
|
230
|
+
}
|
231
|
+
last_seen_digit--;
|
232
|
+
if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
|
233
|
+
|
234
|
+
if (last_seen_digit >= 0)
|
235
|
+
count = i;
|
236
|
+
break;
|
237
|
+
}
|
238
|
+
count = i;
|
239
|
+
i++;
|
240
|
+
}
|
241
|
+
if (seen_digit)
|
242
|
+
return count;
|
243
|
+
else
|
244
|
+
return 0;
|
245
|
+
}
|
246
|
+
|
247
|
+
int std_get_apostrophe(char *input)
|
248
|
+
{
|
249
|
+
int i = 0;
|
250
|
+
|
251
|
+
while (isalpha(input[i]) || input[i] == '\'')
|
252
|
+
i++;
|
253
|
+
|
254
|
+
return i;
|
255
|
+
}
|
256
|
+
|
257
|
+
int std_get_url(char *input, char *token)
|
258
|
+
{
|
259
|
+
int i = 0;
|
260
|
+
|
261
|
+
while (isurlc(input[i])) {
|
262
|
+
if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
|
263
|
+
break; // can't have to puncs in a row
|
264
|
+
token[i] = input[i];
|
265
|
+
i++;
|
266
|
+
}
|
267
|
+
|
268
|
+
//strip trailing puncs
|
269
|
+
while (isurlpunc(input[i-1])) i--;
|
270
|
+
|
271
|
+
return i;
|
272
|
+
}
|
273
|
+
|
274
|
+
/* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
|
275
|
+
*/
|
276
|
+
int std_get_company_name(char *input)
|
277
|
+
{
|
278
|
+
int i = 0;
|
279
|
+
while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
|
280
|
+
i++;
|
281
|
+
|
282
|
+
return i;
|
283
|
+
}
|
284
|
+
|
285
|
+
Token *std_next(TokenStream *ts)
|
286
|
+
{
|
287
|
+
int i = ts->pos, j;
|
288
|
+
int start;
|
289
|
+
char *text = ts->text;
|
290
|
+
char token[MAX_WORD_SIZE];
|
291
|
+
int token_i = 0;
|
292
|
+
int len;
|
293
|
+
int num_end = 0;
|
294
|
+
int is_acronym;
|
295
|
+
int seen_at_symbol;
|
296
|
+
|
297
|
+
while (text[i] != '\0' && !isalnum(text[i]))
|
298
|
+
i++;
|
299
|
+
if (text[i] == '\0')
|
300
|
+
return NULL;
|
301
|
+
|
302
|
+
start = i;
|
303
|
+
if (isdigit(text[i])) {
|
304
|
+
i += std_get_number(text + i);
|
305
|
+
ts->pos = i;
|
306
|
+
tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
|
307
|
+
} else {
|
308
|
+
token_i = std_get_alpha(text + i, token);
|
309
|
+
i += token_i;
|
310
|
+
|
311
|
+
if (!isstdtokchar(text[i])) {
|
312
|
+
// very common case, ie a plain word, so check and return
|
313
|
+
tk_set(ts->token, text+start, i-start, start, i, 1);
|
314
|
+
ts->pos = i;
|
315
|
+
return ts->token;
|
316
|
+
}
|
317
|
+
|
318
|
+
if (text[i] == '\'') { // apostrophe case.
|
319
|
+
i += std_get_apostrophe(text + i);
|
320
|
+
ts->pos = i;
|
321
|
+
len = i - start;
|
322
|
+
// strip possesive
|
323
|
+
if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
|
324
|
+
len -= 2;
|
325
|
+
tk_set(ts->token, text+start, len, start, i, 1);
|
326
|
+
return ts->token;
|
327
|
+
}
|
328
|
+
if (text[i] == '&') { // apostrophe case.
|
329
|
+
i += std_get_company_name(text + i);
|
330
|
+
ts->pos = i;
|
331
|
+
tk_set(ts->token, text+start, i - start, start, i, 1);
|
332
|
+
return ts->token;
|
333
|
+
}
|
334
|
+
|
335
|
+
if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
|
336
|
+
num_end = start + std_get_number(text + start);
|
337
|
+
if (!isstdtokchar(text[num_end])) { // we won't find a longer token
|
338
|
+
ts->pos = num_end;
|
339
|
+
tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
|
340
|
+
return ts->token;
|
341
|
+
}
|
342
|
+
// else there may be a longer token so check
|
343
|
+
}
|
344
|
+
|
345
|
+
if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
|
346
|
+
// check for a known url start
|
347
|
+
token[token_i] = '\0';
|
348
|
+
i += 3;
|
349
|
+
while (text[i] == '/') i++;
|
350
|
+
if (isalpha(text[i]) &&
|
351
|
+
(strcmp(token, "ftp") == 0 ||
|
352
|
+
strcmp(token, "http") == 0 ||
|
353
|
+
strcmp(token, "https") == 0 ||
|
354
|
+
strcmp(token, "file") == 0)) {
|
355
|
+
len = std_get_url(text + i, token); // dispose of first part of the URL
|
356
|
+
} else { //still treat as url but keep the first part
|
357
|
+
token_i = i - start;
|
358
|
+
memcpy(token, text + start, token_i * sizeof(char));
|
359
|
+
len = token_i + std_get_url(text + i, token + token_i); // keep start
|
360
|
+
}
|
361
|
+
ts->pos = i + len;
|
362
|
+
token[len] = 0;
|
363
|
+
tk_set(ts->token, token, len, start, ts->pos, 1);
|
364
|
+
return ts->token;
|
365
|
+
}
|
366
|
+
|
367
|
+
// now see how int a url we can find.
|
368
|
+
is_acronym = true;
|
369
|
+
seen_at_symbol = false;
|
370
|
+
while (isurlxatc(text[i])) {
|
371
|
+
if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
|
372
|
+
is_acronym = false;
|
373
|
+
}
|
374
|
+
if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
|
375
|
+
break; // can't have to punctuation characters in a row
|
376
|
+
if (text[i] == '@') {
|
377
|
+
if (seen_at_symbol)
|
378
|
+
break; // we can only have one @ symbol
|
379
|
+
else
|
380
|
+
seen_at_symbol = true;
|
381
|
+
}
|
382
|
+
i++;
|
383
|
+
}
|
384
|
+
while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
|
385
|
+
if (i > num_end) {
|
386
|
+
ts->pos = i;
|
387
|
+
|
388
|
+
if (is_acronym) { // check that it is one letter followed by one '.'
|
389
|
+
for (j = start; j < i-1; j++) {
|
390
|
+
if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
if (is_acronym) {// strip '.'s
|
394
|
+
for (j = start + token_i; j < i; j++) {
|
395
|
+
if (text[j] != '.') {
|
396
|
+
token[token_i] = text[j];
|
397
|
+
token_i++;
|
398
|
+
}
|
399
|
+
}
|
400
|
+
tk_set(ts->token, token, token_i, start, ts->pos, 1);
|
401
|
+
} else { // just return the url as is
|
402
|
+
tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
|
403
|
+
}
|
404
|
+
} else { // return the number
|
405
|
+
ts->pos = num_end;
|
406
|
+
tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
|
407
|
+
}
|
408
|
+
}
|
409
|
+
|
410
|
+
return ts->token;
|
411
|
+
}
|
412
|
+
|
413
|
+
TokenStream *standard_tokenizer_create()
|
414
|
+
{
|
415
|
+
TokenStream *ts = ts_create();
|
416
|
+
ts->next = &std_next;
|
417
|
+
return ts;
|
418
|
+
}
|
419
|
+
|
420
|
+
const char *ENGLISH_STOP_WORDS[] = {
|
421
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
422
|
+
"for", "if", "in", "into", "is", "it",
|
423
|
+
"no", "not", "of", "on", "or", "s", "such",
|
424
|
+
"t", "that", "the", "their", "then", "there", "these",
|
425
|
+
"they", "this", "to", "was", "will", "with"
|
426
|
+
};
|
427
|
+
|
428
|
+
void filter_reset(TokenStream *ts, char *text)
|
429
|
+
{
|
430
|
+
ts->sub_ts->reset(ts->sub_ts, text);
|
431
|
+
}
|
432
|
+
|
433
|
+
void filter_destroy(void *p)
|
434
|
+
{
|
435
|
+
TokenStream *ts = (TokenStream *)p;
|
436
|
+
ts->sub_ts->destroy(ts->sub_ts);
|
437
|
+
if (ts->token != NULL) tk_destroy(ts->token);
|
438
|
+
free(ts);
|
439
|
+
}
|
440
|
+
|
441
|
+
void sf_destroy(void *p)
|
442
|
+
{
|
443
|
+
HshTable *words = (HshTable *)((TokenStream *)p)->data;
|
444
|
+
h_destroy(words);
|
445
|
+
filter_destroy(p);
|
446
|
+
}
|
447
|
+
|
448
|
+
Token *sf_next(TokenStream *ts)
|
449
|
+
{
|
450
|
+
int pos_inc = 1;
|
451
|
+
HshTable *words = (HshTable *)ts->data;
|
452
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
453
|
+
while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
|
454
|
+
tk = ts->sub_ts->next(ts->sub_ts);
|
455
|
+
pos_inc++;
|
456
|
+
}
|
457
|
+
if (tk != NULL) tk->pos_inc = pos_inc;
|
458
|
+
return tk;
|
459
|
+
}
|
460
|
+
|
461
|
+
TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
|
462
|
+
{
|
463
|
+
int i;
|
464
|
+
TokenStream *tf = ALLOC(TokenStream);
|
465
|
+
tf->sub_ts = ts;
|
466
|
+
HshTable *wordtable = h_new_str(NULL, NULL);
|
467
|
+
for (i = 0; i < len; i++) {
|
468
|
+
h_set(wordtable, words[i], words[i]);
|
469
|
+
}
|
470
|
+
tf->data = wordtable;
|
471
|
+
tf->token = NULL;
|
472
|
+
tf->next = &sf_next;
|
473
|
+
tf->reset = &filter_reset;
|
474
|
+
tf->destroy = &sf_destroy;
|
475
|
+
return tf;
|
476
|
+
}
|
477
|
+
|
478
|
+
TokenStream *stop_filter_create(TokenStream *ts)
|
479
|
+
{
|
480
|
+
return stop_filter_create_with_words(ts,
|
481
|
+
(char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
|
482
|
+
}
|
483
|
+
|
484
|
+
Token *lcf_next(TokenStream *ts)
|
485
|
+
{
|
486
|
+
int i = 0;
|
487
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
488
|
+
if (tk == NULL) return tk;
|
489
|
+
while (tk->text[i] != '\0') {
|
490
|
+
tk->text[i] = tolower(tk->text[i]);
|
491
|
+
i++;
|
492
|
+
}
|
493
|
+
return tk;
|
494
|
+
}
|
495
|
+
|
496
|
+
TokenStream *lowercase_filter_create(TokenStream *ts)
|
497
|
+
{
|
498
|
+
TokenStream *tf = ALLOC(TokenStream);
|
499
|
+
tf->token = NULL;
|
500
|
+
tf->next = &lcf_next;
|
501
|
+
tf->reset = &filter_reset;
|
502
|
+
tf->destroy = &filter_destroy;
|
503
|
+
tf->sub_ts = ts;
|
504
|
+
return tf;
|
505
|
+
}
|
506
|
+
|
507
|
+
Analyzer *letter_analyzer_create()
|
508
|
+
{
|
509
|
+
Analyzer *a = ALLOC(Analyzer);
|
510
|
+
a->data = NULL;
|
511
|
+
a->current_ts = lowercase_filter_create(letter_tokenizer_create());
|
512
|
+
a->destroy = &a_standard_destroy;
|
513
|
+
a->get_ts = &a_standard_get_ts;
|
514
|
+
return a;
|
515
|
+
}
|
516
|
+
|
517
|
+
|
518
|
+
Analyzer *standard_analyzer_create_with_words(char **words, int len)
|
519
|
+
{
|
520
|
+
Analyzer *a = ALLOC(Analyzer);
|
521
|
+
a->data = NULL;
|
522
|
+
a->current_ts =
|
523
|
+
stop_filter_create_with_words(
|
524
|
+
lowercase_filter_create(standard_tokenizer_create()), words, len);
|
525
|
+
a->destroy = &a_standard_destroy;
|
526
|
+
a->get_ts = &a_standard_get_ts;
|
527
|
+
return a;
|
528
|
+
}
|
529
|
+
|
530
|
+
Analyzer *standard_analyzer_create()
|
531
|
+
{
|
532
|
+
return standard_analyzer_create_with_words(
|
533
|
+
(char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
|
534
|
+
}
|
535
|
+
|
536
|
+
#ifdef ALONE
|
537
|
+
int main(int argc, char **argv)
|
538
|
+
{
|
539
|
+
char buf[10000];
|
540
|
+
Analyzer *a = standard_analyzer_create();
|
541
|
+
TokenStream *ts;
|
542
|
+
Token *tk;
|
543
|
+
while (fgets(buf, 9999, stdin) != NULL) {
|
544
|
+
ts = a->get_ts(a, "hello", buf);
|
545
|
+
ts->pos = 0;
|
546
|
+
while ((tk = ts->next(ts)) != NULL) {
|
547
|
+
printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
|
548
|
+
}
|
549
|
+
printf("\n");
|
550
|
+
}
|
551
|
+
return 0;
|
552
|
+
}
|
553
|
+
#endif
|