ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/CHANGELOG
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
20060316:
|
2
|
+
* changed Token#term_text to Token#text
|
3
|
+
* changed Token#position_increment to Term#pos_inc
|
4
|
+
* changed order of args to Token.new. Now Term.new(text, start_offset,
|
5
|
+
end_offset, pos_inc=1, type="text"). NOTE: type does nothing.
|
6
|
+
* changed TermVectorOffsetInfo#start_offset to TermVectorOffsetInfo#start
|
7
|
+
* changed TermVectorOffsetInfo#end_offset to TermVectorOffsetInfo#end
|
8
|
+
* added :id_field option to Index::Index class.
|
9
|
+
|
data/Rakefile
CHANGED
@@ -9,7 +9,7 @@ require 'rake/testtask'
|
|
9
9
|
require 'rake/rdoctask'
|
10
10
|
require 'rake/clean'
|
11
11
|
require 'rake_utils/code_statistics'
|
12
|
-
require 'lib/
|
12
|
+
require 'lib/rferret'
|
13
13
|
|
14
14
|
begin
|
15
15
|
require 'rubygems'
|
@@ -30,18 +30,32 @@ def announce(msg='')
|
|
30
30
|
end
|
31
31
|
|
32
32
|
$VERBOSE = nil
|
33
|
+
|
34
|
+
EXT = "ferret_ext.so"
|
35
|
+
EXT_SRC = FileList["src/*/*.[ch]"]
|
36
|
+
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
37
|
+
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
38
|
+
|
33
39
|
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
34
40
|
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
|
35
41
|
|
36
42
|
task :default => :all_tests
|
37
43
|
desc "Run all tests"
|
38
|
-
task :all_tests => [ :
|
44
|
+
task :all_tests => [ :test_runits, :test_cunits, :test_functional ]
|
39
45
|
|
40
46
|
desc "Generate API documentation, and show coding stats"
|
41
47
|
task :doc => [ :stats, :appdoc ]
|
42
48
|
|
43
|
-
desc "run unit tests in test/unit"
|
44
|
-
Rake::TestTask.new("
|
49
|
+
desc "run unit tests in test/unit for pure ruby ferret"
|
50
|
+
Rake::TestTask.new("test_runits" => :parsers) do |t|
|
51
|
+
t.ruby_opts = ["-r 'lib/rferret'"]
|
52
|
+
t.libs << "test/unit"
|
53
|
+
t.pattern = 'test/unit/ts_*.rb'
|
54
|
+
t.verbose = true
|
55
|
+
end
|
56
|
+
|
57
|
+
desc "run unit tests in test/unit for C ferret"
|
58
|
+
Rake::TestTask.new("test_cunits" => :ext) do |t|
|
45
59
|
t.libs << "test/unit"
|
46
60
|
t.pattern = 'test/unit/t[cs]_*.rb'
|
47
61
|
t.verbose = true
|
@@ -84,22 +98,28 @@ rd = Rake::RDocTask.new("appdoc") do |rdoc|
|
|
84
98
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
85
99
|
end
|
86
100
|
|
87
|
-
|
101
|
+
EXT_SRC.each do |fn|
|
102
|
+
dest_fn = File.join("ext", File.basename(fn))
|
103
|
+
file dest_fn => fn do |t|
|
104
|
+
cp fn, dest_fn
|
105
|
+
end
|
106
|
+
end
|
88
107
|
|
89
108
|
desc "Build the extension"
|
90
|
-
task :ext => "ext/#{EXT}"
|
109
|
+
task :ext => ["ext/#{EXT}"] + SRC
|
91
110
|
|
92
|
-
file "ext/#{EXT}" => "ext/Makefile" do
|
111
|
+
file "ext/#{EXT}" => ["ext/Makefile"] do
|
112
|
+
cp "ext/inc/lang.h", "ext/lang.h"
|
93
113
|
sh "cd ext; make"
|
94
114
|
end
|
95
115
|
|
96
|
-
file "ext/Makefile" do
|
116
|
+
file "ext/Makefile" => SRC do
|
97
117
|
sh "cd ext; ruby extconf.rb"
|
98
118
|
end
|
99
119
|
|
100
120
|
# Make Parsers ---------------------------------------------------------------
|
101
121
|
|
102
|
-
RACC_SRC = FileList["
|
122
|
+
RACC_SRC = FileList["lib/**/*.y"]
|
103
123
|
RACC_OUT = RACC_SRC.collect { |fn| fn.sub(/\.y$/, '.tab.rb') }
|
104
124
|
|
105
125
|
task :parsers => RACC_OUT
|
@@ -195,8 +215,9 @@ end
|
|
195
215
|
# Creating a release
|
196
216
|
|
197
217
|
desc "Make a new release"
|
198
|
-
task :prerelease => [:
|
199
|
-
task :
|
218
|
+
task :prerelease => [:all_tests, :clobber]
|
219
|
+
task :repackage => EXT_SRC_DEST
|
220
|
+
task :package => EXT_SRC_DEST
|
200
221
|
task :tag => [:prerelease]
|
201
222
|
task :update_version => [:prerelease]
|
202
223
|
task :release => [:tag, :update_version, :package] do
|
@@ -229,7 +250,7 @@ task :prerelease do
|
|
229
250
|
end
|
230
251
|
|
231
252
|
# Are all source files checked in?
|
232
|
-
data = `svn -q status`
|
253
|
+
data = `svn -q --ignore-externals status`
|
233
254
|
unless data =~ /^$/
|
234
255
|
fail "'svn -q status' is not clean ... do you have unchecked-in files?"
|
235
256
|
end
|
@@ -237,28 +258,33 @@ task :prerelease do
|
|
237
258
|
announce "No outstanding checkins found ... OK"
|
238
259
|
end
|
239
260
|
|
261
|
+
def reversion(fn)
|
262
|
+
open(fn) do |ferret_in|
|
263
|
+
open(fn + ".new", "w") do |ferret_out|
|
264
|
+
ferret_in.each do |line|
|
265
|
+
if line =~ /^ VERSION\s*=\s*/
|
266
|
+
ferret_out.puts " VERSION = '#{PKG_VERSION}'"
|
267
|
+
else
|
268
|
+
ferret_out.puts line
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
240
275
|
task :update_version => [:prerelease] do
|
241
276
|
if PKG_VERSION == CURRENT_VERSION
|
242
277
|
announce "No version change ... skipping version update"
|
243
278
|
else
|
244
279
|
announce "Updating Ferret version to #{PKG_VERSION}"
|
245
|
-
|
246
|
-
|
247
|
-
ferret_in.each do |line|
|
248
|
-
if line =~ /^ VERSION\s*=\s*/
|
249
|
-
ferret_out.puts " VERSION = '#{PKG_VERSION}'"
|
250
|
-
else
|
251
|
-
ferret_out.puts line
|
252
|
-
end
|
253
|
-
end
|
254
|
-
end
|
255
|
-
end
|
280
|
+
reversion("lib/ferret.rb")
|
281
|
+
reversion("lib/rferret.rb")
|
256
282
|
if ENV['RELTEST']
|
257
283
|
announce "Release Task Testing, skipping commiting of new version"
|
258
284
|
else
|
259
|
-
mv "lib/
|
285
|
+
mv "lib/rferret.rb.new", "lib/rferret.rb"
|
260
286
|
end
|
261
|
-
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/
|
287
|
+
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
262
288
|
end
|
263
289
|
end
|
264
290
|
|
data/ext/analysis.c
ADDED
@@ -0,0 +1,553 @@
|
|
1
|
+
#include <analysis.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include <hash.h>
|
5
|
+
|
6
|
+
Token *tk_create()
|
7
|
+
{
|
8
|
+
return ALLOC(Token);
|
9
|
+
}
|
10
|
+
|
11
|
+
void tk_destroy(void *p)
|
12
|
+
{
|
13
|
+
free(p);
|
14
|
+
}
|
15
|
+
|
16
|
+
inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc)
|
17
|
+
{
|
18
|
+
if (tlen >= MAX_WORD_SIZE) tlen = MAX_WORD_SIZE - 1;
|
19
|
+
memcpy(tk->text, text, sizeof(char) * tlen);
|
20
|
+
tk->text[tlen] = '\0';
|
21
|
+
tk->start = start;
|
22
|
+
tk->end = end;
|
23
|
+
tk->pos_inc = pos_inc;
|
24
|
+
return tk;
|
25
|
+
}
|
26
|
+
|
27
|
+
inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
|
28
|
+
{
|
29
|
+
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
30
|
+
}
|
31
|
+
|
32
|
+
int tk_eq(Token *tk1, Token *tk2)
|
33
|
+
{
|
34
|
+
if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
35
|
+
tk1->start == tk2->start && tk1->end == tk2->end)
|
36
|
+
return true;
|
37
|
+
else
|
38
|
+
return false;
|
39
|
+
}
|
40
|
+
|
41
|
+
int tk_cmp(Token *tk1, Token *tk2)
|
42
|
+
{
|
43
|
+
int cmp;
|
44
|
+
if (tk1->start > tk2->start) {
|
45
|
+
cmp = 1;
|
46
|
+
} else if (tk1->start < tk2->start) {
|
47
|
+
cmp = -1;
|
48
|
+
} else {
|
49
|
+
if (tk1->end > tk2->end) {
|
50
|
+
cmp = 1;
|
51
|
+
} else if (tk1->end < tk2->end) {
|
52
|
+
cmp = -1;
|
53
|
+
} else {
|
54
|
+
cmp = strcmp((char *)tk1->text, (char *)tk2->text);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
return cmp;
|
58
|
+
}
|
59
|
+
|
60
|
+
void ts_standard_destroy(void *p)
|
61
|
+
{
|
62
|
+
TokenStream *ts = (TokenStream *)p;
|
63
|
+
tk_destroy(ts->token);
|
64
|
+
free(p);
|
65
|
+
}
|
66
|
+
|
67
|
+
void ts_reset(TokenStream *ts, char *text)
|
68
|
+
{
|
69
|
+
ts->text = text;
|
70
|
+
ts->pos = 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
TokenStream *ts_create()
|
74
|
+
{
|
75
|
+
TokenStream *ts = ALLOC(TokenStream);
|
76
|
+
ts->pos = -1;
|
77
|
+
ts->text = NULL;
|
78
|
+
ts->token = tk_create();
|
79
|
+
ts->destroy = &ts_standard_destroy;
|
80
|
+
ts->reset = &ts_reset;
|
81
|
+
return ts;
|
82
|
+
}
|
83
|
+
|
84
|
+
Token *wst_next(TokenStream *ts)
|
85
|
+
{
|
86
|
+
int i = ts->pos;
|
87
|
+
int start, end;
|
88
|
+
char *text = ts->text;
|
89
|
+
|
90
|
+
while (text[i] != '\0' && isspace(text[i]))
|
91
|
+
i++;
|
92
|
+
if (text[i] == '\0')
|
93
|
+
return NULL;
|
94
|
+
|
95
|
+
start = i;
|
96
|
+
while (text[i] != '\0' && !isspace(text[i]))
|
97
|
+
i++;
|
98
|
+
ts->pos = end = i;
|
99
|
+
tk_set(ts->token, text+start, end-start, start, end, 1);
|
100
|
+
return ts->token;
|
101
|
+
}
|
102
|
+
|
103
|
+
TokenStream *whitespace_tokenizer_create()
|
104
|
+
{
|
105
|
+
TokenStream *ts = ts_create();
|
106
|
+
ts->next = &wst_next;
|
107
|
+
return ts;
|
108
|
+
}
|
109
|
+
|
110
|
+
Token *lt_next(TokenStream *ts)
|
111
|
+
{
|
112
|
+
int i = ts->pos;
|
113
|
+
int start, end;
|
114
|
+
char *text = ts->text;
|
115
|
+
|
116
|
+
while (text[i] != '\0' && !isalpha(text[i]))
|
117
|
+
i++;
|
118
|
+
if (text[i] == '\0')
|
119
|
+
return NULL;
|
120
|
+
|
121
|
+
start = i;
|
122
|
+
while (text[i] != '\0' && isalpha(text[i]))
|
123
|
+
i++;
|
124
|
+
ts->pos = end = i;
|
125
|
+
tk_set(ts->token, text+start, end-start, start, end, 1);
|
126
|
+
return ts->token;
|
127
|
+
}
|
128
|
+
|
129
|
+
TokenStream *letter_tokenizer_create()
|
130
|
+
{
|
131
|
+
TokenStream *ts = ts_create();
|
132
|
+
ts->next = <_next;
|
133
|
+
return ts;
|
134
|
+
}
|
135
|
+
|
136
|
+
void a_standard_destroy(void *p)
|
137
|
+
{
|
138
|
+
Analyzer *a = (Analyzer *)p;
|
139
|
+
ts_destroy(a->current_ts);
|
140
|
+
free(p);
|
141
|
+
}
|
142
|
+
|
143
|
+
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
144
|
+
{
|
145
|
+
a->current_ts->reset(a->current_ts, text);
|
146
|
+
return a->current_ts;
|
147
|
+
}
|
148
|
+
|
149
|
+
Analyzer *whitespace_analyzer_create()
|
150
|
+
{
|
151
|
+
Analyzer *a = ALLOC(Analyzer);
|
152
|
+
a->data = NULL;
|
153
|
+
a->current_ts = whitespace_tokenizer_create();
|
154
|
+
a->destroy = &a_standard_destroy;
|
155
|
+
a->get_ts = &a_standard_get_ts;
|
156
|
+
return a;
|
157
|
+
}
|
158
|
+
|
159
|
+
int std_get_alpha(char *input, char *token)
|
160
|
+
{
|
161
|
+
int i = 0;
|
162
|
+
while (input[i] != '\0' && isalpha(input[i])) {
|
163
|
+
token[i] = input[i];
|
164
|
+
i++;
|
165
|
+
}
|
166
|
+
return i;
|
167
|
+
}
|
168
|
+
|
169
|
+
int std_get_alnum(char *input, char *token)
|
170
|
+
{
|
171
|
+
int i = 0;
|
172
|
+
while (input[i] != '\0' && isalnum(input[i])) {
|
173
|
+
token[i] = input[i];
|
174
|
+
i++;
|
175
|
+
}
|
176
|
+
return i;
|
177
|
+
}
|
178
|
+
|
179
|
+
int isnumpunc(char c)
|
180
|
+
{
|
181
|
+
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
|
182
|
+
}
|
183
|
+
|
184
|
+
int isurlpunc(char c)
|
185
|
+
{
|
186
|
+
return (c == '.' || c == '/' || c == '-' || c == '_');
|
187
|
+
}
|
188
|
+
|
189
|
+
int isurlc(char c)
|
190
|
+
{
|
191
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
|
192
|
+
}
|
193
|
+
|
194
|
+
int isurlxatpunc(char c)
|
195
|
+
{
|
196
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
|
197
|
+
}
|
198
|
+
|
199
|
+
int isurlxatc(char c)
|
200
|
+
{
|
201
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
|
202
|
+
}
|
203
|
+
|
204
|
+
int isstdtokchar(char c)
|
205
|
+
{
|
206
|
+
if (isspace(c)) return false; // most common so check first.
|
207
|
+
if (isalnum(c) || isnumpunc(c) || c == '&' ||
|
208
|
+
c == '@' || c == '\'' || c == ':')
|
209
|
+
return true;
|
210
|
+
return false;
|
211
|
+
}
|
212
|
+
|
213
|
+
/* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
|
214
|
+
* least one digit.
|
215
|
+
* (alnum) = [a-zA-Z0-9]
|
216
|
+
* (punc) = [_\/.,-]
|
217
|
+
*/
|
218
|
+
int std_get_number(char *input)
|
219
|
+
{
|
220
|
+
int i = 0;
|
221
|
+
int count = 0;
|
222
|
+
int last_seen_digit = 2;
|
223
|
+
int seen_digit = false;
|
224
|
+
|
225
|
+
while (last_seen_digit >= 0) {
|
226
|
+
while ((input[i] != '\0') && isalnum(input[i])) {
|
227
|
+
if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
|
228
|
+
if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
|
229
|
+
i++;
|
230
|
+
}
|
231
|
+
last_seen_digit--;
|
232
|
+
if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
|
233
|
+
|
234
|
+
if (last_seen_digit >= 0)
|
235
|
+
count = i;
|
236
|
+
break;
|
237
|
+
}
|
238
|
+
count = i;
|
239
|
+
i++;
|
240
|
+
}
|
241
|
+
if (seen_digit)
|
242
|
+
return count;
|
243
|
+
else
|
244
|
+
return 0;
|
245
|
+
}
|
246
|
+
|
247
|
+
int std_get_apostrophe(char *input)
|
248
|
+
{
|
249
|
+
int i = 0;
|
250
|
+
|
251
|
+
while (isalpha(input[i]) || input[i] == '\'')
|
252
|
+
i++;
|
253
|
+
|
254
|
+
return i;
|
255
|
+
}
|
256
|
+
|
257
|
+
int std_get_url(char *input, char *token)
|
258
|
+
{
|
259
|
+
int i = 0;
|
260
|
+
|
261
|
+
while (isurlc(input[i])) {
|
262
|
+
if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
|
263
|
+
break; // can't have to puncs in a row
|
264
|
+
token[i] = input[i];
|
265
|
+
i++;
|
266
|
+
}
|
267
|
+
|
268
|
+
//strip trailing puncs
|
269
|
+
while (isurlpunc(input[i-1])) i--;
|
270
|
+
|
271
|
+
return i;
|
272
|
+
}
|
273
|
+
|
274
|
+
/* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
|
275
|
+
*/
|
276
|
+
int std_get_company_name(char *input)
|
277
|
+
{
|
278
|
+
int i = 0;
|
279
|
+
while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
|
280
|
+
i++;
|
281
|
+
|
282
|
+
return i;
|
283
|
+
}
|
284
|
+
|
285
|
+
Token *std_next(TokenStream *ts)
|
286
|
+
{
|
287
|
+
int i = ts->pos, j;
|
288
|
+
int start;
|
289
|
+
char *text = ts->text;
|
290
|
+
char token[MAX_WORD_SIZE];
|
291
|
+
int token_i = 0;
|
292
|
+
int len;
|
293
|
+
int num_end = 0;
|
294
|
+
int is_acronym;
|
295
|
+
int seen_at_symbol;
|
296
|
+
|
297
|
+
while (text[i] != '\0' && !isalnum(text[i]))
|
298
|
+
i++;
|
299
|
+
if (text[i] == '\0')
|
300
|
+
return NULL;
|
301
|
+
|
302
|
+
start = i;
|
303
|
+
if (isdigit(text[i])) {
|
304
|
+
i += std_get_number(text + i);
|
305
|
+
ts->pos = i;
|
306
|
+
tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
|
307
|
+
} else {
|
308
|
+
token_i = std_get_alpha(text + i, token);
|
309
|
+
i += token_i;
|
310
|
+
|
311
|
+
if (!isstdtokchar(text[i])) {
|
312
|
+
// very common case, ie a plain word, so check and return
|
313
|
+
tk_set(ts->token, text+start, i-start, start, i, 1);
|
314
|
+
ts->pos = i;
|
315
|
+
return ts->token;
|
316
|
+
}
|
317
|
+
|
318
|
+
if (text[i] == '\'') { // apostrophe case.
|
319
|
+
i += std_get_apostrophe(text + i);
|
320
|
+
ts->pos = i;
|
321
|
+
len = i - start;
|
322
|
+
// strip possesive
|
323
|
+
if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
|
324
|
+
len -= 2;
|
325
|
+
tk_set(ts->token, text+start, len, start, i, 1);
|
326
|
+
return ts->token;
|
327
|
+
}
|
328
|
+
if (text[i] == '&') { // apostrophe case.
|
329
|
+
i += std_get_company_name(text + i);
|
330
|
+
ts->pos = i;
|
331
|
+
tk_set(ts->token, text+start, i - start, start, i, 1);
|
332
|
+
return ts->token;
|
333
|
+
}
|
334
|
+
|
335
|
+
if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
|
336
|
+
num_end = start + std_get_number(text + start);
|
337
|
+
if (!isstdtokchar(text[num_end])) { // we won't find a longer token
|
338
|
+
ts->pos = num_end;
|
339
|
+
tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
|
340
|
+
return ts->token;
|
341
|
+
}
|
342
|
+
// else there may be a longer token so check
|
343
|
+
}
|
344
|
+
|
345
|
+
if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
|
346
|
+
// check for a known url start
|
347
|
+
token[token_i] = '\0';
|
348
|
+
i += 3;
|
349
|
+
while (text[i] == '/') i++;
|
350
|
+
if (isalpha(text[i]) &&
|
351
|
+
(strcmp(token, "ftp") == 0 ||
|
352
|
+
strcmp(token, "http") == 0 ||
|
353
|
+
strcmp(token, "https") == 0 ||
|
354
|
+
strcmp(token, "file") == 0)) {
|
355
|
+
len = std_get_url(text + i, token); // dispose of first part of the URL
|
356
|
+
} else { //still treat as url but keep the first part
|
357
|
+
token_i = i - start;
|
358
|
+
memcpy(token, text + start, token_i * sizeof(char));
|
359
|
+
len = token_i + std_get_url(text + i, token + token_i); // keep start
|
360
|
+
}
|
361
|
+
ts->pos = i + len;
|
362
|
+
token[len] = 0;
|
363
|
+
tk_set(ts->token, token, len, start, ts->pos, 1);
|
364
|
+
return ts->token;
|
365
|
+
}
|
366
|
+
|
367
|
+
// now see how int a url we can find.
|
368
|
+
is_acronym = true;
|
369
|
+
seen_at_symbol = false;
|
370
|
+
while (isurlxatc(text[i])) {
|
371
|
+
if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
|
372
|
+
is_acronym = false;
|
373
|
+
}
|
374
|
+
if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
|
375
|
+
break; // can't have to punctuation characters in a row
|
376
|
+
if (text[i] == '@') {
|
377
|
+
if (seen_at_symbol)
|
378
|
+
break; // we can only have one @ symbol
|
379
|
+
else
|
380
|
+
seen_at_symbol = true;
|
381
|
+
}
|
382
|
+
i++;
|
383
|
+
}
|
384
|
+
while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
|
385
|
+
if (i > num_end) {
|
386
|
+
ts->pos = i;
|
387
|
+
|
388
|
+
if (is_acronym) { // check that it is one letter followed by one '.'
|
389
|
+
for (j = start; j < i-1; j++) {
|
390
|
+
if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
if (is_acronym) {// strip '.'s
|
394
|
+
for (j = start + token_i; j < i; j++) {
|
395
|
+
if (text[j] != '.') {
|
396
|
+
token[token_i] = text[j];
|
397
|
+
token_i++;
|
398
|
+
}
|
399
|
+
}
|
400
|
+
tk_set(ts->token, token, token_i, start, ts->pos, 1);
|
401
|
+
} else { // just return the url as is
|
402
|
+
tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
|
403
|
+
}
|
404
|
+
} else { // return the number
|
405
|
+
ts->pos = num_end;
|
406
|
+
tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
|
407
|
+
}
|
408
|
+
}
|
409
|
+
|
410
|
+
return ts->token;
|
411
|
+
}
|
412
|
+
|
413
|
+
TokenStream *standard_tokenizer_create()
|
414
|
+
{
|
415
|
+
TokenStream *ts = ts_create();
|
416
|
+
ts->next = &std_next;
|
417
|
+
return ts;
|
418
|
+
}
|
419
|
+
|
420
|
+
const char *ENGLISH_STOP_WORDS[] = {
|
421
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by",
|
422
|
+
"for", "if", "in", "into", "is", "it",
|
423
|
+
"no", "not", "of", "on", "or", "s", "such",
|
424
|
+
"t", "that", "the", "their", "then", "there", "these",
|
425
|
+
"they", "this", "to", "was", "will", "with"
|
426
|
+
};
|
427
|
+
|
428
|
+
void filter_reset(TokenStream *ts, char *text)
|
429
|
+
{
|
430
|
+
ts->sub_ts->reset(ts->sub_ts, text);
|
431
|
+
}
|
432
|
+
|
433
|
+
void filter_destroy(void *p)
|
434
|
+
{
|
435
|
+
TokenStream *ts = (TokenStream *)p;
|
436
|
+
ts->sub_ts->destroy(ts->sub_ts);
|
437
|
+
if (ts->token != NULL) tk_destroy(ts->token);
|
438
|
+
free(ts);
|
439
|
+
}
|
440
|
+
|
441
|
+
void sf_destroy(void *p)
|
442
|
+
{
|
443
|
+
HshTable *words = (HshTable *)((TokenStream *)p)->data;
|
444
|
+
h_destroy(words);
|
445
|
+
filter_destroy(p);
|
446
|
+
}
|
447
|
+
|
448
|
+
Token *sf_next(TokenStream *ts)
|
449
|
+
{
|
450
|
+
int pos_inc = 1;
|
451
|
+
HshTable *words = (HshTable *)ts->data;
|
452
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
453
|
+
while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
|
454
|
+
tk = ts->sub_ts->next(ts->sub_ts);
|
455
|
+
pos_inc++;
|
456
|
+
}
|
457
|
+
if (tk != NULL) tk->pos_inc = pos_inc;
|
458
|
+
return tk;
|
459
|
+
}
|
460
|
+
|
461
|
+
TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
|
462
|
+
{
|
463
|
+
int i;
|
464
|
+
TokenStream *tf = ALLOC(TokenStream);
|
465
|
+
tf->sub_ts = ts;
|
466
|
+
HshTable *wordtable = h_new_str(NULL, NULL);
|
467
|
+
for (i = 0; i < len; i++) {
|
468
|
+
h_set(wordtable, words[i], words[i]);
|
469
|
+
}
|
470
|
+
tf->data = wordtable;
|
471
|
+
tf->token = NULL;
|
472
|
+
tf->next = &sf_next;
|
473
|
+
tf->reset = &filter_reset;
|
474
|
+
tf->destroy = &sf_destroy;
|
475
|
+
return tf;
|
476
|
+
}
|
477
|
+
|
478
|
+
TokenStream *stop_filter_create(TokenStream *ts)
|
479
|
+
{
|
480
|
+
return stop_filter_create_with_words(ts,
|
481
|
+
(char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
|
482
|
+
}
|
483
|
+
|
484
|
+
Token *lcf_next(TokenStream *ts)
|
485
|
+
{
|
486
|
+
int i = 0;
|
487
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
488
|
+
if (tk == NULL) return tk;
|
489
|
+
while (tk->text[i] != '\0') {
|
490
|
+
tk->text[i] = tolower(tk->text[i]);
|
491
|
+
i++;
|
492
|
+
}
|
493
|
+
return tk;
|
494
|
+
}
|
495
|
+
|
496
|
+
TokenStream *lowercase_filter_create(TokenStream *ts)
|
497
|
+
{
|
498
|
+
TokenStream *tf = ALLOC(TokenStream);
|
499
|
+
tf->token = NULL;
|
500
|
+
tf->next = &lcf_next;
|
501
|
+
tf->reset = &filter_reset;
|
502
|
+
tf->destroy = &filter_destroy;
|
503
|
+
tf->sub_ts = ts;
|
504
|
+
return tf;
|
505
|
+
}
|
506
|
+
|
507
|
+
Analyzer *letter_analyzer_create()
|
508
|
+
{
|
509
|
+
Analyzer *a = ALLOC(Analyzer);
|
510
|
+
a->data = NULL;
|
511
|
+
a->current_ts = lowercase_filter_create(letter_tokenizer_create());
|
512
|
+
a->destroy = &a_standard_destroy;
|
513
|
+
a->get_ts = &a_standard_get_ts;
|
514
|
+
return a;
|
515
|
+
}
|
516
|
+
|
517
|
+
|
518
|
+
Analyzer *standard_analyzer_create_with_words(char **words, int len)
|
519
|
+
{
|
520
|
+
Analyzer *a = ALLOC(Analyzer);
|
521
|
+
a->data = NULL;
|
522
|
+
a->current_ts =
|
523
|
+
stop_filter_create_with_words(
|
524
|
+
lowercase_filter_create(standard_tokenizer_create()), words, len);
|
525
|
+
a->destroy = &a_standard_destroy;
|
526
|
+
a->get_ts = &a_standard_get_ts;
|
527
|
+
return a;
|
528
|
+
}
|
529
|
+
|
530
|
+
Analyzer *standard_analyzer_create()
|
531
|
+
{
|
532
|
+
return standard_analyzer_create_with_words(
|
533
|
+
(char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
|
534
|
+
}
|
535
|
+
|
536
|
+
#ifdef ALONE
|
537
|
+
int main(int argc, char **argv)
|
538
|
+
{
|
539
|
+
char buf[10000];
|
540
|
+
Analyzer *a = standard_analyzer_create();
|
541
|
+
TokenStream *ts;
|
542
|
+
Token *tk;
|
543
|
+
while (fgets(buf, 9999, stdin) != NULL) {
|
544
|
+
ts = a->get_ts(a, "hello", buf);
|
545
|
+
ts->pos = 0;
|
546
|
+
while ((tk = ts->next(ts)) != NULL) {
|
547
|
+
printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
|
548
|
+
}
|
549
|
+
printf("\n");
|
550
|
+
}
|
551
|
+
return 0;
|
552
|
+
}
|
553
|
+
#endif
|