ferret 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/README
CHANGED
@@ -12,17 +12,18 @@ search for things in them later.
|
|
12
12
|
|
13
13
|
== Installation
|
14
14
|
|
15
|
-
If you have gems installed you can
|
15
|
+
If you have gems installed you can simply do;
|
16
16
|
|
17
17
|
gem install ferret
|
18
18
|
|
19
|
-
Otherwise,
|
19
|
+
Otherwise, you will need Rake installed. De-compress the archive and enter its top directory.
|
20
20
|
|
21
|
-
tar zxpvf ferret
|
22
|
-
cd ferret
|
21
|
+
tar zxpvf ferret-<version>.tar.gz
|
22
|
+
cd ferret-<version>
|
23
23
|
|
24
|
-
Run the
|
24
|
+
Run the following;
|
25
25
|
|
26
|
+
$ rake ext
|
26
27
|
$ ruby setup.rb config
|
27
28
|
$ ruby setup.rb setup
|
28
29
|
# ruby setup.rb install
|
data/Rakefile
CHANGED
@@ -33,16 +33,25 @@ $VERBOSE = nil
|
|
33
33
|
|
34
34
|
EXT = "ferret_ext.so"
|
35
35
|
EXT_SRC = FileList["src/**/*.[ch]"]
|
36
|
+
if (/mswin/ =~ RUBY_PLATFORM)
|
37
|
+
EXT_SRC.delete('src/io/nix_io.c')
|
38
|
+
end
|
36
39
|
|
37
40
|
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
38
41
|
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
39
42
|
|
40
|
-
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
43
|
+
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles', '.config'])
|
41
44
|
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
45
|
+
POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
|
46
|
+
|
47
|
+
desc "Clean specifically for the release."
|
48
|
+
task :polish => [:clean] do
|
49
|
+
POLISH.each { |fn| rm_r fn rescue nil }
|
50
|
+
end
|
42
51
|
|
43
|
-
task :default => :
|
52
|
+
task :default => :test_all
|
44
53
|
desc "Run all tests"
|
45
|
-
task :
|
54
|
+
task :test_all => [ :test_runits, :test_cunits, :test_functional ]
|
46
55
|
|
47
56
|
desc "Generate API documentation, and show coding stats"
|
48
57
|
task :doc => [ :stats, :appdoc ]
|
@@ -121,7 +130,13 @@ task :ext => ["ext/#{EXT}"] + SRC
|
|
121
130
|
file "ext/#{EXT}" => ["ext/Makefile"] do
|
122
131
|
cp "ext/inc/lang.h", "ext/lang.h"
|
123
132
|
cp "ext/inc/except.h", "ext/except.h"
|
124
|
-
|
133
|
+
cd "ext"
|
134
|
+
if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
|
135
|
+
sh "nmake"
|
136
|
+
else
|
137
|
+
sh "make"
|
138
|
+
end
|
139
|
+
cd ".."
|
125
140
|
end
|
126
141
|
|
127
142
|
file "ext/lang.h" => ["ext/inc/lang.h"] do
|
@@ -132,7 +147,9 @@ file "ext/except.h" => ["ext/inc/except.h"] do
|
|
132
147
|
end
|
133
148
|
|
134
149
|
file "ext/Makefile" => SRC do
|
135
|
-
|
150
|
+
cd "ext"
|
151
|
+
`ruby extconf.rb`
|
152
|
+
cd ".."
|
136
153
|
end
|
137
154
|
|
138
155
|
# Make Parsers ---------------------------------------------------------------
|
@@ -158,6 +175,9 @@ PKG_FILES = FileList[
|
|
158
175
|
'Rakefile'
|
159
176
|
]
|
160
177
|
PKG_FILES.exclude('**/*.o')
|
178
|
+
PKG_FILES.include('ext/termdocs.c')
|
179
|
+
PKG_FILES.exclude('**/Makefile')
|
180
|
+
PKG_FILES.exclude('ext/ferret_ext.so')
|
161
181
|
|
162
182
|
|
163
183
|
if ! defined?(Gem)
|
@@ -233,12 +253,13 @@ end
|
|
233
253
|
# Creating a release
|
234
254
|
|
235
255
|
desc "Make a new release"
|
236
|
-
task :
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
256
|
+
task :release => [
|
257
|
+
:prerelease,
|
258
|
+
:polish,
|
259
|
+
:test_all,
|
260
|
+
:update_version,
|
261
|
+
:package,
|
262
|
+
:tag] do
|
242
263
|
announce
|
243
264
|
announce "**************************************************************"
|
244
265
|
announce "* Release #{PKG_VERSION} Complete."
|
@@ -288,6 +309,7 @@ def reversion(fn)
|
|
288
309
|
end
|
289
310
|
end
|
290
311
|
end
|
312
|
+
mv fn + ".new", fn
|
291
313
|
end
|
292
314
|
|
293
315
|
task :update_version => [:prerelease] do
|
@@ -300,9 +322,8 @@ task :update_version => [:prerelease] do
|
|
300
322
|
if ENV['RELTEST']
|
301
323
|
announce "Release Task Testing, skipping commiting of new version"
|
302
324
|
else
|
303
|
-
|
325
|
+
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
304
326
|
end
|
305
|
-
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
306
327
|
end
|
307
328
|
end
|
308
329
|
|
data/TODO
CHANGED
data/TUTORIAL
CHANGED
@@ -22,7 +22,7 @@ search for later. If you'd like to use a different analyzer you can specify it
|
|
22
22
|
here, eg;
|
23
23
|
|
24
24
|
index = Index::Index.new(:path => '/path/to/index',
|
25
|
-
:analyzer => WhiteSpaceAnalyzer.new)
|
25
|
+
:analyzer => Analysis::WhiteSpaceAnalyzer.new)
|
26
26
|
|
27
27
|
For more options when creating an Index refer to Ferret::Index::Index.
|
28
28
|
|
data/ext/analysis.c
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
#include
|
1
|
+
#include "analysis.h"
|
2
|
+
#include "hash.h"
|
3
|
+
#include "libstemmer.h"
|
2
4
|
#include <string.h>
|
3
5
|
#include <ctype.h>
|
4
6
|
#include <wctype.h>
|
5
7
|
#include <wchar.h>
|
6
|
-
|
7
|
-
#include "libstemmer.h"
|
8
|
+
|
8
9
|
|
9
10
|
/****************************************************************************
|
10
11
|
*
|
@@ -22,9 +23,16 @@ void tk_destroy(void *p)
|
|
22
23
|
free(p);
|
23
24
|
}
|
24
25
|
|
25
|
-
inline Token *tk_set(Token *tk,
|
26
|
+
inline Token *tk_set(Token *tk,
|
27
|
+
char *text,
|
28
|
+
int tlen,
|
29
|
+
int start,
|
30
|
+
int end,
|
31
|
+
int pos_inc)
|
26
32
|
{
|
27
|
-
if (tlen >= MAX_WORD_SIZE)
|
33
|
+
if (tlen >= MAX_WORD_SIZE) {
|
34
|
+
tlen = MAX_WORD_SIZE - 1;
|
35
|
+
}
|
28
36
|
memcpy(tk->text, text, sizeof(char) * tlen);
|
29
37
|
tk->text[tlen] = '\0';
|
30
38
|
tk->start = start;
|
@@ -33,14 +41,23 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
|
|
33
41
|
return tk;
|
34
42
|
}
|
35
43
|
|
36
|
-
inline Token *tk_set_ts(Token *tk,
|
44
|
+
inline Token *tk_set_ts(Token *tk,
|
45
|
+
char *start,
|
46
|
+
char *end,
|
47
|
+
char *text,
|
48
|
+
int pos_inc)
|
37
49
|
{
|
38
|
-
return tk_set(tk, start, end - start,
|
50
|
+
return tk_set(tk, start, (int)(end - start),
|
51
|
+
(int)(start - text), (int)(end - text), pos_inc);
|
39
52
|
}
|
40
53
|
|
41
|
-
inline Token *tk_set_no_len(Token *tk,
|
54
|
+
inline Token *tk_set_no_len(Token *tk,
|
55
|
+
char *text,
|
56
|
+
int start,
|
57
|
+
int end,
|
58
|
+
int pos_inc)
|
42
59
|
{
|
43
|
-
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
60
|
+
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
44
61
|
}
|
45
62
|
|
46
63
|
int tk_eq(Token *tk1, Token *tk2)
|
@@ -75,9 +92,14 @@ int tk_cmp(Token *tk1, Token *tk2)
|
|
75
92
|
*
|
76
93
|
****************************************************************************/
|
77
94
|
|
78
|
-
void
|
95
|
+
void ts_deref(void *p)
|
79
96
|
{
|
80
97
|
TokenStream *ts = (TokenStream *)p;
|
98
|
+
if (--ts->ref_cnt <= 0) ts->destroy(ts);
|
99
|
+
}
|
100
|
+
|
101
|
+
void ts_standard_destroy(TokenStream *ts)
|
102
|
+
{
|
81
103
|
tk_destroy(ts->token);
|
82
104
|
free(ts);
|
83
105
|
}
|
@@ -89,13 +111,11 @@ void ts_reset(TokenStream *ts, char *text)
|
|
89
111
|
|
90
112
|
TokenStream *ts_create()
|
91
113
|
{
|
92
|
-
TokenStream *ts =
|
93
|
-
ts->text = NULL;
|
114
|
+
TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
|
94
115
|
ts->token = tk_create();
|
95
116
|
ts->destroy = &ts_standard_destroy;
|
96
117
|
ts->reset = &ts_reset;
|
97
|
-
ts->
|
98
|
-
ts->clone_i = NULL;
|
118
|
+
ts->ref_cnt = 1;
|
99
119
|
return ts;
|
100
120
|
}
|
101
121
|
|
@@ -109,6 +129,7 @@ TokenStream *ts_clone(TokenStream *orig_ts)
|
|
109
129
|
}
|
110
130
|
if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
|
111
131
|
if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
|
132
|
+
ts->ref_cnt = 1;
|
112
133
|
return ts;
|
113
134
|
}
|
114
135
|
|
@@ -116,7 +137,7 @@ TokenStream *ts_clone(TokenStream *orig_ts)
|
|
116
137
|
static char * const ENC_ERR_MSG = "Error decoding input string. "
|
117
138
|
"Check that you have the locale set correctly";
|
118
139
|
#define MB_NEXT_CHAR \
|
119
|
-
if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
140
|
+
if ((i = (int)mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
120
141
|
RAISE(IO_ERROR, ENC_ERR_MSG)
|
121
142
|
|
122
143
|
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
|
@@ -128,9 +149,8 @@ inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc
|
|
128
149
|
return tk;
|
129
150
|
}
|
130
151
|
|
131
|
-
void mb_ts_standard_destroy(
|
152
|
+
void mb_ts_standard_destroy(TokenStream *ts)
|
132
153
|
{
|
133
|
-
TokenStream *ts = (TokenStream *)p;
|
134
154
|
tk_destroy(ts->token);
|
135
155
|
free(ts->data);
|
136
156
|
free(ts);
|
@@ -150,14 +170,13 @@ void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
150
170
|
|
151
171
|
TokenStream *mb_ts_create()
|
152
172
|
{
|
153
|
-
TokenStream *ts =
|
173
|
+
TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
|
154
174
|
ts->data = ALLOC(mbstate_t);
|
155
|
-
ts->text = NULL;
|
156
175
|
ts->token = tk_create();
|
157
176
|
ts->destroy = &mb_ts_standard_destroy;
|
158
177
|
ts->reset = &mb_ts_reset;
|
159
178
|
ts->clone_i = &mb_ts_clone_i;
|
160
|
-
ts->
|
179
|
+
ts->ref_cnt = 1;
|
161
180
|
return ts;
|
162
181
|
}
|
163
182
|
|
@@ -167,11 +186,16 @@ TokenStream *mb_ts_create()
|
|
167
186
|
*
|
168
187
|
****************************************************************************/
|
169
188
|
|
170
|
-
void
|
189
|
+
void a_deref(void *p)
|
171
190
|
{
|
172
191
|
Analyzer *a = (Analyzer *)p;
|
173
|
-
|
174
|
-
|
192
|
+
if (--a->ref_cnt <= 0) a->destroy(a);
|
193
|
+
}
|
194
|
+
|
195
|
+
void a_standard_destroy(Analyzer *a)
|
196
|
+
{
|
197
|
+
if (a->current_ts) ts_deref(a->current_ts);
|
198
|
+
free(a);
|
175
199
|
}
|
176
200
|
|
177
201
|
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
@@ -180,7 +204,8 @@ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
|
180
204
|
return a->current_ts;
|
181
205
|
}
|
182
206
|
|
183
|
-
Analyzer *analyzer_create(void *data, TokenStream *ts,
|
207
|
+
Analyzer *analyzer_create(void *data, TokenStream *ts,
|
208
|
+
void (*destroy)(Analyzer *a),
|
184
209
|
TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
|
185
210
|
{
|
186
211
|
Analyzer *a = ALLOC(Analyzer);
|
@@ -188,6 +213,7 @@ Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
|
|
188
213
|
a->current_ts = ts;
|
189
214
|
a->destroy = (destroy ? destroy : &a_standard_destroy);
|
190
215
|
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
216
|
+
a->ref_cnt = 1;
|
191
217
|
return a;
|
192
218
|
}
|
193
219
|
|
@@ -284,7 +310,7 @@ Token *mb_wst_next_lc(TokenStream *ts)
|
|
284
310
|
MB_NEXT_CHAR;
|
285
311
|
}
|
286
312
|
*w = 0;
|
287
|
-
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
313
|
+
w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
|
288
314
|
ts->t = t;
|
289
315
|
return ts->token;
|
290
316
|
}
|
@@ -409,7 +435,7 @@ Token *mb_lt_next_lc(TokenStream *ts)
|
|
409
435
|
MB_NEXT_CHAR;
|
410
436
|
}
|
411
437
|
*w = 0;
|
412
|
-
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
438
|
+
w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
|
413
439
|
ts->t = t;
|
414
440
|
return ts->token;
|
415
441
|
}
|
@@ -472,7 +498,7 @@ int mb_std_get_alpha(TokenStream *ts, char *token)
|
|
472
498
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
473
499
|
}
|
474
500
|
|
475
|
-
i = t - ts->t;
|
501
|
+
i = (int)(t - ts->t);
|
476
502
|
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
477
503
|
memcpy(token, ts->t, i);
|
478
504
|
return i;
|
@@ -500,7 +526,7 @@ int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
|
|
500
526
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
501
527
|
}
|
502
528
|
|
503
|
-
i = t - ts->t;
|
529
|
+
i = (int)(t - ts->t);
|
504
530
|
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
505
531
|
memcpy(token, ts->t, i);
|
506
532
|
return i;
|
@@ -599,7 +625,7 @@ int std_get_apostrophe(char *input)
|
|
599
625
|
while (isalpha(*t) || *t == '\'')
|
600
626
|
t++;
|
601
627
|
|
602
|
-
return t - input;
|
628
|
+
return (int)(t - input);
|
603
629
|
}
|
604
630
|
|
605
631
|
int mb_std_get_apostrophe(char *input)
|
@@ -613,7 +639,7 @@ int mb_std_get_apostrophe(char *input)
|
|
613
639
|
t += i;
|
614
640
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
615
641
|
}
|
616
|
-
return t - input;
|
642
|
+
return (int)(t - input);
|
617
643
|
}
|
618
644
|
|
619
645
|
int std_get_url(char *input, char *token, int i)
|
@@ -654,7 +680,7 @@ int mb_std_get_company_name(char *input, TokenStream *ts)
|
|
654
680
|
MB_NEXT_CHAR;
|
655
681
|
}
|
656
682
|
|
657
|
-
return t - input;
|
683
|
+
return (int)(t - input);
|
658
684
|
}
|
659
685
|
|
660
686
|
bool std_advance_to_start(TokenStream *ts)
|
@@ -723,7 +749,7 @@ Token *std_next(TokenStream *ts)
|
|
723
749
|
if (*t == '\'') { // apostrophe case.
|
724
750
|
t += std_tz->get_apostrophe(t);
|
725
751
|
ts->t = t;
|
726
|
-
len = t - start;
|
752
|
+
len = (int)(t - start);
|
727
753
|
// strip possesive
|
728
754
|
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
|
729
755
|
|
@@ -760,13 +786,14 @@ Token *std_next(TokenStream *ts)
|
|
760
786
|
memcmp(token, "file", 4) == 0)) {
|
761
787
|
len = std_get_url(t, token, 0); // dispose of first part of the URL
|
762
788
|
} else { //still treat as url but keep the first part
|
763
|
-
token_i = t - start;
|
789
|
+
token_i = (int)(t - start);
|
764
790
|
memcpy(token, start, token_i * sizeof(char));
|
765
791
|
len = token_i + std_get_url(t, token, token_i); // keep start
|
766
792
|
}
|
767
793
|
ts->t = t + len;
|
768
794
|
token[len] = 0;
|
769
|
-
tk_set(ts->token, token, len, start - ts->text,
|
795
|
+
tk_set(ts->token, token, len, (int)(start - ts->text),
|
796
|
+
(int)(ts->t - ts->text), 1);
|
770
797
|
return ts->token;
|
771
798
|
}
|
772
799
|
|
@@ -806,7 +833,8 @@ Token *std_next(TokenStream *ts)
|
|
806
833
|
token_i++;
|
807
834
|
}
|
808
835
|
}
|
809
|
-
tk_set(ts->token, token, token_i, start - ts->text,
|
836
|
+
tk_set(ts->token, token, token_i, (int)(start - ts->text),
|
837
|
+
(int)(t - ts->text), 1);
|
810
838
|
} else { // just return the url as is
|
811
839
|
tk_set_ts(ts->token, start, t, ts->text, 1);
|
812
840
|
}
|
@@ -819,9 +847,8 @@ Token *std_next(TokenStream *ts)
|
|
819
847
|
return ts->token;
|
820
848
|
}
|
821
849
|
|
822
|
-
void std_ts_destroy(
|
850
|
+
void std_ts_destroy(TokenStream *ts)
|
823
851
|
{
|
824
|
-
TokenStream *ts = (TokenStream *)p;
|
825
852
|
free(ts->data);
|
826
853
|
ts_standard_destroy(ts);
|
827
854
|
}
|
@@ -871,19 +898,18 @@ void filter_reset(TokenStream *ts, char *text)
|
|
871
898
|
ts->sub_ts->reset(ts->sub_ts, text);
|
872
899
|
}
|
873
900
|
|
874
|
-
void filter_destroy(
|
901
|
+
void filter_destroy(TokenStream *tf)
|
875
902
|
{
|
876
|
-
|
877
|
-
if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
|
903
|
+
ts_deref(tf->sub_ts);
|
878
904
|
if (tf->token != NULL) tk_destroy(tf->token);
|
879
905
|
free(tf);
|
880
906
|
}
|
881
907
|
|
882
|
-
void sf_destroy(
|
908
|
+
void sf_destroy(TokenStream *tf)
|
883
909
|
{
|
884
|
-
HshTable *words = (HshTable *)
|
910
|
+
HshTable *words = (HshTable *)tf->data;
|
885
911
|
h_destroy(words);
|
886
|
-
filter_destroy(
|
912
|
+
filter_destroy(tf);
|
887
913
|
}
|
888
914
|
|
889
915
|
void sf_clone_i_i(void *key, void *value, void *arg)
|
@@ -917,10 +943,10 @@ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
|
917
943
|
{
|
918
944
|
int i;
|
919
945
|
char *w;
|
946
|
+
HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
|
920
947
|
TokenStream *tf = ALLOC(TokenStream);
|
921
948
|
tf->sub_ts = ts;
|
922
|
-
|
923
|
-
HshTable *wordtable = h_new_str(&free, NULL);
|
949
|
+
|
924
950
|
for (i = 0; i < len; i++) {
|
925
951
|
w = estrdup(words[i]);
|
926
952
|
h_set(wordtable, w, w);
|
@@ -931,16 +957,16 @@ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
|
931
957
|
tf->reset = &filter_reset;
|
932
958
|
tf->destroy = &sf_destroy;
|
933
959
|
tf->clone_i = &sf_clone_i;
|
960
|
+
tf->ref_cnt = 1;
|
934
961
|
return tf;
|
935
962
|
}
|
936
963
|
|
937
964
|
TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
938
965
|
{
|
939
966
|
char *w;
|
967
|
+
HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
|
940
968
|
TokenStream *tf = ALLOC(TokenStream);
|
941
969
|
tf->sub_ts = ts;
|
942
|
-
tf->destroy_sub = true;
|
943
|
-
HshTable *wordtable = h_new_str(&free, NULL);
|
944
970
|
while (*words) {
|
945
971
|
w = estrdup(*words);
|
946
972
|
h_set(wordtable, w, w);
|
@@ -952,6 +978,7 @@ TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
|
952
978
|
tf->reset = &filter_reset;
|
953
979
|
tf->destroy = &sf_destroy;
|
954
980
|
tf->clone_i = &sf_clone_i;
|
981
|
+
tf->ref_cnt = 1;
|
955
982
|
return tf;
|
956
983
|
}
|
957
984
|
|
@@ -968,7 +995,7 @@ Token *mb_lcf_next(TokenStream *ts)
|
|
968
995
|
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
969
996
|
if (tk == NULL) return tk;
|
970
997
|
|
971
|
-
i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
998
|
+
i = (int)mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
972
999
|
w = wbuf;
|
973
1000
|
while (*w != 0) {
|
974
1001
|
*w = towlower(*w);
|
@@ -986,8 +1013,8 @@ TokenStream *mb_lowercase_filter_create(TokenStream *ts)
|
|
986
1013
|
tf->reset = &filter_reset;
|
987
1014
|
tf->destroy = &filter_destroy;
|
988
1015
|
tf->sub_ts = ts;
|
989
|
-
tf->destroy_sub = true;
|
990
1016
|
tf->clone_i = NULL;
|
1017
|
+
tf->ref_cnt = 1;
|
991
1018
|
return tf;
|
992
1019
|
}
|
993
1020
|
|
@@ -1011,8 +1038,8 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
|
|
1011
1038
|
tf->reset = &filter_reset;
|
1012
1039
|
tf->destroy = &filter_destroy;
|
1013
1040
|
tf->sub_ts = ts;
|
1014
|
-
tf->destroy_sub = true;
|
1015
1041
|
tf->clone_i = NULL;
|
1042
|
+
tf->ref_cnt = 1;
|
1016
1043
|
return tf;
|
1017
1044
|
}
|
1018
1045
|
|
@@ -1022,15 +1049,14 @@ typedef struct StemFilter {
|
|
1022
1049
|
char *charenc;
|
1023
1050
|
} StemFilter;
|
1024
1051
|
|
1025
|
-
void stemf_destroy(
|
1052
|
+
void stemf_destroy(TokenStream *tf)
|
1026
1053
|
{
|
1027
|
-
|
1028
|
-
StemFilter *stemf = (StemFilter *)ts->data;
|
1054
|
+
StemFilter *stemf = (StemFilter *)tf->data;
|
1029
1055
|
sb_stemmer_delete(stemf->stemmer);
|
1030
1056
|
free(stemf->algorithm);
|
1031
1057
|
free(stemf->charenc);
|
1032
1058
|
free(stemf);
|
1033
|
-
filter_destroy(
|
1059
|
+
filter_destroy(tf);
|
1034
1060
|
}
|
1035
1061
|
|
1036
1062
|
Token *stemf_next(TokenStream *ts)
|
@@ -1040,7 +1066,7 @@ Token *stemf_next(TokenStream *ts)
|
|
1040
1066
|
struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
|
1041
1067
|
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
1042
1068
|
if (tk == NULL) return tk;
|
1043
|
-
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
|
1069
|
+
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, (int)strlen(tk->text));
|
1044
1070
|
len = sb_stemmer_length(stemmer);
|
1045
1071
|
if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
|
1046
1072
|
memcpy(tk->text, stemmed, len);
|
@@ -1074,7 +1100,7 @@ TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
|
|
1074
1100
|
tf->destroy = &stemf_destroy;
|
1075
1101
|
tf->clone_i = &stemf_clone_i;
|
1076
1102
|
tf->sub_ts = ts;
|
1077
|
-
tf->
|
1103
|
+
tf->ref_cnt = 1;
|
1078
1104
|
return tf;
|
1079
1105
|
}
|
1080
1106
|
|
@@ -1148,19 +1174,12 @@ Analyzer *mb_standard_analyzer_create(bool lowercase)
|
|
1148
1174
|
*
|
1149
1175
|
****************************************************************************/
|
1150
1176
|
|
1151
|
-
|
1152
|
-
HshTable *dict;
|
1153
|
-
Analyzer *def;
|
1154
|
-
bool destroy_subs : 1;
|
1155
|
-
} PerFieldAnalyzer;
|
1156
|
-
|
1157
|
-
void pfa_destroy(void *p)
|
1177
|
+
void pfa_destroy(Analyzer *self)
|
1158
1178
|
{
|
1159
|
-
Analyzer *self = (Analyzer *)p;
|
1160
1179
|
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
1161
1180
|
h_destroy(pfa->dict);
|
1162
1181
|
|
1163
|
-
|
1182
|
+
a_deref(pfa->def);
|
1164
1183
|
free(pfa);
|
1165
1184
|
free(self);
|
1166
1185
|
}
|
@@ -1176,7 +1195,7 @@ TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
|
|
1176
1195
|
void pfa_sub_a_destroy(void *p)
|
1177
1196
|
{
|
1178
1197
|
Analyzer *a = (Analyzer *)p;
|
1179
|
-
|
1198
|
+
a_deref(a);
|
1180
1199
|
}
|
1181
1200
|
|
1182
1201
|
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
@@ -1185,13 +1204,11 @@ void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
|
1185
1204
|
h_set(pfa->dict, estrdup(field), analyzer);
|
1186
1205
|
}
|
1187
1206
|
|
1188
|
-
Analyzer *per_field_analyzer_create(Analyzer *def
|
1207
|
+
Analyzer *per_field_analyzer_create(Analyzer *def)
|
1189
1208
|
{
|
1190
1209
|
PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
|
1191
1210
|
pfa->def = def;
|
1192
|
-
pfa->
|
1193
|
-
pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
|
1194
|
-
: h_new_str(&free, NULL);
|
1211
|
+
pfa->dict = h_new_str(&free, &pfa_sub_a_destroy);
|
1195
1212
|
return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
|
1196
1213
|
}
|
1197
1214
|
|