ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/README
CHANGED
@@ -12,17 +12,18 @@ search for things in them later.
|
|
12
12
|
|
13
13
|
== Installation
|
14
14
|
|
15
|
-
If you have gems installed you can
|
15
|
+
If you have gems installed you can simply do;
|
16
16
|
|
17
17
|
gem install ferret
|
18
18
|
|
19
|
-
Otherwise,
|
19
|
+
Otherwise, you will need Rake installed. De-compress the archive and enter its top directory.
|
20
20
|
|
21
|
-
tar zxpvf ferret
|
22
|
-
cd ferret
|
21
|
+
tar zxpvf ferret-<version>.tar.gz
|
22
|
+
cd ferret-<version>
|
23
23
|
|
24
|
-
Run the
|
24
|
+
Run the following;
|
25
25
|
|
26
|
+
$ rake ext
|
26
27
|
$ ruby setup.rb config
|
27
28
|
$ ruby setup.rb setup
|
28
29
|
# ruby setup.rb install
|
data/Rakefile
CHANGED
@@ -33,16 +33,25 @@ $VERBOSE = nil
|
|
33
33
|
|
34
34
|
EXT = "ferret_ext.so"
|
35
35
|
EXT_SRC = FileList["src/**/*.[ch]"]
|
36
|
+
if (/mswin/ =~ RUBY_PLATFORM)
|
37
|
+
EXT_SRC.delete('src/io/nix_io.c')
|
38
|
+
end
|
36
39
|
|
37
40
|
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
38
41
|
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
39
42
|
|
40
|
-
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
43
|
+
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles', '.config'])
|
41
44
|
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
45
|
+
POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
|
46
|
+
|
47
|
+
desc "Clean specifically for the release."
|
48
|
+
task :polish => [:clean] do
|
49
|
+
POLISH.each { |fn| rm_r fn rescue nil }
|
50
|
+
end
|
42
51
|
|
43
|
-
task :default => :
|
52
|
+
task :default => :test_all
|
44
53
|
desc "Run all tests"
|
45
|
-
task :
|
54
|
+
task :test_all => [ :test_runits, :test_cunits, :test_functional ]
|
46
55
|
|
47
56
|
desc "Generate API documentation, and show coding stats"
|
48
57
|
task :doc => [ :stats, :appdoc ]
|
@@ -121,7 +130,13 @@ task :ext => ["ext/#{EXT}"] + SRC
|
|
121
130
|
file "ext/#{EXT}" => ["ext/Makefile"] do
|
122
131
|
cp "ext/inc/lang.h", "ext/lang.h"
|
123
132
|
cp "ext/inc/except.h", "ext/except.h"
|
124
|
-
|
133
|
+
cd "ext"
|
134
|
+
if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
|
135
|
+
sh "nmake"
|
136
|
+
else
|
137
|
+
sh "make"
|
138
|
+
end
|
139
|
+
cd ".."
|
125
140
|
end
|
126
141
|
|
127
142
|
file "ext/lang.h" => ["ext/inc/lang.h"] do
|
@@ -132,7 +147,9 @@ file "ext/except.h" => ["ext/inc/except.h"] do
|
|
132
147
|
end
|
133
148
|
|
134
149
|
file "ext/Makefile" => SRC do
|
135
|
-
|
150
|
+
cd "ext"
|
151
|
+
`ruby extconf.rb`
|
152
|
+
cd ".."
|
136
153
|
end
|
137
154
|
|
138
155
|
# Make Parsers ---------------------------------------------------------------
|
@@ -158,6 +175,9 @@ PKG_FILES = FileList[
|
|
158
175
|
'Rakefile'
|
159
176
|
]
|
160
177
|
PKG_FILES.exclude('**/*.o')
|
178
|
+
PKG_FILES.include('ext/termdocs.c')
|
179
|
+
PKG_FILES.exclude('**/Makefile')
|
180
|
+
PKG_FILES.exclude('ext/ferret_ext.so')
|
161
181
|
|
162
182
|
|
163
183
|
if ! defined?(Gem)
|
@@ -233,12 +253,13 @@ end
|
|
233
253
|
# Creating a release
|
234
254
|
|
235
255
|
desc "Make a new release"
|
236
|
-
task :
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
256
|
+
task :release => [
|
257
|
+
:prerelease,
|
258
|
+
:polish,
|
259
|
+
:test_all,
|
260
|
+
:update_version,
|
261
|
+
:package,
|
262
|
+
:tag] do
|
242
263
|
announce
|
243
264
|
announce "**************************************************************"
|
244
265
|
announce "* Release #{PKG_VERSION} Complete."
|
@@ -288,6 +309,7 @@ def reversion(fn)
|
|
288
309
|
end
|
289
310
|
end
|
290
311
|
end
|
312
|
+
mv fn + ".new", fn
|
291
313
|
end
|
292
314
|
|
293
315
|
task :update_version => [:prerelease] do
|
@@ -300,9 +322,8 @@ task :update_version => [:prerelease] do
|
|
300
322
|
if ENV['RELTEST']
|
301
323
|
announce "Release Task Testing, skipping commiting of new version"
|
302
324
|
else
|
303
|
-
|
325
|
+
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
304
326
|
end
|
305
|
-
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
|
306
327
|
end
|
307
328
|
end
|
308
329
|
|
data/TODO
CHANGED
data/TUTORIAL
CHANGED
@@ -22,7 +22,7 @@ search for later. If you'd like to use a different analyzer you can specify it
|
|
22
22
|
here, eg;
|
23
23
|
|
24
24
|
index = Index::Index.new(:path => '/path/to/index',
|
25
|
-
:analyzer => WhiteSpaceAnalyzer.new)
|
25
|
+
:analyzer => Analysis::WhiteSpaceAnalyzer.new)
|
26
26
|
|
27
27
|
For more options when creating an Index refer to Ferret::Index::Index.
|
28
28
|
|
data/ext/analysis.c
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
#include
|
1
|
+
#include "analysis.h"
|
2
|
+
#include "hash.h"
|
3
|
+
#include "libstemmer.h"
|
2
4
|
#include <string.h>
|
3
5
|
#include <ctype.h>
|
4
6
|
#include <wctype.h>
|
5
7
|
#include <wchar.h>
|
6
|
-
|
7
|
-
#include "libstemmer.h"
|
8
|
+
|
8
9
|
|
9
10
|
/****************************************************************************
|
10
11
|
*
|
@@ -22,9 +23,16 @@ void tk_destroy(void *p)
|
|
22
23
|
free(p);
|
23
24
|
}
|
24
25
|
|
25
|
-
inline Token *tk_set(Token *tk,
|
26
|
+
inline Token *tk_set(Token *tk,
|
27
|
+
char *text,
|
28
|
+
int tlen,
|
29
|
+
int start,
|
30
|
+
int end,
|
31
|
+
int pos_inc)
|
26
32
|
{
|
27
|
-
if (tlen >= MAX_WORD_SIZE)
|
33
|
+
if (tlen >= MAX_WORD_SIZE) {
|
34
|
+
tlen = MAX_WORD_SIZE - 1;
|
35
|
+
}
|
28
36
|
memcpy(tk->text, text, sizeof(char) * tlen);
|
29
37
|
tk->text[tlen] = '\0';
|
30
38
|
tk->start = start;
|
@@ -33,14 +41,23 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
|
|
33
41
|
return tk;
|
34
42
|
}
|
35
43
|
|
36
|
-
inline Token *tk_set_ts(Token *tk,
|
44
|
+
inline Token *tk_set_ts(Token *tk,
|
45
|
+
char *start,
|
46
|
+
char *end,
|
47
|
+
char *text,
|
48
|
+
int pos_inc)
|
37
49
|
{
|
38
|
-
return tk_set(tk, start, end - start,
|
50
|
+
return tk_set(tk, start, (int)(end - start),
|
51
|
+
(int)(start - text), (int)(end - text), pos_inc);
|
39
52
|
}
|
40
53
|
|
41
|
-
inline Token *tk_set_no_len(Token *tk,
|
54
|
+
inline Token *tk_set_no_len(Token *tk,
|
55
|
+
char *text,
|
56
|
+
int start,
|
57
|
+
int end,
|
58
|
+
int pos_inc)
|
42
59
|
{
|
43
|
-
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
60
|
+
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
44
61
|
}
|
45
62
|
|
46
63
|
int tk_eq(Token *tk1, Token *tk2)
|
@@ -75,9 +92,14 @@ int tk_cmp(Token *tk1, Token *tk2)
|
|
75
92
|
*
|
76
93
|
****************************************************************************/
|
77
94
|
|
78
|
-
void
|
95
|
+
void ts_deref(void *p)
|
79
96
|
{
|
80
97
|
TokenStream *ts = (TokenStream *)p;
|
98
|
+
if (--ts->ref_cnt <= 0) ts->destroy(ts);
|
99
|
+
}
|
100
|
+
|
101
|
+
void ts_standard_destroy(TokenStream *ts)
|
102
|
+
{
|
81
103
|
tk_destroy(ts->token);
|
82
104
|
free(ts);
|
83
105
|
}
|
@@ -89,13 +111,11 @@ void ts_reset(TokenStream *ts, char *text)
|
|
89
111
|
|
90
112
|
TokenStream *ts_create()
|
91
113
|
{
|
92
|
-
TokenStream *ts =
|
93
|
-
ts->text = NULL;
|
114
|
+
TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
|
94
115
|
ts->token = tk_create();
|
95
116
|
ts->destroy = &ts_standard_destroy;
|
96
117
|
ts->reset = &ts_reset;
|
97
|
-
ts->
|
98
|
-
ts->clone_i = NULL;
|
118
|
+
ts->ref_cnt = 1;
|
99
119
|
return ts;
|
100
120
|
}
|
101
121
|
|
@@ -109,6 +129,7 @@ TokenStream *ts_clone(TokenStream *orig_ts)
|
|
109
129
|
}
|
110
130
|
if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
|
111
131
|
if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
|
132
|
+
ts->ref_cnt = 1;
|
112
133
|
return ts;
|
113
134
|
}
|
114
135
|
|
@@ -116,7 +137,7 @@ TokenStream *ts_clone(TokenStream *orig_ts)
|
|
116
137
|
static char * const ENC_ERR_MSG = "Error decoding input string. "
|
117
138
|
"Check that you have the locale set correctly";
|
118
139
|
#define MB_NEXT_CHAR \
|
119
|
-
if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
140
|
+
if ((i = (int)mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
120
141
|
RAISE(IO_ERROR, ENC_ERR_MSG)
|
121
142
|
|
122
143
|
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
|
@@ -128,9 +149,8 @@ inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc
|
|
128
149
|
return tk;
|
129
150
|
}
|
130
151
|
|
131
|
-
void mb_ts_standard_destroy(
|
152
|
+
void mb_ts_standard_destroy(TokenStream *ts)
|
132
153
|
{
|
133
|
-
TokenStream *ts = (TokenStream *)p;
|
134
154
|
tk_destroy(ts->token);
|
135
155
|
free(ts->data);
|
136
156
|
free(ts);
|
@@ -150,14 +170,13 @@ void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
150
170
|
|
151
171
|
TokenStream *mb_ts_create()
|
152
172
|
{
|
153
|
-
TokenStream *ts =
|
173
|
+
TokenStream *ts = ALLOC_AND_ZERO_N(TokenStream, 1);
|
154
174
|
ts->data = ALLOC(mbstate_t);
|
155
|
-
ts->text = NULL;
|
156
175
|
ts->token = tk_create();
|
157
176
|
ts->destroy = &mb_ts_standard_destroy;
|
158
177
|
ts->reset = &mb_ts_reset;
|
159
178
|
ts->clone_i = &mb_ts_clone_i;
|
160
|
-
ts->
|
179
|
+
ts->ref_cnt = 1;
|
161
180
|
return ts;
|
162
181
|
}
|
163
182
|
|
@@ -167,11 +186,16 @@ TokenStream *mb_ts_create()
|
|
167
186
|
*
|
168
187
|
****************************************************************************/
|
169
188
|
|
170
|
-
void
|
189
|
+
void a_deref(void *p)
|
171
190
|
{
|
172
191
|
Analyzer *a = (Analyzer *)p;
|
173
|
-
|
174
|
-
|
192
|
+
if (--a->ref_cnt <= 0) a->destroy(a);
|
193
|
+
}
|
194
|
+
|
195
|
+
void a_standard_destroy(Analyzer *a)
|
196
|
+
{
|
197
|
+
if (a->current_ts) ts_deref(a->current_ts);
|
198
|
+
free(a);
|
175
199
|
}
|
176
200
|
|
177
201
|
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
@@ -180,7 +204,8 @@ TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
|
180
204
|
return a->current_ts;
|
181
205
|
}
|
182
206
|
|
183
|
-
Analyzer *analyzer_create(void *data, TokenStream *ts,
|
207
|
+
Analyzer *analyzer_create(void *data, TokenStream *ts,
|
208
|
+
void (*destroy)(Analyzer *a),
|
184
209
|
TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
|
185
210
|
{
|
186
211
|
Analyzer *a = ALLOC(Analyzer);
|
@@ -188,6 +213,7 @@ Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
|
|
188
213
|
a->current_ts = ts;
|
189
214
|
a->destroy = (destroy ? destroy : &a_standard_destroy);
|
190
215
|
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
216
|
+
a->ref_cnt = 1;
|
191
217
|
return a;
|
192
218
|
}
|
193
219
|
|
@@ -284,7 +310,7 @@ Token *mb_wst_next_lc(TokenStream *ts)
|
|
284
310
|
MB_NEXT_CHAR;
|
285
311
|
}
|
286
312
|
*w = 0;
|
287
|
-
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
313
|
+
w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
|
288
314
|
ts->t = t;
|
289
315
|
return ts->token;
|
290
316
|
}
|
@@ -409,7 +435,7 @@ Token *mb_lt_next_lc(TokenStream *ts)
|
|
409
435
|
MB_NEXT_CHAR;
|
410
436
|
}
|
411
437
|
*w = 0;
|
412
|
-
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
438
|
+
w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
|
413
439
|
ts->t = t;
|
414
440
|
return ts->token;
|
415
441
|
}
|
@@ -472,7 +498,7 @@ int mb_std_get_alpha(TokenStream *ts, char *token)
|
|
472
498
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
473
499
|
}
|
474
500
|
|
475
|
-
i = t - ts->t;
|
501
|
+
i = (int)(t - ts->t);
|
476
502
|
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
477
503
|
memcpy(token, ts->t, i);
|
478
504
|
return i;
|
@@ -500,7 +526,7 @@ int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
|
|
500
526
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
501
527
|
}
|
502
528
|
|
503
|
-
i = t - ts->t;
|
529
|
+
i = (int)(t - ts->t);
|
504
530
|
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
505
531
|
memcpy(token, ts->t, i);
|
506
532
|
return i;
|
@@ -599,7 +625,7 @@ int std_get_apostrophe(char *input)
|
|
599
625
|
while (isalpha(*t) || *t == '\'')
|
600
626
|
t++;
|
601
627
|
|
602
|
-
return t - input;
|
628
|
+
return (int)(t - input);
|
603
629
|
}
|
604
630
|
|
605
631
|
int mb_std_get_apostrophe(char *input)
|
@@ -613,7 +639,7 @@ int mb_std_get_apostrophe(char *input)
|
|
613
639
|
t += i;
|
614
640
|
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
615
641
|
}
|
616
|
-
return t - input;
|
642
|
+
return (int)(t - input);
|
617
643
|
}
|
618
644
|
|
619
645
|
int std_get_url(char *input, char *token, int i)
|
@@ -654,7 +680,7 @@ int mb_std_get_company_name(char *input, TokenStream *ts)
|
|
654
680
|
MB_NEXT_CHAR;
|
655
681
|
}
|
656
682
|
|
657
|
-
return t - input;
|
683
|
+
return (int)(t - input);
|
658
684
|
}
|
659
685
|
|
660
686
|
bool std_advance_to_start(TokenStream *ts)
|
@@ -723,7 +749,7 @@ Token *std_next(TokenStream *ts)
|
|
723
749
|
if (*t == '\'') { // apostrophe case.
|
724
750
|
t += std_tz->get_apostrophe(t);
|
725
751
|
ts->t = t;
|
726
|
-
len = t - start;
|
752
|
+
len = (int)(t - start);
|
727
753
|
// strip possesive
|
728
754
|
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
|
729
755
|
|
@@ -760,13 +786,14 @@ Token *std_next(TokenStream *ts)
|
|
760
786
|
memcmp(token, "file", 4) == 0)) {
|
761
787
|
len = std_get_url(t, token, 0); // dispose of first part of the URL
|
762
788
|
} else { //still treat as url but keep the first part
|
763
|
-
token_i = t - start;
|
789
|
+
token_i = (int)(t - start);
|
764
790
|
memcpy(token, start, token_i * sizeof(char));
|
765
791
|
len = token_i + std_get_url(t, token, token_i); // keep start
|
766
792
|
}
|
767
793
|
ts->t = t + len;
|
768
794
|
token[len] = 0;
|
769
|
-
tk_set(ts->token, token, len, start - ts->text,
|
795
|
+
tk_set(ts->token, token, len, (int)(start - ts->text),
|
796
|
+
(int)(ts->t - ts->text), 1);
|
770
797
|
return ts->token;
|
771
798
|
}
|
772
799
|
|
@@ -806,7 +833,8 @@ Token *std_next(TokenStream *ts)
|
|
806
833
|
token_i++;
|
807
834
|
}
|
808
835
|
}
|
809
|
-
tk_set(ts->token, token, token_i, start - ts->text,
|
836
|
+
tk_set(ts->token, token, token_i, (int)(start - ts->text),
|
837
|
+
(int)(t - ts->text), 1);
|
810
838
|
} else { // just return the url as is
|
811
839
|
tk_set_ts(ts->token, start, t, ts->text, 1);
|
812
840
|
}
|
@@ -819,9 +847,8 @@ Token *std_next(TokenStream *ts)
|
|
819
847
|
return ts->token;
|
820
848
|
}
|
821
849
|
|
822
|
-
void std_ts_destroy(
|
850
|
+
void std_ts_destroy(TokenStream *ts)
|
823
851
|
{
|
824
|
-
TokenStream *ts = (TokenStream *)p;
|
825
852
|
free(ts->data);
|
826
853
|
ts_standard_destroy(ts);
|
827
854
|
}
|
@@ -871,19 +898,18 @@ void filter_reset(TokenStream *ts, char *text)
|
|
871
898
|
ts->sub_ts->reset(ts->sub_ts, text);
|
872
899
|
}
|
873
900
|
|
874
|
-
void filter_destroy(
|
901
|
+
void filter_destroy(TokenStream *tf)
|
875
902
|
{
|
876
|
-
|
877
|
-
if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
|
903
|
+
ts_deref(tf->sub_ts);
|
878
904
|
if (tf->token != NULL) tk_destroy(tf->token);
|
879
905
|
free(tf);
|
880
906
|
}
|
881
907
|
|
882
|
-
void sf_destroy(
|
908
|
+
void sf_destroy(TokenStream *tf)
|
883
909
|
{
|
884
|
-
HshTable *words = (HshTable *)
|
910
|
+
HshTable *words = (HshTable *)tf->data;
|
885
911
|
h_destroy(words);
|
886
|
-
filter_destroy(
|
912
|
+
filter_destroy(tf);
|
887
913
|
}
|
888
914
|
|
889
915
|
void sf_clone_i_i(void *key, void *value, void *arg)
|
@@ -917,10 +943,10 @@ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
|
917
943
|
{
|
918
944
|
int i;
|
919
945
|
char *w;
|
946
|
+
HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
|
920
947
|
TokenStream *tf = ALLOC(TokenStream);
|
921
948
|
tf->sub_ts = ts;
|
922
|
-
|
923
|
-
HshTable *wordtable = h_new_str(&free, NULL);
|
949
|
+
|
924
950
|
for (i = 0; i < len; i++) {
|
925
951
|
w = estrdup(words[i]);
|
926
952
|
h_set(wordtable, w, w);
|
@@ -931,16 +957,16 @@ TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
|
931
957
|
tf->reset = &filter_reset;
|
932
958
|
tf->destroy = &sf_destroy;
|
933
959
|
tf->clone_i = &sf_clone_i;
|
960
|
+
tf->ref_cnt = 1;
|
934
961
|
return tf;
|
935
962
|
}
|
936
963
|
|
937
964
|
TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
938
965
|
{
|
939
966
|
char *w;
|
967
|
+
HshTable *wordtable = h_new_str(&free, (free_ft)NULL);
|
940
968
|
TokenStream *tf = ALLOC(TokenStream);
|
941
969
|
tf->sub_ts = ts;
|
942
|
-
tf->destroy_sub = true;
|
943
|
-
HshTable *wordtable = h_new_str(&free, NULL);
|
944
970
|
while (*words) {
|
945
971
|
w = estrdup(*words);
|
946
972
|
h_set(wordtable, w, w);
|
@@ -952,6 +978,7 @@ TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
|
952
978
|
tf->reset = &filter_reset;
|
953
979
|
tf->destroy = &sf_destroy;
|
954
980
|
tf->clone_i = &sf_clone_i;
|
981
|
+
tf->ref_cnt = 1;
|
955
982
|
return tf;
|
956
983
|
}
|
957
984
|
|
@@ -968,7 +995,7 @@ Token *mb_lcf_next(TokenStream *ts)
|
|
968
995
|
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
969
996
|
if (tk == NULL) return tk;
|
970
997
|
|
971
|
-
i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
998
|
+
i = (int)mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
972
999
|
w = wbuf;
|
973
1000
|
while (*w != 0) {
|
974
1001
|
*w = towlower(*w);
|
@@ -986,8 +1013,8 @@ TokenStream *mb_lowercase_filter_create(TokenStream *ts)
|
|
986
1013
|
tf->reset = &filter_reset;
|
987
1014
|
tf->destroy = &filter_destroy;
|
988
1015
|
tf->sub_ts = ts;
|
989
|
-
tf->destroy_sub = true;
|
990
1016
|
tf->clone_i = NULL;
|
1017
|
+
tf->ref_cnt = 1;
|
991
1018
|
return tf;
|
992
1019
|
}
|
993
1020
|
|
@@ -1011,8 +1038,8 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
|
|
1011
1038
|
tf->reset = &filter_reset;
|
1012
1039
|
tf->destroy = &filter_destroy;
|
1013
1040
|
tf->sub_ts = ts;
|
1014
|
-
tf->destroy_sub = true;
|
1015
1041
|
tf->clone_i = NULL;
|
1042
|
+
tf->ref_cnt = 1;
|
1016
1043
|
return tf;
|
1017
1044
|
}
|
1018
1045
|
|
@@ -1022,15 +1049,14 @@ typedef struct StemFilter {
|
|
1022
1049
|
char *charenc;
|
1023
1050
|
} StemFilter;
|
1024
1051
|
|
1025
|
-
void stemf_destroy(
|
1052
|
+
void stemf_destroy(TokenStream *tf)
|
1026
1053
|
{
|
1027
|
-
|
1028
|
-
StemFilter *stemf = (StemFilter *)ts->data;
|
1054
|
+
StemFilter *stemf = (StemFilter *)tf->data;
|
1029
1055
|
sb_stemmer_delete(stemf->stemmer);
|
1030
1056
|
free(stemf->algorithm);
|
1031
1057
|
free(stemf->charenc);
|
1032
1058
|
free(stemf);
|
1033
|
-
filter_destroy(
|
1059
|
+
filter_destroy(tf);
|
1034
1060
|
}
|
1035
1061
|
|
1036
1062
|
Token *stemf_next(TokenStream *ts)
|
@@ -1040,7 +1066,7 @@ Token *stemf_next(TokenStream *ts)
|
|
1040
1066
|
struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
|
1041
1067
|
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
1042
1068
|
if (tk == NULL) return tk;
|
1043
|
-
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
|
1069
|
+
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, (int)strlen(tk->text));
|
1044
1070
|
len = sb_stemmer_length(stemmer);
|
1045
1071
|
if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
|
1046
1072
|
memcpy(tk->text, stemmed, len);
|
@@ -1074,7 +1100,7 @@ TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
|
|
1074
1100
|
tf->destroy = &stemf_destroy;
|
1075
1101
|
tf->clone_i = &stemf_clone_i;
|
1076
1102
|
tf->sub_ts = ts;
|
1077
|
-
tf->
|
1103
|
+
tf->ref_cnt = 1;
|
1078
1104
|
return tf;
|
1079
1105
|
}
|
1080
1106
|
|
@@ -1148,19 +1174,12 @@ Analyzer *mb_standard_analyzer_create(bool lowercase)
|
|
1148
1174
|
*
|
1149
1175
|
****************************************************************************/
|
1150
1176
|
|
1151
|
-
|
1152
|
-
HshTable *dict;
|
1153
|
-
Analyzer *def;
|
1154
|
-
bool destroy_subs : 1;
|
1155
|
-
} PerFieldAnalyzer;
|
1156
|
-
|
1157
|
-
void pfa_destroy(void *p)
|
1177
|
+
void pfa_destroy(Analyzer *self)
|
1158
1178
|
{
|
1159
|
-
Analyzer *self = (Analyzer *)p;
|
1160
1179
|
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
1161
1180
|
h_destroy(pfa->dict);
|
1162
1181
|
|
1163
|
-
|
1182
|
+
a_deref(pfa->def);
|
1164
1183
|
free(pfa);
|
1165
1184
|
free(self);
|
1166
1185
|
}
|
@@ -1176,7 +1195,7 @@ TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
|
|
1176
1195
|
void pfa_sub_a_destroy(void *p)
|
1177
1196
|
{
|
1178
1197
|
Analyzer *a = (Analyzer *)p;
|
1179
|
-
|
1198
|
+
a_deref(a);
|
1180
1199
|
}
|
1181
1200
|
|
1182
1201
|
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
@@ -1185,13 +1204,11 @@ void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
|
1185
1204
|
h_set(pfa->dict, estrdup(field), analyzer);
|
1186
1205
|
}
|
1187
1206
|
|
1188
|
-
Analyzer *per_field_analyzer_create(Analyzer *def
|
1207
|
+
Analyzer *per_field_analyzer_create(Analyzer *def)
|
1189
1208
|
{
|
1190
1209
|
PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
|
1191
1210
|
pfa->def = def;
|
1192
|
-
pfa->
|
1193
|
-
pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
|
1194
|
-
: h_new_str(&free, NULL);
|
1211
|
+
pfa->dict = h_new_str(&free, &pfa_sub_a_destroy);
|
1195
1212
|
return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
|
1196
1213
|
}
|
1197
1214
|
|