ferret 0.9.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +23 -5
- data/TODO +2 -1
- data/ext/analysis.c +838 -177
- data/ext/analysis.h +55 -7
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/array.c +8 -5
- data/ext/compound_io.c +132 -96
- data/ext/document.c +58 -28
- data/ext/except.c +59 -0
- data/ext/except.h +88 -0
- data/ext/ferret.c +47 -3
- data/ext/ferret.h +3 -0
- data/ext/field.c +15 -9
- data/ext/filter.c +1 -1
- data/ext/fs_store.c +215 -34
- data/ext/global.c +72 -3
- data/ext/global.h +4 -3
- data/ext/hash.c +44 -3
- data/ext/hash.h +9 -0
- data/ext/header.h +58 -0
- data/ext/inc/except.h +88 -0
- data/ext/inc/lang.h +23 -13
- data/ext/ind.c +16 -10
- data/ext/index.h +2 -22
- data/ext/index_io.c +3 -11
- data/ext/index_rw.c +245 -193
- data/ext/lang.h +23 -13
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/modules.h +162 -0
- data/ext/q_boolean.c +34 -21
- data/ext/q_const_score.c +6 -12
- data/ext/q_filtered_query.c +206 -0
- data/ext/q_fuzzy.c +18 -15
- data/ext/q_match_all.c +3 -7
- data/ext/q_multi_phrase.c +10 -14
- data/ext/q_parser.c +29 -2
- data/ext/q_phrase.c +14 -21
- data/ext/q_prefix.c +15 -12
- data/ext/q_range.c +30 -28
- data/ext/q_span.c +13 -21
- data/ext/q_term.c +17 -26
- data/ext/r_analysis.c +693 -21
- data/ext/r_doc.c +11 -12
- data/ext/r_index_io.c +4 -1
- data/ext/r_qparser.c +21 -2
- data/ext/r_search.c +285 -18
- data/ext/ram_store.c +5 -2
- data/ext/search.c +11 -17
- data/ext/search.h +21 -45
- data/ext/similarity.h +67 -0
- data/ext/sort.c +30 -25
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stopwords.c +325 -0
- data/ext/store.c +34 -2
- data/ext/tags +2953 -0
- data/ext/term.c +21 -15
- data/ext/termdocs.c +5 -3
- data/ext/utilities.c +446 -0
- data/ext/vector.c +27 -13
- data/lib/ferret/document/document.rb +1 -1
- data/lib/ferret/index/index.rb +44 -6
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
- data/lib/rferret.rb +2 -1
- data/test/test_helper.rb +2 -2
- data/test/unit/analysis/ctc_analyzer.rb +401 -0
- data/test/unit/analysis/ctc_tokenstream.rb +423 -0
- data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
- data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
- data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
- data/test/unit/analysis/tc_analyzer.rb +1 -2
- data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
- data/test/unit/document/rtc_field.rb +28 -0
- data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
- data/test/unit/document/tc_field.rb +82 -12
- data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
- data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
- data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
- data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
- data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
- data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
- data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
- data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
- data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
- data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
- data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
- data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
- data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
- data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
- data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
- data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
- data/test/unit/query_parser/tc_query_parser.rb +24 -16
- data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
- data/test/unit/search/rtc_sort_field.rb +14 -0
- data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
- data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
- data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
- data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
- data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +20 -7
- data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
- data/test/unit/store/rtc_fs_store.rb +62 -0
- data/test/unit/store/rtc_ram_store.rb +15 -0
- data/test/unit/store/rtm_store.rb +150 -0
- data/test/unit/store/rtm_store_lock.rb +2 -0
- data/test/unit/store/tc_fs_store.rb +54 -40
- data/test/unit/store/tc_ram_store.rb +20 -0
- data/test/unit/store/tm_store.rb +30 -146
- data/test/unit/store/tm_store_lock.rb +66 -0
- data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
- data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
- data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
- data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
- data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
- data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
- data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
- data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
- metadata +360 -289
- data/test/unit/document/c_field.rb +0 -98
- data/test/unit/search/c_sort_field.rb +0 -27
- data/test/unit/store/c_fs_store.rb +0 -76
- data/test/unit/store/c_ram_store.rb +0 -35
- data/test/unit/store/m_store.rb +0 -34
- data/test/unit/store/m_store_lock.rb +0 -68
data/Rakefile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
$:. << 'lib'
|
|
2
2
|
# Some parts of this Rakefile where taken from Jim Weirich's Rakefile for
|
|
3
|
-
# Rake. Other parts where
|
|
3
|
+
# Rake. Other parts where taken from the David Heinemeier Hansson's Rails
|
|
4
4
|
# Rakefile. Both are under MIT-LICENSE. Thanks to both for their excellent
|
|
5
5
|
# projects.
|
|
6
6
|
|
|
@@ -32,12 +32,13 @@ end
|
|
|
32
32
|
$VERBOSE = nil
|
|
33
33
|
|
|
34
34
|
EXT = "ferret_ext.so"
|
|
35
|
-
EXT_SRC = FileList["src
|
|
35
|
+
EXT_SRC = FileList["src/**/*.[ch]"]
|
|
36
|
+
|
|
36
37
|
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
|
37
38
|
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
|
38
39
|
|
|
39
40
|
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
|
40
|
-
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
|
|
41
|
+
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
|
41
42
|
|
|
42
43
|
task :default => :all_tests
|
|
43
44
|
desc "Run all tests"
|
|
@@ -57,7 +58,7 @@ end
|
|
|
57
58
|
desc "run unit tests in test/unit for C ferret"
|
|
58
59
|
Rake::TestTask.new("test_cunits" => :ext) do |t|
|
|
59
60
|
t.libs << "test/unit"
|
|
60
|
-
t.pattern = 'test/unit/
|
|
61
|
+
t.pattern = 'test/unit/ts_*.rb'
|
|
61
62
|
t.verbose = true
|
|
62
63
|
end
|
|
63
64
|
|
|
@@ -102,6 +103,15 @@ EXT_SRC.each do |fn|
|
|
|
102
103
|
dest_fn = File.join("ext", File.basename(fn))
|
|
103
104
|
file dest_fn => fn do |t|
|
|
104
105
|
cp fn, dest_fn
|
|
106
|
+
if fn =~ /stemmer/
|
|
107
|
+
# flatten the directory structure for lib_stemmer
|
|
108
|
+
open(dest_fn) do |in_f|
|
|
109
|
+
open(dest_fn + ".out", "w") do |out_f|
|
|
110
|
+
in_f.each {|line| out_f.write(line.sub(/(#include ["<])[.a-z_\/]*\//) {"#{$1}"})}
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
mv dest_fn + ".out", dest_fn
|
|
114
|
+
end
|
|
105
115
|
end
|
|
106
116
|
end
|
|
107
117
|
|
|
@@ -110,9 +120,17 @@ task :ext => ["ext/#{EXT}"] + SRC
|
|
|
110
120
|
|
|
111
121
|
file "ext/#{EXT}" => ["ext/Makefile"] do
|
|
112
122
|
cp "ext/inc/lang.h", "ext/lang.h"
|
|
123
|
+
cp "ext/inc/except.h", "ext/except.h"
|
|
113
124
|
sh "cd ext; make"
|
|
114
125
|
end
|
|
115
126
|
|
|
127
|
+
file "ext/lang.h" => ["ext/inc/lang.h"] do
|
|
128
|
+
cp "ext/inc/lang.h", "ext/lang.h"
|
|
129
|
+
end
|
|
130
|
+
file "ext/except.h" => ["ext/inc/except.h"] do
|
|
131
|
+
cp "ext/inc/except.h", "ext/except.h"
|
|
132
|
+
end
|
|
133
|
+
|
|
116
134
|
file "ext/Makefile" => SRC do
|
|
117
135
|
sh "cd ext; ruby extconf.rb"
|
|
118
136
|
end
|
|
@@ -220,7 +238,7 @@ task :repackage => EXT_SRC_DEST
|
|
|
220
238
|
task :package => EXT_SRC_DEST
|
|
221
239
|
task :tag => [:prerelease]
|
|
222
240
|
task :update_version => [:prerelease]
|
|
223
|
-
task :release
|
|
241
|
+
task :release do #=> [:tag, :update_version, :package] do
|
|
224
242
|
announce
|
|
225
243
|
announce "**************************************************************"
|
|
226
244
|
announce "* Release #{PKG_VERSION} Complete."
|
data/TODO
CHANGED
|
@@ -4,11 +4,12 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
|
|
|
4
4
|
|
|
5
5
|
=== To Do
|
|
6
6
|
|
|
7
|
-
* Add the ability to persist an in memory index to Ferret::Index::Index
|
|
8
7
|
* Make a dll for people on Windows
|
|
8
|
+
* pure ruby ConstantScoreQuery
|
|
9
9
|
|
|
10
10
|
=== Done
|
|
11
11
|
|
|
12
|
+
* Add the ability to persist an in memory index to Ferret::Index::Index
|
|
12
13
|
* Add UTF-8 support
|
|
13
14
|
* Multi Field Query
|
|
14
15
|
* Test threading
|
data/ext/analysis.c
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
1
|
#include <analysis.h>
|
|
2
2
|
#include <string.h>
|
|
3
3
|
#include <ctype.h>
|
|
4
|
-
#include <
|
|
4
|
+
#include <wctype.h>
|
|
5
|
+
#include <wchar.h>
|
|
6
|
+
#include "hash.h"
|
|
7
|
+
#include "libstemmer.h"
|
|
8
|
+
|
|
9
|
+
/****************************************************************************
|
|
10
|
+
*
|
|
11
|
+
* Token
|
|
12
|
+
*
|
|
13
|
+
****************************************************************************/
|
|
5
14
|
|
|
6
15
|
Token *tk_create()
|
|
7
16
|
{
|
|
@@ -24,6 +33,11 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
|
|
|
24
33
|
return tk;
|
|
25
34
|
}
|
|
26
35
|
|
|
36
|
+
inline Token *tk_set_ts(Token *tk, char *start, char *end, char *text, int pos_inc)
|
|
37
|
+
{
|
|
38
|
+
return tk_set(tk, start, end - start, start - text, end - text, pos_inc);
|
|
39
|
+
}
|
|
40
|
+
|
|
27
41
|
inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
|
|
28
42
|
{
|
|
29
43
|
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
|
@@ -31,11 +45,8 @@ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_i
|
|
|
31
45
|
|
|
32
46
|
int tk_eq(Token *tk1, Token *tk2)
|
|
33
47
|
{
|
|
34
|
-
|
|
35
|
-
tk1->start == tk2->start && tk1->end == tk2->end)
|
|
36
|
-
return true;
|
|
37
|
-
else
|
|
38
|
-
return false;
|
|
48
|
+
return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
|
49
|
+
tk1->start == tk2->start && tk1->end == tk2->end);
|
|
39
50
|
}
|
|
40
51
|
|
|
41
52
|
int tk_cmp(Token *tk1, Token *tk2)
|
|
@@ -57,46 +68,152 @@ int tk_cmp(Token *tk1, Token *tk2)
|
|
|
57
68
|
return cmp;
|
|
58
69
|
}
|
|
59
70
|
|
|
71
|
+
|
|
72
|
+
/****************************************************************************
|
|
73
|
+
*
|
|
74
|
+
* TokenStream
|
|
75
|
+
*
|
|
76
|
+
****************************************************************************/
|
|
77
|
+
|
|
60
78
|
void ts_standard_destroy(void *p)
|
|
61
79
|
{
|
|
62
80
|
TokenStream *ts = (TokenStream *)p;
|
|
63
81
|
tk_destroy(ts->token);
|
|
64
|
-
free(
|
|
82
|
+
free(ts);
|
|
65
83
|
}
|
|
66
84
|
|
|
67
85
|
void ts_reset(TokenStream *ts, char *text)
|
|
68
86
|
{
|
|
69
|
-
ts->text = text;
|
|
70
|
-
ts->pos = 0;
|
|
87
|
+
ts->t = ts->text = text;
|
|
71
88
|
}
|
|
72
89
|
|
|
73
90
|
TokenStream *ts_create()
|
|
74
91
|
{
|
|
75
92
|
TokenStream *ts = ALLOC(TokenStream);
|
|
76
|
-
ts->pos = -1;
|
|
77
93
|
ts->text = NULL;
|
|
78
94
|
ts->token = tk_create();
|
|
79
95
|
ts->destroy = &ts_standard_destroy;
|
|
80
96
|
ts->reset = &ts_reset;
|
|
97
|
+
ts->sub_ts = NULL;
|
|
98
|
+
ts->clone_i = NULL;
|
|
81
99
|
return ts;
|
|
82
100
|
}
|
|
83
101
|
|
|
102
|
+
TokenStream *ts_clone(TokenStream *orig_ts)
|
|
103
|
+
{
|
|
104
|
+
TokenStream *ts = ALLOC(TokenStream);
|
|
105
|
+
memcpy(ts, orig_ts, sizeof(TokenStream));
|
|
106
|
+
if (orig_ts->token) {
|
|
107
|
+
ts->token = ALLOC(Token);
|
|
108
|
+
memcpy(ts->token, orig_ts->token, sizeof(Token));
|
|
109
|
+
}
|
|
110
|
+
if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
|
|
111
|
+
if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
|
|
112
|
+
return ts;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/* * Multi-byte TokenStream * */
|
|
116
|
+
static char * const ENC_ERR_MSG = "Error decoding input string. "
|
|
117
|
+
"Check that you have the locale set correctly";
|
|
118
|
+
#define MB_NEXT_CHAR \
|
|
119
|
+
if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
|
120
|
+
RAISE(IO_ERROR, ENC_ERR_MSG)
|
|
121
|
+
|
|
122
|
+
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
|
|
123
|
+
{
|
|
124
|
+
tk->text[wcstombs(tk->text, text, MAX_WORD_SIZE - 1)] = '\0';
|
|
125
|
+
tk->start = start;
|
|
126
|
+
tk->end = end;
|
|
127
|
+
tk->pos_inc = pos_inc;
|
|
128
|
+
return tk;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
void mb_ts_standard_destroy(void *p)
|
|
132
|
+
{
|
|
133
|
+
TokenStream *ts = (TokenStream *)p;
|
|
134
|
+
tk_destroy(ts->token);
|
|
135
|
+
free(ts->data);
|
|
136
|
+
free(ts);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
void mb_ts_reset(TokenStream *ts, char *text)
|
|
140
|
+
{
|
|
141
|
+
ZEROSET(ts->data, mbstate_t, 1);
|
|
142
|
+
ts_reset(ts, text);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
146
|
+
{
|
|
147
|
+
new_ts->data = ALLOC(mbstate_t);
|
|
148
|
+
memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
TokenStream *mb_ts_create()
|
|
152
|
+
{
|
|
153
|
+
TokenStream *ts = ALLOC(TokenStream);
|
|
154
|
+
ts->data = ALLOC(mbstate_t);
|
|
155
|
+
ts->text = NULL;
|
|
156
|
+
ts->token = tk_create();
|
|
157
|
+
ts->destroy = &mb_ts_standard_destroy;
|
|
158
|
+
ts->reset = &mb_ts_reset;
|
|
159
|
+
ts->clone_i = &mb_ts_clone_i;
|
|
160
|
+
ts->sub_ts = NULL;
|
|
161
|
+
return ts;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/****************************************************************************
|
|
165
|
+
*
|
|
166
|
+
* Analyzer
|
|
167
|
+
*
|
|
168
|
+
****************************************************************************/
|
|
169
|
+
|
|
170
|
+
void a_standard_destroy(void *p)
|
|
171
|
+
{
|
|
172
|
+
Analyzer *a = (Analyzer *)p;
|
|
173
|
+
ts_destroy(a->current_ts);
|
|
174
|
+
free(p);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
|
178
|
+
{
|
|
179
|
+
a->current_ts->reset(a->current_ts, text);
|
|
180
|
+
return a->current_ts;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
|
|
184
|
+
TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
|
|
185
|
+
{
|
|
186
|
+
Analyzer *a = ALLOC(Analyzer);
|
|
187
|
+
a->data = data;
|
|
188
|
+
a->current_ts = ts;
|
|
189
|
+
a->destroy = (destroy ? destroy : &a_standard_destroy);
|
|
190
|
+
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
|
191
|
+
return a;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/****************************************************************************
|
|
195
|
+
*
|
|
196
|
+
* Whitespace
|
|
197
|
+
*
|
|
198
|
+
****************************************************************************/
|
|
199
|
+
|
|
200
|
+
/*
|
|
201
|
+
* WhitespaceTokenizer
|
|
202
|
+
*/
|
|
84
203
|
Token *wst_next(TokenStream *ts)
|
|
85
204
|
{
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
char *text = ts->text;
|
|
205
|
+
char *t = ts->t;
|
|
206
|
+
char *start;
|
|
89
207
|
|
|
90
|
-
while (
|
|
91
|
-
i++;
|
|
92
|
-
if (text[i] == '\0')
|
|
93
|
-
return NULL;
|
|
208
|
+
while (*t != '\0' && isspace(*t)) t++;
|
|
94
209
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
210
|
+
if (*t == '\0') return NULL;
|
|
211
|
+
|
|
212
|
+
start = t;
|
|
213
|
+
while (*t != '\0' && !isspace(*t)) t++;
|
|
214
|
+
|
|
215
|
+
ts->t = t;
|
|
216
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
100
217
|
return ts->token;
|
|
101
218
|
}
|
|
102
219
|
|
|
@@ -107,22 +224,121 @@ TokenStream *whitespace_tokenizer_create()
|
|
|
107
224
|
return ts;
|
|
108
225
|
}
|
|
109
226
|
|
|
227
|
+
/*
|
|
228
|
+
* Multi-byte WhitespaceTokenizer
|
|
229
|
+
*/
|
|
230
|
+
Token *mb_wst_next(TokenStream *ts)
|
|
231
|
+
{
|
|
232
|
+
int i;
|
|
233
|
+
char *start;
|
|
234
|
+
char *t = ts->t;
|
|
235
|
+
wchar_t wchr;
|
|
236
|
+
|
|
237
|
+
MB_NEXT_CHAR;
|
|
238
|
+
while (wchr != 0 && iswspace(wchr)) {
|
|
239
|
+
t += i;
|
|
240
|
+
MB_NEXT_CHAR;
|
|
241
|
+
}
|
|
242
|
+
if (wchr == 0) return NULL;
|
|
243
|
+
|
|
244
|
+
start = t;
|
|
245
|
+
t += i;
|
|
246
|
+
MB_NEXT_CHAR;
|
|
247
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
|
248
|
+
t += i;
|
|
249
|
+
MB_NEXT_CHAR;
|
|
250
|
+
}
|
|
251
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
252
|
+
ts->t = t;
|
|
253
|
+
return ts->token;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/*
|
|
257
|
+
* Lowercasing Multi-byte WhitespaceTokenizer
|
|
258
|
+
*/
|
|
259
|
+
Token *mb_wst_next_lc(TokenStream *ts)
|
|
260
|
+
{
|
|
261
|
+
int i;
|
|
262
|
+
char *start;
|
|
263
|
+
char *t = ts->t;
|
|
264
|
+
wchar_t wchr;
|
|
265
|
+
wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
|
|
266
|
+
|
|
267
|
+
w = wbuf;
|
|
268
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
|
269
|
+
|
|
270
|
+
MB_NEXT_CHAR;
|
|
271
|
+
while (wchr != 0 && iswspace(wchr)) {
|
|
272
|
+
t += i;
|
|
273
|
+
MB_NEXT_CHAR;
|
|
274
|
+
}
|
|
275
|
+
if (wchr == 0) return NULL;
|
|
276
|
+
|
|
277
|
+
start = t;
|
|
278
|
+
t += i;
|
|
279
|
+
*w++ = towlower(wchr);
|
|
280
|
+
MB_NEXT_CHAR;
|
|
281
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
|
282
|
+
if (w < w_end) *w++ = towlower(wchr);
|
|
283
|
+
t += i;
|
|
284
|
+
MB_NEXT_CHAR;
|
|
285
|
+
}
|
|
286
|
+
*w = 0;
|
|
287
|
+
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
|
288
|
+
ts->t = t;
|
|
289
|
+
return ts->token;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
TokenStream *mb_whitespace_tokenizer_create(bool lowercase)
|
|
293
|
+
{
|
|
294
|
+
TokenStream *ts = mb_ts_create();
|
|
295
|
+
ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
|
|
296
|
+
return ts;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/*
|
|
300
|
+
* WhitespaceAnalyzers
|
|
301
|
+
*/
|
|
302
|
+
Analyzer *whitespace_analyzer_create(bool lowercase)
|
|
303
|
+
{
|
|
304
|
+
TokenStream *ts;
|
|
305
|
+
if (lowercase) {
|
|
306
|
+
ts = lowercase_filter_create(whitespace_tokenizer_create());
|
|
307
|
+
} else {
|
|
308
|
+
ts = whitespace_tokenizer_create();
|
|
309
|
+
}
|
|
310
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
Analyzer *mb_whitespace_analyzer_create(bool lowercase)
|
|
314
|
+
{
|
|
315
|
+
return analyzer_create(NULL, mb_whitespace_tokenizer_create(lowercase),
|
|
316
|
+
NULL, NULL);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/****************************************************************************
|
|
320
|
+
*
|
|
321
|
+
* Letter
|
|
322
|
+
*
|
|
323
|
+
****************************************************************************/
|
|
324
|
+
|
|
325
|
+
/*
|
|
326
|
+
* LetterTokenizer
|
|
327
|
+
*/
|
|
110
328
|
Token *lt_next(TokenStream *ts)
|
|
111
329
|
{
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
char *text = ts->text;
|
|
330
|
+
char *start;
|
|
331
|
+
char *t = ts->t;
|
|
115
332
|
|
|
116
|
-
while (
|
|
117
|
-
i++;
|
|
118
|
-
if (text[i] == '\0')
|
|
119
|
-
return NULL;
|
|
333
|
+
while (*t != '\0' && !isalpha(*t)) t++;
|
|
120
334
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
335
|
+
if (*t == '\0') return NULL;
|
|
336
|
+
|
|
337
|
+
start = t;
|
|
338
|
+
while (*t != '\0' && isalpha(*t)) t++;
|
|
339
|
+
|
|
340
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
341
|
+
ts->t = t;
|
|
126
342
|
return ts->token;
|
|
127
343
|
}
|
|
128
344
|
|
|
@@ -133,54 +349,174 @@ TokenStream *letter_tokenizer_create()
|
|
|
133
349
|
return ts;
|
|
134
350
|
}
|
|
135
351
|
|
|
136
|
-
|
|
352
|
+
/*
|
|
353
|
+
* Multi-byte LetterTokenizer
|
|
354
|
+
*/
|
|
355
|
+
Token *mb_lt_next(TokenStream *ts)
|
|
137
356
|
{
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
357
|
+
int i;
|
|
358
|
+
char *start;
|
|
359
|
+
char *t = ts->t;
|
|
360
|
+
wchar_t wchr;
|
|
361
|
+
|
|
362
|
+
MB_NEXT_CHAR;
|
|
363
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
|
364
|
+
t += i;
|
|
365
|
+
MB_NEXT_CHAR;
|
|
366
|
+
}
|
|
367
|
+
if (wchr == 0) return NULL;
|
|
368
|
+
|
|
369
|
+
start = t;
|
|
370
|
+
t += i;
|
|
371
|
+
MB_NEXT_CHAR;
|
|
372
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
|
373
|
+
t += i;
|
|
374
|
+
MB_NEXT_CHAR;
|
|
375
|
+
}
|
|
376
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
377
|
+
ts->t = t;
|
|
378
|
+
return ts->token;
|
|
141
379
|
}
|
|
142
380
|
|
|
143
|
-
|
|
381
|
+
/*
|
|
382
|
+
* Lowercasing Multi-byte LetterTokenizer
|
|
383
|
+
*/
|
|
384
|
+
Token *mb_lt_next_lc(TokenStream *ts)
|
|
144
385
|
{
|
|
145
|
-
|
|
146
|
-
|
|
386
|
+
int i;
|
|
387
|
+
char *start;
|
|
388
|
+
char *t = ts->t;
|
|
389
|
+
wchar_t wchr;
|
|
390
|
+
wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
|
|
391
|
+
|
|
392
|
+
w = wbuf;
|
|
393
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
|
394
|
+
|
|
395
|
+
MB_NEXT_CHAR;
|
|
396
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
|
397
|
+
t += i;
|
|
398
|
+
MB_NEXT_CHAR;
|
|
399
|
+
}
|
|
400
|
+
if (wchr == 0) return NULL;
|
|
401
|
+
|
|
402
|
+
start = t;
|
|
403
|
+
t += i;
|
|
404
|
+
*w++ = towlower(wchr);
|
|
405
|
+
MB_NEXT_CHAR;
|
|
406
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
|
407
|
+
if (w < w_end) *w++ = towlower(wchr);
|
|
408
|
+
t += i;
|
|
409
|
+
MB_NEXT_CHAR;
|
|
410
|
+
}
|
|
411
|
+
*w = 0;
|
|
412
|
+
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
|
413
|
+
ts->t = t;
|
|
414
|
+
return ts->token;
|
|
147
415
|
}
|
|
148
416
|
|
|
149
|
-
|
|
417
|
+
TokenStream *mb_letter_tokenizer_create(bool lowercase)
|
|
150
418
|
{
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
a->destroy = &a_standard_destroy;
|
|
155
|
-
a->get_ts = &a_standard_get_ts;
|
|
156
|
-
return a;
|
|
419
|
+
TokenStream *ts = mb_ts_create();
|
|
420
|
+
ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
|
|
421
|
+
return ts;
|
|
157
422
|
}
|
|
158
423
|
|
|
159
|
-
|
|
424
|
+
/*
|
|
425
|
+
* LetterAnalyzers
|
|
426
|
+
*/
|
|
427
|
+
Analyzer *letter_analyzer_create(bool lowercase)
|
|
428
|
+
{
|
|
429
|
+
TokenStream *ts;
|
|
430
|
+
if (lowercase) {
|
|
431
|
+
ts = lowercase_filter_create(letter_tokenizer_create());
|
|
432
|
+
} else {
|
|
433
|
+
ts = letter_tokenizer_create();
|
|
434
|
+
}
|
|
435
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
Analyzer *mb_letter_analyzer_create(bool lowercase)
|
|
439
|
+
{
|
|
440
|
+
return analyzer_create(NULL,
|
|
441
|
+
mb_letter_tokenizer_create(lowercase), NULL, NULL);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
/****************************************************************************
|
|
445
|
+
*
|
|
446
|
+
* Standard
|
|
447
|
+
*
|
|
448
|
+
****************************************************************************/
|
|
449
|
+
|
|
450
|
+
/*
|
|
451
|
+
* StandardTokenizer
|
|
452
|
+
*/
|
|
453
|
+
int std_get_alpha(TokenStream *ts, char *token)
|
|
160
454
|
{
|
|
161
455
|
int i = 0;
|
|
162
|
-
|
|
163
|
-
|
|
456
|
+
char *t = ts->t;
|
|
457
|
+
while (t[i] != '\0' && isalpha(t[i])) {
|
|
458
|
+
if (i < MAX_WORD_SIZE) token[i] = t[i];
|
|
164
459
|
i++;
|
|
165
460
|
}
|
|
166
461
|
return i;
|
|
167
462
|
}
|
|
168
463
|
|
|
169
|
-
int
|
|
464
|
+
int mb_std_get_alpha(TokenStream *ts, char *token)
|
|
465
|
+
{
|
|
466
|
+
char *t = ts->t;
|
|
467
|
+
wchar_t w;
|
|
468
|
+
int i;
|
|
469
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
470
|
+
while (w != 0 && iswalpha(w)) {
|
|
471
|
+
t += i;
|
|
472
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
i = t - ts->t;
|
|
476
|
+
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
|
477
|
+
memcpy(token, ts->t, i);
|
|
478
|
+
return i;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
int std_get_alnum(TokenStream *ts, char *token)
|
|
170
482
|
{
|
|
171
483
|
int i = 0;
|
|
172
|
-
|
|
173
|
-
|
|
484
|
+
char *t = ts->t;
|
|
485
|
+
while (t[i] != '\0' && isalnum(t[i])) {
|
|
486
|
+
if (i < MAX_WORD_SIZE) token[i] = t[i];
|
|
174
487
|
i++;
|
|
175
488
|
}
|
|
176
489
|
return i;
|
|
177
490
|
}
|
|
178
491
|
|
|
492
|
+
int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
|
|
493
|
+
{
|
|
494
|
+
char *t = ts->t;
|
|
495
|
+
wchar_t w;
|
|
496
|
+
int i;
|
|
497
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
498
|
+
while (w != 0 && iswalnum(w)) {
|
|
499
|
+
t += i;
|
|
500
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
i = t - ts->t;
|
|
504
|
+
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
|
505
|
+
memcpy(token, ts->t, i);
|
|
506
|
+
return i;
|
|
507
|
+
|
|
508
|
+
}
|
|
509
|
+
|
|
179
510
|
int isnumpunc(char c)
|
|
180
511
|
{
|
|
181
512
|
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
|
|
182
513
|
}
|
|
183
514
|
|
|
515
|
+
int w_isnumpunc(wchar_t c)
|
|
516
|
+
{
|
|
517
|
+
return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_' || c == L'-');
|
|
518
|
+
}
|
|
519
|
+
|
|
184
520
|
int isurlpunc(char c)
|
|
185
521
|
{
|
|
186
522
|
return (c == '.' || c == '/' || c == '-' || c == '_');
|
|
@@ -201,11 +537,23 @@ int isurlxatc(char c)
|
|
|
201
537
|
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
|
|
202
538
|
}
|
|
203
539
|
|
|
204
|
-
|
|
540
|
+
bool std_is_tok_char(char *c)
|
|
205
541
|
{
|
|
206
|
-
if (isspace(c)) return false; // most common so check first.
|
|
207
|
-
if (isalnum(c) || isnumpunc(c) || c == '&' ||
|
|
208
|
-
c == '@' || c == '\'' || c == ':')
|
|
542
|
+
if (isspace(*c)) return false; // most common so check first.
|
|
543
|
+
if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
|
|
544
|
+
*c == '@' || *c == '\'' || *c == ':')
|
|
545
|
+
return true;
|
|
546
|
+
return false;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
bool w_std_is_tok_char(char *t)
|
|
550
|
+
{
|
|
551
|
+
wchar_t c;
|
|
552
|
+
if ((mbtowc(&c, t, MB_CUR_MAX)) < 0)
|
|
553
|
+
RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
554
|
+
if (iswspace(c)) return false; // most common so check first.
|
|
555
|
+
if (iswalnum(c) || w_isnumpunc(c) || c == L'&' ||
|
|
556
|
+
c == L'@' || c == L'\'' || c == L':')
|
|
209
557
|
return true;
|
|
210
558
|
return false;
|
|
211
559
|
}
|
|
@@ -246,22 +594,34 @@ int std_get_number(char *input)
|
|
|
246
594
|
|
|
247
595
|
int std_get_apostrophe(char *input)
|
|
248
596
|
{
|
|
249
|
-
|
|
597
|
+
char *t = input;
|
|
250
598
|
|
|
251
|
-
while (isalpha(
|
|
252
|
-
|
|
599
|
+
while (isalpha(*t) || *t == '\'')
|
|
600
|
+
t++;
|
|
253
601
|
|
|
254
|
-
return
|
|
602
|
+
return t - input;
|
|
255
603
|
}
|
|
256
604
|
|
|
257
|
-
int
|
|
605
|
+
int mb_std_get_apostrophe(char *input)
|
|
258
606
|
{
|
|
259
|
-
|
|
607
|
+
char *t = input;
|
|
608
|
+
wchar_t w;
|
|
609
|
+
int i;
|
|
260
610
|
|
|
611
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
612
|
+
while (iswalpha(w) || w == L'\'') {
|
|
613
|
+
t += i;
|
|
614
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
615
|
+
}
|
|
616
|
+
return t - input;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
int std_get_url(char *input, char *token, int i)
|
|
620
|
+
{
|
|
261
621
|
while (isurlc(input[i])) {
|
|
262
622
|
if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
|
|
263
623
|
break; // can't have to puncs in a row
|
|
264
|
-
token[i] = input[i];
|
|
624
|
+
if (i < MAX_WORD_SIZE) token[i] = input[i];
|
|
265
625
|
i++;
|
|
266
626
|
}
|
|
267
627
|
|
|
@@ -282,148 +642,229 @@ int std_get_company_name(char *input)
|
|
|
282
642
|
return i;
|
|
283
643
|
}
|
|
284
644
|
|
|
645
|
+
int mb_std_get_company_name(char *input, TokenStream *ts)
|
|
646
|
+
{
|
|
647
|
+
char *t = input;
|
|
648
|
+
wchar_t wchr;
|
|
649
|
+
int i;
|
|
650
|
+
|
|
651
|
+
MB_NEXT_CHAR;
|
|
652
|
+
while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
|
|
653
|
+
t += i;
|
|
654
|
+
MB_NEXT_CHAR;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
return t - input;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
bool std_advance_to_start(TokenStream *ts)
|
|
661
|
+
{
|
|
662
|
+
char *t = ts->t;
|
|
663
|
+
while (*t != '\0' && !isalnum(*t)) t++;
|
|
664
|
+
|
|
665
|
+
ts->t = t;
|
|
666
|
+
|
|
667
|
+
return (*t != '\0');
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
bool mb_std_advance_to_start(TokenStream *ts)
|
|
671
|
+
{
|
|
672
|
+
int i;
|
|
673
|
+
wchar_t w;
|
|
674
|
+
|
|
675
|
+
if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
676
|
+
while (w != 0 && !iswalnum(w)) {
|
|
677
|
+
ts->t += i;
|
|
678
|
+
if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
return (w != 0);
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
typedef struct StandardTokenizer {
|
|
685
|
+
bool (*advance_to_start)(TokenStream *ts);
|
|
686
|
+
bool (*is_tok_char)(char *c);
|
|
687
|
+
int (*get_alpha)(TokenStream *ts, char *token);
|
|
688
|
+
int (*get_apostrophe)(char *input);
|
|
689
|
+
} StandardTokenizer;
|
|
690
|
+
|
|
285
691
|
Token *std_next(TokenStream *ts)
|
|
286
692
|
{
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
char *
|
|
693
|
+
StandardTokenizer *std_tz = (StandardTokenizer *)ts->data;
|
|
694
|
+
char *s;
|
|
695
|
+
char *t;
|
|
696
|
+
char *start = NULL;
|
|
697
|
+
char *num_end = NULL;
|
|
290
698
|
char token[MAX_WORD_SIZE];
|
|
291
699
|
int token_i = 0;
|
|
292
700
|
int len;
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
int seen_at_symbol;
|
|
701
|
+
bool is_acronym;
|
|
702
|
+
bool seen_at_symbol;
|
|
296
703
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
ts->pos = i;
|
|
306
|
-
tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
|
|
704
|
+
|
|
705
|
+
if (!std_tz->advance_to_start(ts)) return NULL;
|
|
706
|
+
|
|
707
|
+
start = t = ts->t;
|
|
708
|
+
if (isdigit(*t)) {
|
|
709
|
+
t += std_get_number(t);
|
|
710
|
+
ts->t = t;
|
|
711
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
307
712
|
} else {
|
|
308
|
-
token_i =
|
|
309
|
-
|
|
713
|
+
token_i = std_tz->get_alpha(ts, token);
|
|
714
|
+
t += token_i;
|
|
310
715
|
|
|
311
|
-
if (!
|
|
716
|
+
if (!std_tz->is_tok_char(t)) {
|
|
312
717
|
// very common case, ie a plain word, so check and return
|
|
313
|
-
|
|
314
|
-
ts->
|
|
718
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
719
|
+
ts->t = t;
|
|
315
720
|
return ts->token;
|
|
316
721
|
}
|
|
317
722
|
|
|
318
|
-
if (
|
|
319
|
-
|
|
320
|
-
ts->
|
|
321
|
-
len =
|
|
723
|
+
if (*t == '\'') { // apostrophe case.
|
|
724
|
+
t += std_tz->get_apostrophe(t);
|
|
725
|
+
ts->t = t;
|
|
726
|
+
len = t - start;
|
|
322
727
|
// strip possesive
|
|
323
|
-
if ((
|
|
324
|
-
|
|
325
|
-
|
|
728
|
+
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
|
|
729
|
+
|
|
730
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
326
731
|
return ts->token;
|
|
327
732
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
733
|
+
|
|
734
|
+
if (*t == '&') { // apostrophe case.
|
|
735
|
+
t += std_get_company_name(t);
|
|
736
|
+
ts->t = t;
|
|
737
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
332
738
|
return ts->token;
|
|
333
739
|
}
|
|
334
740
|
|
|
335
|
-
if (isdigit(
|
|
336
|
-
num_end = start + std_get_number(
|
|
337
|
-
if (!
|
|
338
|
-
ts->
|
|
339
|
-
|
|
741
|
+
if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
|
|
742
|
+
num_end = start + std_get_number(start);
|
|
743
|
+
if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
|
|
744
|
+
ts->t = num_end;
|
|
745
|
+
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
|
340
746
|
return ts->token;
|
|
341
747
|
}
|
|
342
748
|
// else there may be a longer token so check
|
|
343
749
|
}
|
|
344
750
|
|
|
345
|
-
if (
|
|
751
|
+
if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
|
|
346
752
|
// check for a known url start
|
|
347
753
|
token[token_i] = '\0';
|
|
348
|
-
|
|
349
|
-
while (
|
|
350
|
-
if (isalpha(
|
|
351
|
-
(
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
len = std_get_url(
|
|
754
|
+
t += 3;
|
|
755
|
+
while (*t == '/') t++;
|
|
756
|
+
if (isalpha(*t) &&
|
|
757
|
+
(memcmp(token, "ftp", 3) == 0 ||
|
|
758
|
+
memcmp(token, "http", 4) == 0 ||
|
|
759
|
+
memcmp(token, "https", 5) == 0 ||
|
|
760
|
+
memcmp(token, "file", 4) == 0)) {
|
|
761
|
+
len = std_get_url(t, token, 0); // dispose of first part of the URL
|
|
356
762
|
} else { //still treat as url but keep the first part
|
|
357
|
-
token_i =
|
|
358
|
-
memcpy(token,
|
|
359
|
-
len = token_i + std_get_url(
|
|
763
|
+
token_i = t - start;
|
|
764
|
+
memcpy(token, start, token_i * sizeof(char));
|
|
765
|
+
len = token_i + std_get_url(t, token, token_i); // keep start
|
|
360
766
|
}
|
|
361
|
-
ts->
|
|
767
|
+
ts->t = t + len;
|
|
362
768
|
token[len] = 0;
|
|
363
|
-
tk_set(ts->token, token, len, start, ts->
|
|
769
|
+
tk_set(ts->token, token, len, start - ts->text, ts->t - ts->text, 1);
|
|
364
770
|
return ts->token;
|
|
365
771
|
}
|
|
366
772
|
|
|
367
|
-
// now see how
|
|
773
|
+
// now see how long a url we can find.
|
|
368
774
|
is_acronym = true;
|
|
369
775
|
seen_at_symbol = false;
|
|
370
|
-
while (isurlxatc(
|
|
371
|
-
if (is_acronym && !isalpha(
|
|
776
|
+
while (isurlxatc(*t)) {
|
|
777
|
+
if (is_acronym && !isalpha(*t) && (*t != '.')) {
|
|
372
778
|
is_acronym = false;
|
|
373
779
|
}
|
|
374
|
-
if (isurlxatpunc(
|
|
375
|
-
break; // can't have
|
|
376
|
-
|
|
377
|
-
|
|
780
|
+
if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
|
|
781
|
+
break; // can't have two punctuation characters in a row
|
|
782
|
+
}
|
|
783
|
+
if (*t == '@') {
|
|
784
|
+
if (seen_at_symbol) {
|
|
378
785
|
break; // we can only have one @ symbol
|
|
379
|
-
else
|
|
786
|
+
} else {
|
|
380
787
|
seen_at_symbol = true;
|
|
788
|
+
}
|
|
381
789
|
}
|
|
382
|
-
|
|
790
|
+
t++;
|
|
383
791
|
}
|
|
384
|
-
while (isurlxatpunc(
|
|
385
|
-
|
|
386
|
-
|
|
792
|
+
while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
|
|
793
|
+
|
|
794
|
+
if (t > num_end) {
|
|
795
|
+
ts->t = t;
|
|
387
796
|
|
|
388
797
|
if (is_acronym) { // check that it is one letter followed by one '.'
|
|
389
|
-
for (
|
|
390
|
-
if (isalpha(
|
|
798
|
+
for (s = start; s < t-1; s++) {
|
|
799
|
+
if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
|
|
391
800
|
}
|
|
392
801
|
}
|
|
393
802
|
if (is_acronym) {// strip '.'s
|
|
394
|
-
for (
|
|
395
|
-
if (
|
|
396
|
-
token[token_i] =
|
|
803
|
+
for (s = start + token_i; s < t; s++) {
|
|
804
|
+
if (*s != '.') {
|
|
805
|
+
token[token_i] = *s;
|
|
397
806
|
token_i++;
|
|
398
807
|
}
|
|
399
808
|
}
|
|
400
|
-
tk_set(ts->token, token, token_i, start, ts->
|
|
809
|
+
tk_set(ts->token, token, token_i, start - ts->text, t - ts->text, 1);
|
|
401
810
|
} else { // just return the url as is
|
|
402
|
-
|
|
811
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
|
403
812
|
}
|
|
404
813
|
} else { // return the number
|
|
405
|
-
ts->
|
|
406
|
-
|
|
814
|
+
ts->t = num_end;
|
|
815
|
+
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
|
407
816
|
}
|
|
408
817
|
}
|
|
409
818
|
|
|
410
819
|
return ts->token;
|
|
411
820
|
}
|
|
412
821
|
|
|
822
|
+
void std_ts_destroy(void *p)
|
|
823
|
+
{
|
|
824
|
+
TokenStream *ts = (TokenStream *)p;
|
|
825
|
+
free(ts->data);
|
|
826
|
+
ts_standard_destroy(ts);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
void std_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
830
|
+
{
|
|
831
|
+
new_ts->data = ALLOC(StandardTokenizer);
|
|
832
|
+
memcpy(new_ts->data, orig_ts->data, sizeof(StandardTokenizer));
|
|
833
|
+
}
|
|
834
|
+
|
|
413
835
|
TokenStream *standard_tokenizer_create()
|
|
414
836
|
{
|
|
415
837
|
TokenStream *ts = ts_create();
|
|
838
|
+
|
|
839
|
+
StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
|
|
840
|
+
std_tz->advance_to_start = &std_advance_to_start;
|
|
841
|
+
std_tz->get_alpha = &std_get_alpha;
|
|
842
|
+
std_tz->is_tok_char = &std_is_tok_char;
|
|
843
|
+
std_tz->get_apostrophe = &std_get_apostrophe;
|
|
844
|
+
|
|
845
|
+
ts->data = std_tz;
|
|
846
|
+
ts->destroy = &std_ts_destroy;
|
|
847
|
+
ts->clone_i = &std_ts_clone_i;
|
|
416
848
|
ts->next = &std_next;
|
|
417
849
|
return ts;
|
|
418
850
|
}
|
|
419
851
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
852
|
+
TokenStream *mb_standard_tokenizer_create()
|
|
853
|
+
{
|
|
854
|
+
TokenStream *ts = ts_create();
|
|
855
|
+
|
|
856
|
+
StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
|
|
857
|
+
std_tz->advance_to_start = &mb_std_advance_to_start;
|
|
858
|
+
std_tz->get_alpha = &mb_std_get_alpha;
|
|
859
|
+
std_tz->is_tok_char = &w_std_is_tok_char;
|
|
860
|
+
std_tz->get_apostrophe = &mb_std_get_apostrophe;
|
|
861
|
+
|
|
862
|
+
ts->data = std_tz;
|
|
863
|
+
ts->destroy = &std_ts_destroy;
|
|
864
|
+
ts->clone_i = &std_ts_clone_i;
|
|
865
|
+
ts->next = &std_next;
|
|
866
|
+
return ts;
|
|
867
|
+
}
|
|
427
868
|
|
|
428
869
|
void filter_reset(TokenStream *ts, char *text)
|
|
429
870
|
{
|
|
@@ -432,10 +873,10 @@ void filter_reset(TokenStream *ts, char *text)
|
|
|
432
873
|
|
|
433
874
|
void filter_destroy(void *p)
|
|
434
875
|
{
|
|
435
|
-
TokenStream *
|
|
436
|
-
|
|
437
|
-
if (
|
|
438
|
-
free(
|
|
876
|
+
TokenStream *tf = (TokenStream *)p;
|
|
877
|
+
if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
|
|
878
|
+
if (tf->token != NULL) tk_destroy(tf->token);
|
|
879
|
+
free(tf);
|
|
439
880
|
}
|
|
440
881
|
|
|
441
882
|
void sf_destroy(void *p)
|
|
@@ -445,40 +886,109 @@ void sf_destroy(void *p)
|
|
|
445
886
|
filter_destroy(p);
|
|
446
887
|
}
|
|
447
888
|
|
|
448
|
-
|
|
889
|
+
void sf_clone_i_i(void *key, void *value, void *arg)
|
|
890
|
+
{
|
|
891
|
+
HshTable *wordtable = (HshTable *)arg;
|
|
892
|
+
char *w = estrdup(key);
|
|
893
|
+
h_set(wordtable, w, w);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
void sf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
897
|
+
{
|
|
898
|
+
new_ts->data = h_new_str(&free, NULL);
|
|
899
|
+
h_each(orig_ts->data, &sf_clone_i_i, new_ts->data);
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
Token *sf_next(TokenStream *tf)
|
|
449
903
|
{
|
|
450
904
|
int pos_inc = 1;
|
|
451
|
-
HshTable *words = (HshTable *)
|
|
452
|
-
Token *tk =
|
|
905
|
+
HshTable *words = (HshTable *)tf->data;
|
|
906
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
|
453
907
|
while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
|
|
454
|
-
tk =
|
|
908
|
+
tk = tf->sub_ts->next(tf->sub_ts);
|
|
455
909
|
pos_inc++;
|
|
456
910
|
}
|
|
457
911
|
if (tk != NULL) tk->pos_inc = pos_inc;
|
|
458
912
|
return tk;
|
|
459
913
|
}
|
|
460
914
|
|
|
461
|
-
TokenStream *
|
|
915
|
+
TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
|
916
|
+
const char **words, int len)
|
|
462
917
|
{
|
|
463
918
|
int i;
|
|
919
|
+
char *w;
|
|
464
920
|
TokenStream *tf = ALLOC(TokenStream);
|
|
465
921
|
tf->sub_ts = ts;
|
|
466
|
-
|
|
922
|
+
tf->destroy_sub = true;
|
|
923
|
+
HshTable *wordtable = h_new_str(&free, NULL);
|
|
467
924
|
for (i = 0; i < len; i++) {
|
|
468
|
-
|
|
925
|
+
w = estrdup(words[i]);
|
|
926
|
+
h_set(wordtable, w, w);
|
|
927
|
+
}
|
|
928
|
+
tf->data = wordtable;
|
|
929
|
+
tf->token = NULL;
|
|
930
|
+
tf->next = &sf_next;
|
|
931
|
+
tf->reset = &filter_reset;
|
|
932
|
+
tf->destroy = &sf_destroy;
|
|
933
|
+
tf->clone_i = &sf_clone_i;
|
|
934
|
+
return tf;
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
|
938
|
+
{
|
|
939
|
+
char *w;
|
|
940
|
+
TokenStream *tf = ALLOC(TokenStream);
|
|
941
|
+
tf->sub_ts = ts;
|
|
942
|
+
tf->destroy_sub = true;
|
|
943
|
+
HshTable *wordtable = h_new_str(&free, NULL);
|
|
944
|
+
while (*words) {
|
|
945
|
+
w = estrdup(*words);
|
|
946
|
+
h_set(wordtable, w, w);
|
|
947
|
+
words++;
|
|
469
948
|
}
|
|
470
949
|
tf->data = wordtable;
|
|
471
950
|
tf->token = NULL;
|
|
472
951
|
tf->next = &sf_next;
|
|
473
952
|
tf->reset = &filter_reset;
|
|
474
953
|
tf->destroy = &sf_destroy;
|
|
954
|
+
tf->clone_i = &sf_clone_i;
|
|
475
955
|
return tf;
|
|
476
956
|
}
|
|
477
957
|
|
|
478
958
|
TokenStream *stop_filter_create(TokenStream *ts)
|
|
479
959
|
{
|
|
480
|
-
return stop_filter_create_with_words(ts,
|
|
481
|
-
|
|
960
|
+
return stop_filter_create_with_words(ts, FULL_ENGLISH_STOP_WORDS);
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
Token *mb_lcf_next(TokenStream *ts)
|
|
964
|
+
{
|
|
965
|
+
wchar_t wbuf[MAX_WORD_SIZE], *w;
|
|
966
|
+
//mbstate_t state = {0};
|
|
967
|
+
int i;
|
|
968
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
|
969
|
+
if (tk == NULL) return tk;
|
|
970
|
+
|
|
971
|
+
i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
|
972
|
+
w = wbuf;
|
|
973
|
+
while (*w != 0) {
|
|
974
|
+
*w = towlower(*w);
|
|
975
|
+
w++;
|
|
976
|
+
}
|
|
977
|
+
wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
|
|
978
|
+
return tk;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
TokenStream *mb_lowercase_filter_create(TokenStream *ts)
|
|
982
|
+
{
|
|
983
|
+
TokenStream *tf = ALLOC(TokenStream);
|
|
984
|
+
tf->token = NULL;
|
|
985
|
+
tf->next = &mb_lcf_next;
|
|
986
|
+
tf->reset = &filter_reset;
|
|
987
|
+
tf->destroy = &filter_destroy;
|
|
988
|
+
tf->sub_ts = ts;
|
|
989
|
+
tf->destroy_sub = true;
|
|
990
|
+
tf->clone_i = NULL;
|
|
991
|
+
return tf;
|
|
482
992
|
}
|
|
483
993
|
|
|
484
994
|
Token *lcf_next(TokenStream *ts)
|
|
@@ -501,48 +1011,199 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
|
|
|
501
1011
|
tf->reset = &filter_reset;
|
|
502
1012
|
tf->destroy = &filter_destroy;
|
|
503
1013
|
tf->sub_ts = ts;
|
|
1014
|
+
tf->destroy_sub = true;
|
|
1015
|
+
tf->clone_i = NULL;
|
|
504
1016
|
return tf;
|
|
505
1017
|
}
|
|
506
1018
|
|
|
507
|
-
|
|
1019
|
+
typedef struct StemFilter {
|
|
1020
|
+
struct sb_stemmer *stemmer;
|
|
1021
|
+
char *algorithm;
|
|
1022
|
+
char *charenc;
|
|
1023
|
+
} StemFilter;
|
|
1024
|
+
|
|
1025
|
+
void stemf_destroy(void *p)
|
|
508
1026
|
{
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
1027
|
+
TokenStream *ts = (TokenStream *)p;
|
|
1028
|
+
StemFilter *stemf = (StemFilter *)ts->data;
|
|
1029
|
+
sb_stemmer_delete(stemf->stemmer);
|
|
1030
|
+
free(stemf->algorithm);
|
|
1031
|
+
free(stemf->charenc);
|
|
1032
|
+
free(stemf);
|
|
1033
|
+
filter_destroy(ts);
|
|
515
1034
|
}
|
|
516
1035
|
|
|
1036
|
+
Token *stemf_next(TokenStream *ts)
|
|
1037
|
+
{
|
|
1038
|
+
int len;
|
|
1039
|
+
const sb_symbol *stemmed;
|
|
1040
|
+
struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
|
|
1041
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
|
1042
|
+
if (tk == NULL) return tk;
|
|
1043
|
+
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
|
|
1044
|
+
len = sb_stemmer_length(stemmer);
|
|
1045
|
+
if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
|
|
1046
|
+
memcpy(tk->text, stemmed, len);
|
|
1047
|
+
tk->text[len] = '\0';
|
|
1048
|
+
return tk;
|
|
1049
|
+
}
|
|
517
1050
|
|
|
518
|
-
|
|
1051
|
+
void stemf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
|
519
1052
|
{
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
1053
|
+
StemFilter *orig_stemf = (StemFilter *)orig_ts->data;
|
|
1054
|
+
StemFilter *stemf = ALLOC(StemFilter);
|
|
1055
|
+
stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
|
|
1056
|
+
stemf->algorithm = orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
|
|
1057
|
+
stemf->charenc = orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
|
|
1058
|
+
new_ts->data = stemf;
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
|
|
1062
|
+
const char * charenc)
|
|
1063
|
+
{
|
|
1064
|
+
TokenStream *tf = ALLOC(TokenStream);
|
|
1065
|
+
StemFilter *stemf = ALLOC(StemFilter);
|
|
1066
|
+
stemf->stemmer = sb_stemmer_new(algorithm, charenc);
|
|
1067
|
+
stemf->algorithm = algorithm ? estrdup(algorithm) : NULL;
|
|
1068
|
+
stemf->charenc = charenc ? estrdup(charenc) : NULL;
|
|
1069
|
+
tf->data = stemf;
|
|
1070
|
+
|
|
1071
|
+
tf->token = NULL;
|
|
1072
|
+
tf->next = &stemf_next;
|
|
1073
|
+
tf->reset = &filter_reset;
|
|
1074
|
+
tf->destroy = &stemf_destroy;
|
|
1075
|
+
tf->clone_i = &stemf_clone_i;
|
|
1076
|
+
tf->sub_ts = ts;
|
|
1077
|
+
tf->destroy_sub = true;
|
|
1078
|
+
return tf;
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
Analyzer *standard_analyzer_create_with_words_len(
|
|
1082
|
+
const char **words, int len, bool lowercase)
|
|
1083
|
+
{
|
|
1084
|
+
TokenStream *ts;
|
|
1085
|
+
if (lowercase) {
|
|
1086
|
+
ts = stop_filter_create_with_words_len(
|
|
524
1087
|
lowercase_filter_create(standard_tokenizer_create()), words, len);
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
1088
|
+
} else {
|
|
1089
|
+
ts = stop_filter_create_with_words_len(
|
|
1090
|
+
standard_tokenizer_create(), words, len);
|
|
1091
|
+
}
|
|
1092
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
Analyzer *standard_analyzer_create_with_words(const char **words, bool lowercase)
|
|
1096
|
+
{
|
|
1097
|
+
TokenStream *ts;
|
|
1098
|
+
if (lowercase) {
|
|
1099
|
+
ts = stop_filter_create_with_words(
|
|
1100
|
+
lowercase_filter_create(standard_tokenizer_create()), words);
|
|
1101
|
+
} else {
|
|
1102
|
+
ts = stop_filter_create_with_words(
|
|
1103
|
+
standard_tokenizer_create(), words);
|
|
1104
|
+
}
|
|
1105
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
Analyzer *mb_standard_analyzer_create_with_words_len(
|
|
1109
|
+
const char **words, int len, bool lowercase)
|
|
1110
|
+
{
|
|
1111
|
+
TokenStream *ts;
|
|
1112
|
+
if (lowercase) {
|
|
1113
|
+
ts = stop_filter_create_with_words_len(
|
|
1114
|
+
mb_lowercase_filter_create(mb_standard_tokenizer_create()), words, len);
|
|
1115
|
+
} else {
|
|
1116
|
+
ts = stop_filter_create_with_words_len(
|
|
1117
|
+
mb_standard_tokenizer_create(), words, len);
|
|
1118
|
+
}
|
|
1119
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
Analyzer *mb_standard_analyzer_create_with_words(
|
|
1123
|
+
const char **words, bool lowercase)
|
|
1124
|
+
{
|
|
1125
|
+
TokenStream *ts;
|
|
1126
|
+
if (lowercase) {
|
|
1127
|
+
ts = stop_filter_create_with_words(
|
|
1128
|
+
mb_lowercase_filter_create(mb_standard_tokenizer_create()), words);
|
|
1129
|
+
} else {
|
|
1130
|
+
ts = stop_filter_create_with_words(mb_standard_tokenizer_create(), words);
|
|
1131
|
+
}
|
|
1132
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
Analyzer *standard_analyzer_create(bool lowercase)
|
|
1136
|
+
{
|
|
1137
|
+
return standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
Analyzer *mb_standard_analyzer_create(bool lowercase)
|
|
1141
|
+
{
|
|
1142
|
+
return mb_standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
/****************************************************************************
|
|
1146
|
+
*
|
|
1147
|
+
* PerFieldAnalyzer
|
|
1148
|
+
*
|
|
1149
|
+
****************************************************************************/
|
|
1150
|
+
|
|
1151
|
+
typedef struct PerFieldAnalyzer {
|
|
1152
|
+
HshTable *dict;
|
|
1153
|
+
Analyzer *def;
|
|
1154
|
+
bool destroy_subs : 1;
|
|
1155
|
+
} PerFieldAnalyzer;
|
|
1156
|
+
|
|
1157
|
+
void pfa_destroy(void *p)
|
|
1158
|
+
{
|
|
1159
|
+
Analyzer *self = (Analyzer *)p;
|
|
1160
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
|
1161
|
+
h_destroy(pfa->dict);
|
|
1162
|
+
|
|
1163
|
+
if (pfa->destroy_subs) a_destroy(pfa->def);
|
|
1164
|
+
free(pfa);
|
|
1165
|
+
free(self);
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
|
|
1169
|
+
{
|
|
1170
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
|
1171
|
+
Analyzer *a = h_get(pfa->dict, field);
|
|
1172
|
+
if (a == NULL) a = pfa->def;
|
|
1173
|
+
return a_get_ts(a, field, text);
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
void pfa_sub_a_destroy(void *p)
|
|
1177
|
+
{
|
|
1178
|
+
Analyzer *a = (Analyzer *)p;
|
|
1179
|
+
a->destroy(a);
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
|
1183
|
+
{
|
|
1184
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
|
1185
|
+
h_set(pfa->dict, estrdup(field), analyzer);
|
|
528
1186
|
}
|
|
529
1187
|
|
|
530
|
-
Analyzer *
|
|
1188
|
+
Analyzer *per_field_analyzer_create(Analyzer *def, bool destroy_subs)
|
|
531
1189
|
{
|
|
532
|
-
|
|
533
|
-
|
|
1190
|
+
PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
|
|
1191
|
+
pfa->def = def;
|
|
1192
|
+
pfa->destroy_subs = destroy_subs;
|
|
1193
|
+
pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
|
|
1194
|
+
: h_new_str(&free, NULL);
|
|
1195
|
+
return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
|
|
534
1196
|
}
|
|
535
1197
|
|
|
536
1198
|
#ifdef ALONE
|
|
537
1199
|
int main(int argc, char **argv)
|
|
538
1200
|
{
|
|
539
1201
|
char buf[10000];
|
|
540
|
-
Analyzer *a = standard_analyzer_create();
|
|
1202
|
+
Analyzer *a = standard_analyzer_create(true);
|
|
541
1203
|
TokenStream *ts;
|
|
542
1204
|
Token *tk;
|
|
543
1205
|
while (fgets(buf, 9999, stdin) != NULL) {
|
|
544
1206
|
ts = a->get_ts(a, "hello", buf);
|
|
545
|
-
ts->pos = 0;
|
|
546
1207
|
while ((tk = ts->next(ts)) != NULL) {
|
|
547
1208
|
printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
|
|
548
1209
|
}
|