ferret 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +23 -5
- data/TODO +2 -1
- data/ext/analysis.c +838 -177
- data/ext/analysis.h +55 -7
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/array.c +8 -5
- data/ext/compound_io.c +132 -96
- data/ext/document.c +58 -28
- data/ext/except.c +59 -0
- data/ext/except.h +88 -0
- data/ext/ferret.c +47 -3
- data/ext/ferret.h +3 -0
- data/ext/field.c +15 -9
- data/ext/filter.c +1 -1
- data/ext/fs_store.c +215 -34
- data/ext/global.c +72 -3
- data/ext/global.h +4 -3
- data/ext/hash.c +44 -3
- data/ext/hash.h +9 -0
- data/ext/header.h +58 -0
- data/ext/inc/except.h +88 -0
- data/ext/inc/lang.h +23 -13
- data/ext/ind.c +16 -10
- data/ext/index.h +2 -22
- data/ext/index_io.c +3 -11
- data/ext/index_rw.c +245 -193
- data/ext/lang.h +23 -13
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/modules.h +162 -0
- data/ext/q_boolean.c +34 -21
- data/ext/q_const_score.c +6 -12
- data/ext/q_filtered_query.c +206 -0
- data/ext/q_fuzzy.c +18 -15
- data/ext/q_match_all.c +3 -7
- data/ext/q_multi_phrase.c +10 -14
- data/ext/q_parser.c +29 -2
- data/ext/q_phrase.c +14 -21
- data/ext/q_prefix.c +15 -12
- data/ext/q_range.c +30 -28
- data/ext/q_span.c +13 -21
- data/ext/q_term.c +17 -26
- data/ext/r_analysis.c +693 -21
- data/ext/r_doc.c +11 -12
- data/ext/r_index_io.c +4 -1
- data/ext/r_qparser.c +21 -2
- data/ext/r_search.c +285 -18
- data/ext/ram_store.c +5 -2
- data/ext/search.c +11 -17
- data/ext/search.h +21 -45
- data/ext/similarity.h +67 -0
- data/ext/sort.c +30 -25
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stopwords.c +325 -0
- data/ext/store.c +34 -2
- data/ext/tags +2953 -0
- data/ext/term.c +21 -15
- data/ext/termdocs.c +5 -3
- data/ext/utilities.c +446 -0
- data/ext/vector.c +27 -13
- data/lib/ferret/document/document.rb +1 -1
- data/lib/ferret/index/index.rb +44 -6
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
- data/lib/rferret.rb +2 -1
- data/test/test_helper.rb +2 -2
- data/test/unit/analysis/ctc_analyzer.rb +401 -0
- data/test/unit/analysis/ctc_tokenstream.rb +423 -0
- data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
- data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
- data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
- data/test/unit/analysis/tc_analyzer.rb +1 -2
- data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
- data/test/unit/document/rtc_field.rb +28 -0
- data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
- data/test/unit/document/tc_field.rb +82 -12
- data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
- data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
- data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
- data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
- data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
- data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
- data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
- data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
- data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
- data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
- data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
- data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
- data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
- data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
- data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
- data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
- data/test/unit/query_parser/tc_query_parser.rb +24 -16
- data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
- data/test/unit/search/rtc_sort_field.rb +14 -0
- data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
- data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
- data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
- data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
- data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +20 -7
- data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
- data/test/unit/store/rtc_fs_store.rb +62 -0
- data/test/unit/store/rtc_ram_store.rb +15 -0
- data/test/unit/store/rtm_store.rb +150 -0
- data/test/unit/store/rtm_store_lock.rb +2 -0
- data/test/unit/store/tc_fs_store.rb +54 -40
- data/test/unit/store/tc_ram_store.rb +20 -0
- data/test/unit/store/tm_store.rb +30 -146
- data/test/unit/store/tm_store_lock.rb +66 -0
- data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
- data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
- data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
- data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
- data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
- data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
- data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
- data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
- metadata +360 -289
- data/test/unit/document/c_field.rb +0 -98
- data/test/unit/search/c_sort_field.rb +0 -27
- data/test/unit/store/c_fs_store.rb +0 -76
- data/test/unit/store/c_ram_store.rb +0 -35
- data/test/unit/store/m_store.rb +0 -34
- data/test/unit/store/m_store_lock.rb +0 -68
data/Rakefile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
$:. << 'lib'
|
2
2
|
# Some parts of this Rakefile where taken from Jim Weirich's Rakefile for
|
3
|
-
# Rake. Other parts where
|
3
|
+
# Rake. Other parts where taken from the David Heinemeier Hansson's Rails
|
4
4
|
# Rakefile. Both are under MIT-LICENSE. Thanks to both for their excellent
|
5
5
|
# projects.
|
6
6
|
|
@@ -32,12 +32,13 @@ end
|
|
32
32
|
$VERBOSE = nil
|
33
33
|
|
34
34
|
EXT = "ferret_ext.so"
|
35
|
-
EXT_SRC = FileList["src
|
35
|
+
EXT_SRC = FileList["src/**/*.[ch]"]
|
36
|
+
|
36
37
|
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
37
38
|
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
38
39
|
|
39
40
|
CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
|
40
|
-
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
|
41
|
+
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
41
42
|
|
42
43
|
task :default => :all_tests
|
43
44
|
desc "Run all tests"
|
@@ -57,7 +58,7 @@ end
|
|
57
58
|
desc "run unit tests in test/unit for C ferret"
|
58
59
|
Rake::TestTask.new("test_cunits" => :ext) do |t|
|
59
60
|
t.libs << "test/unit"
|
60
|
-
t.pattern = 'test/unit/
|
61
|
+
t.pattern = 'test/unit/ts_*.rb'
|
61
62
|
t.verbose = true
|
62
63
|
end
|
63
64
|
|
@@ -102,6 +103,15 @@ EXT_SRC.each do |fn|
|
|
102
103
|
dest_fn = File.join("ext", File.basename(fn))
|
103
104
|
file dest_fn => fn do |t|
|
104
105
|
cp fn, dest_fn
|
106
|
+
if fn =~ /stemmer/
|
107
|
+
# flatten the directory structure for lib_stemmer
|
108
|
+
open(dest_fn) do |in_f|
|
109
|
+
open(dest_fn + ".out", "w") do |out_f|
|
110
|
+
in_f.each {|line| out_f.write(line.sub(/(#include ["<])[.a-z_\/]*\//) {"#{$1}"})}
|
111
|
+
end
|
112
|
+
end
|
113
|
+
mv dest_fn + ".out", dest_fn
|
114
|
+
end
|
105
115
|
end
|
106
116
|
end
|
107
117
|
|
@@ -110,9 +120,17 @@ task :ext => ["ext/#{EXT}"] + SRC
|
|
110
120
|
|
111
121
|
file "ext/#{EXT}" => ["ext/Makefile"] do
|
112
122
|
cp "ext/inc/lang.h", "ext/lang.h"
|
123
|
+
cp "ext/inc/except.h", "ext/except.h"
|
113
124
|
sh "cd ext; make"
|
114
125
|
end
|
115
126
|
|
127
|
+
file "ext/lang.h" => ["ext/inc/lang.h"] do
|
128
|
+
cp "ext/inc/lang.h", "ext/lang.h"
|
129
|
+
end
|
130
|
+
file "ext/except.h" => ["ext/inc/except.h"] do
|
131
|
+
cp "ext/inc/except.h", "ext/except.h"
|
132
|
+
end
|
133
|
+
|
116
134
|
file "ext/Makefile" => SRC do
|
117
135
|
sh "cd ext; ruby extconf.rb"
|
118
136
|
end
|
@@ -220,7 +238,7 @@ task :repackage => EXT_SRC_DEST
|
|
220
238
|
task :package => EXT_SRC_DEST
|
221
239
|
task :tag => [:prerelease]
|
222
240
|
task :update_version => [:prerelease]
|
223
|
-
task :release
|
241
|
+
task :release do #=> [:tag, :update_version, :package] do
|
224
242
|
announce
|
225
243
|
announce "**************************************************************"
|
226
244
|
announce "* Release #{PKG_VERSION} Complete."
|
data/TODO
CHANGED
@@ -4,11 +4,12 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
|
|
4
4
|
|
5
5
|
=== To Do
|
6
6
|
|
7
|
-
* Add the ability to persist an in memory index to Ferret::Index::Index
|
8
7
|
* Make a dll for people on Windows
|
8
|
+
* pure ruby ConstantScoreQuery
|
9
9
|
|
10
10
|
=== Done
|
11
11
|
|
12
|
+
* Add the ability to persist an in memory index to Ferret::Index::Index
|
12
13
|
* Add UTF-8 support
|
13
14
|
* Multi Field Query
|
14
15
|
* Test threading
|
data/ext/analysis.c
CHANGED
@@ -1,7 +1,16 @@
|
|
1
1
|
#include <analysis.h>
|
2
2
|
#include <string.h>
|
3
3
|
#include <ctype.h>
|
4
|
-
#include <
|
4
|
+
#include <wctype.h>
|
5
|
+
#include <wchar.h>
|
6
|
+
#include "hash.h"
|
7
|
+
#include "libstemmer.h"
|
8
|
+
|
9
|
+
/****************************************************************************
|
10
|
+
*
|
11
|
+
* Token
|
12
|
+
*
|
13
|
+
****************************************************************************/
|
5
14
|
|
6
15
|
Token *tk_create()
|
7
16
|
{
|
@@ -24,6 +33,11 @@ inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int po
|
|
24
33
|
return tk;
|
25
34
|
}
|
26
35
|
|
36
|
+
inline Token *tk_set_ts(Token *tk, char *start, char *end, char *text, int pos_inc)
|
37
|
+
{
|
38
|
+
return tk_set(tk, start, end - start, start - text, end - text, pos_inc);
|
39
|
+
}
|
40
|
+
|
27
41
|
inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
|
28
42
|
{
|
29
43
|
return tk_set(tk, text, strlen(text), start, end, pos_inc);
|
@@ -31,11 +45,8 @@ inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_i
|
|
31
45
|
|
32
46
|
int tk_eq(Token *tk1, Token *tk2)
|
33
47
|
{
|
34
|
-
|
35
|
-
tk1->start == tk2->start && tk1->end == tk2->end)
|
36
|
-
return true;
|
37
|
-
else
|
38
|
-
return false;
|
48
|
+
return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
49
|
+
tk1->start == tk2->start && tk1->end == tk2->end);
|
39
50
|
}
|
40
51
|
|
41
52
|
int tk_cmp(Token *tk1, Token *tk2)
|
@@ -57,46 +68,152 @@ int tk_cmp(Token *tk1, Token *tk2)
|
|
57
68
|
return cmp;
|
58
69
|
}
|
59
70
|
|
71
|
+
|
72
|
+
/****************************************************************************
|
73
|
+
*
|
74
|
+
* TokenStream
|
75
|
+
*
|
76
|
+
****************************************************************************/
|
77
|
+
|
60
78
|
void ts_standard_destroy(void *p)
|
61
79
|
{
|
62
80
|
TokenStream *ts = (TokenStream *)p;
|
63
81
|
tk_destroy(ts->token);
|
64
|
-
free(
|
82
|
+
free(ts);
|
65
83
|
}
|
66
84
|
|
67
85
|
void ts_reset(TokenStream *ts, char *text)
|
68
86
|
{
|
69
|
-
ts->text = text;
|
70
|
-
ts->pos = 0;
|
87
|
+
ts->t = ts->text = text;
|
71
88
|
}
|
72
89
|
|
73
90
|
TokenStream *ts_create()
|
74
91
|
{
|
75
92
|
TokenStream *ts = ALLOC(TokenStream);
|
76
|
-
ts->pos = -1;
|
77
93
|
ts->text = NULL;
|
78
94
|
ts->token = tk_create();
|
79
95
|
ts->destroy = &ts_standard_destroy;
|
80
96
|
ts->reset = &ts_reset;
|
97
|
+
ts->sub_ts = NULL;
|
98
|
+
ts->clone_i = NULL;
|
81
99
|
return ts;
|
82
100
|
}
|
83
101
|
|
102
|
+
TokenStream *ts_clone(TokenStream *orig_ts)
|
103
|
+
{
|
104
|
+
TokenStream *ts = ALLOC(TokenStream);
|
105
|
+
memcpy(ts, orig_ts, sizeof(TokenStream));
|
106
|
+
if (orig_ts->token) {
|
107
|
+
ts->token = ALLOC(Token);
|
108
|
+
memcpy(ts->token, orig_ts->token, sizeof(Token));
|
109
|
+
}
|
110
|
+
if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
|
111
|
+
if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
|
112
|
+
return ts;
|
113
|
+
}
|
114
|
+
|
115
|
+
/* * Multi-byte TokenStream * */
|
116
|
+
static char * const ENC_ERR_MSG = "Error decoding input string. "
|
117
|
+
"Check that you have the locale set correctly";
|
118
|
+
#define MB_NEXT_CHAR \
|
119
|
+
if ((i = mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
120
|
+
RAISE(IO_ERROR, ENC_ERR_MSG)
|
121
|
+
|
122
|
+
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
|
123
|
+
{
|
124
|
+
tk->text[wcstombs(tk->text, text, MAX_WORD_SIZE - 1)] = '\0';
|
125
|
+
tk->start = start;
|
126
|
+
tk->end = end;
|
127
|
+
tk->pos_inc = pos_inc;
|
128
|
+
return tk;
|
129
|
+
}
|
130
|
+
|
131
|
+
void mb_ts_standard_destroy(void *p)
|
132
|
+
{
|
133
|
+
TokenStream *ts = (TokenStream *)p;
|
134
|
+
tk_destroy(ts->token);
|
135
|
+
free(ts->data);
|
136
|
+
free(ts);
|
137
|
+
}
|
138
|
+
|
139
|
+
void mb_ts_reset(TokenStream *ts, char *text)
|
140
|
+
{
|
141
|
+
ZEROSET(ts->data, mbstate_t, 1);
|
142
|
+
ts_reset(ts, text);
|
143
|
+
}
|
144
|
+
|
145
|
+
void mb_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
146
|
+
{
|
147
|
+
new_ts->data = ALLOC(mbstate_t);
|
148
|
+
memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
|
149
|
+
}
|
150
|
+
|
151
|
+
TokenStream *mb_ts_create()
|
152
|
+
{
|
153
|
+
TokenStream *ts = ALLOC(TokenStream);
|
154
|
+
ts->data = ALLOC(mbstate_t);
|
155
|
+
ts->text = NULL;
|
156
|
+
ts->token = tk_create();
|
157
|
+
ts->destroy = &mb_ts_standard_destroy;
|
158
|
+
ts->reset = &mb_ts_reset;
|
159
|
+
ts->clone_i = &mb_ts_clone_i;
|
160
|
+
ts->sub_ts = NULL;
|
161
|
+
return ts;
|
162
|
+
}
|
163
|
+
|
164
|
+
/****************************************************************************
|
165
|
+
*
|
166
|
+
* Analyzer
|
167
|
+
*
|
168
|
+
****************************************************************************/
|
169
|
+
|
170
|
+
void a_standard_destroy(void *p)
|
171
|
+
{
|
172
|
+
Analyzer *a = (Analyzer *)p;
|
173
|
+
ts_destroy(a->current_ts);
|
174
|
+
free(p);
|
175
|
+
}
|
176
|
+
|
177
|
+
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
178
|
+
{
|
179
|
+
a->current_ts->reset(a->current_ts, text);
|
180
|
+
return a->current_ts;
|
181
|
+
}
|
182
|
+
|
183
|
+
Analyzer *analyzer_create(void *data, TokenStream *ts, void (*destroy)(void *),
|
184
|
+
TokenStream *(*get_ts)(Analyzer *a, char *field, char *text))
|
185
|
+
{
|
186
|
+
Analyzer *a = ALLOC(Analyzer);
|
187
|
+
a->data = data;
|
188
|
+
a->current_ts = ts;
|
189
|
+
a->destroy = (destroy ? destroy : &a_standard_destroy);
|
190
|
+
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
191
|
+
return a;
|
192
|
+
}
|
193
|
+
|
194
|
+
/****************************************************************************
|
195
|
+
*
|
196
|
+
* Whitespace
|
197
|
+
*
|
198
|
+
****************************************************************************/
|
199
|
+
|
200
|
+
/*
|
201
|
+
* WhitespaceTokenizer
|
202
|
+
*/
|
84
203
|
Token *wst_next(TokenStream *ts)
|
85
204
|
{
|
86
|
-
|
87
|
-
|
88
|
-
char *text = ts->text;
|
205
|
+
char *t = ts->t;
|
206
|
+
char *start;
|
89
207
|
|
90
|
-
while (
|
91
|
-
i++;
|
92
|
-
if (text[i] == '\0')
|
93
|
-
return NULL;
|
208
|
+
while (*t != '\0' && isspace(*t)) t++;
|
94
209
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
210
|
+
if (*t == '\0') return NULL;
|
211
|
+
|
212
|
+
start = t;
|
213
|
+
while (*t != '\0' && !isspace(*t)) t++;
|
214
|
+
|
215
|
+
ts->t = t;
|
216
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
100
217
|
return ts->token;
|
101
218
|
}
|
102
219
|
|
@@ -107,22 +224,121 @@ TokenStream *whitespace_tokenizer_create()
|
|
107
224
|
return ts;
|
108
225
|
}
|
109
226
|
|
227
|
+
/*
|
228
|
+
* Multi-byte WhitespaceTokenizer
|
229
|
+
*/
|
230
|
+
Token *mb_wst_next(TokenStream *ts)
|
231
|
+
{
|
232
|
+
int i;
|
233
|
+
char *start;
|
234
|
+
char *t = ts->t;
|
235
|
+
wchar_t wchr;
|
236
|
+
|
237
|
+
MB_NEXT_CHAR;
|
238
|
+
while (wchr != 0 && iswspace(wchr)) {
|
239
|
+
t += i;
|
240
|
+
MB_NEXT_CHAR;
|
241
|
+
}
|
242
|
+
if (wchr == 0) return NULL;
|
243
|
+
|
244
|
+
start = t;
|
245
|
+
t += i;
|
246
|
+
MB_NEXT_CHAR;
|
247
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
248
|
+
t += i;
|
249
|
+
MB_NEXT_CHAR;
|
250
|
+
}
|
251
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
252
|
+
ts->t = t;
|
253
|
+
return ts->token;
|
254
|
+
}
|
255
|
+
|
256
|
+
/*
|
257
|
+
* Lowercasing Multi-byte WhitespaceTokenizer
|
258
|
+
*/
|
259
|
+
Token *mb_wst_next_lc(TokenStream *ts)
|
260
|
+
{
|
261
|
+
int i;
|
262
|
+
char *start;
|
263
|
+
char *t = ts->t;
|
264
|
+
wchar_t wchr;
|
265
|
+
wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
|
266
|
+
|
267
|
+
w = wbuf;
|
268
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
269
|
+
|
270
|
+
MB_NEXT_CHAR;
|
271
|
+
while (wchr != 0 && iswspace(wchr)) {
|
272
|
+
t += i;
|
273
|
+
MB_NEXT_CHAR;
|
274
|
+
}
|
275
|
+
if (wchr == 0) return NULL;
|
276
|
+
|
277
|
+
start = t;
|
278
|
+
t += i;
|
279
|
+
*w++ = towlower(wchr);
|
280
|
+
MB_NEXT_CHAR;
|
281
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
282
|
+
if (w < w_end) *w++ = towlower(wchr);
|
283
|
+
t += i;
|
284
|
+
MB_NEXT_CHAR;
|
285
|
+
}
|
286
|
+
*w = 0;
|
287
|
+
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
288
|
+
ts->t = t;
|
289
|
+
return ts->token;
|
290
|
+
}
|
291
|
+
|
292
|
+
TokenStream *mb_whitespace_tokenizer_create(bool lowercase)
|
293
|
+
{
|
294
|
+
TokenStream *ts = mb_ts_create();
|
295
|
+
ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
|
296
|
+
return ts;
|
297
|
+
}
|
298
|
+
|
299
|
+
/*
|
300
|
+
* WhitespaceAnalyzers
|
301
|
+
*/
|
302
|
+
Analyzer *whitespace_analyzer_create(bool lowercase)
|
303
|
+
{
|
304
|
+
TokenStream *ts;
|
305
|
+
if (lowercase) {
|
306
|
+
ts = lowercase_filter_create(whitespace_tokenizer_create());
|
307
|
+
} else {
|
308
|
+
ts = whitespace_tokenizer_create();
|
309
|
+
}
|
310
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
311
|
+
}
|
312
|
+
|
313
|
+
Analyzer *mb_whitespace_analyzer_create(bool lowercase)
|
314
|
+
{
|
315
|
+
return analyzer_create(NULL, mb_whitespace_tokenizer_create(lowercase),
|
316
|
+
NULL, NULL);
|
317
|
+
}
|
318
|
+
|
319
|
+
/****************************************************************************
|
320
|
+
*
|
321
|
+
* Letter
|
322
|
+
*
|
323
|
+
****************************************************************************/
|
324
|
+
|
325
|
+
/*
|
326
|
+
* LetterTokenizer
|
327
|
+
*/
|
110
328
|
Token *lt_next(TokenStream *ts)
|
111
329
|
{
|
112
|
-
|
113
|
-
|
114
|
-
char *text = ts->text;
|
330
|
+
char *start;
|
331
|
+
char *t = ts->t;
|
115
332
|
|
116
|
-
while (
|
117
|
-
i++;
|
118
|
-
if (text[i] == '\0')
|
119
|
-
return NULL;
|
333
|
+
while (*t != '\0' && !isalpha(*t)) t++;
|
120
334
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
335
|
+
if (*t == '\0') return NULL;
|
336
|
+
|
337
|
+
start = t;
|
338
|
+
while (*t != '\0' && isalpha(*t)) t++;
|
339
|
+
|
340
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
341
|
+
ts->t = t;
|
126
342
|
return ts->token;
|
127
343
|
}
|
128
344
|
|
@@ -133,54 +349,174 @@ TokenStream *letter_tokenizer_create()
|
|
133
349
|
return ts;
|
134
350
|
}
|
135
351
|
|
136
|
-
|
352
|
+
/*
|
353
|
+
* Multi-byte LetterTokenizer
|
354
|
+
*/
|
355
|
+
Token *mb_lt_next(TokenStream *ts)
|
137
356
|
{
|
138
|
-
|
139
|
-
|
140
|
-
|
357
|
+
int i;
|
358
|
+
char *start;
|
359
|
+
char *t = ts->t;
|
360
|
+
wchar_t wchr;
|
361
|
+
|
362
|
+
MB_NEXT_CHAR;
|
363
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
364
|
+
t += i;
|
365
|
+
MB_NEXT_CHAR;
|
366
|
+
}
|
367
|
+
if (wchr == 0) return NULL;
|
368
|
+
|
369
|
+
start = t;
|
370
|
+
t += i;
|
371
|
+
MB_NEXT_CHAR;
|
372
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
373
|
+
t += i;
|
374
|
+
MB_NEXT_CHAR;
|
375
|
+
}
|
376
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
377
|
+
ts->t = t;
|
378
|
+
return ts->token;
|
141
379
|
}
|
142
380
|
|
143
|
-
|
381
|
+
/*
|
382
|
+
* Lowercasing Multi-byte LetterTokenizer
|
383
|
+
*/
|
384
|
+
Token *mb_lt_next_lc(TokenStream *ts)
|
144
385
|
{
|
145
|
-
|
146
|
-
|
386
|
+
int i;
|
387
|
+
char *start;
|
388
|
+
char *t = ts->t;
|
389
|
+
wchar_t wchr;
|
390
|
+
wchar_t wbuf[MAX_WORD_SIZE+1], *w, *w_end;
|
391
|
+
|
392
|
+
w = wbuf;
|
393
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
394
|
+
|
395
|
+
MB_NEXT_CHAR;
|
396
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
397
|
+
t += i;
|
398
|
+
MB_NEXT_CHAR;
|
399
|
+
}
|
400
|
+
if (wchr == 0) return NULL;
|
401
|
+
|
402
|
+
start = t;
|
403
|
+
t += i;
|
404
|
+
*w++ = towlower(wchr);
|
405
|
+
MB_NEXT_CHAR;
|
406
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
407
|
+
if (w < w_end) *w++ = towlower(wchr);
|
408
|
+
t += i;
|
409
|
+
MB_NEXT_CHAR;
|
410
|
+
}
|
411
|
+
*w = 0;
|
412
|
+
w_tk_set(ts->token, wbuf, start - ts->text, t - ts->text, 1);
|
413
|
+
ts->t = t;
|
414
|
+
return ts->token;
|
147
415
|
}
|
148
416
|
|
149
|
-
|
417
|
+
TokenStream *mb_letter_tokenizer_create(bool lowercase)
|
150
418
|
{
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
a->destroy = &a_standard_destroy;
|
155
|
-
a->get_ts = &a_standard_get_ts;
|
156
|
-
return a;
|
419
|
+
TokenStream *ts = mb_ts_create();
|
420
|
+
ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
|
421
|
+
return ts;
|
157
422
|
}
|
158
423
|
|
159
|
-
|
424
|
+
/*
|
425
|
+
* LetterAnalyzers
|
426
|
+
*/
|
427
|
+
Analyzer *letter_analyzer_create(bool lowercase)
|
428
|
+
{
|
429
|
+
TokenStream *ts;
|
430
|
+
if (lowercase) {
|
431
|
+
ts = lowercase_filter_create(letter_tokenizer_create());
|
432
|
+
} else {
|
433
|
+
ts = letter_tokenizer_create();
|
434
|
+
}
|
435
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
436
|
+
}
|
437
|
+
|
438
|
+
Analyzer *mb_letter_analyzer_create(bool lowercase)
|
439
|
+
{
|
440
|
+
return analyzer_create(NULL,
|
441
|
+
mb_letter_tokenizer_create(lowercase), NULL, NULL);
|
442
|
+
}
|
443
|
+
|
444
|
+
/****************************************************************************
|
445
|
+
*
|
446
|
+
* Standard
|
447
|
+
*
|
448
|
+
****************************************************************************/
|
449
|
+
|
450
|
+
/*
|
451
|
+
* StandardTokenizer
|
452
|
+
*/
|
453
|
+
int std_get_alpha(TokenStream *ts, char *token)
|
160
454
|
{
|
161
455
|
int i = 0;
|
162
|
-
|
163
|
-
|
456
|
+
char *t = ts->t;
|
457
|
+
while (t[i] != '\0' && isalpha(t[i])) {
|
458
|
+
if (i < MAX_WORD_SIZE) token[i] = t[i];
|
164
459
|
i++;
|
165
460
|
}
|
166
461
|
return i;
|
167
462
|
}
|
168
463
|
|
169
|
-
int
|
464
|
+
int mb_std_get_alpha(TokenStream *ts, char *token)
|
465
|
+
{
|
466
|
+
char *t = ts->t;
|
467
|
+
wchar_t w;
|
468
|
+
int i;
|
469
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
470
|
+
while (w != 0 && iswalpha(w)) {
|
471
|
+
t += i;
|
472
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
473
|
+
}
|
474
|
+
|
475
|
+
i = t - ts->t;
|
476
|
+
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
477
|
+
memcpy(token, ts->t, i);
|
478
|
+
return i;
|
479
|
+
}
|
480
|
+
|
481
|
+
int std_get_alnum(TokenStream *ts, char *token)
|
170
482
|
{
|
171
483
|
int i = 0;
|
172
|
-
|
173
|
-
|
484
|
+
char *t = ts->t;
|
485
|
+
while (t[i] != '\0' && isalnum(t[i])) {
|
486
|
+
if (i < MAX_WORD_SIZE) token[i] = t[i];
|
174
487
|
i++;
|
175
488
|
}
|
176
489
|
return i;
|
177
490
|
}
|
178
491
|
|
492
|
+
int mb_std_get_alnum(char *text, char *token, TokenStream *ts)
|
493
|
+
{
|
494
|
+
char *t = ts->t;
|
495
|
+
wchar_t w;
|
496
|
+
int i;
|
497
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
498
|
+
while (w != 0 && iswalnum(w)) {
|
499
|
+
t += i;
|
500
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
501
|
+
}
|
502
|
+
|
503
|
+
i = t - ts->t;
|
504
|
+
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
505
|
+
memcpy(token, ts->t, i);
|
506
|
+
return i;
|
507
|
+
|
508
|
+
}
|
509
|
+
|
179
510
|
int isnumpunc(char c)
|
180
511
|
{
|
181
512
|
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
|
182
513
|
}
|
183
514
|
|
515
|
+
int w_isnumpunc(wchar_t c)
|
516
|
+
{
|
517
|
+
return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_' || c == L'-');
|
518
|
+
}
|
519
|
+
|
184
520
|
int isurlpunc(char c)
|
185
521
|
{
|
186
522
|
return (c == '.' || c == '/' || c == '-' || c == '_');
|
@@ -201,11 +537,23 @@ int isurlxatc(char c)
|
|
201
537
|
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
|
202
538
|
}
|
203
539
|
|
204
|
-
|
540
|
+
bool std_is_tok_char(char *c)
|
205
541
|
{
|
206
|
-
if (isspace(c)) return false; // most common so check first.
|
207
|
-
if (isalnum(c) || isnumpunc(c) || c == '&' ||
|
208
|
-
c == '@' || c == '\'' || c == ':')
|
542
|
+
if (isspace(*c)) return false; // most common so check first.
|
543
|
+
if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
|
544
|
+
*c == '@' || *c == '\'' || *c == ':')
|
545
|
+
return true;
|
546
|
+
return false;
|
547
|
+
}
|
548
|
+
|
549
|
+
bool w_std_is_tok_char(char *t)
|
550
|
+
{
|
551
|
+
wchar_t c;
|
552
|
+
if ((mbtowc(&c, t, MB_CUR_MAX)) < 0)
|
553
|
+
RAISE(IO_ERROR, ENC_ERR_MSG);
|
554
|
+
if (iswspace(c)) return false; // most common so check first.
|
555
|
+
if (iswalnum(c) || w_isnumpunc(c) || c == L'&' ||
|
556
|
+
c == L'@' || c == L'\'' || c == L':')
|
209
557
|
return true;
|
210
558
|
return false;
|
211
559
|
}
|
@@ -246,22 +594,34 @@ int std_get_number(char *input)
|
|
246
594
|
|
247
595
|
int std_get_apostrophe(char *input)
|
248
596
|
{
|
249
|
-
|
597
|
+
char *t = input;
|
250
598
|
|
251
|
-
while (isalpha(
|
252
|
-
|
599
|
+
while (isalpha(*t) || *t == '\'')
|
600
|
+
t++;
|
253
601
|
|
254
|
-
return
|
602
|
+
return t - input;
|
255
603
|
}
|
256
604
|
|
257
|
-
int
|
605
|
+
int mb_std_get_apostrophe(char *input)
|
258
606
|
{
|
259
|
-
|
607
|
+
char *t = input;
|
608
|
+
wchar_t w;
|
609
|
+
int i;
|
260
610
|
|
611
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
612
|
+
while (iswalpha(w) || w == L'\'') {
|
613
|
+
t += i;
|
614
|
+
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
615
|
+
}
|
616
|
+
return t - input;
|
617
|
+
}
|
618
|
+
|
619
|
+
int std_get_url(char *input, char *token, int i)
|
620
|
+
{
|
261
621
|
while (isurlc(input[i])) {
|
262
622
|
if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
|
263
623
|
break; // can't have to puncs in a row
|
264
|
-
token[i] = input[i];
|
624
|
+
if (i < MAX_WORD_SIZE) token[i] = input[i];
|
265
625
|
i++;
|
266
626
|
}
|
267
627
|
|
@@ -282,148 +642,229 @@ int std_get_company_name(char *input)
|
|
282
642
|
return i;
|
283
643
|
}
|
284
644
|
|
645
|
+
int mb_std_get_company_name(char *input, TokenStream *ts)
|
646
|
+
{
|
647
|
+
char *t = input;
|
648
|
+
wchar_t wchr;
|
649
|
+
int i;
|
650
|
+
|
651
|
+
MB_NEXT_CHAR;
|
652
|
+
while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
|
653
|
+
t += i;
|
654
|
+
MB_NEXT_CHAR;
|
655
|
+
}
|
656
|
+
|
657
|
+
return t - input;
|
658
|
+
}
|
659
|
+
|
660
|
+
bool std_advance_to_start(TokenStream *ts)
|
661
|
+
{
|
662
|
+
char *t = ts->t;
|
663
|
+
while (*t != '\0' && !isalnum(*t)) t++;
|
664
|
+
|
665
|
+
ts->t = t;
|
666
|
+
|
667
|
+
return (*t != '\0');
|
668
|
+
}
|
669
|
+
|
670
|
+
bool mb_std_advance_to_start(TokenStream *ts)
|
671
|
+
{
|
672
|
+
int i;
|
673
|
+
wchar_t w;
|
674
|
+
|
675
|
+
if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
676
|
+
while (w != 0 && !iswalnum(w)) {
|
677
|
+
ts->t += i;
|
678
|
+
if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
679
|
+
}
|
680
|
+
|
681
|
+
return (w != 0);
|
682
|
+
}
|
683
|
+
|
684
|
+
typedef struct StandardTokenizer {
|
685
|
+
bool (*advance_to_start)(TokenStream *ts);
|
686
|
+
bool (*is_tok_char)(char *c);
|
687
|
+
int (*get_alpha)(TokenStream *ts, char *token);
|
688
|
+
int (*get_apostrophe)(char *input);
|
689
|
+
} StandardTokenizer;
|
690
|
+
|
285
691
|
Token *std_next(TokenStream *ts)
|
286
692
|
{
|
287
|
-
|
288
|
-
|
289
|
-
char *
|
693
|
+
StandardTokenizer *std_tz = (StandardTokenizer *)ts->data;
|
694
|
+
char *s;
|
695
|
+
char *t;
|
696
|
+
char *start = NULL;
|
697
|
+
char *num_end = NULL;
|
290
698
|
char token[MAX_WORD_SIZE];
|
291
699
|
int token_i = 0;
|
292
700
|
int len;
|
293
|
-
|
294
|
-
|
295
|
-
int seen_at_symbol;
|
701
|
+
bool is_acronym;
|
702
|
+
bool seen_at_symbol;
|
296
703
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
ts->pos = i;
|
306
|
-
tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
|
704
|
+
|
705
|
+
if (!std_tz->advance_to_start(ts)) return NULL;
|
706
|
+
|
707
|
+
start = t = ts->t;
|
708
|
+
if (isdigit(*t)) {
|
709
|
+
t += std_get_number(t);
|
710
|
+
ts->t = t;
|
711
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
307
712
|
} else {
|
308
|
-
token_i =
|
309
|
-
|
713
|
+
token_i = std_tz->get_alpha(ts, token);
|
714
|
+
t += token_i;
|
310
715
|
|
311
|
-
if (!
|
716
|
+
if (!std_tz->is_tok_char(t)) {
|
312
717
|
// very common case, ie a plain word, so check and return
|
313
|
-
|
314
|
-
ts->
|
718
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
719
|
+
ts->t = t;
|
315
720
|
return ts->token;
|
316
721
|
}
|
317
722
|
|
318
|
-
if (
|
319
|
-
|
320
|
-
ts->
|
321
|
-
len =
|
723
|
+
if (*t == '\'') { // apostrophe case.
|
724
|
+
t += std_tz->get_apostrophe(t);
|
725
|
+
ts->t = t;
|
726
|
+
len = t - start;
|
322
727
|
// strip possesive
|
323
|
-
if ((
|
324
|
-
|
325
|
-
|
728
|
+
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
|
729
|
+
|
730
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
326
731
|
return ts->token;
|
327
732
|
}
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
733
|
+
|
734
|
+
if (*t == '&') { // apostrophe case.
|
735
|
+
t += std_get_company_name(t);
|
736
|
+
ts->t = t;
|
737
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
332
738
|
return ts->token;
|
333
739
|
}
|
334
740
|
|
335
|
-
if (isdigit(
|
336
|
-
num_end = start + std_get_number(
|
337
|
-
if (!
|
338
|
-
ts->
|
339
|
-
|
741
|
+
if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
|
742
|
+
num_end = start + std_get_number(start);
|
743
|
+
if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
|
744
|
+
ts->t = num_end;
|
745
|
+
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
340
746
|
return ts->token;
|
341
747
|
}
|
342
748
|
// else there may be a longer token so check
|
343
749
|
}
|
344
750
|
|
345
|
-
if (
|
751
|
+
if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
|
346
752
|
// check for a known url start
|
347
753
|
token[token_i] = '\0';
|
348
|
-
|
349
|
-
while (
|
350
|
-
if (isalpha(
|
351
|
-
(
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
len = std_get_url(
|
754
|
+
t += 3;
|
755
|
+
while (*t == '/') t++;
|
756
|
+
if (isalpha(*t) &&
|
757
|
+
(memcmp(token, "ftp", 3) == 0 ||
|
758
|
+
memcmp(token, "http", 4) == 0 ||
|
759
|
+
memcmp(token, "https", 5) == 0 ||
|
760
|
+
memcmp(token, "file", 4) == 0)) {
|
761
|
+
len = std_get_url(t, token, 0); // dispose of first part of the URL
|
356
762
|
} else { //still treat as url but keep the first part
|
357
|
-
token_i =
|
358
|
-
memcpy(token,
|
359
|
-
len = token_i + std_get_url(
|
763
|
+
token_i = t - start;
|
764
|
+
memcpy(token, start, token_i * sizeof(char));
|
765
|
+
len = token_i + std_get_url(t, token, token_i); // keep start
|
360
766
|
}
|
361
|
-
ts->
|
767
|
+
ts->t = t + len;
|
362
768
|
token[len] = 0;
|
363
|
-
tk_set(ts->token, token, len, start, ts->
|
769
|
+
tk_set(ts->token, token, len, start - ts->text, ts->t - ts->text, 1);
|
364
770
|
return ts->token;
|
365
771
|
}
|
366
772
|
|
367
|
-
// now see how
|
773
|
+
// now see how long a url we can find.
|
368
774
|
is_acronym = true;
|
369
775
|
seen_at_symbol = false;
|
370
|
-
while (isurlxatc(
|
371
|
-
if (is_acronym && !isalpha(
|
776
|
+
while (isurlxatc(*t)) {
|
777
|
+
if (is_acronym && !isalpha(*t) && (*t != '.')) {
|
372
778
|
is_acronym = false;
|
373
779
|
}
|
374
|
-
if (isurlxatpunc(
|
375
|
-
break; // can't have
|
376
|
-
|
377
|
-
|
780
|
+
if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
|
781
|
+
break; // can't have two punctuation characters in a row
|
782
|
+
}
|
783
|
+
if (*t == '@') {
|
784
|
+
if (seen_at_symbol) {
|
378
785
|
break; // we can only have one @ symbol
|
379
|
-
else
|
786
|
+
} else {
|
380
787
|
seen_at_symbol = true;
|
788
|
+
}
|
381
789
|
}
|
382
|
-
|
790
|
+
t++;
|
383
791
|
}
|
384
|
-
while (isurlxatpunc(
|
385
|
-
|
386
|
-
|
792
|
+
while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
|
793
|
+
|
794
|
+
if (t > num_end) {
|
795
|
+
ts->t = t;
|
387
796
|
|
388
797
|
if (is_acronym) { // check that it is one letter followed by one '.'
|
389
|
-
for (
|
390
|
-
if (isalpha(
|
798
|
+
for (s = start; s < t-1; s++) {
|
799
|
+
if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
|
391
800
|
}
|
392
801
|
}
|
393
802
|
if (is_acronym) {// strip '.'s
|
394
|
-
for (
|
395
|
-
if (
|
396
|
-
token[token_i] =
|
803
|
+
for (s = start + token_i; s < t; s++) {
|
804
|
+
if (*s != '.') {
|
805
|
+
token[token_i] = *s;
|
397
806
|
token_i++;
|
398
807
|
}
|
399
808
|
}
|
400
|
-
tk_set(ts->token, token, token_i, start, ts->
|
809
|
+
tk_set(ts->token, token, token_i, start - ts->text, t - ts->text, 1);
|
401
810
|
} else { // just return the url as is
|
402
|
-
|
811
|
+
tk_set_ts(ts->token, start, t, ts->text, 1);
|
403
812
|
}
|
404
813
|
} else { // return the number
|
405
|
-
ts->
|
406
|
-
|
814
|
+
ts->t = num_end;
|
815
|
+
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
407
816
|
}
|
408
817
|
}
|
409
818
|
|
410
819
|
return ts->token;
|
411
820
|
}
|
412
821
|
|
822
|
+
void std_ts_destroy(void *p)
|
823
|
+
{
|
824
|
+
TokenStream *ts = (TokenStream *)p;
|
825
|
+
free(ts->data);
|
826
|
+
ts_standard_destroy(ts);
|
827
|
+
}
|
828
|
+
|
829
|
+
void std_ts_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
830
|
+
{
|
831
|
+
new_ts->data = ALLOC(StandardTokenizer);
|
832
|
+
memcpy(new_ts->data, orig_ts->data, sizeof(StandardTokenizer));
|
833
|
+
}
|
834
|
+
|
413
835
|
TokenStream *standard_tokenizer_create()
|
414
836
|
{
|
415
837
|
TokenStream *ts = ts_create();
|
838
|
+
|
839
|
+
StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
|
840
|
+
std_tz->advance_to_start = &std_advance_to_start;
|
841
|
+
std_tz->get_alpha = &std_get_alpha;
|
842
|
+
std_tz->is_tok_char = &std_is_tok_char;
|
843
|
+
std_tz->get_apostrophe = &std_get_apostrophe;
|
844
|
+
|
845
|
+
ts->data = std_tz;
|
846
|
+
ts->destroy = &std_ts_destroy;
|
847
|
+
ts->clone_i = &std_ts_clone_i;
|
416
848
|
ts->next = &std_next;
|
417
849
|
return ts;
|
418
850
|
}
|
419
851
|
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
852
|
+
TokenStream *mb_standard_tokenizer_create()
|
853
|
+
{
|
854
|
+
TokenStream *ts = ts_create();
|
855
|
+
|
856
|
+
StandardTokenizer *std_tz = ALLOC(StandardTokenizer);
|
857
|
+
std_tz->advance_to_start = &mb_std_advance_to_start;
|
858
|
+
std_tz->get_alpha = &mb_std_get_alpha;
|
859
|
+
std_tz->is_tok_char = &w_std_is_tok_char;
|
860
|
+
std_tz->get_apostrophe = &mb_std_get_apostrophe;
|
861
|
+
|
862
|
+
ts->data = std_tz;
|
863
|
+
ts->destroy = &std_ts_destroy;
|
864
|
+
ts->clone_i = &std_ts_clone_i;
|
865
|
+
ts->next = &std_next;
|
866
|
+
return ts;
|
867
|
+
}
|
427
868
|
|
428
869
|
void filter_reset(TokenStream *ts, char *text)
|
429
870
|
{
|
@@ -432,10 +873,10 @@ void filter_reset(TokenStream *ts, char *text)
|
|
432
873
|
|
433
874
|
void filter_destroy(void *p)
|
434
875
|
{
|
435
|
-
TokenStream *
|
436
|
-
|
437
|
-
if (
|
438
|
-
free(
|
876
|
+
TokenStream *tf = (TokenStream *)p;
|
877
|
+
if (tf->destroy_sub) tf->sub_ts->destroy(tf->sub_ts);
|
878
|
+
if (tf->token != NULL) tk_destroy(tf->token);
|
879
|
+
free(tf);
|
439
880
|
}
|
440
881
|
|
441
882
|
void sf_destroy(void *p)
|
@@ -445,40 +886,109 @@ void sf_destroy(void *p)
|
|
445
886
|
filter_destroy(p);
|
446
887
|
}
|
447
888
|
|
448
|
-
|
889
|
+
void sf_clone_i_i(void *key, void *value, void *arg)
|
890
|
+
{
|
891
|
+
HshTable *wordtable = (HshTable *)arg;
|
892
|
+
char *w = estrdup(key);
|
893
|
+
h_set(wordtable, w, w);
|
894
|
+
}
|
895
|
+
|
896
|
+
void sf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
897
|
+
{
|
898
|
+
new_ts->data = h_new_str(&free, NULL);
|
899
|
+
h_each(orig_ts->data, &sf_clone_i_i, new_ts->data);
|
900
|
+
}
|
901
|
+
|
902
|
+
Token *sf_next(TokenStream *tf)
|
449
903
|
{
|
450
904
|
int pos_inc = 1;
|
451
|
-
HshTable *words = (HshTable *)
|
452
|
-
Token *tk =
|
905
|
+
HshTable *words = (HshTable *)tf->data;
|
906
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
453
907
|
while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
|
454
|
-
tk =
|
908
|
+
tk = tf->sub_ts->next(tf->sub_ts);
|
455
909
|
pos_inc++;
|
456
910
|
}
|
457
911
|
if (tk != NULL) tk->pos_inc = pos_inc;
|
458
912
|
return tk;
|
459
913
|
}
|
460
914
|
|
461
|
-
TokenStream *
|
915
|
+
TokenStream *stop_filter_create_with_words_len(TokenStream *ts,
|
916
|
+
const char **words, int len)
|
462
917
|
{
|
463
918
|
int i;
|
919
|
+
char *w;
|
464
920
|
TokenStream *tf = ALLOC(TokenStream);
|
465
921
|
tf->sub_ts = ts;
|
466
|
-
|
922
|
+
tf->destroy_sub = true;
|
923
|
+
HshTable *wordtable = h_new_str(&free, NULL);
|
467
924
|
for (i = 0; i < len; i++) {
|
468
|
-
|
925
|
+
w = estrdup(words[i]);
|
926
|
+
h_set(wordtable, w, w);
|
927
|
+
}
|
928
|
+
tf->data = wordtable;
|
929
|
+
tf->token = NULL;
|
930
|
+
tf->next = &sf_next;
|
931
|
+
tf->reset = &filter_reset;
|
932
|
+
tf->destroy = &sf_destroy;
|
933
|
+
tf->clone_i = &sf_clone_i;
|
934
|
+
return tf;
|
935
|
+
}
|
936
|
+
|
937
|
+
TokenStream *stop_filter_create_with_words(TokenStream *ts, const char **words)
|
938
|
+
{
|
939
|
+
char *w;
|
940
|
+
TokenStream *tf = ALLOC(TokenStream);
|
941
|
+
tf->sub_ts = ts;
|
942
|
+
tf->destroy_sub = true;
|
943
|
+
HshTable *wordtable = h_new_str(&free, NULL);
|
944
|
+
while (*words) {
|
945
|
+
w = estrdup(*words);
|
946
|
+
h_set(wordtable, w, w);
|
947
|
+
words++;
|
469
948
|
}
|
470
949
|
tf->data = wordtable;
|
471
950
|
tf->token = NULL;
|
472
951
|
tf->next = &sf_next;
|
473
952
|
tf->reset = &filter_reset;
|
474
953
|
tf->destroy = &sf_destroy;
|
954
|
+
tf->clone_i = &sf_clone_i;
|
475
955
|
return tf;
|
476
956
|
}
|
477
957
|
|
478
958
|
TokenStream *stop_filter_create(TokenStream *ts)
|
479
959
|
{
|
480
|
-
return stop_filter_create_with_words(ts,
|
481
|
-
|
960
|
+
return stop_filter_create_with_words(ts, FULL_ENGLISH_STOP_WORDS);
|
961
|
+
}
|
962
|
+
|
963
|
+
Token *mb_lcf_next(TokenStream *ts)
|
964
|
+
{
|
965
|
+
wchar_t wbuf[MAX_WORD_SIZE], *w;
|
966
|
+
//mbstate_t state = {0};
|
967
|
+
int i;
|
968
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
969
|
+
if (tk == NULL) return tk;
|
970
|
+
|
971
|
+
i = mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
972
|
+
w = wbuf;
|
973
|
+
while (*w != 0) {
|
974
|
+
*w = towlower(*w);
|
975
|
+
w++;
|
976
|
+
}
|
977
|
+
wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
|
978
|
+
return tk;
|
979
|
+
}
|
980
|
+
|
981
|
+
TokenStream *mb_lowercase_filter_create(TokenStream *ts)
|
982
|
+
{
|
983
|
+
TokenStream *tf = ALLOC(TokenStream);
|
984
|
+
tf->token = NULL;
|
985
|
+
tf->next = &mb_lcf_next;
|
986
|
+
tf->reset = &filter_reset;
|
987
|
+
tf->destroy = &filter_destroy;
|
988
|
+
tf->sub_ts = ts;
|
989
|
+
tf->destroy_sub = true;
|
990
|
+
tf->clone_i = NULL;
|
991
|
+
return tf;
|
482
992
|
}
|
483
993
|
|
484
994
|
Token *lcf_next(TokenStream *ts)
|
@@ -501,48 +1011,199 @@ TokenStream *lowercase_filter_create(TokenStream *ts)
|
|
501
1011
|
tf->reset = &filter_reset;
|
502
1012
|
tf->destroy = &filter_destroy;
|
503
1013
|
tf->sub_ts = ts;
|
1014
|
+
tf->destroy_sub = true;
|
1015
|
+
tf->clone_i = NULL;
|
504
1016
|
return tf;
|
505
1017
|
}
|
506
1018
|
|
507
|
-
|
1019
|
+
typedef struct StemFilter {
|
1020
|
+
struct sb_stemmer *stemmer;
|
1021
|
+
char *algorithm;
|
1022
|
+
char *charenc;
|
1023
|
+
} StemFilter;
|
1024
|
+
|
1025
|
+
void stemf_destroy(void *p)
|
508
1026
|
{
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
1027
|
+
TokenStream *ts = (TokenStream *)p;
|
1028
|
+
StemFilter *stemf = (StemFilter *)ts->data;
|
1029
|
+
sb_stemmer_delete(stemf->stemmer);
|
1030
|
+
free(stemf->algorithm);
|
1031
|
+
free(stemf->charenc);
|
1032
|
+
free(stemf);
|
1033
|
+
filter_destroy(ts);
|
515
1034
|
}
|
516
1035
|
|
1036
|
+
Token *stemf_next(TokenStream *ts)
|
1037
|
+
{
|
1038
|
+
int len;
|
1039
|
+
const sb_symbol *stemmed;
|
1040
|
+
struct sb_stemmer *stemmer = ((StemFilter *)ts->data)->stemmer;
|
1041
|
+
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
1042
|
+
if (tk == NULL) return tk;
|
1043
|
+
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, strlen(tk->text));
|
1044
|
+
len = sb_stemmer_length(stemmer);
|
1045
|
+
if (len >= MAX_WORD_SIZE) len = MAX_WORD_SIZE - 1;
|
1046
|
+
memcpy(tk->text, stemmed, len);
|
1047
|
+
tk->text[len] = '\0';
|
1048
|
+
return tk;
|
1049
|
+
}
|
517
1050
|
|
518
|
-
|
1051
|
+
void stemf_clone_i(TokenStream *orig_ts, TokenStream *new_ts)
|
519
1052
|
{
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
1053
|
+
StemFilter *orig_stemf = (StemFilter *)orig_ts->data;
|
1054
|
+
StemFilter *stemf = ALLOC(StemFilter);
|
1055
|
+
stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
|
1056
|
+
stemf->algorithm = orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
|
1057
|
+
stemf->charenc = orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
|
1058
|
+
new_ts->data = stemf;
|
1059
|
+
}
|
1060
|
+
|
1061
|
+
TokenStream *stem_filter_create(TokenStream *ts, const char * algorithm,
|
1062
|
+
const char * charenc)
|
1063
|
+
{
|
1064
|
+
TokenStream *tf = ALLOC(TokenStream);
|
1065
|
+
StemFilter *stemf = ALLOC(StemFilter);
|
1066
|
+
stemf->stemmer = sb_stemmer_new(algorithm, charenc);
|
1067
|
+
stemf->algorithm = algorithm ? estrdup(algorithm) : NULL;
|
1068
|
+
stemf->charenc = charenc ? estrdup(charenc) : NULL;
|
1069
|
+
tf->data = stemf;
|
1070
|
+
|
1071
|
+
tf->token = NULL;
|
1072
|
+
tf->next = &stemf_next;
|
1073
|
+
tf->reset = &filter_reset;
|
1074
|
+
tf->destroy = &stemf_destroy;
|
1075
|
+
tf->clone_i = &stemf_clone_i;
|
1076
|
+
tf->sub_ts = ts;
|
1077
|
+
tf->destroy_sub = true;
|
1078
|
+
return tf;
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
Analyzer *standard_analyzer_create_with_words_len(
|
1082
|
+
const char **words, int len, bool lowercase)
|
1083
|
+
{
|
1084
|
+
TokenStream *ts;
|
1085
|
+
if (lowercase) {
|
1086
|
+
ts = stop_filter_create_with_words_len(
|
524
1087
|
lowercase_filter_create(standard_tokenizer_create()), words, len);
|
525
|
-
|
526
|
-
|
527
|
-
|
1088
|
+
} else {
|
1089
|
+
ts = stop_filter_create_with_words_len(
|
1090
|
+
standard_tokenizer_create(), words, len);
|
1091
|
+
}
|
1092
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
1093
|
+
}
|
1094
|
+
|
1095
|
+
Analyzer *standard_analyzer_create_with_words(const char **words, bool lowercase)
|
1096
|
+
{
|
1097
|
+
TokenStream *ts;
|
1098
|
+
if (lowercase) {
|
1099
|
+
ts = stop_filter_create_with_words(
|
1100
|
+
lowercase_filter_create(standard_tokenizer_create()), words);
|
1101
|
+
} else {
|
1102
|
+
ts = stop_filter_create_with_words(
|
1103
|
+
standard_tokenizer_create(), words);
|
1104
|
+
}
|
1105
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
1106
|
+
}
|
1107
|
+
|
1108
|
+
Analyzer *mb_standard_analyzer_create_with_words_len(
|
1109
|
+
const char **words, int len, bool lowercase)
|
1110
|
+
{
|
1111
|
+
TokenStream *ts;
|
1112
|
+
if (lowercase) {
|
1113
|
+
ts = stop_filter_create_with_words_len(
|
1114
|
+
mb_lowercase_filter_create(mb_standard_tokenizer_create()), words, len);
|
1115
|
+
} else {
|
1116
|
+
ts = stop_filter_create_with_words_len(
|
1117
|
+
mb_standard_tokenizer_create(), words, len);
|
1118
|
+
}
|
1119
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
Analyzer *mb_standard_analyzer_create_with_words(
|
1123
|
+
const char **words, bool lowercase)
|
1124
|
+
{
|
1125
|
+
TokenStream *ts;
|
1126
|
+
if (lowercase) {
|
1127
|
+
ts = stop_filter_create_with_words(
|
1128
|
+
mb_lowercase_filter_create(mb_standard_tokenizer_create()), words);
|
1129
|
+
} else {
|
1130
|
+
ts = stop_filter_create_with_words(mb_standard_tokenizer_create(), words);
|
1131
|
+
}
|
1132
|
+
return analyzer_create(NULL, ts, NULL, NULL);
|
1133
|
+
}
|
1134
|
+
|
1135
|
+
Analyzer *standard_analyzer_create(bool lowercase)
|
1136
|
+
{
|
1137
|
+
return standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
Analyzer *mb_standard_analyzer_create(bool lowercase)
|
1141
|
+
{
|
1142
|
+
return mb_standard_analyzer_create_with_words(FULL_ENGLISH_STOP_WORDS, lowercase);
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
/****************************************************************************
|
1146
|
+
*
|
1147
|
+
* PerFieldAnalyzer
|
1148
|
+
*
|
1149
|
+
****************************************************************************/
|
1150
|
+
|
1151
|
+
typedef struct PerFieldAnalyzer {
|
1152
|
+
HshTable *dict;
|
1153
|
+
Analyzer *def;
|
1154
|
+
bool destroy_subs : 1;
|
1155
|
+
} PerFieldAnalyzer;
|
1156
|
+
|
1157
|
+
void pfa_destroy(void *p)
|
1158
|
+
{
|
1159
|
+
Analyzer *self = (Analyzer *)p;
|
1160
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
1161
|
+
h_destroy(pfa->dict);
|
1162
|
+
|
1163
|
+
if (pfa->destroy_subs) a_destroy(pfa->def);
|
1164
|
+
free(pfa);
|
1165
|
+
free(self);
|
1166
|
+
}
|
1167
|
+
|
1168
|
+
TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
|
1169
|
+
{
|
1170
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
1171
|
+
Analyzer *a = h_get(pfa->dict, field);
|
1172
|
+
if (a == NULL) a = pfa->def;
|
1173
|
+
return a_get_ts(a, field, text);
|
1174
|
+
}
|
1175
|
+
|
1176
|
+
void pfa_sub_a_destroy(void *p)
|
1177
|
+
{
|
1178
|
+
Analyzer *a = (Analyzer *)p;
|
1179
|
+
a->destroy(a);
|
1180
|
+
}
|
1181
|
+
|
1182
|
+
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
1183
|
+
{
|
1184
|
+
PerFieldAnalyzer *pfa = (PerFieldAnalyzer *)self->data;
|
1185
|
+
h_set(pfa->dict, estrdup(field), analyzer);
|
528
1186
|
}
|
529
1187
|
|
530
|
-
Analyzer *
|
1188
|
+
Analyzer *per_field_analyzer_create(Analyzer *def, bool destroy_subs)
|
531
1189
|
{
|
532
|
-
|
533
|
-
|
1190
|
+
PerFieldAnalyzer *pfa = ALLOC(PerFieldAnalyzer);
|
1191
|
+
pfa->def = def;
|
1192
|
+
pfa->destroy_subs = destroy_subs;
|
1193
|
+
pfa->dict = destroy_subs ? h_new_str(&free, &pfa_sub_a_destroy)
|
1194
|
+
: h_new_str(&free, NULL);
|
1195
|
+
return analyzer_create(pfa, NULL, &pfa_destroy, &pfa_get_ts);
|
534
1196
|
}
|
535
1197
|
|
536
1198
|
#ifdef ALONE
|
537
1199
|
int main(int argc, char **argv)
|
538
1200
|
{
|
539
1201
|
char buf[10000];
|
540
|
-
Analyzer *a = standard_analyzer_create();
|
1202
|
+
Analyzer *a = standard_analyzer_create(true);
|
541
1203
|
TokenStream *ts;
|
542
1204
|
Token *tk;
|
543
1205
|
while (fgets(buf, 9999, stdin) != NULL) {
|
544
1206
|
ts = a->get_ts(a, "hello", buf);
|
545
|
-
ts->pos = 0;
|
546
1207
|
while ((tk = ts->next(ts)) != NULL) {
|
547
1208
|
printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
|
548
1209
|
}
|