ferret 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/Makefile +2 -2
- data/ext/ferret.c +27 -2
- data/ext/ferret.h +59 -16
- data/ext/ferret_ext.so +0 -0
- data/ext/index_io.c +72 -77
- data/ext/priority_queue.c +150 -145
- data/ext/ram_directory.c +47 -42
- data/ext/segment_merge_queue.c +4 -8
- data/ext/segment_term_enum.c +324 -0
- data/ext/similarity.c +59 -0
- data/ext/string_helper.c +2 -2
- data/ext/tags +150 -46
- data/ext/term.c +107 -152
- data/ext/term_buffer.c +105 -174
- data/ext/term_infos_reader.c +54 -0
- data/ext/terminfo.c +160 -0
- data/ext/token.c +93 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/analyzers.rb +18 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
- data/lib/ferret/analysis/token.rb +8 -1
- data/lib/ferret/analysis/tokenizers.rb +10 -5
- data/lib/ferret/document/field.rb +33 -11
- data/lib/ferret/index/document_writer.rb +3 -2
- data/lib/ferret/index/field_infos.rb +38 -12
- data/lib/ferret/index/fields_io.rb +10 -4
- data/lib/ferret/index/index.rb +20 -4
- data/lib/ferret/index/index_reader.rb +19 -4
- data/lib/ferret/index/index_writer.rb +1 -1
- data/lib/ferret/index/multi_reader.rb +21 -7
- data/lib/ferret/index/segment_merge_info.rb +24 -22
- data/lib/ferret/index/segment_merge_queue.rb +2 -2
- data/lib/ferret/index/segment_merger.rb +28 -11
- data/lib/ferret/index/segment_reader.rb +19 -4
- data/lib/ferret/index/segment_term_enum.rb +3 -11
- data/lib/ferret/index/term_buffer.rb +13 -16
- data/lib/ferret/index/term_doc_enum.rb +8 -5
- data/lib/ferret/index/term_enum.rb +2 -2
- data/lib/ferret/index/term_info.rb +1 -5
- data/lib/ferret/index/term_infos_io.rb +2 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
- data/lib/ferret/search/phrase_scorer.rb +0 -1
- data/lib/ferret/search/similarity.rb +2 -2
- data/lib/ferret/search/term_scorer.rb +2 -2
- data/lib/ferret/store/directory.rb +2 -0
- data/lib/ferret/store/fs_store.rb +16 -3
- data/lib/ferret/store/ram_store.rb +2 -2
- data/test/unit/document/tc_field.rb +9 -0
- data/test/unit/index/tc_field_infos.rb +29 -21
- data/test/unit/index/tc_index.rb +44 -7
- data/test/unit/index/tc_term_buffer.rb +3 -3
- data/test/unit/index/tc_term_info.rb +1 -1
- data/test/unit/query_parser/tc_query_parser.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +3 -0
- data/test/unit/store/tc_fs_store.rb +47 -16
- data/test/unit/store/tc_ram_store.rb +1 -1
- metadata +8 -3
@@ -0,0 +1,54 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
static ID frt_id_index_terms;
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* TermInfosReader Methods
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
static VALUE
|
11
|
+
frt_tir_get_index_offset(VALUE self, VALUE rterm)
|
12
|
+
{
|
13
|
+
VALUE index_terms = rb_ivar_get(self, frt_id_index_terms);
|
14
|
+
|
15
|
+
register int lo = 0; // binary search @index_terms[]
|
16
|
+
register int hi = RARRAY(index_terms)->len - 1;
|
17
|
+
register int mid, delta;
|
18
|
+
|
19
|
+
Term *term, *tmp_term;
|
20
|
+
Data_Get_Struct(rterm, Term, term);
|
21
|
+
|
22
|
+
while (hi >= lo) {
|
23
|
+
mid = (lo + hi) >> 1;
|
24
|
+
|
25
|
+
Data_Get_Struct(RARRAY(index_terms)->ptr[mid], Term, tmp_term);
|
26
|
+
delta = frt_term_cmp(term, tmp_term);
|
27
|
+
if (delta < 0) {
|
28
|
+
hi = mid - 1;
|
29
|
+
} else if (delta > 0) {
|
30
|
+
lo = mid + 1;
|
31
|
+
} else {
|
32
|
+
return INT2FIX(mid);
|
33
|
+
}
|
34
|
+
}
|
35
|
+
return INT2FIX(hi);
|
36
|
+
}
|
37
|
+
|
38
|
+
/****************************************************************************
|
39
|
+
*
|
40
|
+
* Init Function
|
41
|
+
*
|
42
|
+
****************************************************************************/
|
43
|
+
|
44
|
+
void
|
45
|
+
Init_term_infos_reader(void)
|
46
|
+
{
|
47
|
+
/* IDs */
|
48
|
+
frt_id_index_terms = rb_intern("@index_terms");
|
49
|
+
|
50
|
+
/* TermInfosReader */
|
51
|
+
cTermInfosReader = rb_define_class_under(mIndex, "TermInfosReader", rb_cObject);
|
52
|
+
|
53
|
+
rb_define_method(cTermInfosReader, "get_index_offset", frt_tir_get_index_offset, 1);
|
54
|
+
}
|
data/ext/terminfo.c
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* TermInfo Methods
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
void
|
11
|
+
frt_ti_free(void *p)
|
12
|
+
{
|
13
|
+
free(p);
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE
|
17
|
+
frt_ti_alloc(VALUE klass)
|
18
|
+
{
|
19
|
+
TermInfo *ti = (TermInfo *)ALLOC(TermInfo);
|
20
|
+
VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_ti_free, ti);
|
21
|
+
return rbuffer;
|
22
|
+
}
|
23
|
+
|
24
|
+
#define GET_TI TermInfo *ti; Data_Get_Struct(self, TermInfo, ti)
|
25
|
+
inline VALUE
|
26
|
+
frt_ti_set(int argc, VALUE *argv, VALUE self)
|
27
|
+
{
|
28
|
+
VALUE df, fp, pp, so;
|
29
|
+
GET_TI;
|
30
|
+
MEMZERO(ti, TermInfo, 1);
|
31
|
+
rb_scan_args(argc, argv, "04", &df, &fp, &pp, &so);
|
32
|
+
switch (argc) {
|
33
|
+
case 4:
|
34
|
+
ti->skip_offset = FIX2INT(so);
|
35
|
+
case 3:
|
36
|
+
ti->prox_pointer = FIX2INT(pp);
|
37
|
+
case 2:
|
38
|
+
ti->freq_pointer = FIX2INT(fp);
|
39
|
+
case 1:
|
40
|
+
ti->doc_freq = FIX2INT(df);
|
41
|
+
case 0:
|
42
|
+
break;
|
43
|
+
}
|
44
|
+
return Qnil;
|
45
|
+
}
|
46
|
+
|
47
|
+
static VALUE
|
48
|
+
frt_ti_init(int argc, VALUE *argv, VALUE self)
|
49
|
+
{
|
50
|
+
frt_ti_set(argc, argv, self);
|
51
|
+
return self;
|
52
|
+
}
|
53
|
+
|
54
|
+
static VALUE
|
55
|
+
frt_ti_init_copy(VALUE self, VALUE rother)
|
56
|
+
{
|
57
|
+
TermInfo *other_ti;
|
58
|
+
GET_TI;
|
59
|
+
Data_Get_Struct(rother, TermInfo, other_ti);
|
60
|
+
MEMCPY(ti, other_ti, TermInfo, 1);
|
61
|
+
return self;
|
62
|
+
}
|
63
|
+
|
64
|
+
static VALUE
|
65
|
+
frt_ti_eql(VALUE self, VALUE rother)
|
66
|
+
{
|
67
|
+
TermInfo *other_ti;
|
68
|
+
GET_TI;
|
69
|
+
if (NIL_P(rother)) return Qfalse;
|
70
|
+
Data_Get_Struct(rother, TermInfo, other_ti);
|
71
|
+
return (MEMCMP(ti, other_ti, TermInfo, 1) == 0) ? Qtrue : Qfalse;
|
72
|
+
}
|
73
|
+
|
74
|
+
static VALUE
|
75
|
+
frt_ti_get_df(VALUE self)
|
76
|
+
{
|
77
|
+
GET_TI;
|
78
|
+
return INT2FIX(ti->doc_freq);
|
79
|
+
}
|
80
|
+
|
81
|
+
static VALUE
|
82
|
+
frt_ti_get_fp(VALUE self)
|
83
|
+
{
|
84
|
+
GET_TI;
|
85
|
+
return INT2FIX(ti->freq_pointer);
|
86
|
+
}
|
87
|
+
|
88
|
+
static VALUE
|
89
|
+
frt_ti_get_pp(VALUE self)
|
90
|
+
{
|
91
|
+
GET_TI;
|
92
|
+
return INT2FIX(ti->prox_pointer);
|
93
|
+
}
|
94
|
+
|
95
|
+
static VALUE
|
96
|
+
frt_ti_get_so(VALUE self)
|
97
|
+
{
|
98
|
+
GET_TI;
|
99
|
+
return INT2FIX(ti->skip_offset);
|
100
|
+
}
|
101
|
+
|
102
|
+
static VALUE
|
103
|
+
frt_ti_set_df(VALUE self, VALUE val)
|
104
|
+
{
|
105
|
+
GET_TI;
|
106
|
+
ti->doc_freq = FIX2INT(val);
|
107
|
+
return Qnil;
|
108
|
+
}
|
109
|
+
|
110
|
+
static VALUE
|
111
|
+
frt_ti_set_fp(VALUE self, VALUE val)
|
112
|
+
{
|
113
|
+
GET_TI;
|
114
|
+
ti->freq_pointer = FIX2INT(val);
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
|
118
|
+
static VALUE
|
119
|
+
frt_ti_set_pp(VALUE self, VALUE val)
|
120
|
+
{
|
121
|
+
GET_TI;
|
122
|
+
ti->prox_pointer = FIX2INT(val);
|
123
|
+
return Qnil;
|
124
|
+
}
|
125
|
+
|
126
|
+
static VALUE
|
127
|
+
frt_ti_set_so(VALUE self, VALUE val)
|
128
|
+
{
|
129
|
+
GET_TI;
|
130
|
+
ti->skip_offset = FIX2INT(val);
|
131
|
+
return Qnil;
|
132
|
+
}
|
133
|
+
|
134
|
+
/****************************************************************************
|
135
|
+
*
|
136
|
+
* Init Function
|
137
|
+
*
|
138
|
+
****************************************************************************/
|
139
|
+
|
140
|
+
void
|
141
|
+
Init_term_info(void)
|
142
|
+
{
|
143
|
+
/* TermInfo */
|
144
|
+
cTermInfo = rb_define_class_under(mIndex, "TermInfo", rb_cObject);
|
145
|
+
rb_define_alloc_func(cTermInfo, frt_ti_alloc);
|
146
|
+
|
147
|
+
rb_define_method(cTermInfo, "initialize", frt_ti_init, -1);
|
148
|
+
rb_define_method(cTermInfo, "set_values!", frt_ti_set, -1);
|
149
|
+
rb_define_method(cTermInfo, "initialize_copy", frt_ti_init_copy, 1);
|
150
|
+
rb_define_method(cTermInfo, "set!", frt_ti_init_copy, 1);
|
151
|
+
rb_define_method(cTermInfo, "==", frt_ti_eql, 1);
|
152
|
+
rb_define_method(cTermInfo, "doc_freq", frt_ti_get_df, 0);
|
153
|
+
rb_define_method(cTermInfo, "doc_freq=", frt_ti_set_df, 1);
|
154
|
+
rb_define_method(cTermInfo, "freq_pointer", frt_ti_get_fp, 0);
|
155
|
+
rb_define_method(cTermInfo, "freq_pointer=", frt_ti_set_fp, 1);
|
156
|
+
rb_define_method(cTermInfo, "prox_pointer", frt_ti_get_pp, 0);
|
157
|
+
rb_define_method(cTermInfo, "prox_pointer=", frt_ti_set_pp, 1);
|
158
|
+
rb_define_method(cTermInfo, "skip_offset", frt_ti_get_so, 0);
|
159
|
+
rb_define_method(cTermInfo, "skip_offset=", frt_ti_set_so, 1);
|
160
|
+
}
|
data/ext/token.c
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
/****************************************************************************
|
4
|
+
*
|
5
|
+
* Token Methods
|
6
|
+
*
|
7
|
+
****************************************************************************/
|
8
|
+
|
9
|
+
ID id_tk_text, id_tk_pos_inc, id_tk_start_offset, id_tk_end_offset, id_tk_type;
|
10
|
+
ID id_tk_pos_inc_set;
|
11
|
+
|
12
|
+
static VALUE
|
13
|
+
frt_token_pos_inc (VALUE self, VALUE pI)
|
14
|
+
{
|
15
|
+
if(FIX2INT(pI) < 0)
|
16
|
+
rb_raise(rb_eArgError, "position_increment < 0");
|
17
|
+
rb_ivar_set(self, id_tk_pos_inc, pI);
|
18
|
+
return self;
|
19
|
+
}
|
20
|
+
|
21
|
+
static VALUE
|
22
|
+
frt_token_init(int argc, VALUE *argv, VALUE self)
|
23
|
+
{
|
24
|
+
VALUE text, start_offset, end_offset, type, pos_inc;
|
25
|
+
rb_scan_args(argc, argv, "32", &text,
|
26
|
+
&start_offset, &end_offset, &type, &pos_inc);
|
27
|
+
rb_ivar_set(self, id_tk_text, text);
|
28
|
+
rb_ivar_set(self, id_tk_start_offset, start_offset);
|
29
|
+
rb_ivar_set(self, id_tk_end_offset, end_offset);
|
30
|
+
if (argc < 4) {
|
31
|
+
rb_ivar_set(self, id_tk_type, rb_str_new("word", 4));
|
32
|
+
} else {
|
33
|
+
rb_ivar_set(self, id_tk_type, type);
|
34
|
+
}
|
35
|
+
if (argc < 5) {
|
36
|
+
rb_ivar_set(self, id_tk_pos_inc, INT2FIX(1));
|
37
|
+
} else {
|
38
|
+
rb_ivar_set(self, id_tk_pos_inc, pos_inc);
|
39
|
+
}
|
40
|
+
return self;
|
41
|
+
}
|
42
|
+
|
43
|
+
static VALUE
|
44
|
+
frt_token_eql(VALUE self, VALUE other)
|
45
|
+
{
|
46
|
+
VALUE rself_text, rother_text;
|
47
|
+
char *self_text, *other_text;
|
48
|
+
if (!rb_respond_to(other, id_tk_pos_inc_set))
|
49
|
+
return Qfalse;
|
50
|
+
rself_text = rb_ivar_get(self, id_tk_text);
|
51
|
+
rother_text = rb_ivar_get(other, id_tk_text);
|
52
|
+
self_text = StringValuePtr(rself_text);
|
53
|
+
other_text = StringValuePtr(rother_text);
|
54
|
+
if (rb_ivar_get(self, id_tk_start_offset) == rb_ivar_get(other, id_tk_start_offset) &&
|
55
|
+
rb_ivar_get(self, id_tk_end_offset) == rb_ivar_get(other, id_tk_end_offset) &&
|
56
|
+
(strcmp(self_text, other_text) == 0))
|
57
|
+
return Qtrue;
|
58
|
+
else
|
59
|
+
return Qfalse;
|
60
|
+
}
|
61
|
+
|
62
|
+
/****************************************************************************
|
63
|
+
*
|
64
|
+
* Init Function
|
65
|
+
*
|
66
|
+
****************************************************************************/
|
67
|
+
|
68
|
+
void
|
69
|
+
Init_token(void)
|
70
|
+
{
|
71
|
+
/* IDs */
|
72
|
+
id_tk_text = rb_intern("@term_text");
|
73
|
+
id_tk_start_offset = rb_intern("@start_offset");
|
74
|
+
id_tk_end_offset = rb_intern("@end_offset");
|
75
|
+
id_tk_type = rb_intern("@type");
|
76
|
+
id_tk_pos_inc = rb_intern("@position_increment");
|
77
|
+
id_tk_pos_inc_set = rb_intern("position_increment=");
|
78
|
+
|
79
|
+
|
80
|
+
/* IndexWriter */
|
81
|
+
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
82
|
+
|
83
|
+
rb_define_method(cToken, "initialize", frt_token_init, -1);
|
84
|
+
rb_define_method(cToken, "position_increment=", frt_token_pos_inc, 1);
|
85
|
+
rb_define_method(cToken, "==", frt_token_eql, 1);
|
86
|
+
rb_define_method(cToken, "eql", frt_token_eql, 1);
|
87
|
+
|
88
|
+
rb_define_attr(cToken, "term_text", 1, 1);
|
89
|
+
rb_define_attr(cToken, "position_increment", 1, 0);
|
90
|
+
rb_define_attr(cToken, "start_offset", 1, 0);
|
91
|
+
rb_define_attr(cToken, "end_offset", 1, 0);
|
92
|
+
rb_define_attr(cToken, "type", 1, 1);
|
93
|
+
}
|
data/lib/ferret.rb
CHANGED
@@ -17,6 +17,24 @@ module Ferret::Analysis
|
|
17
17
|
def token_stream(field, string)
|
18
18
|
return LowerCaseTokenizer.new(string)
|
19
19
|
end
|
20
|
+
|
21
|
+
# Invoked before indexing a Field instance if
|
22
|
+
# terms have already been added to that field. This allows custom
|
23
|
+
# analyzers to place an automatic position increment gap between
|
24
|
+
# Field instances using the same field name. The default value
|
25
|
+
# position increment gap is 0. With a 0 position increment gap and
|
26
|
+
# the typical default token position increment of 1, all terms in a field,
|
27
|
+
# including across Field instances, are in successive positions, allowing
|
28
|
+
# exact PhraseQuery matches, for instance, across Field instance boundaries.
|
29
|
+
#
|
30
|
+
# field_name:: Field name being indexed.
|
31
|
+
# position_increment_gap:: added to the next token emitted from
|
32
|
+
# #token_stream(String,Reader)
|
33
|
+
#
|
34
|
+
def position_increment_gap(field_name)
|
35
|
+
return 0
|
36
|
+
end
|
37
|
+
|
20
38
|
end
|
21
39
|
|
22
40
|
# An Analyzer that uses WhiteSpaceTokenizer.
|
@@ -18,7 +18,21 @@ module Ferret::Analysis
|
|
18
18
|
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
19
|
P = /[_\/.,-]/
|
20
20
|
HASDIGIT = /\w*\d\w*/
|
21
|
+
TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
|
22
|
+
|\.([[:alpha:]]\.)+
|
23
|
+
|(@|\&)\w+([-.]\w+)*
|
24
|
+
)
|
25
|
+
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
26
|
+
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
27
|
+
|(\.\w+)+
|
28
|
+
|
|
29
|
+
)
|
30
|
+
/x
|
21
31
|
|
32
|
+
ACRONYM_WORD = /^#{ACRONYM}$/
|
33
|
+
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
34
|
+
DOT = /\./
|
35
|
+
APOSTROPHE_S = /'[sS]$/
|
22
36
|
protected
|
23
37
|
|
24
38
|
# Collects only characters which are not spaces tabs or carraige returns
|
@@ -27,24 +41,15 @@ module Ferret::Analysis
|
|
27
41
|
# This is a simplified version of the original Lucene standard
|
28
42
|
# tokenizer. I think it works better. I hope so anyway. Any way to
|
29
43
|
# do this more neatly?
|
30
|
-
|
31
|
-
|\.([[:alpha:]]\.)+
|
32
|
-
|(@|\&)\w+([-.]\w+)*
|
33
|
-
)
|
34
|
-
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
35
|
-
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
36
|
-
|(\.\w+)+
|
37
|
-
|
|
38
|
-
)
|
39
|
-
/x
|
44
|
+
TOKEN_RE
|
40
45
|
end
|
41
46
|
|
42
47
|
# stem the 's and remove the '.'s from acronyms
|
43
48
|
def normalize(str)
|
44
|
-
if str =~
|
45
|
-
str.gsub!(
|
46
|
-
elsif str =~
|
47
|
-
str.gsub!(
|
49
|
+
if str =~ ACRONYM_WORD
|
50
|
+
str.gsub!(DOT, '')
|
51
|
+
elsif str =~ APOSTROPHE_WORD
|
52
|
+
str.gsub!(APOSTROPHE_S, '')
|
48
53
|
end
|
49
54
|
str
|
50
55
|
end
|
@@ -35,9 +35,16 @@ module Ferret::Analysis
|
|
35
35
|
@position_increment = pos_inc
|
36
36
|
end
|
37
37
|
|
38
|
+
def set!(txt, so, eo)
|
39
|
+
@term_text = txt
|
40
|
+
@start_offset = so
|
41
|
+
@end_offset = eo
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
38
45
|
def eql?(o)
|
39
46
|
return (o.instance_of?(Token) and @start_offset == o.start_offset and
|
40
|
-
@end_offset == o.end_offset and @term_text
|
47
|
+
@end_offset == o.end_offset and @term_text == o.term_text)
|
41
48
|
end
|
42
49
|
alias :== :eql?
|
43
50
|
|
@@ -36,6 +36,7 @@ module Ferret::Analysis
|
|
36
36
|
# input:: must have a read(count) method which returns an array or string
|
37
37
|
# of _count_ chars.
|
38
38
|
def initialize(input)
|
39
|
+
#@token_buffer = Token.new("", 0, 0)
|
39
40
|
if input.is_a? String
|
40
41
|
@ss = StringScanner.new(input)
|
41
42
|
else
|
@@ -53,6 +54,7 @@ module Ferret::Analysis
|
|
53
54
|
return nil
|
54
55
|
end
|
55
56
|
|
57
|
+
#return @token_buffer.set!(normalize(term), term_start, term_end)
|
56
58
|
return Token.new(normalize(term), term_start, term_end)
|
57
59
|
end
|
58
60
|
|
@@ -62,8 +64,9 @@ module Ferret::Analysis
|
|
62
64
|
|
63
65
|
protected
|
64
66
|
# returns the regular expression used to find the next token
|
67
|
+
TOKEN_RE = /[[:alpha:]]+/
|
65
68
|
def token_re
|
66
|
-
|
69
|
+
TOKEN_RE
|
67
70
|
end
|
68
71
|
|
69
72
|
# Called on each token to normalize it before it is added to the
|
@@ -80,8 +83,9 @@ module Ferret::Analysis
|
|
80
83
|
protected
|
81
84
|
# Collects only characters which satisfy the regular expression
|
82
85
|
# _/[[:alpha:]]+/_.
|
83
|
-
|
84
|
-
|
86
|
+
TOKEN_RE = /[[:alpha:]]+/
|
87
|
+
def token_re
|
88
|
+
TOKEN_RE
|
85
89
|
end
|
86
90
|
end
|
87
91
|
|
@@ -100,8 +104,9 @@ module Ferret::Analysis
|
|
100
104
|
class WhiteSpaceTokenizer < RegExpTokenizer
|
101
105
|
protected
|
102
106
|
# Collects only characters which are not spaces tabs or carraige returns
|
103
|
-
|
104
|
-
|
107
|
+
TOKEN_RE = /\S+/
|
108
|
+
def token_re
|
109
|
+
TOKEN_RE
|
105
110
|
end
|
106
111
|
end
|
107
112
|
end
|