ferret 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/Makefile +2 -2
- data/ext/ferret.c +27 -2
- data/ext/ferret.h +59 -16
- data/ext/ferret_ext.so +0 -0
- data/ext/index_io.c +72 -77
- data/ext/priority_queue.c +150 -145
- data/ext/ram_directory.c +47 -42
- data/ext/segment_merge_queue.c +4 -8
- data/ext/segment_term_enum.c +324 -0
- data/ext/similarity.c +59 -0
- data/ext/string_helper.c +2 -2
- data/ext/tags +150 -46
- data/ext/term.c +107 -152
- data/ext/term_buffer.c +105 -174
- data/ext/term_infos_reader.c +54 -0
- data/ext/terminfo.c +160 -0
- data/ext/token.c +93 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/analyzers.rb +18 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +19 -14
- data/lib/ferret/analysis/token.rb +8 -1
- data/lib/ferret/analysis/tokenizers.rb +10 -5
- data/lib/ferret/document/field.rb +33 -11
- data/lib/ferret/index/document_writer.rb +3 -2
- data/lib/ferret/index/field_infos.rb +38 -12
- data/lib/ferret/index/fields_io.rb +10 -4
- data/lib/ferret/index/index.rb +20 -4
- data/lib/ferret/index/index_reader.rb +19 -4
- data/lib/ferret/index/index_writer.rb +1 -1
- data/lib/ferret/index/multi_reader.rb +21 -7
- data/lib/ferret/index/segment_merge_info.rb +24 -22
- data/lib/ferret/index/segment_merge_queue.rb +2 -2
- data/lib/ferret/index/segment_merger.rb +28 -11
- data/lib/ferret/index/segment_reader.rb +19 -4
- data/lib/ferret/index/segment_term_enum.rb +3 -11
- data/lib/ferret/index/term_buffer.rb +13 -16
- data/lib/ferret/index/term_doc_enum.rb +8 -5
- data/lib/ferret/index/term_enum.rb +2 -2
- data/lib/ferret/index/term_info.rb +1 -5
- data/lib/ferret/index/term_infos_io.rb +2 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -7
- data/lib/ferret/search/phrase_scorer.rb +0 -1
- data/lib/ferret/search/similarity.rb +2 -2
- data/lib/ferret/search/term_scorer.rb +2 -2
- data/lib/ferret/store/directory.rb +2 -0
- data/lib/ferret/store/fs_store.rb +16 -3
- data/lib/ferret/store/ram_store.rb +2 -2
- data/test/unit/document/tc_field.rb +9 -0
- data/test/unit/index/tc_field_infos.rb +29 -21
- data/test/unit/index/tc_index.rb +44 -7
- data/test/unit/index/tc_term_buffer.rb +3 -3
- data/test/unit/index/tc_term_info.rb +1 -1
- data/test/unit/query_parser/tc_query_parser.rb +1 -1
- data/test/unit/search/tc_index_searcher.rb +3 -0
- data/test/unit/store/tc_fs_store.rb +47 -16
- data/test/unit/store/tc_ram_store.rb +1 -1
- metadata +8 -3
@@ -0,0 +1,54 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
static ID frt_id_index_terms;
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* TermInfosReader Methods
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
static VALUE
|
11
|
+
frt_tir_get_index_offset(VALUE self, VALUE rterm)
|
12
|
+
{
|
13
|
+
VALUE index_terms = rb_ivar_get(self, frt_id_index_terms);
|
14
|
+
|
15
|
+
register int lo = 0; // binary search @index_terms[]
|
16
|
+
register int hi = RARRAY(index_terms)->len - 1;
|
17
|
+
register int mid, delta;
|
18
|
+
|
19
|
+
Term *term, *tmp_term;
|
20
|
+
Data_Get_Struct(rterm, Term, term);
|
21
|
+
|
22
|
+
while (hi >= lo) {
|
23
|
+
mid = (lo + hi) >> 1;
|
24
|
+
|
25
|
+
Data_Get_Struct(RARRAY(index_terms)->ptr[mid], Term, tmp_term);
|
26
|
+
delta = frt_term_cmp(term, tmp_term);
|
27
|
+
if (delta < 0) {
|
28
|
+
hi = mid - 1;
|
29
|
+
} else if (delta > 0) {
|
30
|
+
lo = mid + 1;
|
31
|
+
} else {
|
32
|
+
return INT2FIX(mid);
|
33
|
+
}
|
34
|
+
}
|
35
|
+
return INT2FIX(hi);
|
36
|
+
}
|
37
|
+
|
38
|
+
/****************************************************************************
|
39
|
+
*
|
40
|
+
* Init Function
|
41
|
+
*
|
42
|
+
****************************************************************************/
|
43
|
+
|
44
|
+
void
|
45
|
+
Init_term_infos_reader(void)
|
46
|
+
{
|
47
|
+
/* IDs */
|
48
|
+
frt_id_index_terms = rb_intern("@index_terms");
|
49
|
+
|
50
|
+
/* TermInfosReader */
|
51
|
+
cTermInfosReader = rb_define_class_under(mIndex, "TermInfosReader", rb_cObject);
|
52
|
+
|
53
|
+
rb_define_method(cTermInfosReader, "get_index_offset", frt_tir_get_index_offset, 1);
|
54
|
+
}
|
data/ext/terminfo.c
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* TermInfo Methods
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
void
|
11
|
+
frt_ti_free(void *p)
|
12
|
+
{
|
13
|
+
free(p);
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE
|
17
|
+
frt_ti_alloc(VALUE klass)
|
18
|
+
{
|
19
|
+
TermInfo *ti = (TermInfo *)ALLOC(TermInfo);
|
20
|
+
VALUE rbuffer = Data_Wrap_Struct(klass, NULL, frt_ti_free, ti);
|
21
|
+
return rbuffer;
|
22
|
+
}
|
23
|
+
|
24
|
+
#define GET_TI TermInfo *ti; Data_Get_Struct(self, TermInfo, ti)
|
25
|
+
inline VALUE
|
26
|
+
frt_ti_set(int argc, VALUE *argv, VALUE self)
|
27
|
+
{
|
28
|
+
VALUE df, fp, pp, so;
|
29
|
+
GET_TI;
|
30
|
+
MEMZERO(ti, TermInfo, 1);
|
31
|
+
rb_scan_args(argc, argv, "04", &df, &fp, &pp, &so);
|
32
|
+
switch (argc) {
|
33
|
+
case 4:
|
34
|
+
ti->skip_offset = FIX2INT(so);
|
35
|
+
case 3:
|
36
|
+
ti->prox_pointer = FIX2INT(pp);
|
37
|
+
case 2:
|
38
|
+
ti->freq_pointer = FIX2INT(fp);
|
39
|
+
case 1:
|
40
|
+
ti->doc_freq = FIX2INT(df);
|
41
|
+
case 0:
|
42
|
+
break;
|
43
|
+
}
|
44
|
+
return Qnil;
|
45
|
+
}
|
46
|
+
|
47
|
+
static VALUE
|
48
|
+
frt_ti_init(int argc, VALUE *argv, VALUE self)
|
49
|
+
{
|
50
|
+
frt_ti_set(argc, argv, self);
|
51
|
+
return self;
|
52
|
+
}
|
53
|
+
|
54
|
+
static VALUE
|
55
|
+
frt_ti_init_copy(VALUE self, VALUE rother)
|
56
|
+
{
|
57
|
+
TermInfo *other_ti;
|
58
|
+
GET_TI;
|
59
|
+
Data_Get_Struct(rother, TermInfo, other_ti);
|
60
|
+
MEMCPY(ti, other_ti, TermInfo, 1);
|
61
|
+
return self;
|
62
|
+
}
|
63
|
+
|
64
|
+
static VALUE
|
65
|
+
frt_ti_eql(VALUE self, VALUE rother)
|
66
|
+
{
|
67
|
+
TermInfo *other_ti;
|
68
|
+
GET_TI;
|
69
|
+
if (NIL_P(rother)) return Qfalse;
|
70
|
+
Data_Get_Struct(rother, TermInfo, other_ti);
|
71
|
+
return (MEMCMP(ti, other_ti, TermInfo, 1) == 0) ? Qtrue : Qfalse;
|
72
|
+
}
|
73
|
+
|
74
|
+
static VALUE
|
75
|
+
frt_ti_get_df(VALUE self)
|
76
|
+
{
|
77
|
+
GET_TI;
|
78
|
+
return INT2FIX(ti->doc_freq);
|
79
|
+
}
|
80
|
+
|
81
|
+
static VALUE
|
82
|
+
frt_ti_get_fp(VALUE self)
|
83
|
+
{
|
84
|
+
GET_TI;
|
85
|
+
return INT2FIX(ti->freq_pointer);
|
86
|
+
}
|
87
|
+
|
88
|
+
static VALUE
|
89
|
+
frt_ti_get_pp(VALUE self)
|
90
|
+
{
|
91
|
+
GET_TI;
|
92
|
+
return INT2FIX(ti->prox_pointer);
|
93
|
+
}
|
94
|
+
|
95
|
+
static VALUE
|
96
|
+
frt_ti_get_so(VALUE self)
|
97
|
+
{
|
98
|
+
GET_TI;
|
99
|
+
return INT2FIX(ti->skip_offset);
|
100
|
+
}
|
101
|
+
|
102
|
+
static VALUE
|
103
|
+
frt_ti_set_df(VALUE self, VALUE val)
|
104
|
+
{
|
105
|
+
GET_TI;
|
106
|
+
ti->doc_freq = FIX2INT(val);
|
107
|
+
return Qnil;
|
108
|
+
}
|
109
|
+
|
110
|
+
static VALUE
|
111
|
+
frt_ti_set_fp(VALUE self, VALUE val)
|
112
|
+
{
|
113
|
+
GET_TI;
|
114
|
+
ti->freq_pointer = FIX2INT(val);
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
|
118
|
+
static VALUE
|
119
|
+
frt_ti_set_pp(VALUE self, VALUE val)
|
120
|
+
{
|
121
|
+
GET_TI;
|
122
|
+
ti->prox_pointer = FIX2INT(val);
|
123
|
+
return Qnil;
|
124
|
+
}
|
125
|
+
|
126
|
+
static VALUE
|
127
|
+
frt_ti_set_so(VALUE self, VALUE val)
|
128
|
+
{
|
129
|
+
GET_TI;
|
130
|
+
ti->skip_offset = FIX2INT(val);
|
131
|
+
return Qnil;
|
132
|
+
}
|
133
|
+
|
134
|
+
/****************************************************************************
|
135
|
+
*
|
136
|
+
* Init Function
|
137
|
+
*
|
138
|
+
****************************************************************************/
|
139
|
+
|
140
|
+
void
|
141
|
+
Init_term_info(void)
|
142
|
+
{
|
143
|
+
/* TermInfo */
|
144
|
+
cTermInfo = rb_define_class_under(mIndex, "TermInfo", rb_cObject);
|
145
|
+
rb_define_alloc_func(cTermInfo, frt_ti_alloc);
|
146
|
+
|
147
|
+
rb_define_method(cTermInfo, "initialize", frt_ti_init, -1);
|
148
|
+
rb_define_method(cTermInfo, "set_values!", frt_ti_set, -1);
|
149
|
+
rb_define_method(cTermInfo, "initialize_copy", frt_ti_init_copy, 1);
|
150
|
+
rb_define_method(cTermInfo, "set!", frt_ti_init_copy, 1);
|
151
|
+
rb_define_method(cTermInfo, "==", frt_ti_eql, 1);
|
152
|
+
rb_define_method(cTermInfo, "doc_freq", frt_ti_get_df, 0);
|
153
|
+
rb_define_method(cTermInfo, "doc_freq=", frt_ti_set_df, 1);
|
154
|
+
rb_define_method(cTermInfo, "freq_pointer", frt_ti_get_fp, 0);
|
155
|
+
rb_define_method(cTermInfo, "freq_pointer=", frt_ti_set_fp, 1);
|
156
|
+
rb_define_method(cTermInfo, "prox_pointer", frt_ti_get_pp, 0);
|
157
|
+
rb_define_method(cTermInfo, "prox_pointer=", frt_ti_set_pp, 1);
|
158
|
+
rb_define_method(cTermInfo, "skip_offset", frt_ti_get_so, 0);
|
159
|
+
rb_define_method(cTermInfo, "skip_offset=", frt_ti_set_so, 1);
|
160
|
+
}
|
data/ext/token.c
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#include "ferret.h"
|
2
|
+
|
3
|
+
/****************************************************************************
|
4
|
+
*
|
5
|
+
* Token Methods
|
6
|
+
*
|
7
|
+
****************************************************************************/
|
8
|
+
|
9
|
+
ID id_tk_text, id_tk_pos_inc, id_tk_start_offset, id_tk_end_offset, id_tk_type;
|
10
|
+
ID id_tk_pos_inc_set;
|
11
|
+
|
12
|
+
static VALUE
|
13
|
+
frt_token_pos_inc (VALUE self, VALUE pI)
|
14
|
+
{
|
15
|
+
if(FIX2INT(pI) < 0)
|
16
|
+
rb_raise(rb_eArgError, "position_increment < 0");
|
17
|
+
rb_ivar_set(self, id_tk_pos_inc, pI);
|
18
|
+
return self;
|
19
|
+
}
|
20
|
+
|
21
|
+
static VALUE
|
22
|
+
frt_token_init(int argc, VALUE *argv, VALUE self)
|
23
|
+
{
|
24
|
+
VALUE text, start_offset, end_offset, type, pos_inc;
|
25
|
+
rb_scan_args(argc, argv, "32", &text,
|
26
|
+
&start_offset, &end_offset, &type, &pos_inc);
|
27
|
+
rb_ivar_set(self, id_tk_text, text);
|
28
|
+
rb_ivar_set(self, id_tk_start_offset, start_offset);
|
29
|
+
rb_ivar_set(self, id_tk_end_offset, end_offset);
|
30
|
+
if (argc < 4) {
|
31
|
+
rb_ivar_set(self, id_tk_type, rb_str_new("word", 4));
|
32
|
+
} else {
|
33
|
+
rb_ivar_set(self, id_tk_type, type);
|
34
|
+
}
|
35
|
+
if (argc < 5) {
|
36
|
+
rb_ivar_set(self, id_tk_pos_inc, INT2FIX(1));
|
37
|
+
} else {
|
38
|
+
rb_ivar_set(self, id_tk_pos_inc, pos_inc);
|
39
|
+
}
|
40
|
+
return self;
|
41
|
+
}
|
42
|
+
|
43
|
+
static VALUE
|
44
|
+
frt_token_eql(VALUE self, VALUE other)
|
45
|
+
{
|
46
|
+
VALUE rself_text, rother_text;
|
47
|
+
char *self_text, *other_text;
|
48
|
+
if (!rb_respond_to(other, id_tk_pos_inc_set))
|
49
|
+
return Qfalse;
|
50
|
+
rself_text = rb_ivar_get(self, id_tk_text);
|
51
|
+
rother_text = rb_ivar_get(other, id_tk_text);
|
52
|
+
self_text = StringValuePtr(rself_text);
|
53
|
+
other_text = StringValuePtr(rother_text);
|
54
|
+
if (rb_ivar_get(self, id_tk_start_offset) == rb_ivar_get(other, id_tk_start_offset) &&
|
55
|
+
rb_ivar_get(self, id_tk_end_offset) == rb_ivar_get(other, id_tk_end_offset) &&
|
56
|
+
(strcmp(self_text, other_text) == 0))
|
57
|
+
return Qtrue;
|
58
|
+
else
|
59
|
+
return Qfalse;
|
60
|
+
}
|
61
|
+
|
62
|
+
/****************************************************************************
|
63
|
+
*
|
64
|
+
* Init Function
|
65
|
+
*
|
66
|
+
****************************************************************************/
|
67
|
+
|
68
|
+
void
|
69
|
+
Init_token(void)
|
70
|
+
{
|
71
|
+
/* IDs */
|
72
|
+
id_tk_text = rb_intern("@term_text");
|
73
|
+
id_tk_start_offset = rb_intern("@start_offset");
|
74
|
+
id_tk_end_offset = rb_intern("@end_offset");
|
75
|
+
id_tk_type = rb_intern("@type");
|
76
|
+
id_tk_pos_inc = rb_intern("@position_increment");
|
77
|
+
id_tk_pos_inc_set = rb_intern("position_increment=");
|
78
|
+
|
79
|
+
|
80
|
+
/* IndexWriter */
|
81
|
+
cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
|
82
|
+
|
83
|
+
rb_define_method(cToken, "initialize", frt_token_init, -1);
|
84
|
+
rb_define_method(cToken, "position_increment=", frt_token_pos_inc, 1);
|
85
|
+
rb_define_method(cToken, "==", frt_token_eql, 1);
|
86
|
+
rb_define_method(cToken, "eql", frt_token_eql, 1);
|
87
|
+
|
88
|
+
rb_define_attr(cToken, "term_text", 1, 1);
|
89
|
+
rb_define_attr(cToken, "position_increment", 1, 0);
|
90
|
+
rb_define_attr(cToken, "start_offset", 1, 0);
|
91
|
+
rb_define_attr(cToken, "end_offset", 1, 0);
|
92
|
+
rb_define_attr(cToken, "type", 1, 1);
|
93
|
+
}
|
data/lib/ferret.rb
CHANGED
@@ -17,6 +17,24 @@ module Ferret::Analysis
|
|
17
17
|
def token_stream(field, string)
|
18
18
|
return LowerCaseTokenizer.new(string)
|
19
19
|
end
|
20
|
+
|
21
|
+
# Invoked before indexing a Field instance if
|
22
|
+
# terms have already been added to that field. This allows custom
|
23
|
+
# analyzers to place an automatic position increment gap between
|
24
|
+
# Field instances using the same field name. The default value
|
25
|
+
# position increment gap is 0. With a 0 position increment gap and
|
26
|
+
# the typical default token position increment of 1, all terms in a field,
|
27
|
+
# including across Field instances, are in successive positions, allowing
|
28
|
+
# exact PhraseQuery matches, for instance, across Field instance boundaries.
|
29
|
+
#
|
30
|
+
# field_name:: Field name being indexed.
|
31
|
+
# position_increment_gap:: added to the next token emitted from
|
32
|
+
# #token_stream(String,Reader)
|
33
|
+
#
|
34
|
+
def position_increment_gap(field_name)
|
35
|
+
return 0
|
36
|
+
end
|
37
|
+
|
20
38
|
end
|
21
39
|
|
22
40
|
# An Analyzer that uses WhiteSpaceTokenizer.
|
@@ -18,7 +18,21 @@ module Ferret::Analysis
|
|
18
18
|
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
19
|
P = /[_\/.,-]/
|
20
20
|
HASDIGIT = /\w*\d\w*/
|
21
|
+
TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
|
22
|
+
|\.([[:alpha:]]\.)+
|
23
|
+
|(@|\&)\w+([-.]\w+)*
|
24
|
+
)
|
25
|
+
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
26
|
+
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
27
|
+
|(\.\w+)+
|
28
|
+
|
|
29
|
+
)
|
30
|
+
/x
|
21
31
|
|
32
|
+
ACRONYM_WORD = /^#{ACRONYM}$/
|
33
|
+
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
34
|
+
DOT = /\./
|
35
|
+
APOSTROPHE_S = /'[sS]$/
|
22
36
|
protected
|
23
37
|
|
24
38
|
# Collects only characters which are not spaces tabs or carraige returns
|
@@ -27,24 +41,15 @@ module Ferret::Analysis
|
|
27
41
|
# This is a simplified version of the original Lucene standard
|
28
42
|
# tokenizer. I think it works better. I hope so anyway. Any way to
|
29
43
|
# do this more neatly?
|
30
|
-
|
31
|
-
|\.([[:alpha:]]\.)+
|
32
|
-
|(@|\&)\w+([-.]\w+)*
|
33
|
-
)
|
34
|
-
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
35
|
-
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
36
|
-
|(\.\w+)+
|
37
|
-
|
|
38
|
-
)
|
39
|
-
/x
|
44
|
+
TOKEN_RE
|
40
45
|
end
|
41
46
|
|
42
47
|
# stem the 's and remove the '.'s from acronyms
|
43
48
|
def normalize(str)
|
44
|
-
if str =~
|
45
|
-
str.gsub!(
|
46
|
-
elsif str =~
|
47
|
-
str.gsub!(
|
49
|
+
if str =~ ACRONYM_WORD
|
50
|
+
str.gsub!(DOT, '')
|
51
|
+
elsif str =~ APOSTROPHE_WORD
|
52
|
+
str.gsub!(APOSTROPHE_S, '')
|
48
53
|
end
|
49
54
|
str
|
50
55
|
end
|
@@ -35,9 +35,16 @@ module Ferret::Analysis
|
|
35
35
|
@position_increment = pos_inc
|
36
36
|
end
|
37
37
|
|
38
|
+
def set!(txt, so, eo)
|
39
|
+
@term_text = txt
|
40
|
+
@start_offset = so
|
41
|
+
@end_offset = eo
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
38
45
|
def eql?(o)
|
39
46
|
return (o.instance_of?(Token) and @start_offset == o.start_offset and
|
40
|
-
@end_offset == o.end_offset and @term_text
|
47
|
+
@end_offset == o.end_offset and @term_text == o.term_text)
|
41
48
|
end
|
42
49
|
alias :== :eql?
|
43
50
|
|
@@ -36,6 +36,7 @@ module Ferret::Analysis
|
|
36
36
|
# input:: must have a read(count) method which returns an array or string
|
37
37
|
# of _count_ chars.
|
38
38
|
def initialize(input)
|
39
|
+
#@token_buffer = Token.new("", 0, 0)
|
39
40
|
if input.is_a? String
|
40
41
|
@ss = StringScanner.new(input)
|
41
42
|
else
|
@@ -53,6 +54,7 @@ module Ferret::Analysis
|
|
53
54
|
return nil
|
54
55
|
end
|
55
56
|
|
57
|
+
#return @token_buffer.set!(normalize(term), term_start, term_end)
|
56
58
|
return Token.new(normalize(term), term_start, term_end)
|
57
59
|
end
|
58
60
|
|
@@ -62,8 +64,9 @@ module Ferret::Analysis
|
|
62
64
|
|
63
65
|
protected
|
64
66
|
# returns the regular expression used to find the next token
|
67
|
+
TOKEN_RE = /[[:alpha:]]+/
|
65
68
|
def token_re
|
66
|
-
|
69
|
+
TOKEN_RE
|
67
70
|
end
|
68
71
|
|
69
72
|
# Called on each token to normalize it before it is added to the
|
@@ -80,8 +83,9 @@ module Ferret::Analysis
|
|
80
83
|
protected
|
81
84
|
# Collects only characters which satisfy the regular expression
|
82
85
|
# _/[[:alpha:]]+/_.
|
83
|
-
|
84
|
-
|
86
|
+
TOKEN_RE = /[[:alpha:]]+/
|
87
|
+
def token_re
|
88
|
+
TOKEN_RE
|
85
89
|
end
|
86
90
|
end
|
87
91
|
|
@@ -100,8 +104,9 @@ module Ferret::Analysis
|
|
100
104
|
class WhiteSpaceTokenizer < RegExpTokenizer
|
101
105
|
protected
|
102
106
|
# Collects only characters which are not spaces tabs or carraige returns
|
103
|
-
|
104
|
-
|
107
|
+
TOKEN_RE = /\S+/
|
108
|
+
def token_re
|
109
|
+
TOKEN_RE
|
105
110
|
end
|
106
111
|
end
|
107
112
|
end
|