ferret 0.11.4 → 0.11.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -0
- data/TUTORIAL +3 -3
- data/ext/analysis.c +12 -9
- data/ext/array.c +10 -10
- data/ext/array.h +8 -1
- data/ext/bitvector.c +2 -2
- data/ext/except.c +1 -1
- data/ext/ferret.c +2 -2
- data/ext/ferret.h +1 -1
- data/ext/fs_store.c +13 -2
- data/ext/global.c +4 -4
- data/ext/global.h +6 -0
- data/ext/hash.c +1 -1
- data/ext/helper.c +1 -1
- data/ext/helper.h +1 -1
- data/ext/index.c +48 -22
- data/ext/index.h +17 -16
- data/ext/mempool.c +4 -1
- data/ext/mempool.h +1 -1
- data/ext/multimapper.c +2 -2
- data/ext/q_fuzzy.c +2 -2
- data/ext/q_multi_term.c +2 -2
- data/ext/q_parser.c +39 -8
- data/ext/q_range.c +32 -1
- data/ext/r_analysis.c +66 -28
- data/ext/r_index.c +18 -19
- data/ext/r_qparser.c +21 -6
- data/ext/r_search.c +74 -49
- data/ext/r_store.c +1 -1
- data/ext/r_utils.c +17 -17
- data/ext/search.c +10 -5
- data/ext/search.h +3 -1
- data/ext/sort.c +2 -2
- data/ext/stopwords.c +23 -34
- data/ext/store.c +9 -9
- data/ext/store.h +5 -4
- data/lib/ferret/document.rb +2 -2
- data/lib/ferret/field_infos.rb +37 -35
- data/lib/ferret/index.rb +16 -6
- data/lib/ferret/number_tools.rb +2 -2
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +40 -0
- data/test/unit/index/tc_index.rb +64 -101
- data/test/unit/index/tc_index_reader.rb +13 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +17 -1
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tm_searcher.rb +27 -1
- data/test/unit/ts_largefile.rb +4 -0
- metadata +147 -144
data/ext/index.h
CHANGED
@@ -65,24 +65,24 @@ extern HashTable *co_hash_create();
|
|
65
65
|
|
66
66
|
enum StoreValues
|
67
67
|
{
|
68
|
-
STORE_NO = 0,
|
69
|
-
STORE_YES = 1,
|
68
|
+
STORE_NO = 0,
|
69
|
+
STORE_YES = 1,
|
70
70
|
STORE_COMPRESS = 2
|
71
71
|
};
|
72
72
|
|
73
73
|
enum IndexValues
|
74
74
|
{
|
75
|
-
INDEX_NO = 0,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
75
|
+
INDEX_NO = 0,
|
76
|
+
INDEX_UNTOKENIZED = 1,
|
77
|
+
INDEX_YES = 3,
|
78
|
+
INDEX_UNTOKENIZED_OMIT_NORMS = 5,
|
79
|
+
INDEX_YES_OMIT_NORMS = 7
|
80
80
|
};
|
81
81
|
|
82
82
|
enum TermVectorValues
|
83
83
|
{
|
84
|
-
TERM_VECTOR_NO = 0,
|
85
|
-
TERM_VECTOR_YES = 1,
|
84
|
+
TERM_VECTOR_NO = 0,
|
85
|
+
TERM_VECTOR_YES = 1,
|
86
86
|
TERM_VECTOR_WITH_POSITIONS = 3,
|
87
87
|
TERM_VECTOR_WITH_OFFSETS = 5,
|
88
88
|
TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
|
@@ -374,7 +374,7 @@ typedef struct TermInfosWriter
|
|
374
374
|
|
375
375
|
extern TermInfosWriter *tiw_open(Store *store,
|
376
376
|
const char *segment,
|
377
|
-
int index_interval,
|
377
|
+
int index_interval,
|
378
378
|
int skip_interval);
|
379
379
|
extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
|
380
380
|
extern void tiw_add(TermInfosWriter *tiw,
|
@@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
|
|
456
456
|
|
457
457
|
typedef struct Offset
|
458
458
|
{
|
459
|
-
|
460
|
-
|
459
|
+
off_t start;
|
460
|
+
off_t end;
|
461
461
|
} Offset;
|
462
462
|
|
463
|
-
extern Offset *offset_new(
|
463
|
+
extern Offset *offset_new(off_t start, off_t end);
|
464
464
|
|
465
465
|
/****************************************************************************
|
466
466
|
*
|
@@ -488,7 +488,7 @@ typedef struct Posting
|
|
488
488
|
struct Posting *next;
|
489
489
|
} Posting;
|
490
490
|
|
491
|
-
extern
|
491
|
+
extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
|
492
492
|
|
493
493
|
/****************************************************************************
|
494
494
|
*
|
@@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
|
|
617
617
|
/* * * LazyDocField * * */
|
618
618
|
typedef struct LazyDocFieldData
|
619
619
|
{
|
620
|
-
|
620
|
+
off_t start;
|
621
621
|
int length;
|
622
622
|
char *text;
|
623
623
|
} LazyDocFieldData;
|
@@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw);
|
|
706
706
|
* A utility class (used by both IndexReader and IndexWriter) to keep track of
|
707
707
|
* files that need to be deleted because they are no longer referenced by the
|
708
708
|
* index.
|
709
|
-
*
|
709
|
+
*
|
710
710
|
****************************************************************************/
|
711
711
|
|
712
712
|
struct Deleter
|
@@ -760,6 +760,7 @@ struct IndexReader
|
|
760
760
|
void (*delete_doc_i)(IndexReader *ir, int doc_num);
|
761
761
|
void (*undelete_all_i)(IndexReader *ir);
|
762
762
|
void (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
|
763
|
+
bool (*is_latest_i)(IndexReader *ir);
|
763
764
|
void (*commit_i)(IndexReader *ir);
|
764
765
|
void (*close_i)(IndexReader *ir);
|
765
766
|
int ref_cnt;
|
data/ext/mempool.c
CHANGED
@@ -21,10 +21,13 @@ MemoryPool *mp_new()
|
|
21
21
|
return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
|
22
22
|
}
|
23
23
|
|
24
|
-
|
24
|
+
INLINE void *mp_alloc(MemoryPool *mp, int size)
|
25
25
|
{
|
26
26
|
char *p;
|
27
27
|
p = mp->curr_buffer + mp->pointer;
|
28
|
+
#if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
|
29
|
+
size = (((size - 1) >> 3) + 1) << 3;
|
30
|
+
#endif
|
28
31
|
mp->pointer += size;
|
29
32
|
|
30
33
|
if (mp->pointer > mp->chunk_size) {
|
data/ext/mempool.h
CHANGED
@@ -16,7 +16,7 @@ typedef struct MemoryPool {
|
|
16
16
|
|
17
17
|
extern MemoryPool *mp_new();
|
18
18
|
extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
|
19
|
-
extern
|
19
|
+
extern INLINE void *mp_alloc(MemoryPool *mp, int size);
|
20
20
|
extern void mp_reset(MemoryPool *mp);
|
21
21
|
extern void mp_destroy(MemoryPool *mp);
|
22
22
|
extern char *mp_strdup(MemoryPool *mp, const char *str);
|
data/ext/multimapper.c
CHANGED
@@ -121,7 +121,7 @@ MultiMapper *mulmap_new()
|
|
121
121
|
return self;
|
122
122
|
}
|
123
123
|
|
124
|
-
static
|
124
|
+
static INLINE void mulmap_free_dstates(MultiMapper *self)
|
125
125
|
{
|
126
126
|
if (self->d_size > 0) {
|
127
127
|
int i;
|
@@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
|
|
151
151
|
}
|
152
152
|
|
153
153
|
|
154
|
-
static
|
154
|
+
static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
155
155
|
{
|
156
156
|
int i;
|
157
157
|
for (i = cnt - 1; i >= 0; i--) {
|
data/ext/q_fuzzy.c
CHANGED
@@ -11,7 +11,7 @@
|
|
11
11
|
*
|
12
12
|
****************************************************************************/
|
13
13
|
|
14
|
-
static
|
14
|
+
static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
|
15
15
|
{
|
16
16
|
return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
|
17
17
|
}
|
@@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
|
24
24
|
}
|
25
25
|
}
|
26
26
|
|
27
|
-
static
|
27
|
+
static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
28
|
{
|
29
29
|
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
|
30
30
|
: fuzq_calculate_max_distance(fuzq, m);
|
data/ext/q_multi_term.c
CHANGED
@@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
|
|
236
236
|
return (pq_top(tdew_pq) == NULL) ? false : true;
|
237
237
|
}
|
238
238
|
|
239
|
-
static
|
239
|
+
static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
|
240
240
|
{
|
241
241
|
return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
|
242
242
|
}
|
@@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field)
|
|
661
661
|
|
662
662
|
void multi_tq_add_term_boost(Query *self, const char *term, float boost)
|
663
663
|
{
|
664
|
-
if (boost > MTQ(self)->min_boost) {
|
664
|
+
if (boost > MTQ(self)->min_boost && term && term[0]) {
|
665
665
|
BoostedTerm *bt = boosted_term_new(term, boost);
|
666
666
|
PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
|
667
667
|
pq_insert(bt_pq, bt);
|
data/ext/q_parser.c
CHANGED
@@ -147,7 +147,7 @@ typedef union YYSTYPE
|
|
147
147
|
Phrase *phrase;
|
148
148
|
char *str;
|
149
149
|
}
|
150
|
-
/* Line
|
150
|
+
/* Line 187 of yacc.c. */
|
151
151
|
#line 152 "y.tab.c"
|
152
152
|
YYSTYPE;
|
153
153
|
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
@@ -2061,12 +2061,14 @@ get_word_done:
|
|
2061
2061
|
* just checks for all of them. */
|
2062
2062
|
*bufp = '\0';
|
2063
2063
|
len = (int)(bufp - buf);
|
2064
|
-
if (
|
2065
|
-
if (
|
2066
|
-
|
2067
|
-
|
2064
|
+
if (qp->use_keywords) {
|
2065
|
+
if (len == 3) {
|
2066
|
+
if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
|
2067
|
+
if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
|
2068
|
+
if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
|
2069
|
+
}
|
2070
|
+
if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
|
2068
2071
|
}
|
2069
|
-
if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
|
2070
2072
|
|
2071
2073
|
/* found a word so return it. */
|
2072
2074
|
lvalp->str = buf;
|
@@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field,
|
|
2489
2491
|
}
|
2490
2492
|
else {
|
2491
2493
|
int i;
|
2492
|
-
|
2494
|
+
int term_cnt = 0;
|
2495
|
+
Token *token;
|
2496
|
+
char *last_word = NULL;
|
2497
|
+
|
2493
2498
|
for (i = 0; i < word_count; i++) {
|
2494
|
-
|
2499
|
+
token = ts_next(get_cached_ts(qp, field, words[i]));
|
2500
|
+
free(words[i]);
|
2501
|
+
if (token) {
|
2502
|
+
last_word = words[i] = estrdup(token->text);
|
2503
|
+
++term_cnt;
|
2504
|
+
}
|
2505
|
+
else {
|
2506
|
+
words[i] = estrdup("");
|
2507
|
+
}
|
2508
|
+
}
|
2509
|
+
|
2510
|
+
switch (term_cnt) {
|
2511
|
+
case 0:
|
2512
|
+
q = bq_new(false);
|
2513
|
+
break;
|
2514
|
+
case 1:
|
2515
|
+
q = tq_new(field, last_word);
|
2516
|
+
break;
|
2517
|
+
default:
|
2518
|
+
q = multi_tq_new_conf(field, term_cnt, 0.0);
|
2519
|
+
for (i = 0; i < word_count; i++) {
|
2520
|
+
if (words[i][0]) {
|
2521
|
+
multi_tq_add_term(q, words[i]);
|
2522
|
+
}
|
2523
|
+
}
|
2524
|
+
break;
|
2495
2525
|
}
|
2496
2526
|
}
|
2497
2527
|
}
|
@@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
|
|
2620
2650
|
self->max_clauses = QP_MAX_CLAUSES;
|
2621
2651
|
self->handle_parse_errors = false;
|
2622
2652
|
self->allow_any_fields = false;
|
2653
|
+
self->use_keywords = true;
|
2623
2654
|
self->def_slop = 0;
|
2624
2655
|
self->fields_buf = hs_new_str(NULL);
|
2625
2656
|
self->all_fields = all_fields;
|
data/ext/q_range.c
CHANGED
@@ -269,13 +269,44 @@ static void rq_destroy(Query *self)
|
|
269
269
|
q_destroy_i(self);
|
270
270
|
}
|
271
271
|
|
272
|
+
static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
|
273
|
+
TermVector *tv)
|
274
|
+
{
|
275
|
+
Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
|
276
|
+
if (strcmp(tv->field, range->field) == 0) {
|
277
|
+
int i, j;
|
278
|
+
char *upper_text = range->upper_term;
|
279
|
+
char *lower_text = range->lower_term;
|
280
|
+
int upper_limit = range->include_upper ? 1 : 0;
|
281
|
+
int lower_limit = range->include_lower ? 1 : 0;
|
282
|
+
|
283
|
+
for (i = tv->term_cnt - 1; i >= 0; i--) {
|
284
|
+
TVTerm *tv_term = &(tv->terms[i]);
|
285
|
+
char *text = tv_term->text;
|
286
|
+
if ((!upper_text || strcmp(text, upper_text) < upper_limit) &&
|
287
|
+
(!lower_text || strcmp(lower_text, text) < lower_limit)) {
|
288
|
+
|
289
|
+
for (j = 0; j < tv_term->freq; j++) {
|
290
|
+
int pos = tv_term->positions[j];
|
291
|
+
matchv_add(mv, pos, pos);
|
292
|
+
}
|
293
|
+
}
|
294
|
+
}
|
295
|
+
}
|
296
|
+
return mv;
|
297
|
+
}
|
298
|
+
|
272
299
|
static Query *rq_rewrite(Query *self, IndexReader *ir)
|
273
300
|
{
|
301
|
+
Query *csq;
|
274
302
|
Range *r = RQ(self)->range;
|
275
303
|
Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
|
276
304
|
r->include_lower, r->include_upper);
|
277
305
|
(void)ir;
|
278
|
-
|
306
|
+
csq = csq_new_nr(filter);
|
307
|
+
((ConstantScoreQuery *)csq)->original = self;
|
308
|
+
csq->get_matchv_i = &rq_get_matchv_i;
|
309
|
+
return (Query *)csq;
|
279
310
|
}
|
280
311
|
|
281
312
|
static unsigned long rq_hash(Query *self)
|
data/ext/r_analysis.c
CHANGED
@@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt)
|
|
150
150
|
* values as needed. For example, if you have a stop word filter you will be
|
151
151
|
* skipping tokens. Let's say you have the stop words "the" and "and" and you
|
152
152
|
* parse the title "The Old Man and the Sea". The terms "Old", "Man" and
|
153
|
-
* "Sea" will have the position
|
153
|
+
* "Sea" will have the position increments 2, 1 and 3 respectively.
|
154
154
|
*
|
155
155
|
* Another reason you might want to vary the position increment is if you are
|
156
156
|
* adding synonyms to the index. For example let's say you have the synonym
|
@@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts)
|
|
424
424
|
return rts;
|
425
425
|
}
|
426
426
|
|
427
|
-
static
|
427
|
+
static INLINE VALUE
|
428
428
|
get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
429
429
|
{
|
430
430
|
StringValue(rstr);
|
@@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
|
811
811
|
* LetterTokenizer.new(lower = true) -> tokenizer
|
812
812
|
*
|
813
813
|
* Create a new LetterTokenizer which optionally downcases tokens. Downcasing
|
814
|
-
* is done according the
|
814
|
+
* is done according the current locale.
|
815
815
|
*
|
816
816
|
* lower:: set to false if you don't wish to downcase tokens
|
817
817
|
*/
|
@@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
|
842
842
|
* WhiteSpaceTokenizer.new(lower = true) -> tokenizer
|
843
843
|
*
|
844
844
|
* Create a new WhiteSpaceTokenizer which optionally downcases tokens.
|
845
|
-
* Downcasing is done according the
|
845
|
+
* Downcasing is done according the current locale.
|
846
846
|
*
|
847
847
|
* lower:: set to false if you don't wish to downcase tokens
|
848
848
|
*/
|
@@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
873
873
|
* StandardTokenizer.new(lower = true) -> tokenizer
|
874
874
|
*
|
875
875
|
* Create a new StandardTokenizer which optionally downcases tokens.
|
876
|
-
* Downcasing is done according the
|
876
|
+
* Downcasing is done according the current locale.
|
877
877
|
*
|
878
878
|
* lower:: set to false if you don't wish to downcase tokens
|
879
879
|
*/
|
@@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
896
896
|
* AsciiLowerCaseFilter.new(token_stream) -> token_stream
|
897
897
|
*
|
898
898
|
* Create an AsciiLowerCaseFilter which normalizes a token's text to
|
899
|
-
* lowercase but only for
|
899
|
+
* lowercase but only for ASCII characters. For other characters use
|
900
900
|
* LowerCaseFilter.
|
901
901
|
*/
|
902
902
|
static VALUE
|
@@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
990
990
|
return self;
|
991
991
|
}
|
992
992
|
|
993
|
-
static
|
993
|
+
static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
|
994
994
|
{
|
995
995
|
switch (TYPE(from)) {
|
996
996
|
case T_STRING:
|
@@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
|
1046
1046
|
* MappingFilter.new(token_stream, mapping) -> token_stream
|
1047
1047
|
*
|
1048
1048
|
* Create an MappingFilter which maps strings in tokens. This is usually used
|
1049
|
-
* to map UTF-8 characters to
|
1050
|
-
* better
|
1049
|
+
* to map UTF-8 characters to ASCII characters for easier searching and
|
1050
|
+
* better search recall. The mapping is compiled into a Deterministic Finite
|
1051
1051
|
* Automata so it is super fast. This Filter can therefor be used for
|
1052
1052
|
* indexing very large datasets. Currently regular expressions are not
|
1053
1053
|
* supported. If you are really interested in the feature, please contact me
|
@@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
|
1087
1087
|
* algorithm="english",
|
1088
1088
|
* encoding="UTF-8") -> token_stream
|
1089
1089
|
*
|
1090
|
-
* Create an StemFilter which uses a snowball stemmer (
|
1090
|
+
* Create an StemFilter which uses a snowball stemmer (thank you Martin
|
1091
1091
|
* Porter) to stem words. You can optionally specify the algorithm (default:
|
1092
1092
|
* "english") and encoding (default: "UTF-8").
|
1093
1093
|
*
|
@@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a)
|
|
1193
1193
|
return self;
|
1194
1194
|
}
|
1195
1195
|
|
1196
|
+
INLINE VALUE
|
1197
|
+
get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
|
1198
|
+
{
|
1199
|
+
TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
|
1200
|
+
|
1201
|
+
/* Make sure that there is no entry already */
|
1202
|
+
object_set(&ts->text, rstring);
|
1203
|
+
return get_rb_token_stream(ts);
|
1204
|
+
}
|
1205
|
+
|
1196
1206
|
/*
|
1197
1207
|
* call-seq:
|
1198
1208
|
* analyzer.token_stream(field_name, input) -> token_stream
|
@@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
1209
1219
|
{
|
1210
1220
|
/* NOTE: Any changes made to this method may also need to be applied to
|
1211
1221
|
* frt_re_analyzer_token_stream */
|
1212
|
-
TokenStream *ts;
|
1213
1222
|
Analyzer *a;
|
1214
1223
|
GET_A(a, self);
|
1215
1224
|
|
1216
1225
|
StringValue(rstring);
|
1217
1226
|
|
1218
|
-
|
1219
|
-
|
1220
|
-
/* Make sure that there is no entry already */
|
1221
|
-
object_set(&ts->text, rstring);
|
1222
|
-
return get_rb_token_stream(ts);
|
1227
|
+
return get_rb_ts_from_a(a, rfield, rstring);
|
1223
1228
|
}
|
1224
1229
|
|
1225
1230
|
#define GET_LOWER(dflt) \
|
@@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
1234
1239
|
*
|
1235
1240
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1236
1241
|
* but can optionally leave case as is. Lowercasing will only be done to
|
1237
|
-
*
|
1242
|
+
* ASCII characters.
|
1238
1243
|
*
|
1239
1244
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1240
1245
|
*/
|
@@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1279
1284
|
*
|
1280
1285
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1281
1286
|
* but can optionally leave case as is. Lowercasing will only be done to
|
1282
|
-
*
|
1287
|
+
* ASCII characters.
|
1283
1288
|
*
|
1284
1289
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1285
1290
|
*/
|
@@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
1457
1462
|
return self;
|
1458
1463
|
}
|
1459
1464
|
|
1465
|
+
/*
|
1466
|
+
* call-seq:
|
1467
|
+
* analyzer.token_stream(field_name, input) -> token_stream
|
1468
|
+
*
|
1469
|
+
* Create a new TokenStream to tokenize +input+. The TokenStream created will
|
1470
|
+
* also depend on the +field_name+ in the case of the PerFieldAnalyzer.
|
1471
|
+
*
|
1472
|
+
* field_name:: name of the field to be tokenized
|
1473
|
+
* input:: data from the field to be tokenized
|
1474
|
+
*/
|
1475
|
+
static VALUE
|
1476
|
+
frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1477
|
+
{
|
1478
|
+
Analyzer *pfa, *a;
|
1479
|
+
char *field = frt_field(rfield);
|
1480
|
+
GET_A(pfa, self);
|
1481
|
+
|
1482
|
+
StringValue(rstring);
|
1483
|
+
a = (Analyzer *)h_get(PFA(pfa)->dict, field);
|
1484
|
+
if (a == NULL) {
|
1485
|
+
a = PFA(pfa)->default_a;
|
1486
|
+
}
|
1487
|
+
if (a->get_ts == cwa_get_ts) {
|
1488
|
+
return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1489
|
+
ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
|
1490
|
+
}
|
1491
|
+
else {
|
1492
|
+
return get_rb_ts_from_a(a, rfield, rstring);
|
1493
|
+
}
|
1494
|
+
}
|
1495
|
+
|
1460
1496
|
/*** RegExpAnalyzer ***/
|
1461
1497
|
|
1462
1498
|
static void
|
@@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
|
|
1585
1621
|
*
|
1586
1622
|
* == Summary
|
1587
1623
|
*
|
1588
|
-
* A Token is an
|
1624
|
+
* A Token is an occurrence of a term from the text of a field. It consists
|
1589
1625
|
* of a term's text and the start and end offset of the term in the text of
|
1590
1626
|
* the field;
|
1591
1627
|
*
|
@@ -1648,7 +1684,7 @@ static void Init_TokenStream(void)
|
|
1648
1684
|
/*
|
1649
1685
|
* Document-class: Ferret::Analysis::AsciiLetterTokenizer
|
1650
1686
|
*
|
1651
|
-
* A LetterTokenizer is a tokenizer that divides text at non-
|
1687
|
+
* A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
|
1652
1688
|
* That is to say, it defines tokens as maximal strings of adjacent letters,
|
1653
1689
|
* as defined by the regular expression _/[A-Za-z]+/_.
|
1654
1690
|
*
|
@@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void)
|
|
1781
1817
|
* Document-class: Ferret::Analysis::RegExpTokenizer
|
1782
1818
|
*
|
1783
1819
|
* A tokenizer that recognizes tokens based on a regular expression passed to
|
1784
|
-
* the
|
1820
|
+
* the constructor. Most possible tokenizers can be created using this class.
|
1785
1821
|
*
|
1786
1822
|
* === Example
|
1787
1823
|
*
|
@@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void)
|
|
1817
1853
|
* Document-class: Ferret::Analysis::AsciiLowerCaseFilter
|
1818
1854
|
*
|
1819
1855
|
* AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
|
1820
|
-
*
|
1856
|
+
* ASCII characters. For other characters use LowerCaseFilter.
|
1821
1857
|
*
|
1822
1858
|
* === Example
|
1823
1859
|
*
|
@@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void)
|
|
1881
1917
|
* Document-class: Ferret::Analysis::MappingFilter
|
1882
1918
|
*
|
1883
1919
|
* A MappingFilter maps strings in tokens. This is usually used to map UTF-8
|
1884
|
-
* characters to
|
1920
|
+
* characters to ASCII characters for easier searching and better search
|
1885
1921
|
* recall. The mapping is compiled into a Deterministic Finite Automata so it
|
1886
1922
|
* is super fast. This Filter can therefor be used for indexing very large
|
1887
1923
|
* datasets. Currently regular expressions are not supported. If you are
|
@@ -2020,7 +2056,7 @@ static void Init_StemFilter(void)
|
|
2020
2056
|
* a policy for extracting index terms from text.
|
2021
2057
|
*
|
2022
2058
|
* Typical implementations first build a Tokenizer, which breaks the stream
|
2023
|
-
* of characters from the Reader into raw Tokens. One or more
|
2059
|
+
* of characters from the Reader into raw Tokens. One or more TokenFilters
|
2024
2060
|
* may then be applied to the output of the Tokenizer.
|
2025
2061
|
*
|
2026
2062
|
* The default Analyzer just creates a LowerCaseTokenizer which converts
|
@@ -2057,7 +2093,7 @@ static void Init_Analyzer(void)
|
|
2057
2093
|
* == Summary
|
2058
2094
|
*
|
2059
2095
|
* An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
|
2060
|
-
* maximal strings of
|
2096
|
+
* maximal strings of ASCII characters. If implemented in Ruby it would look
|
2061
2097
|
* like;
|
2062
2098
|
*
|
2063
2099
|
* class AsciiLetterAnalyzer
|
@@ -2075,7 +2111,7 @@ static void Init_Analyzer(void)
|
|
2075
2111
|
* end
|
2076
2112
|
*
|
2077
2113
|
* As you can see it makes use of the AsciiLetterTokenizer and
|
2078
|
-
* AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-
|
2114
|
+
* AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
|
2079
2115
|
* characters so you should use the LetterAnalyzer is you want to analyze
|
2080
2116
|
* multi-byte data like "UTF-8".
|
2081
2117
|
*/
|
@@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
2194
2230
|
* == Summary
|
2195
2231
|
*
|
2196
2232
|
* The AsciiStandardAnalyzer is the most advanced of the available
|
2197
|
-
*
|
2233
|
+
* ASCII-analyzers. If it were implemented in Ruby it would look like this;
|
2198
2234
|
*
|
2199
2235
|
* class AsciiStandardAnalyzer
|
2200
2236
|
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
@@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
2212
2248
|
*
|
2213
2249
|
* As you can see it makes use of the AsciiStandardTokenizer and you can also
|
2214
2250
|
* add your own list of stop-words if you wish. Note that this tokenizer
|
2215
|
-
* won't recognize non-
|
2251
|
+
* won't recognize non-ASCII characters so you should use the
|
2216
2252
|
* StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
|
2217
2253
|
*/
|
2218
2254
|
static void Init_AsciiStandardAnalyzer(void)
|
@@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void)
|
|
2292
2328
|
frt_per_field_analyzer_add_field, 2);
|
2293
2329
|
rb_define_method(cPerFieldAnalyzer, "[]=",
|
2294
2330
|
frt_per_field_analyzer_add_field, 2);
|
2331
|
+
rb_define_method(cPerFieldAnalyzer, "token_stream",
|
2332
|
+
frt_pfa_analyzer_token_stream, 2);
|
2295
2333
|
}
|
2296
2334
|
|
2297
2335
|
/*
|