ferret 0.11.4 → 0.11.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/TUTORIAL +3 -3
- data/ext/analysis.c +12 -9
- data/ext/array.c +10 -10
- data/ext/array.h +8 -1
- data/ext/bitvector.c +2 -2
- data/ext/except.c +1 -1
- data/ext/ferret.c +2 -2
- data/ext/ferret.h +1 -1
- data/ext/fs_store.c +13 -2
- data/ext/global.c +4 -4
- data/ext/global.h +6 -0
- data/ext/hash.c +1 -1
- data/ext/helper.c +1 -1
- data/ext/helper.h +1 -1
- data/ext/index.c +48 -22
- data/ext/index.h +17 -16
- data/ext/mempool.c +4 -1
- data/ext/mempool.h +1 -1
- data/ext/multimapper.c +2 -2
- data/ext/q_fuzzy.c +2 -2
- data/ext/q_multi_term.c +2 -2
- data/ext/q_parser.c +39 -8
- data/ext/q_range.c +32 -1
- data/ext/r_analysis.c +66 -28
- data/ext/r_index.c +18 -19
- data/ext/r_qparser.c +21 -6
- data/ext/r_search.c +74 -49
- data/ext/r_store.c +1 -1
- data/ext/r_utils.c +17 -17
- data/ext/search.c +10 -5
- data/ext/search.h +3 -1
- data/ext/sort.c +2 -2
- data/ext/stopwords.c +23 -34
- data/ext/store.c +9 -9
- data/ext/store.h +5 -4
- data/lib/ferret/document.rb +2 -2
- data/lib/ferret/field_infos.rb +37 -35
- data/lib/ferret/index.rb +16 -6
- data/lib/ferret/number_tools.rb +2 -2
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +40 -0
- data/test/unit/index/tc_index.rb +64 -101
- data/test/unit/index/tc_index_reader.rb +13 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +17 -1
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tm_searcher.rb +27 -1
- data/test/unit/ts_largefile.rb +4 -0
- metadata +147 -144
data/ext/index.h
CHANGED
@@ -65,24 +65,24 @@ extern HashTable *co_hash_create();
|
|
65
65
|
|
66
66
|
enum StoreValues
|
67
67
|
{
|
68
|
-
STORE_NO = 0,
|
69
|
-
STORE_YES = 1,
|
68
|
+
STORE_NO = 0,
|
69
|
+
STORE_YES = 1,
|
70
70
|
STORE_COMPRESS = 2
|
71
71
|
};
|
72
72
|
|
73
73
|
enum IndexValues
|
74
74
|
{
|
75
|
-
INDEX_NO = 0,
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
75
|
+
INDEX_NO = 0,
|
76
|
+
INDEX_UNTOKENIZED = 1,
|
77
|
+
INDEX_YES = 3,
|
78
|
+
INDEX_UNTOKENIZED_OMIT_NORMS = 5,
|
79
|
+
INDEX_YES_OMIT_NORMS = 7
|
80
80
|
};
|
81
81
|
|
82
82
|
enum TermVectorValues
|
83
83
|
{
|
84
|
-
TERM_VECTOR_NO = 0,
|
85
|
-
TERM_VECTOR_YES = 1,
|
84
|
+
TERM_VECTOR_NO = 0,
|
85
|
+
TERM_VECTOR_YES = 1,
|
86
86
|
TERM_VECTOR_WITH_POSITIONS = 3,
|
87
87
|
TERM_VECTOR_WITH_OFFSETS = 5,
|
88
88
|
TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
|
@@ -374,7 +374,7 @@ typedef struct TermInfosWriter
|
|
374
374
|
|
375
375
|
extern TermInfosWriter *tiw_open(Store *store,
|
376
376
|
const char *segment,
|
377
|
-
int index_interval,
|
377
|
+
int index_interval,
|
378
378
|
int skip_interval);
|
379
379
|
extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
|
380
380
|
extern void tiw_add(TermInfosWriter *tiw,
|
@@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
|
|
456
456
|
|
457
457
|
typedef struct Offset
|
458
458
|
{
|
459
|
-
|
460
|
-
|
459
|
+
off_t start;
|
460
|
+
off_t end;
|
461
461
|
} Offset;
|
462
462
|
|
463
|
-
extern Offset *offset_new(
|
463
|
+
extern Offset *offset_new(off_t start, off_t end);
|
464
464
|
|
465
465
|
/****************************************************************************
|
466
466
|
*
|
@@ -488,7 +488,7 @@ typedef struct Posting
|
|
488
488
|
struct Posting *next;
|
489
489
|
} Posting;
|
490
490
|
|
491
|
-
extern
|
491
|
+
extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
|
492
492
|
|
493
493
|
/****************************************************************************
|
494
494
|
*
|
@@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
|
|
617
617
|
/* * * LazyDocField * * */
|
618
618
|
typedef struct LazyDocFieldData
|
619
619
|
{
|
620
|
-
|
620
|
+
off_t start;
|
621
621
|
int length;
|
622
622
|
char *text;
|
623
623
|
} LazyDocFieldData;
|
@@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw);
|
|
706
706
|
* A utility class (used by both IndexReader and IndexWriter) to keep track of
|
707
707
|
* files that need to be deleted because they are no longer referenced by the
|
708
708
|
* index.
|
709
|
-
*
|
709
|
+
*
|
710
710
|
****************************************************************************/
|
711
711
|
|
712
712
|
struct Deleter
|
@@ -760,6 +760,7 @@ struct IndexReader
|
|
760
760
|
void (*delete_doc_i)(IndexReader *ir, int doc_num);
|
761
761
|
void (*undelete_all_i)(IndexReader *ir);
|
762
762
|
void (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
|
763
|
+
bool (*is_latest_i)(IndexReader *ir);
|
763
764
|
void (*commit_i)(IndexReader *ir);
|
764
765
|
void (*close_i)(IndexReader *ir);
|
765
766
|
int ref_cnt;
|
data/ext/mempool.c
CHANGED
@@ -21,10 +21,13 @@ MemoryPool *mp_new()
|
|
21
21
|
return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
|
22
22
|
}
|
23
23
|
|
24
|
-
|
24
|
+
INLINE void *mp_alloc(MemoryPool *mp, int size)
|
25
25
|
{
|
26
26
|
char *p;
|
27
27
|
p = mp->curr_buffer + mp->pointer;
|
28
|
+
#if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
|
29
|
+
size = (((size - 1) >> 3) + 1) << 3;
|
30
|
+
#endif
|
28
31
|
mp->pointer += size;
|
29
32
|
|
30
33
|
if (mp->pointer > mp->chunk_size) {
|
data/ext/mempool.h
CHANGED
@@ -16,7 +16,7 @@ typedef struct MemoryPool {
|
|
16
16
|
|
17
17
|
extern MemoryPool *mp_new();
|
18
18
|
extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
|
19
|
-
extern
|
19
|
+
extern INLINE void *mp_alloc(MemoryPool *mp, int size);
|
20
20
|
extern void mp_reset(MemoryPool *mp);
|
21
21
|
extern void mp_destroy(MemoryPool *mp);
|
22
22
|
extern char *mp_strdup(MemoryPool *mp, const char *str);
|
data/ext/multimapper.c
CHANGED
@@ -121,7 +121,7 @@ MultiMapper *mulmap_new()
|
|
121
121
|
return self;
|
122
122
|
}
|
123
123
|
|
124
|
-
static
|
124
|
+
static INLINE void mulmap_free_dstates(MultiMapper *self)
|
125
125
|
{
|
126
126
|
if (self->d_size > 0) {
|
127
127
|
int i;
|
@@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
|
|
151
151
|
}
|
152
152
|
|
153
153
|
|
154
|
-
static
|
154
|
+
static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
|
155
155
|
{
|
156
156
|
int i;
|
157
157
|
for (i = cnt - 1; i >= 0; i--) {
|
data/ext/q_fuzzy.c
CHANGED
@@ -11,7 +11,7 @@
|
|
11
11
|
*
|
12
12
|
****************************************************************************/
|
13
13
|
|
14
|
-
static
|
14
|
+
static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
|
15
15
|
{
|
16
16
|
return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
|
17
17
|
}
|
@@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
|
|
24
24
|
}
|
25
25
|
}
|
26
26
|
|
27
|
-
static
|
27
|
+
static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
|
28
28
|
{
|
29
29
|
return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
|
30
30
|
: fuzq_calculate_max_distance(fuzq, m);
|
data/ext/q_multi_term.c
CHANGED
@@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
|
|
236
236
|
return (pq_top(tdew_pq) == NULL) ? false : true;
|
237
237
|
}
|
238
238
|
|
239
|
-
static
|
239
|
+
static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
|
240
240
|
{
|
241
241
|
return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
|
242
242
|
}
|
@@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field)
|
|
661
661
|
|
662
662
|
void multi_tq_add_term_boost(Query *self, const char *term, float boost)
|
663
663
|
{
|
664
|
-
if (boost > MTQ(self)->min_boost) {
|
664
|
+
if (boost > MTQ(self)->min_boost && term && term[0]) {
|
665
665
|
BoostedTerm *bt = boosted_term_new(term, boost);
|
666
666
|
PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
|
667
667
|
pq_insert(bt_pq, bt);
|
data/ext/q_parser.c
CHANGED
@@ -147,7 +147,7 @@ typedef union YYSTYPE
|
|
147
147
|
Phrase *phrase;
|
148
148
|
char *str;
|
149
149
|
}
|
150
|
-
/* Line
|
150
|
+
/* Line 187 of yacc.c. */
|
151
151
|
#line 152 "y.tab.c"
|
152
152
|
YYSTYPE;
|
153
153
|
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
@@ -2061,12 +2061,14 @@ get_word_done:
|
|
2061
2061
|
* just checks for all of them. */
|
2062
2062
|
*bufp = '\0';
|
2063
2063
|
len = (int)(bufp - buf);
|
2064
|
-
if (
|
2065
|
-
if (
|
2066
|
-
|
2067
|
-
|
2064
|
+
if (qp->use_keywords) {
|
2065
|
+
if (len == 3) {
|
2066
|
+
if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
|
2067
|
+
if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
|
2068
|
+
if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
|
2069
|
+
}
|
2070
|
+
if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
|
2068
2071
|
}
|
2069
|
-
if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
|
2070
2072
|
|
2071
2073
|
/* found a word so return it. */
|
2072
2074
|
lvalp->str = buf;
|
@@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field,
|
|
2489
2491
|
}
|
2490
2492
|
else {
|
2491
2493
|
int i;
|
2492
|
-
|
2494
|
+
int term_cnt = 0;
|
2495
|
+
Token *token;
|
2496
|
+
char *last_word = NULL;
|
2497
|
+
|
2493
2498
|
for (i = 0; i < word_count; i++) {
|
2494
|
-
|
2499
|
+
token = ts_next(get_cached_ts(qp, field, words[i]));
|
2500
|
+
free(words[i]);
|
2501
|
+
if (token) {
|
2502
|
+
last_word = words[i] = estrdup(token->text);
|
2503
|
+
++term_cnt;
|
2504
|
+
}
|
2505
|
+
else {
|
2506
|
+
words[i] = estrdup("");
|
2507
|
+
}
|
2508
|
+
}
|
2509
|
+
|
2510
|
+
switch (term_cnt) {
|
2511
|
+
case 0:
|
2512
|
+
q = bq_new(false);
|
2513
|
+
break;
|
2514
|
+
case 1:
|
2515
|
+
q = tq_new(field, last_word);
|
2516
|
+
break;
|
2517
|
+
default:
|
2518
|
+
q = multi_tq_new_conf(field, term_cnt, 0.0);
|
2519
|
+
for (i = 0; i < word_count; i++) {
|
2520
|
+
if (words[i][0]) {
|
2521
|
+
multi_tq_add_term(q, words[i]);
|
2522
|
+
}
|
2523
|
+
}
|
2524
|
+
break;
|
2495
2525
|
}
|
2496
2526
|
}
|
2497
2527
|
}
|
@@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
|
|
2620
2650
|
self->max_clauses = QP_MAX_CLAUSES;
|
2621
2651
|
self->handle_parse_errors = false;
|
2622
2652
|
self->allow_any_fields = false;
|
2653
|
+
self->use_keywords = true;
|
2623
2654
|
self->def_slop = 0;
|
2624
2655
|
self->fields_buf = hs_new_str(NULL);
|
2625
2656
|
self->all_fields = all_fields;
|
data/ext/q_range.c
CHANGED
@@ -269,13 +269,44 @@ static void rq_destroy(Query *self)
|
|
269
269
|
q_destroy_i(self);
|
270
270
|
}
|
271
271
|
|
272
|
+
static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
|
273
|
+
TermVector *tv)
|
274
|
+
{
|
275
|
+
Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
|
276
|
+
if (strcmp(tv->field, range->field) == 0) {
|
277
|
+
int i, j;
|
278
|
+
char *upper_text = range->upper_term;
|
279
|
+
char *lower_text = range->lower_term;
|
280
|
+
int upper_limit = range->include_upper ? 1 : 0;
|
281
|
+
int lower_limit = range->include_lower ? 1 : 0;
|
282
|
+
|
283
|
+
for (i = tv->term_cnt - 1; i >= 0; i--) {
|
284
|
+
TVTerm *tv_term = &(tv->terms[i]);
|
285
|
+
char *text = tv_term->text;
|
286
|
+
if ((!upper_text || strcmp(text, upper_text) < upper_limit) &&
|
287
|
+
(!lower_text || strcmp(lower_text, text) < lower_limit)) {
|
288
|
+
|
289
|
+
for (j = 0; j < tv_term->freq; j++) {
|
290
|
+
int pos = tv_term->positions[j];
|
291
|
+
matchv_add(mv, pos, pos);
|
292
|
+
}
|
293
|
+
}
|
294
|
+
}
|
295
|
+
}
|
296
|
+
return mv;
|
297
|
+
}
|
298
|
+
|
272
299
|
static Query *rq_rewrite(Query *self, IndexReader *ir)
|
273
300
|
{
|
301
|
+
Query *csq;
|
274
302
|
Range *r = RQ(self)->range;
|
275
303
|
Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
|
276
304
|
r->include_lower, r->include_upper);
|
277
305
|
(void)ir;
|
278
|
-
|
306
|
+
csq = csq_new_nr(filter);
|
307
|
+
((ConstantScoreQuery *)csq)->original = self;
|
308
|
+
csq->get_matchv_i = &rq_get_matchv_i;
|
309
|
+
return (Query *)csq;
|
279
310
|
}
|
280
311
|
|
281
312
|
static unsigned long rq_hash(Query *self)
|
data/ext/r_analysis.c
CHANGED
@@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt)
|
|
150
150
|
* values as needed. For example, if you have a stop word filter you will be
|
151
151
|
* skipping tokens. Let's say you have the stop words "the" and "and" and you
|
152
152
|
* parse the title "The Old Man and the Sea". The terms "Old", "Man" and
|
153
|
-
* "Sea" will have the position
|
153
|
+
* "Sea" will have the position increments 2, 1 and 3 respectively.
|
154
154
|
*
|
155
155
|
* Another reason you might want to vary the position increment is if you are
|
156
156
|
* adding synonyms to the index. For example let's say you have the synonym
|
@@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts)
|
|
424
424
|
return rts;
|
425
425
|
}
|
426
426
|
|
427
|
-
static
|
427
|
+
static INLINE VALUE
|
428
428
|
get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
|
429
429
|
{
|
430
430
|
StringValue(rstr);
|
@@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
|
|
811
811
|
* LetterTokenizer.new(lower = true) -> tokenizer
|
812
812
|
*
|
813
813
|
* Create a new LetterTokenizer which optionally downcases tokens. Downcasing
|
814
|
-
* is done according the
|
814
|
+
* is done according the current locale.
|
815
815
|
*
|
816
816
|
* lower:: set to false if you don't wish to downcase tokens
|
817
817
|
*/
|
@@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
|
|
842
842
|
* WhiteSpaceTokenizer.new(lower = true) -> tokenizer
|
843
843
|
*
|
844
844
|
* Create a new WhiteSpaceTokenizer which optionally downcases tokens.
|
845
|
-
* Downcasing is done according the
|
845
|
+
* Downcasing is done according the current locale.
|
846
846
|
*
|
847
847
|
* lower:: set to false if you don't wish to downcase tokens
|
848
848
|
*/
|
@@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
873
873
|
* StandardTokenizer.new(lower = true) -> tokenizer
|
874
874
|
*
|
875
875
|
* Create a new StandardTokenizer which optionally downcases tokens.
|
876
|
-
* Downcasing is done according the
|
876
|
+
* Downcasing is done according the current locale.
|
877
877
|
*
|
878
878
|
* lower:: set to false if you don't wish to downcase tokens
|
879
879
|
*/
|
@@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
|
|
896
896
|
* AsciiLowerCaseFilter.new(token_stream) -> token_stream
|
897
897
|
*
|
898
898
|
* Create an AsciiLowerCaseFilter which normalizes a token's text to
|
899
|
-
* lowercase but only for
|
899
|
+
* lowercase but only for ASCII characters. For other characters use
|
900
900
|
* LowerCaseFilter.
|
901
901
|
*/
|
902
902
|
static VALUE
|
@@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
990
990
|
return self;
|
991
991
|
}
|
992
992
|
|
993
|
-
static
|
993
|
+
static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
|
994
994
|
{
|
995
995
|
switch (TYPE(from)) {
|
996
996
|
case T_STRING:
|
@@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
|
|
1046
1046
|
* MappingFilter.new(token_stream, mapping) -> token_stream
|
1047
1047
|
*
|
1048
1048
|
* Create an MappingFilter which maps strings in tokens. This is usually used
|
1049
|
-
* to map UTF-8 characters to
|
1050
|
-
* better
|
1049
|
+
* to map UTF-8 characters to ASCII characters for easier searching and
|
1050
|
+
* better search recall. The mapping is compiled into a Deterministic Finite
|
1051
1051
|
* Automata so it is super fast. This Filter can therefor be used for
|
1052
1052
|
* indexing very large datasets. Currently regular expressions are not
|
1053
1053
|
* supported. If you are really interested in the feature, please contact me
|
@@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
|
|
1087
1087
|
* algorithm="english",
|
1088
1088
|
* encoding="UTF-8") -> token_stream
|
1089
1089
|
*
|
1090
|
-
* Create an StemFilter which uses a snowball stemmer (
|
1090
|
+
* Create an StemFilter which uses a snowball stemmer (thank you Martin
|
1091
1091
|
* Porter) to stem words. You can optionally specify the algorithm (default:
|
1092
1092
|
* "english") and encoding (default: "UTF-8").
|
1093
1093
|
*
|
@@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a)
|
|
1193
1193
|
return self;
|
1194
1194
|
}
|
1195
1195
|
|
1196
|
+
INLINE VALUE
|
1197
|
+
get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
|
1198
|
+
{
|
1199
|
+
TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
|
1200
|
+
|
1201
|
+
/* Make sure that there is no entry already */
|
1202
|
+
object_set(&ts->text, rstring);
|
1203
|
+
return get_rb_token_stream(ts);
|
1204
|
+
}
|
1205
|
+
|
1196
1206
|
/*
|
1197
1207
|
* call-seq:
|
1198
1208
|
* analyzer.token_stream(field_name, input) -> token_stream
|
@@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
|
1209
1219
|
{
|
1210
1220
|
/* NOTE: Any changes made to this method may also need to be applied to
|
1211
1221
|
* frt_re_analyzer_token_stream */
|
1212
|
-
TokenStream *ts;
|
1213
1222
|
Analyzer *a;
|
1214
1223
|
GET_A(a, self);
|
1215
1224
|
|
1216
1225
|
StringValue(rstring);
|
1217
1226
|
|
1218
|
-
|
1219
|
-
|
1220
|
-
/* Make sure that there is no entry already */
|
1221
|
-
object_set(&ts->text, rstring);
|
1222
|
-
return get_rb_token_stream(ts);
|
1227
|
+
return get_rb_ts_from_a(a, rfield, rstring);
|
1223
1228
|
}
|
1224
1229
|
|
1225
1230
|
#define GET_LOWER(dflt) \
|
@@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt)
|
|
1234
1239
|
*
|
1235
1240
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1236
1241
|
* but can optionally leave case as is. Lowercasing will only be done to
|
1237
|
-
*
|
1242
|
+
* ASCII characters.
|
1238
1243
|
*
|
1239
1244
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1240
1245
|
*/
|
@@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1279
1284
|
*
|
1280
1285
|
* Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
|
1281
1286
|
* but can optionally leave case as is. Lowercasing will only be done to
|
1282
|
-
*
|
1287
|
+
* ASCII characters.
|
1283
1288
|
*
|
1284
1289
|
* lower:: set to false if you don't want the field's tokens to be downcased
|
1285
1290
|
*/
|
@@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
1457
1462
|
return self;
|
1458
1463
|
}
|
1459
1464
|
|
1465
|
+
/*
|
1466
|
+
* call-seq:
|
1467
|
+
* analyzer.token_stream(field_name, input) -> token_stream
|
1468
|
+
*
|
1469
|
+
* Create a new TokenStream to tokenize +input+. The TokenStream created will
|
1470
|
+
* also depend on the +field_name+ in the case of the PerFieldAnalyzer.
|
1471
|
+
*
|
1472
|
+
* field_name:: name of the field to be tokenized
|
1473
|
+
* input:: data from the field to be tokenized
|
1474
|
+
*/
|
1475
|
+
static VALUE
|
1476
|
+
frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
|
1477
|
+
{
|
1478
|
+
Analyzer *pfa, *a;
|
1479
|
+
char *field = frt_field(rfield);
|
1480
|
+
GET_A(pfa, self);
|
1481
|
+
|
1482
|
+
StringValue(rstring);
|
1483
|
+
a = (Analyzer *)h_get(PFA(pfa)->dict, field);
|
1484
|
+
if (a == NULL) {
|
1485
|
+
a = PFA(pfa)->default_a;
|
1486
|
+
}
|
1487
|
+
if (a->get_ts == cwa_get_ts) {
|
1488
|
+
return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1489
|
+
ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
|
1490
|
+
}
|
1491
|
+
else {
|
1492
|
+
return get_rb_ts_from_a(a, rfield, rstring);
|
1493
|
+
}
|
1494
|
+
}
|
1495
|
+
|
1460
1496
|
/*** RegExpAnalyzer ***/
|
1461
1497
|
|
1462
1498
|
static void
|
@@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
|
|
1585
1621
|
*
|
1586
1622
|
* == Summary
|
1587
1623
|
*
|
1588
|
-
* A Token is an
|
1624
|
+
* A Token is an occurrence of a term from the text of a field. It consists
|
1589
1625
|
* of a term's text and the start and end offset of the term in the text of
|
1590
1626
|
* the field;
|
1591
1627
|
*
|
@@ -1648,7 +1684,7 @@ static void Init_TokenStream(void)
|
|
1648
1684
|
/*
|
1649
1685
|
* Document-class: Ferret::Analysis::AsciiLetterTokenizer
|
1650
1686
|
*
|
1651
|
-
* A LetterTokenizer is a tokenizer that divides text at non-
|
1687
|
+
* A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
|
1652
1688
|
* That is to say, it defines tokens as maximal strings of adjacent letters,
|
1653
1689
|
* as defined by the regular expression _/[A-Za-z]+/_.
|
1654
1690
|
*
|
@@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void)
|
|
1781
1817
|
* Document-class: Ferret::Analysis::RegExpTokenizer
|
1782
1818
|
*
|
1783
1819
|
* A tokenizer that recognizes tokens based on a regular expression passed to
|
1784
|
-
* the
|
1820
|
+
* the constructor. Most possible tokenizers can be created using this class.
|
1785
1821
|
*
|
1786
1822
|
* === Example
|
1787
1823
|
*
|
@@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void)
|
|
1817
1853
|
* Document-class: Ferret::Analysis::AsciiLowerCaseFilter
|
1818
1854
|
*
|
1819
1855
|
* AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
|
1820
|
-
*
|
1856
|
+
* ASCII characters. For other characters use LowerCaseFilter.
|
1821
1857
|
*
|
1822
1858
|
* === Example
|
1823
1859
|
*
|
@@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void)
|
|
1881
1917
|
* Document-class: Ferret::Analysis::MappingFilter
|
1882
1918
|
*
|
1883
1919
|
* A MappingFilter maps strings in tokens. This is usually used to map UTF-8
|
1884
|
-
* characters to
|
1920
|
+
* characters to ASCII characters for easier searching and better search
|
1885
1921
|
* recall. The mapping is compiled into a Deterministic Finite Automata so it
|
1886
1922
|
* is super fast. This Filter can therefor be used for indexing very large
|
1887
1923
|
* datasets. Currently regular expressions are not supported. If you are
|
@@ -2020,7 +2056,7 @@ static void Init_StemFilter(void)
|
|
2020
2056
|
* a policy for extracting index terms from text.
|
2021
2057
|
*
|
2022
2058
|
* Typical implementations first build a Tokenizer, which breaks the stream
|
2023
|
-
* of characters from the Reader into raw Tokens. One or more
|
2059
|
+
* of characters from the Reader into raw Tokens. One or more TokenFilters
|
2024
2060
|
* may then be applied to the output of the Tokenizer.
|
2025
2061
|
*
|
2026
2062
|
* The default Analyzer just creates a LowerCaseTokenizer which converts
|
@@ -2057,7 +2093,7 @@ static void Init_Analyzer(void)
|
|
2057
2093
|
* == Summary
|
2058
2094
|
*
|
2059
2095
|
* An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
|
2060
|
-
* maximal strings of
|
2096
|
+
* maximal strings of ASCII characters. If implemented in Ruby it would look
|
2061
2097
|
* like;
|
2062
2098
|
*
|
2063
2099
|
* class AsciiLetterAnalyzer
|
@@ -2075,7 +2111,7 @@ static void Init_Analyzer(void)
|
|
2075
2111
|
* end
|
2076
2112
|
*
|
2077
2113
|
* As you can see it makes use of the AsciiLetterTokenizer and
|
2078
|
-
* AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-
|
2114
|
+
* AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
|
2079
2115
|
* characters so you should use the LetterAnalyzer is you want to analyze
|
2080
2116
|
* multi-byte data like "UTF-8".
|
2081
2117
|
*/
|
@@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
2194
2230
|
* == Summary
|
2195
2231
|
*
|
2196
2232
|
* The AsciiStandardAnalyzer is the most advanced of the available
|
2197
|
-
*
|
2233
|
+
* ASCII-analyzers. If it were implemented in Ruby it would look like this;
|
2198
2234
|
*
|
2199
2235
|
* class AsciiStandardAnalyzer
|
2200
2236
|
* def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
|
@@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
2212
2248
|
*
|
2213
2249
|
* As you can see it makes use of the AsciiStandardTokenizer and you can also
|
2214
2250
|
* add your own list of stop-words if you wish. Note that this tokenizer
|
2215
|
-
* won't recognize non-
|
2251
|
+
* won't recognize non-ASCII characters so you should use the
|
2216
2252
|
* StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
|
2217
2253
|
*/
|
2218
2254
|
static void Init_AsciiStandardAnalyzer(void)
|
@@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void)
|
|
2292
2328
|
frt_per_field_analyzer_add_field, 2);
|
2293
2329
|
rb_define_method(cPerFieldAnalyzer, "[]=",
|
2294
2330
|
frt_per_field_analyzer_add_field, 2);
|
2331
|
+
rb_define_method(cPerFieldAnalyzer, "token_stream",
|
2332
|
+
frt_pfa_analyzer_token_stream, 2);
|
2295
2333
|
}
|
2296
2334
|
|
2297
2335
|
/*
|