ferret 0.11.4 → 0.11.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/TUTORIAL +3 -3
- data/ext/analysis.c +12 -9
- data/ext/array.c +10 -10
- data/ext/array.h +8 -1
- data/ext/bitvector.c +2 -2
- data/ext/except.c +1 -1
- data/ext/ferret.c +2 -2
- data/ext/ferret.h +1 -1
- data/ext/fs_store.c +13 -2
- data/ext/global.c +4 -4
- data/ext/global.h +6 -0
- data/ext/hash.c +1 -1
- data/ext/helper.c +1 -1
- data/ext/helper.h +1 -1
- data/ext/index.c +48 -22
- data/ext/index.h +17 -16
- data/ext/mempool.c +4 -1
- data/ext/mempool.h +1 -1
- data/ext/multimapper.c +2 -2
- data/ext/q_fuzzy.c +2 -2
- data/ext/q_multi_term.c +2 -2
- data/ext/q_parser.c +39 -8
- data/ext/q_range.c +32 -1
- data/ext/r_analysis.c +66 -28
- data/ext/r_index.c +18 -19
- data/ext/r_qparser.c +21 -6
- data/ext/r_search.c +74 -49
- data/ext/r_store.c +1 -1
- data/ext/r_utils.c +17 -17
- data/ext/search.c +10 -5
- data/ext/search.h +3 -1
- data/ext/sort.c +2 -2
- data/ext/stopwords.c +23 -34
- data/ext/store.c +9 -9
- data/ext/store.h +5 -4
- data/lib/ferret/document.rb +2 -2
- data/lib/ferret/field_infos.rb +37 -35
- data/lib/ferret/index.rb +16 -6
- data/lib/ferret/number_tools.rb +2 -2
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +40 -0
- data/test/unit/index/tc_index.rb +64 -101
- data/test/unit/index/tc_index_reader.rb +13 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +17 -1
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tm_searcher.rb +27 -1
- data/test/unit/ts_largefile.rb +4 -0
- metadata +147 -144
    
        data/ext/index.h
    CHANGED
    
    | @@ -65,24 +65,24 @@ extern HashTable *co_hash_create(); | |
| 65 65 |  | 
| 66 66 | 
             
            enum StoreValues
         | 
| 67 67 | 
             
            {
         | 
| 68 | 
            -
                STORE_NO = 0, | 
| 69 | 
            -
                STORE_YES = 1, | 
| 68 | 
            +
                STORE_NO = 0,
         | 
| 69 | 
            +
                STORE_YES = 1,
         | 
| 70 70 | 
             
                STORE_COMPRESS = 2
         | 
| 71 71 | 
             
            };
         | 
| 72 72 |  | 
| 73 73 | 
             
            enum IndexValues
         | 
| 74 74 | 
             
            {
         | 
| 75 | 
            -
                INDEX_NO = 0, | 
| 76 | 
            -
                 | 
| 77 | 
            -
                 | 
| 78 | 
            -
                 | 
| 79 | 
            -
                 | 
| 75 | 
            +
                INDEX_NO = 0,
         | 
| 76 | 
            +
                INDEX_UNTOKENIZED = 1,
         | 
| 77 | 
            +
                INDEX_YES = 3,
         | 
| 78 | 
            +
                INDEX_UNTOKENIZED_OMIT_NORMS = 5,
         | 
| 79 | 
            +
                INDEX_YES_OMIT_NORMS = 7
         | 
| 80 80 | 
             
            };
         | 
| 81 81 |  | 
| 82 82 | 
             
            enum TermVectorValues
         | 
| 83 83 | 
             
            {
         | 
| 84 | 
            -
                TERM_VECTOR_NO = 0, | 
| 85 | 
            -
                TERM_VECTOR_YES = 1, | 
| 84 | 
            +
                TERM_VECTOR_NO = 0,
         | 
| 85 | 
            +
                TERM_VECTOR_YES = 1,
         | 
| 86 86 | 
             
                TERM_VECTOR_WITH_POSITIONS = 3,
         | 
| 87 87 | 
             
                TERM_VECTOR_WITH_OFFSETS = 5,
         | 
| 88 88 | 
             
                TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
         | 
| @@ -374,7 +374,7 @@ typedef struct TermInfosWriter | |
| 374 374 |  | 
| 375 375 | 
             
            extern TermInfosWriter *tiw_open(Store *store,
         | 
| 376 376 | 
             
                                             const char *segment,
         | 
| 377 | 
            -
                                             int index_interval, | 
| 377 | 
            +
                                             int index_interval,
         | 
| 378 378 | 
             
                                             int skip_interval);
         | 
| 379 379 | 
             
            extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
         | 
| 380 380 | 
             
            extern void tiw_add(TermInfosWriter *tiw,
         | 
| @@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms, | |
| 456 456 |  | 
| 457 457 | 
             
            typedef struct Offset
         | 
| 458 458 | 
             
            {
         | 
| 459 | 
            -
                 | 
| 460 | 
            -
                 | 
| 459 | 
            +
                off_t start;
         | 
| 460 | 
            +
                off_t end;
         | 
| 461 461 | 
             
            } Offset;
         | 
| 462 462 |  | 
| 463 | 
            -
            extern Offset *offset_new( | 
| 463 | 
            +
            extern Offset *offset_new(off_t start, off_t end);
         | 
| 464 464 |  | 
| 465 465 | 
             
            /****************************************************************************
         | 
| 466 466 | 
             
             *
         | 
| @@ -488,7 +488,7 @@ typedef struct Posting | |
| 488 488 | 
             
                struct Posting *next;
         | 
| 489 489 | 
             
            } Posting;
         | 
| 490 490 |  | 
| 491 | 
            -
            extern  | 
| 491 | 
            +
            extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
         | 
| 492 492 |  | 
| 493 493 | 
             
            /****************************************************************************
         | 
| 494 494 | 
             
             *
         | 
| @@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr, | |
| 617 617 | 
             
            /* * * LazyDocField * * */
         | 
| 618 618 | 
             
            typedef struct LazyDocFieldData
         | 
| 619 619 | 
             
            {
         | 
| 620 | 
            -
                 | 
| 620 | 
            +
                off_t start;
         | 
| 621 621 | 
             
                int   length;
         | 
| 622 622 | 
             
                char *text;
         | 
| 623 623 | 
             
            } LazyDocFieldData;
         | 
| @@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw); | |
| 706 706 | 
             
             * A utility class (used by both IndexReader and IndexWriter) to keep track of
         | 
| 707 707 | 
             
             * files that need to be deleted because they are no longer referenced by the
         | 
| 708 708 | 
             
             * index.
         | 
| 709 | 
            -
             * | 
| 709 | 
            +
             *
         | 
| 710 710 | 
             
             ****************************************************************************/
         | 
| 711 711 |  | 
| 712 712 | 
             
            struct Deleter
         | 
| @@ -760,6 +760,7 @@ struct IndexReader | |
| 760 760 | 
             
                void          (*delete_doc_i)(IndexReader *ir, int doc_num);
         | 
| 761 761 | 
             
                void          (*undelete_all_i)(IndexReader *ir);
         | 
| 762 762 | 
             
                void          (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
         | 
| 763 | 
            +
                bool          (*is_latest_i)(IndexReader *ir);
         | 
| 763 764 | 
             
                void          (*commit_i)(IndexReader *ir);
         | 
| 764 765 | 
             
                void          (*close_i)(IndexReader *ir);
         | 
| 765 766 | 
             
                int           ref_cnt;
         | 
    
        data/ext/mempool.c
    CHANGED
    
    | @@ -21,10 +21,13 @@ MemoryPool *mp_new() | |
| 21 21 | 
             
                return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
         | 
| 22 22 | 
             
            }
         | 
| 23 23 |  | 
| 24 | 
            -
             | 
| 24 | 
            +
            INLINE void *mp_alloc(MemoryPool *mp, int size)
         | 
| 25 25 | 
             
            {
         | 
| 26 26 | 
             
                char *p;
         | 
| 27 27 | 
             
                p = mp->curr_buffer + mp->pointer;
         | 
| 28 | 
            +
            #if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
         | 
| 29 | 
            +
                size = (((size - 1) >> 3) + 1) << 3;
         | 
| 30 | 
            +
            #endif
         | 
| 28 31 | 
             
                mp->pointer += size;
         | 
| 29 32 |  | 
| 30 33 | 
             
                if (mp->pointer > mp->chunk_size) {
         | 
    
        data/ext/mempool.h
    CHANGED
    
    | @@ -16,7 +16,7 @@ typedef struct MemoryPool { | |
| 16 16 |  | 
| 17 17 | 
             
            extern MemoryPool *mp_new();
         | 
| 18 18 | 
             
            extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
         | 
| 19 | 
            -
            extern  | 
| 19 | 
            +
            extern INLINE void *mp_alloc(MemoryPool *mp, int size);
         | 
| 20 20 | 
             
            extern void mp_reset(MemoryPool *mp);
         | 
| 21 21 | 
             
            extern void mp_destroy(MemoryPool *mp);
         | 
| 22 22 | 
             
            extern char *mp_strdup(MemoryPool *mp, const char *str);
         | 
    
        data/ext/multimapper.c
    CHANGED
    
    | @@ -121,7 +121,7 @@ MultiMapper *mulmap_new() | |
| 121 121 | 
             
                return self;
         | 
| 122 122 | 
             
            }
         | 
| 123 123 |  | 
| 124 | 
            -
            static  | 
| 124 | 
            +
            static INLINE void mulmap_free_dstates(MultiMapper *self)
         | 
| 125 125 | 
             
            {
         | 
| 126 126 | 
             
                if (self->d_size > 0) {
         | 
| 127 127 | 
             
                    int i;
         | 
| @@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep) | |
| 151 151 | 
             
            }
         | 
| 152 152 |  | 
| 153 153 |  | 
| 154 | 
            -
            static  | 
| 154 | 
            +
            static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
         | 
| 155 155 | 
             
            {
         | 
| 156 156 | 
             
                int i;
         | 
| 157 157 | 
             
                for (i = cnt - 1; i >= 0; i--) {
         | 
    
        data/ext/q_fuzzy.c
    CHANGED
    
    | @@ -11,7 +11,7 @@ | |
| 11 11 | 
             
             *
         | 
| 12 12 | 
             
             ****************************************************************************/
         | 
| 13 13 |  | 
| 14 | 
            -
            static  | 
| 14 | 
            +
            static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m) 
         | 
| 15 15 | 
             
            {
         | 
| 16 16 | 
             
                return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
         | 
| 17 17 | 
             
            }
         | 
| @@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq) | |
| 24 24 | 
             
                }
         | 
| 25 25 | 
             
            }
         | 
| 26 26 |  | 
| 27 | 
            -
            static  | 
| 27 | 
            +
            static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
         | 
| 28 28 | 
             
            {
         | 
| 29 29 | 
             
                return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
         | 
| 30 30 | 
             
                    : fuzq_calculate_max_distance(fuzq, m);
         | 
    
        data/ext/q_multi_term.c
    CHANGED
    
    | @@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num) | |
| 236 236 | 
             
                return (pq_top(tdew_pq) == NULL) ? false : true;
         | 
| 237 237 | 
             
            }
         | 
| 238 238 |  | 
| 239 | 
            -
            static  | 
| 239 | 
            +
            static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
         | 
| 240 240 | 
             
            {
         | 
| 241 241 | 
             
                return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
         | 
| 242 242 | 
             
            }
         | 
| @@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field) | |
| 661 661 |  | 
| 662 662 | 
             
            void multi_tq_add_term_boost(Query *self, const char *term, float boost)
         | 
| 663 663 | 
             
            {
         | 
| 664 | 
            -
                if (boost > MTQ(self)->min_boost) {
         | 
| 664 | 
            +
                if (boost > MTQ(self)->min_boost && term && term[0]) {
         | 
| 665 665 | 
             
                    BoostedTerm *bt = boosted_term_new(term, boost);
         | 
| 666 666 | 
             
                    PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
         | 
| 667 667 | 
             
                    pq_insert(bt_pq, bt);
         | 
    
        data/ext/q_parser.c
    CHANGED
    
    | @@ -147,7 +147,7 @@ typedef union YYSTYPE | |
| 147 147 | 
             
                Phrase *phrase;
         | 
| 148 148 | 
             
                char *str;
         | 
| 149 149 | 
             
            }
         | 
| 150 | 
            -
            /* Line  | 
| 150 | 
            +
            /* Line 187 of yacc.c.  */
         | 
| 151 151 | 
             
            #line 152 "y.tab.c"
         | 
| 152 152 | 
             
            	YYSTYPE;
         | 
| 153 153 | 
             
            # define yystype YYSTYPE /* obsolescent; will be withdrawn */
         | 
| @@ -2061,12 +2061,14 @@ get_word_done: | |
| 2061 2061 | 
             
                 * just checks for all of them. */
         | 
| 2062 2062 | 
             
                *bufp = '\0';
         | 
| 2063 2063 | 
             
                len = (int)(bufp - buf);
         | 
| 2064 | 
            -
                if ( | 
| 2065 | 
            -
                    if ( | 
| 2066 | 
            -
             | 
| 2067 | 
            -
             | 
| 2064 | 
            +
                if (qp->use_keywords) {
         | 
| 2065 | 
            +
                    if (len == 3) {
         | 
| 2066 | 
            +
                        if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
         | 
| 2067 | 
            +
                        if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
         | 
| 2068 | 
            +
                        if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
         | 
| 2069 | 
            +
                    }
         | 
| 2070 | 
            +
                    if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
         | 
| 2068 2071 | 
             
                }
         | 
| 2069 | 
            -
                if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
         | 
| 2070 2072 |  | 
| 2071 2073 | 
             
                /* found a word so return it. */
         | 
| 2072 2074 | 
             
                lvalp->str = buf;
         | 
| @@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field, | |
| 2489 2491 | 
             
                    }
         | 
| 2490 2492 | 
             
                    else {
         | 
| 2491 2493 | 
             
                        int i;
         | 
| 2492 | 
            -
                         | 
| 2494 | 
            +
                        int term_cnt = 0;
         | 
| 2495 | 
            +
                        Token *token;
         | 
| 2496 | 
            +
                        char *last_word = NULL;
         | 
| 2497 | 
            +
             | 
| 2493 2498 | 
             
                        for (i = 0; i < word_count; i++) {
         | 
| 2494 | 
            -
                             | 
| 2499 | 
            +
                            token = ts_next(get_cached_ts(qp, field, words[i]));
         | 
| 2500 | 
            +
                            free(words[i]);
         | 
| 2501 | 
            +
                            if (token) {
         | 
| 2502 | 
            +
                                last_word = words[i] = estrdup(token->text);
         | 
| 2503 | 
            +
                                ++term_cnt;
         | 
| 2504 | 
            +
                            }
         | 
| 2505 | 
            +
                            else {
         | 
| 2506 | 
            +
                                words[i] = estrdup("");
         | 
| 2507 | 
            +
                            }
         | 
| 2508 | 
            +
                        }
         | 
| 2509 | 
            +
             | 
| 2510 | 
            +
                        switch (term_cnt) {
         | 
| 2511 | 
            +
                            case 0:
         | 
| 2512 | 
            +
                                q = bq_new(false);
         | 
| 2513 | 
            +
                                break;
         | 
| 2514 | 
            +
                            case 1:
         | 
| 2515 | 
            +
                                q = tq_new(field, last_word);
         | 
| 2516 | 
            +
                                break;
         | 
| 2517 | 
            +
                            default:
         | 
| 2518 | 
            +
                                q = multi_tq_new_conf(field, term_cnt, 0.0);
         | 
| 2519 | 
            +
                                for (i = 0; i < word_count; i++) {
         | 
| 2520 | 
            +
                                    if (words[i][0]) {
         | 
| 2521 | 
            +
                                        multi_tq_add_term(q, words[i]);
         | 
| 2522 | 
            +
                                    }
         | 
| 2523 | 
            +
                                }
         | 
| 2524 | 
            +
                                break;
         | 
| 2495 2525 | 
             
                        }
         | 
| 2496 2526 | 
             
                    }
         | 
| 2497 2527 | 
             
                }
         | 
| @@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields, | |
| 2620 2650 | 
             
                self->max_clauses = QP_MAX_CLAUSES;
         | 
| 2621 2651 | 
             
                self->handle_parse_errors = false;
         | 
| 2622 2652 | 
             
                self->allow_any_fields = false;
         | 
| 2653 | 
            +
                self->use_keywords = true;
         | 
| 2623 2654 | 
             
                self->def_slop = 0;
         | 
| 2624 2655 | 
             
                self->fields_buf = hs_new_str(NULL);
         | 
| 2625 2656 | 
             
                self->all_fields = all_fields;
         | 
    
        data/ext/q_range.c
    CHANGED
    
    | @@ -269,13 +269,44 @@ static void rq_destroy(Query *self) | |
| 269 269 | 
             
                q_destroy_i(self);
         | 
| 270 270 | 
             
            }
         | 
| 271 271 |  | 
| 272 | 
            +
            static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
         | 
| 273 | 
            +
                                                TermVector *tv)
         | 
| 274 | 
            +
            {
         | 
| 275 | 
            +
                Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
         | 
| 276 | 
            +
                if (strcmp(tv->field, range->field) == 0) {
         | 
| 277 | 
            +
                    int i, j;
         | 
| 278 | 
            +
                    char *upper_text = range->upper_term;
         | 
| 279 | 
            +
                    char *lower_text = range->lower_term;
         | 
| 280 | 
            +
                    int upper_limit = range->include_upper ? 1 : 0;
         | 
| 281 | 
            +
                    int lower_limit = range->include_lower ? 1 : 0;
         | 
| 282 | 
            +
             | 
| 283 | 
            +
                    for (i = tv->term_cnt - 1; i >= 0; i--) {
         | 
| 284 | 
            +
                        TVTerm *tv_term = &(tv->terms[i]);
         | 
| 285 | 
            +
                        char *text = tv_term->text;
         | 
| 286 | 
            +
                        if ((!upper_text || strcmp(text, upper_text) < upper_limit) && 
         | 
| 287 | 
            +
                            (!lower_text || strcmp(lower_text, text) < lower_limit)) {
         | 
| 288 | 
            +
             | 
| 289 | 
            +
                            for (j = 0; j < tv_term->freq; j++) {
         | 
| 290 | 
            +
                                int pos = tv_term->positions[j];
         | 
| 291 | 
            +
                                matchv_add(mv, pos, pos);
         | 
| 292 | 
            +
                            }
         | 
| 293 | 
            +
                        }
         | 
| 294 | 
            +
                    }
         | 
| 295 | 
            +
                }
         | 
| 296 | 
            +
                return mv;
         | 
| 297 | 
            +
            }
         | 
| 298 | 
            +
             | 
| 272 299 | 
             
            static Query *rq_rewrite(Query *self, IndexReader *ir)
         | 
| 273 300 | 
             
            {
         | 
| 301 | 
            +
                Query *csq;
         | 
| 274 302 | 
             
                Range *r = RQ(self)->range;
         | 
| 275 303 | 
             
                Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
         | 
| 276 304 | 
             
                                           r->include_lower, r->include_upper);
         | 
| 277 305 | 
             
                (void)ir;
         | 
| 278 | 
            -
                 | 
| 306 | 
            +
                csq = csq_new_nr(filter);
         | 
| 307 | 
            +
                ((ConstantScoreQuery *)csq)->original = self;
         | 
| 308 | 
            +
                csq->get_matchv_i = &rq_get_matchv_i;
         | 
| 309 | 
            +
                return (Query *)csq;
         | 
| 279 310 | 
             
            }
         | 
| 280 311 |  | 
| 281 312 | 
             
            static unsigned long rq_hash(Query *self)
         | 
    
        data/ext/r_analysis.c
    CHANGED
    
    | @@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt) | |
| 150 150 | 
             
             *  values as needed.  For example, if you have a stop word filter you will be
         | 
| 151 151 | 
             
             *  skipping tokens. Let's say you have the stop words "the" and "and" and you
         | 
| 152 152 | 
             
             *  parse the title "The Old Man and the Sea". The terms "Old", "Man" and
         | 
| 153 | 
            -
             *  "Sea" will have the position  | 
| 153 | 
            +
             *  "Sea" will have the position increments 2, 1 and 3 respectively.
         | 
| 154 154 | 
             
             *
         | 
| 155 155 | 
             
             *  Another reason you might want to vary the position increment is if you are
         | 
| 156 156 | 
             
             *  adding synonyms to the index. For example let's say you have the synonym
         | 
| @@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts) | |
| 424 424 | 
             
                return rts;
         | 
| 425 425 | 
             
            }
         | 
| 426 426 |  | 
| 427 | 
            -
            static  | 
| 427 | 
            +
            static INLINE VALUE
         | 
| 428 428 | 
             
            get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
         | 
| 429 429 | 
             
            {
         | 
| 430 430 | 
             
                StringValue(rstr);
         | 
| @@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr) | |
| 811 811 | 
             
             *     LetterTokenizer.new(lower = true) -> tokenizer
         | 
| 812 812 | 
             
             *
         | 
| 813 813 | 
             
             *  Create a new LetterTokenizer which optionally downcases tokens. Downcasing
         | 
| 814 | 
            -
             *  is done according the  | 
| 814 | 
            +
             *  is done according the current locale.
         | 
| 815 815 | 
             
             *
         | 
| 816 816 | 
             
             *  lower:: set to false if you don't wish to downcase tokens
         | 
| 817 817 | 
             
             */
         | 
| @@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr) | |
| 842 842 | 
             
             *     WhiteSpaceTokenizer.new(lower = true) -> tokenizer
         | 
| 843 843 | 
             
             *
         | 
| 844 844 | 
             
             *  Create a new WhiteSpaceTokenizer which optionally downcases tokens.
         | 
| 845 | 
            -
             *  Downcasing is done according the  | 
| 845 | 
            +
             *  Downcasing is done according the current locale.
         | 
| 846 846 | 
             
             *
         | 
| 847 847 | 
             
             *  lower:: set to false if you don't wish to downcase tokens
         | 
| 848 848 | 
             
             */
         | 
| @@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr) | |
| 873 873 | 
             
             *     StandardTokenizer.new(lower = true) -> tokenizer
         | 
| 874 874 | 
             
             *
         | 
| 875 875 | 
             
             *  Create a new StandardTokenizer which optionally downcases tokens.
         | 
| 876 | 
            -
             *  Downcasing is done according the  | 
| 876 | 
            +
             *  Downcasing is done according the current locale.
         | 
| 877 877 | 
             
             *
         | 
| 878 878 | 
             
             *  lower:: set to false if you don't wish to downcase tokens
         | 
| 879 879 | 
             
             */
         | 
| @@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr) | |
| 896 896 | 
             
             *     AsciiLowerCaseFilter.new(token_stream) -> token_stream
         | 
| 897 897 | 
             
             *
         | 
| 898 898 | 
             
             *  Create an AsciiLowerCaseFilter which normalizes a token's text to
         | 
| 899 | 
            -
             *  lowercase but only for  | 
| 899 | 
            +
             *  lowercase but only for ASCII characters. For other characters use
         | 
| 900 900 | 
             
             *  LowerCaseFilter.
         | 
| 901 901 | 
             
             */
         | 
| 902 902 | 
             
            static VALUE
         | 
| @@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self) | |
| 990 990 | 
             
                return self;
         | 
| 991 991 | 
             
            }
         | 
| 992 992 |  | 
| 993 | 
            -
            static  | 
| 993 | 
            +
            static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
         | 
| 994 994 | 
             
            {
         | 
| 995 995 | 
             
                switch (TYPE(from)) {
         | 
| 996 996 | 
             
                    case T_STRING:
         | 
| @@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg) | |
| 1046 1046 | 
             
             *     MappingFilter.new(token_stream, mapping) -> token_stream
         | 
| 1047 1047 | 
             
             *
         | 
| 1048 1048 | 
             
             *  Create an MappingFilter which maps strings in tokens. This is usually used
         | 
| 1049 | 
            -
             *  to map UTF-8 characters to  | 
| 1050 | 
            -
             *  better  | 
| 1049 | 
            +
             *  to map UTF-8 characters to ASCII characters for easier searching and
         | 
| 1050 | 
            +
             *  better search recall. The mapping is compiled into a Deterministic Finite
         | 
| 1051 1051 | 
             
             *  Automata so it is super fast. This Filter can therefor be used for
         | 
| 1052 1052 | 
             
             *  indexing very large datasets. Currently regular expressions are not
         | 
| 1053 1053 | 
             
             *  supported. If you are really interested in the feature, please contact me
         | 
| @@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping) | |
| 1087 1087 | 
             
             *                    algorithm="english",
         | 
| 1088 1088 | 
             
             *                    encoding="UTF-8") -> token_stream
         | 
| 1089 1089 | 
             
             *
         | 
| 1090 | 
            -
             *  Create an StemFilter which uses a snowball stemmer ( | 
| 1090 | 
            +
             *  Create an StemFilter which uses a snowball stemmer (thank you Martin
         | 
| 1091 1091 | 
             
             *  Porter) to stem words. You can optionally specify the algorithm (default:
         | 
| 1092 1092 | 
             
             *  "english") and encoding (default: "UTF-8").
         | 
| 1093 1093 | 
             
             *
         | 
| @@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a) | |
| 1193 1193 | 
             
                return self;
         | 
| 1194 1194 | 
             
            }
         | 
| 1195 1195 |  | 
| 1196 | 
            +
            INLINE VALUE
         | 
| 1197 | 
            +
            get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
         | 
| 1198 | 
            +
            {
         | 
| 1199 | 
            +
                TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
         | 
| 1200 | 
            +
             | 
| 1201 | 
            +
                /* Make sure that there is no entry already */
         | 
| 1202 | 
            +
                object_set(&ts->text, rstring);
         | 
| 1203 | 
            +
                return get_rb_token_stream(ts);
         | 
| 1204 | 
            +
            }
         | 
| 1205 | 
            +
             | 
| 1196 1206 | 
             
            /*
         | 
| 1197 1207 | 
             
             *  call-seq:
         | 
| 1198 1208 | 
             
             *     analyzer.token_stream(field_name, input) -> token_stream
         | 
| @@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring) | |
| 1209 1219 | 
             
            {
         | 
| 1210 1220 | 
             
                /* NOTE: Any changes made to this method may also need to be applied to
         | 
| 1211 1221 | 
             
                 * frt_re_analyzer_token_stream */
         | 
| 1212 | 
            -
                TokenStream *ts;
         | 
| 1213 1222 | 
             
                Analyzer *a;
         | 
| 1214 1223 | 
             
                GET_A(a, self);
         | 
| 1215 1224 |  | 
| 1216 1225 | 
             
                StringValue(rstring);
         | 
| 1217 1226 |  | 
| 1218 | 
            -
                 | 
| 1219 | 
            -
             | 
| 1220 | 
            -
                /* Make sure that there is no entry already */
         | 
| 1221 | 
            -
                object_set(&ts->text, rstring);
         | 
| 1222 | 
            -
                return get_rb_token_stream(ts);
         | 
| 1227 | 
            +
                return get_rb_ts_from_a(a, rfield, rstring);
         | 
| 1223 1228 | 
             
            }
         | 
| 1224 1229 |  | 
| 1225 1230 | 
             
            #define GET_LOWER(dflt) \
         | 
| @@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt) | |
| 1234 1239 | 
             
             *
         | 
| 1235 1240 | 
             
             *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
         | 
| 1236 1241 | 
             
             *  but can optionally leave case as is. Lowercasing will only be done to
         | 
| 1237 | 
            -
             *   | 
| 1242 | 
            +
             *  ASCII characters.
         | 
| 1238 1243 | 
             
             *
         | 
| 1239 1244 | 
             
             *  lower:: set to false if you don't want the field's tokens to be downcased
         | 
| 1240 1245 | 
             
             */
         | 
| @@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self) | |
| 1279 1284 | 
             
             *
         | 
| 1280 1285 | 
             
             *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
         | 
| 1281 1286 | 
             
             *  but can optionally leave case as is. Lowercasing will only be done to
         | 
| 1282 | 
            -
             *   | 
| 1287 | 
            +
             *  ASCII characters.
         | 
| 1283 1288 | 
             
             *
         | 
| 1284 1289 | 
             
             *  lower:: set to false if you don't want the field's tokens to be downcased
         | 
| 1285 1290 | 
             
             */
         | 
| @@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer) | |
| 1457 1462 | 
             
                return self;
         | 
| 1458 1463 | 
             
            }
         | 
| 1459 1464 |  | 
| 1465 | 
            +
            /*
         | 
| 1466 | 
            +
             *  call-seq:
         | 
| 1467 | 
            +
             *     analyzer.token_stream(field_name, input) -> token_stream
         | 
| 1468 | 
            +
             *
         | 
| 1469 | 
            +
             *  Create a new TokenStream to tokenize +input+. The TokenStream created will 
         | 
| 1470 | 
            +
             *  also depend on the +field_name+ in the case of the PerFieldAnalyzer.
         | 
| 1471 | 
            +
             *  
         | 
| 1472 | 
            +
             *  field_name:: name of the field to be tokenized
         | 
| 1473 | 
            +
             *  input::      data from the field to be tokenized
         | 
| 1474 | 
            +
             */
         | 
| 1475 | 
            +
            static VALUE
         | 
| 1476 | 
            +
            frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
         | 
| 1477 | 
            +
            {
         | 
| 1478 | 
            +
                Analyzer *pfa, *a;
         | 
| 1479 | 
            +
                char *field = frt_field(rfield);
         | 
| 1480 | 
            +
                GET_A(pfa, self);
         | 
| 1481 | 
            +
             | 
| 1482 | 
            +
                StringValue(rstring);
         | 
| 1483 | 
            +
                a = (Analyzer *)h_get(PFA(pfa)->dict, field);
         | 
| 1484 | 
            +
                if (a == NULL) {
         | 
| 1485 | 
            +
                    a = PFA(pfa)->default_a;
         | 
| 1486 | 
            +
                }
         | 
| 1487 | 
            +
                if (a->get_ts == cwa_get_ts) {
         | 
| 1488 | 
            +
                    return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
         | 
| 1489 | 
            +
                                      ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
         | 
| 1490 | 
            +
                }
         | 
| 1491 | 
            +
                else {
         | 
| 1492 | 
            +
                    return get_rb_ts_from_a(a, rfield, rstring);
         | 
| 1493 | 
            +
                }
         | 
| 1494 | 
            +
            }
         | 
| 1495 | 
            +
             | 
| 1460 1496 | 
             
            /*** RegExpAnalyzer ***/
         | 
| 1461 1497 |  | 
| 1462 1498 | 
             
            static void
         | 
| @@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale) | |
| 1585 1621 | 
             
             *
         | 
| 1586 1622 | 
             
             *  == Summary
         | 
| 1587 1623 | 
             
             *
         | 
| 1588 | 
            -
             *  A Token is an  | 
| 1624 | 
            +
             *  A Token is an occurrence of a term from the text of a field.  It consists
         | 
| 1589 1625 | 
             
             *  of a term's text and the start and end offset of the term in the text of
         | 
| 1590 1626 | 
             
             *  the field;
         | 
| 1591 1627 | 
             
             * 
         | 
| @@ -1648,7 +1684,7 @@ static void Init_TokenStream(void) | |
| 1648 1684 | 
             
            /*
         | 
| 1649 1685 | 
             
             *  Document-class: Ferret::Analysis::AsciiLetterTokenizer
         | 
| 1650 1686 | 
             
             *
         | 
| 1651 | 
            -
             *  A LetterTokenizer is a tokenizer that divides text at non- | 
| 1687 | 
            +
             *  A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
         | 
| 1652 1688 | 
             
             *  That is to say, it defines tokens as maximal strings of adjacent letters,
         | 
| 1653 1689 | 
             
             *  as defined by the regular expression _/[A-Za-z]+/_.
         | 
| 1654 1690 | 
             
             *
         | 
| @@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void) | |
| 1781 1817 | 
             
             *  Document-class: Ferret::Analysis::RegExpTokenizer
         | 
| 1782 1818 | 
             
             *
         | 
| 1783 1819 | 
             
             *  A tokenizer that recognizes tokens based on a regular expression passed to
         | 
| 1784 | 
            -
             *  the  | 
| 1820 | 
            +
             *  the constructor. Most possible tokenizers can be created using this class.
         | 
| 1785 1821 | 
             
             *
         | 
| 1786 1822 | 
             
             *  === Example
         | 
| 1787 1823 | 
             
             *
         | 
| @@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void) | |
| 1817 1853 | 
             
             *  Document-class: Ferret::Analysis::AsciiLowerCaseFilter
         | 
| 1818 1854 | 
             
             *
         | 
| 1819 1855 | 
             
             *  AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
         | 
| 1820 | 
            -
             *   | 
| 1856 | 
            +
             *  ASCII characters. For other characters use LowerCaseFilter.
         | 
| 1821 1857 | 
             
             *
         | 
| 1822 1858 | 
             
             *  === Example
         | 
| 1823 1859 | 
             
             *
         | 
| @@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void) | |
| 1881 1917 | 
             
             *  Document-class: Ferret::Analysis::MappingFilter
         | 
| 1882 1918 | 
             
             *
         | 
| 1883 1919 | 
             
             *  A MappingFilter maps strings in tokens. This is usually used to map UTF-8
         | 
| 1884 | 
            -
             *  characters to  | 
| 1920 | 
            +
             *  characters to ASCII characters for easier searching and better search
         | 
| 1885 1921 | 
             
             *  recall. The mapping is compiled into a Deterministic Finite Automata so it
         | 
| 1886 1922 | 
             
             *  is super fast. This Filter can therefor be used for indexing very large
         | 
| 1887 1923 | 
             
             *  datasets. Currently regular expressions are not supported. If you are
         | 
| @@ -2020,7 +2056,7 @@ static void Init_StemFilter(void) | |
| 2020 2056 | 
             
             *  a policy for extracting index terms from text.
         | 
| 2021 2057 | 
             
             * 
         | 
| 2022 2058 | 
             
             *  Typical implementations first build a Tokenizer, which breaks the stream
         | 
| 2023 | 
            -
             *  of characters from the Reader into raw Tokens. One or more  | 
| 2059 | 
            +
             *  of characters from the Reader into raw Tokens. One or more TokenFilters
         | 
| 2024 2060 | 
             
             *  may then be applied to the output of the Tokenizer.
         | 
| 2025 2061 | 
             
             * 
         | 
| 2026 2062 | 
             
             *  The default Analyzer just creates a LowerCaseTokenizer which converts
         | 
| @@ -2057,7 +2093,7 @@ static void Init_Analyzer(void) | |
| 2057 2093 | 
             
             *  == Summary
         | 
| 2058 2094 | 
             
             *
         | 
| 2059 2095 | 
             
             *  An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
         | 
| 2060 | 
            -
             *  maximal strings of  | 
| 2096 | 
            +
             *  maximal strings of ASCII characters. If implemented in Ruby it would look
         | 
| 2061 2097 | 
             
             *  like;
         | 
| 2062 2098 | 
             
             *
         | 
| 2063 2099 | 
             
             *    class AsciiLetterAnalyzer
         | 
| @@ -2075,7 +2111,7 @@ static void Init_Analyzer(void) | |
| 2075 2111 | 
             
             *    end
         | 
| 2076 2112 | 
             
             *
         | 
| 2077 2113 | 
             
             *  As you can see it makes use of the AsciiLetterTokenizer and
         | 
| 2078 | 
            -
             *  AsciiLowerCaseFilter. Note that this tokenizer won't recognize non- | 
| 2114 | 
            +
             *  AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
         | 
| 2079 2115 | 
             
             *  characters so you should use the LetterAnalyzer is you want to analyze
         | 
| 2080 2116 | 
             
             *  multi-byte data like "UTF-8".
         | 
| 2081 2117 | 
             
             */
         | 
| @@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void) | |
| 2194 2230 | 
             
             *  == Summary
         | 
| 2195 2231 | 
             
             *
         | 
| 2196 2232 | 
             
             *  The AsciiStandardAnalyzer is the most advanced of the available
         | 
| 2197 | 
            -
             *   | 
| 2233 | 
            +
             *  ASCII-analyzers. If it were implemented in Ruby it would look like this;
         | 
| 2198 2234 | 
             
             *
         | 
| 2199 2235 | 
             
             *    class AsciiStandardAnalyzer
         | 
| 2200 2236 | 
             
             *      def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
         | 
| @@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void) | |
| 2212 2248 | 
             
             *
         | 
| 2213 2249 | 
             
             *  As you can see it makes use of the AsciiStandardTokenizer and you can also
         | 
| 2214 2250 | 
             
             *  add your own list of stop-words if you wish. Note that this tokenizer
         | 
| 2215 | 
            -
             *  won't recognize non- | 
| 2251 | 
            +
             *  won't recognize non-ASCII characters so you should use the
         | 
| 2216 2252 | 
             
             *  StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
         | 
| 2217 2253 | 
             
             */
         | 
| 2218 2254 | 
             
            static void Init_AsciiStandardAnalyzer(void)
         | 
| @@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void) | |
| 2292 2328 | 
             
                                 frt_per_field_analyzer_add_field, 2);
         | 
| 2293 2329 | 
             
                rb_define_method(cPerFieldAnalyzer, "[]=",
         | 
| 2294 2330 | 
             
                                 frt_per_field_analyzer_add_field, 2);
         | 
| 2331 | 
            +
                rb_define_method(cPerFieldAnalyzer, "token_stream",
         | 
| 2332 | 
            +
                                 frt_pfa_analyzer_token_stream, 2);
         | 
| 2295 2333 | 
             
            }
         | 
| 2296 2334 |  | 
| 2297 2335 | 
             
            /*
         |