RubyGems - ferret - Versions diffs - 0.11.4 → 0.11.5 - Mend

ferret 0.11.4 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

data/Rakefile +1 -0
data/TUTORIAL +3 -3
data/ext/analysis.c +12 -9
data/ext/array.c +10 -10
data/ext/array.h +8 -1
data/ext/bitvector.c +2 -2
data/ext/except.c +1 -1
data/ext/ferret.c +2 -2
data/ext/ferret.h +1 -1
data/ext/fs_store.c +13 -2
data/ext/global.c +4 -4
data/ext/global.h +6 -0
data/ext/hash.c +1 -1
data/ext/helper.c +1 -1
data/ext/helper.h +1 -1
data/ext/index.c +48 -22
data/ext/index.h +17 -16
data/ext/mempool.c +4 -1
data/ext/mempool.h +1 -1
data/ext/multimapper.c +2 -2
data/ext/q_fuzzy.c +2 -2
data/ext/q_multi_term.c +2 -2
data/ext/q_parser.c +39 -8
data/ext/q_range.c +32 -1
data/ext/r_analysis.c +66 -28
data/ext/r_index.c +18 -19
data/ext/r_qparser.c +21 -6
data/ext/r_search.c +74 -49
data/ext/r_store.c +1 -1
data/ext/r_utils.c +17 -17
data/ext/search.c +10 -5
data/ext/search.h +3 -1
data/ext/sort.c +2 -2
data/ext/stopwords.c +23 -34
data/ext/store.c +9 -9
data/ext/store.h +5 -4
data/lib/ferret/document.rb +2 -2
data/lib/ferret/field_infos.rb +37 -35
data/lib/ferret/index.rb +16 -6
data/lib/ferret/number_tools.rb +2 -2
data/lib/ferret_version.rb +1 -1
data/test/unit/analysis/tc_token_stream.rb +40 -0
data/test/unit/index/tc_index.rb +64 -101
data/test/unit/index/tc_index_reader.rb +13 -0
data/test/unit/largefile/tc_largefile.rb +46 -0
data/test/unit/query_parser/tc_query_parser.rb +17 -1
data/test/unit/search/tc_multiple_search_requests.rb +58 -0
data/test/unit/search/tm_searcher.rb +27 -1
data/test/unit/ts_largefile.rb +4 -0
metadata +147 -144

data/ext/index.h CHANGED Viewed

@@ -65,24 +65,24 @@ extern HashTable *co_hash_create();
 enum StoreValues
 {
-    STORE_NO = 0,
-    STORE_YES = 1,
+    STORE_NO = 0,
+    STORE_YES = 1,
     STORE_COMPRESS = 2
 };
 enum IndexValues
 {
-    INDEX_NO = 0,
-    INDEX_YES = 1,
-    INDEX_UNTOKENIZED = 3,
-    INDEX_YES_OMIT_NORMS = 5,
-    INDEX_UNTOKENIZED_OMIT_NORMS = 7
+    INDEX_NO = 0,
+    INDEX_UNTOKENIZED = 1,
+    INDEX_YES = 3,
+    INDEX_UNTOKENIZED_OMIT_NORMS = 5,
+    INDEX_YES_OMIT_NORMS = 7
 };
 enum TermVectorValues
 {
-    TERM_VECTOR_NO = 0,
-    TERM_VECTOR_YES = 1,
+    TERM_VECTOR_NO = 0,
+    TERM_VECTOR_YES = 1,
     TERM_VECTOR_WITH_POSITIONS = 3,
     TERM_VECTOR_WITH_OFFSETS = 5,
     TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
@@ -374,7 +374,7 @@ typedef struct TermInfosWriter
 extern TermInfosWriter *tiw_open(Store *store,
                                  const char *segment,
-                                 int index_interval,
+                                 int index_interval,
                                  int skip_interval);
 extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
 extern void tiw_add(TermInfosWriter *tiw,
@@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
 typedef struct Offset
 {
-    int start;
-    int end;
+    off_t start;
+    off_t end;
 } Offset;
-extern Offset *offset_new(int start, int end);
+extern Offset *offset_new(off_t start, off_t end);
 /****************************************************************************
  *
@@ -488,7 +488,7 @@ typedef struct Posting
     struct Posting *next;
 } Posting;
-extern __inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
+extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
 /****************************************************************************
  *
@@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
 /* * * LazyDocField * * */
 typedef struct LazyDocFieldData
 {
-    int   start;
+    off_t start;
     int   length;
     char *text;
 } LazyDocFieldData;
@@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw);
  * A utility class (used by both IndexReader and IndexWriter) to keep track of
  * files that need to be deleted because they are no longer referenced by the
  * index.
- *
+ *
  ****************************************************************************/
 struct Deleter
@@ -760,6 +760,7 @@ struct IndexReader
     void          (*delete_doc_i)(IndexReader *ir, int doc_num);
     void          (*undelete_all_i)(IndexReader *ir);
     void          (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
+    bool          (*is_latest_i)(IndexReader *ir);
     void          (*commit_i)(IndexReader *ir);
     void          (*close_i)(IndexReader *ir);
     int           ref_cnt;

data/ext/mempool.c CHANGED Viewed

@@ -21,10 +21,13 @@ MemoryPool *mp_new()
     return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
 }
-__inline void *mp_alloc(MemoryPool *mp, int size)
+INLINE void *mp_alloc(MemoryPool *mp, int size)
 {
     char *p;
     p = mp->curr_buffer + mp->pointer;
+#if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
+    size = (((size - 1) >> 3) + 1) << 3;
+#endif
     mp->pointer += size;
     if (mp->pointer > mp->chunk_size) {

data/ext/mempool.h CHANGED Viewed

@@ -16,7 +16,7 @@ typedef struct MemoryPool {
 extern MemoryPool *mp_new();
 extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
-extern __inline void *mp_alloc(MemoryPool *mp, int size);
+extern INLINE void *mp_alloc(MemoryPool *mp, int size);
 extern void mp_reset(MemoryPool *mp);
 extern void mp_destroy(MemoryPool *mp);
 extern char *mp_strdup(MemoryPool *mp, const char *str);

data/ext/multimapper.c CHANGED Viewed

@@ -121,7 +121,7 @@ MultiMapper *mulmap_new()
     return self;
 }
-static __inline void mulmap_free_dstates(MultiMapper *self)
+static INLINE void mulmap_free_dstates(MultiMapper *self)
 {
     if (self->d_size > 0) {
         int i;
@@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
 }
-static __inline void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
+static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
 {
     int i;
     for (i = cnt - 1; i >= 0; i--) {

data/ext/q_fuzzy.c CHANGED Viewed

@@ -11,7 +11,7 @@
  *
  ****************************************************************************/
-static __inline int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
+static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
 {
     return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
 }
@@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
     }
 }
-static __inline int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
+static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
 {
     return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
         : fuzq_calculate_max_distance(fuzq, m);

data/ext/q_multi_term.c CHANGED Viewed

@@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
     return (pq_top(tdew_pq) == NULL) ? false : true;
 }
-static __inline bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
+static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
 {
     return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
 }
@@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field)
 void multi_tq_add_term_boost(Query *self, const char *term, float boost)
 {
-    if (boost > MTQ(self)->min_boost) {
+    if (boost > MTQ(self)->min_boost && term && term[0]) {
         BoostedTerm *bt = boosted_term_new(term, boost);
         PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
         pq_insert(bt_pq, bt);

data/ext/q_parser.c CHANGED Viewed

@@ -147,7 +147,7 @@ typedef union YYSTYPE
     Phrase *phrase;
     char *str;
 }
-/* Line 193 of yacc.c.  */
+/* Line 187 of yacc.c.  */
 #line 152 "y.tab.c"
 	YYSTYPE;
 # define yystype YYSTYPE /* obsolescent; will be withdrawn */
@@ -2061,12 +2061,14 @@ get_word_done:
      * just checks for all of them. */
     *bufp = '\0';
     len = (int)(bufp - buf);
-    if (len == 3) {
-        if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
-        if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
-        if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
+    if (qp->use_keywords) {
+        if (len == 3) {
+            if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
+            if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
+            if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
+        }
+        if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
     }
-    if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
     /* found a word so return it. */
     lvalp->str = buf;
@@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field,
         }
         else {
             int i;
-            q = bq_new_max(false, qp->max_clauses);
+            int term_cnt = 0;
+            Token *token;
+            char *last_word = NULL;
             for (i = 0; i < word_count; i++) {
-                bq_add_query_nr(q, get_term_q(qp, field, words[i]), BC_SHOULD);
+                token = ts_next(get_cached_ts(qp, field, words[i]));
+                free(words[i]);
+                if (token) {
+                    last_word = words[i] = estrdup(token->text);
+                    ++term_cnt;
+                }
+                else {
+                    words[i] = estrdup("");
+                }
+            }
+            switch (term_cnt) {
+                case 0:
+                    q = bq_new(false);
+                    break;
+                case 1:
+                    q = tq_new(field, last_word);
+                    break;
+                default:
+                    q = multi_tq_new_conf(field, term_cnt, 0.0);
+                    for (i = 0; i < word_count; i++) {
+                        if (words[i][0]) {
+                            multi_tq_add_term(q, words[i]);
+                        }
+                    }
+                    break;
             }
         }
     }
@@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
     self->max_clauses = QP_MAX_CLAUSES;
     self->handle_parse_errors = false;
     self->allow_any_fields = false;
+    self->use_keywords = true;
     self->def_slop = 0;
     self->fields_buf = hs_new_str(NULL);
     self->all_fields = all_fields;

data/ext/q_range.c CHANGED Viewed

@@ -269,13 +269,44 @@ static void rq_destroy(Query *self)
     q_destroy_i(self);
 }
+static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
+                                    TermVector *tv)
+{
+    Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
+    if (strcmp(tv->field, range->field) == 0) {
+        int i, j;
+        char *upper_text = range->upper_term;
+        char *lower_text = range->lower_term;
+        int upper_limit = range->include_upper ? 1 : 0;
+        int lower_limit = range->include_lower ? 1 : 0;
+        for (i = tv->term_cnt - 1; i >= 0; i--) {
+            TVTerm *tv_term = &(tv->terms[i]);
+            char *text = tv_term->text;
+            if ((!upper_text || strcmp(text, upper_text) < upper_limit) &&
+                (!lower_text || strcmp(lower_text, text) < lower_limit)) {
+                for (j = 0; j < tv_term->freq; j++) {
+                    int pos = tv_term->positions[j];
+                    matchv_add(mv, pos, pos);
+                }
+            }
+        }
+    }
+    return mv;
+}
 static Query *rq_rewrite(Query *self, IndexReader *ir)
 {
+    Query *csq;
     Range *r = RQ(self)->range;
     Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
                                r->include_lower, r->include_upper);
     (void)ir;
-    return csq_new_nr(filter);
+    csq = csq_new_nr(filter);
+    ((ConstantScoreQuery *)csq)->original = self;
+    csq->get_matchv_i = &rq_get_matchv_i;
+    return (Query *)csq;
 }
 static unsigned long rq_hash(Query *self)

data/ext/r_analysis.c CHANGED Viewed

@@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt)
  *  values as needed.  For example, if you have a stop word filter you will be
  *  skipping tokens. Let's say you have the stop words "the" and "and" and you
  *  parse the title "The Old Man and the Sea". The terms "Old", "Man" and
- *  "Sea" will have the position incerements 2, 1 and 3 respectively.
+ *  "Sea" will have the position increments 2, 1 and 3 respectively.
  *
  *  Another reason you might want to vary the position increment is if you are
  *  adding synonyms to the index. For example let's say you have the synonym
@@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts)
     return rts;
 }
-static inline VALUE
+static INLINE VALUE
 get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
 {
     StringValue(rstr);
@@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
  *     LetterTokenizer.new(lower = true) -> tokenizer
  *
  *  Create a new LetterTokenizer which optionally downcases tokens. Downcasing
- *  is done according the the current locale.
+ *  is done according the current locale.
  *
  *  lower:: set to false if you don't wish to downcase tokens
  */
@@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
  *     WhiteSpaceTokenizer.new(lower = true) -> tokenizer
  *
  *  Create a new WhiteSpaceTokenizer which optionally downcases tokens.
- *  Downcasing is done according the the current locale.
+ *  Downcasing is done according the current locale.
  *
  *  lower:: set to false if you don't wish to downcase tokens
  */
@@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
  *     StandardTokenizer.new(lower = true) -> tokenizer
  *
  *  Create a new StandardTokenizer which optionally downcases tokens.
- *  Downcasing is done according the the current locale.
+ *  Downcasing is done according the current locale.
  *
  *  lower:: set to false if you don't wish to downcase tokens
  */
@@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
  *     AsciiLowerCaseFilter.new(token_stream) -> token_stream
  *
  *  Create an AsciiLowerCaseFilter which normalizes a token's text to
- *  lowercase but only for Ascii characters. For other characters use
+ *  lowercase but only for ASCII characters. For other characters use
  *  LowerCaseFilter.
  */
 static VALUE
@@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
     return self;
 }
-static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
+static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
 {
     switch (TYPE(from)) {
         case T_STRING:
@@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
  *     MappingFilter.new(token_stream, mapping) -> token_stream
  *
  *  Create an MappingFilter which maps strings in tokens. This is usually used
- *  to map UTF-8 characters to ascii characters for easier searching and
- *  better searche recall. The mapping is compiled into a Deterministic Finite
+ *  to map UTF-8 characters to ASCII characters for easier searching and
+ *  better search recall. The mapping is compiled into a Deterministic Finite
  *  Automata so it is super fast. This Filter can therefor be used for
  *  indexing very large datasets. Currently regular expressions are not
  *  supported. If you are really interested in the feature, please contact me
@@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
  *                    algorithm="english",
  *                    encoding="UTF-8") -> token_stream
  *
- *  Create an StemFilter which uses a snowball stemmer (thankyou Martin
+ *  Create an StemFilter which uses a snowball stemmer (thank you Martin
  *  Porter) to stem words. You can optionally specify the algorithm (default:
  *  "english") and encoding (default: "UTF-8").
  *
@@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a)
     return self;
 }
+INLINE VALUE
+get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
+{
+    TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
+    /* Make sure that there is no entry already */
+    object_set(&ts->text, rstring);
+    return get_rb_token_stream(ts);
+}
 /*
  *  call-seq:
  *     analyzer.token_stream(field_name, input) -> token_stream
@@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
 {
     /* NOTE: Any changes made to this method may also need to be applied to
      * frt_re_analyzer_token_stream */
-    TokenStream *ts;
     Analyzer *a;
     GET_A(a, self);
     StringValue(rstring);
-    ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
-    /* Make sure that there is no entry already */
-    object_set(&ts->text, rstring);
-    return get_rb_token_stream(ts);
+    return get_rb_ts_from_a(a, rfield, rstring);
 }
 #define GET_LOWER(dflt) \
@@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt)
  *
  *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
  *  but can optionally leave case as is. Lowercasing will only be done to
- *  ascii characters.
+ *  ASCII characters.
  *
  *  lower:: set to false if you don't want the field's tokens to be downcased
  */
@@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
  *
  *  Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
  *  but can optionally leave case as is. Lowercasing will only be done to
- *  ascii characters.
+ *  ASCII characters.
  *
  *  lower:: set to false if you don't want the field's tokens to be downcased
  */
@@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
     return self;
 }
+/*
+ *  call-seq:
+ *     analyzer.token_stream(field_name, input) -> token_stream
+ *
+ *  Create a new TokenStream to tokenize +input+. The TokenStream created will
+ *  also depend on the +field_name+ in the case of the PerFieldAnalyzer.
+ *
+ *  field_name:: name of the field to be tokenized
+ *  input::      data from the field to be tokenized
+ */
+static VALUE
+frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
+{
+    Analyzer *pfa, *a;
+    char *field = frt_field(rfield);
+    GET_A(pfa, self);
+    StringValue(rstring);
+    a = (Analyzer *)h_get(PFA(pfa)->dict, field);
+    if (a == NULL) {
+        a = PFA(pfa)->default_a;
+    }
+    if (a->get_ts == cwa_get_ts) {
+        return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
+                          ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
+    }
+    else {
+        return get_rb_ts_from_a(a, rfield, rstring);
+    }
+}
 /*** RegExpAnalyzer ***/
 static void
@@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
  *
  *  == Summary
  *
- *  A Token is an occurence of a term from the text of a field.  It consists
+ *  A Token is an occurrence of a term from the text of a field.  It consists
  *  of a term's text and the start and end offset of the term in the text of
  *  the field;
  *
@@ -1648,7 +1684,7 @@ static void Init_TokenStream(void)
 /*
  *  Document-class: Ferret::Analysis::AsciiLetterTokenizer
  *
- *  A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
+ *  A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
  *  That is to say, it defines tokens as maximal strings of adjacent letters,
  *  as defined by the regular expression _/[A-Za-z]+/_.
  *
@@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void)
  *  Document-class: Ferret::Analysis::RegExpTokenizer
  *
  *  A tokenizer that recognizes tokens based on a regular expression passed to
- *  the contructor. Most possible tokenizers can be created using this class.
+ *  the constructor. Most possible tokenizers can be created using this class.
  *
  *  === Example
  *
@@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void)
  *  Document-class: Ferret::Analysis::AsciiLowerCaseFilter
  *
  *  AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
- *  Ascii characters. For other characters use LowerCaseFilter.
+ *  ASCII characters. For other characters use LowerCaseFilter.
  *
  *  === Example
  *
@@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void)
  *  Document-class: Ferret::Analysis::MappingFilter
  *
  *  A MappingFilter maps strings in tokens. This is usually used to map UTF-8
- *  characters to ascii characters for easier searching and better searche
+ *  characters to ASCII characters for easier searching and better search
  *  recall. The mapping is compiled into a Deterministic Finite Automata so it
  *  is super fast. This Filter can therefor be used for indexing very large
  *  datasets. Currently regular expressions are not supported. If you are
@@ -2020,7 +2056,7 @@ static void Init_StemFilter(void)
  *  a policy for extracting index terms from text.
  *
  *  Typical implementations first build a Tokenizer, which breaks the stream
- *  of characters from the Reader into raw Tokens. One or more TokenFilter s
+ *  of characters from the Reader into raw Tokens. One or more TokenFilters
  *  may then be applied to the output of the Tokenizer.
  *
  *  The default Analyzer just creates a LowerCaseTokenizer which converts
@@ -2057,7 +2093,7 @@ static void Init_Analyzer(void)
  *  == Summary
  *
  *  An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
- *  maximal strings of Ascii characters. If implemented in Ruby it would look
+ *  maximal strings of ASCII characters. If implemented in Ruby it would look
  *  like;
  *
  *    class AsciiLetterAnalyzer
@@ -2075,7 +2111,7 @@ static void Init_Analyzer(void)
  *    end
  *
  *  As you can see it makes use of the AsciiLetterTokenizer and
- *  AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
+ *  AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
  *  characters so you should use the LetterAnalyzer is you want to analyze
  *  multi-byte data like "UTF-8".
  */
@@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void)
  *  == Summary
  *
  *  The AsciiStandardAnalyzer is the most advanced of the available
- *  ascii-analyzers. If it were implemented in Ruby it would look like this;
+ *  ASCII-analyzers. If it were implemented in Ruby it would look like this;
  *
  *    class AsciiStandardAnalyzer
  *      def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
@@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void)
  *
  *  As you can see it makes use of the AsciiStandardTokenizer and you can also
  *  add your own list of stop-words if you wish. Note that this tokenizer
- *  won't recognize non-ascii characters so you should use the
+ *  won't recognize non-ASCII characters so you should use the
  *  StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
  */
 static void Init_AsciiStandardAnalyzer(void)
@@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void)
                      frt_per_field_analyzer_add_field, 2);
     rb_define_method(cPerFieldAnalyzer, "[]=",
                      frt_per_field_analyzer_add_field, 2);
+    rb_define_method(cPerFieldAnalyzer, "token_stream",
+                     frt_pfa_analyzer_token_stream, 2);
 }
 /*