ferret 0.11.4 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/Rakefile +1 -0
  2. data/TUTORIAL +3 -3
  3. data/ext/analysis.c +12 -9
  4. data/ext/array.c +10 -10
  5. data/ext/array.h +8 -1
  6. data/ext/bitvector.c +2 -2
  7. data/ext/except.c +1 -1
  8. data/ext/ferret.c +2 -2
  9. data/ext/ferret.h +1 -1
  10. data/ext/fs_store.c +13 -2
  11. data/ext/global.c +4 -4
  12. data/ext/global.h +6 -0
  13. data/ext/hash.c +1 -1
  14. data/ext/helper.c +1 -1
  15. data/ext/helper.h +1 -1
  16. data/ext/index.c +48 -22
  17. data/ext/index.h +17 -16
  18. data/ext/mempool.c +4 -1
  19. data/ext/mempool.h +1 -1
  20. data/ext/multimapper.c +2 -2
  21. data/ext/q_fuzzy.c +2 -2
  22. data/ext/q_multi_term.c +2 -2
  23. data/ext/q_parser.c +39 -8
  24. data/ext/q_range.c +32 -1
  25. data/ext/r_analysis.c +66 -28
  26. data/ext/r_index.c +18 -19
  27. data/ext/r_qparser.c +21 -6
  28. data/ext/r_search.c +74 -49
  29. data/ext/r_store.c +1 -1
  30. data/ext/r_utils.c +17 -17
  31. data/ext/search.c +10 -5
  32. data/ext/search.h +3 -1
  33. data/ext/sort.c +2 -2
  34. data/ext/stopwords.c +23 -34
  35. data/ext/store.c +9 -9
  36. data/ext/store.h +5 -4
  37. data/lib/ferret/document.rb +2 -2
  38. data/lib/ferret/field_infos.rb +37 -35
  39. data/lib/ferret/index.rb +16 -6
  40. data/lib/ferret/number_tools.rb +2 -2
  41. data/lib/ferret_version.rb +1 -1
  42. data/test/unit/analysis/tc_token_stream.rb +40 -0
  43. data/test/unit/index/tc_index.rb +64 -101
  44. data/test/unit/index/tc_index_reader.rb +13 -0
  45. data/test/unit/largefile/tc_largefile.rb +46 -0
  46. data/test/unit/query_parser/tc_query_parser.rb +17 -1
  47. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  48. data/test/unit/search/tm_searcher.rb +27 -1
  49. data/test/unit/ts_largefile.rb +4 -0
  50. metadata +147 -144
data/ext/index.h CHANGED
@@ -65,24 +65,24 @@ extern HashTable *co_hash_create();
65
65
 
66
66
  enum StoreValues
67
67
  {
68
- STORE_NO = 0,
69
- STORE_YES = 1,
68
+ STORE_NO = 0,
69
+ STORE_YES = 1,
70
70
  STORE_COMPRESS = 2
71
71
  };
72
72
 
73
73
  enum IndexValues
74
74
  {
75
- INDEX_NO = 0,
76
- INDEX_YES = 1,
77
- INDEX_UNTOKENIZED = 3,
78
- INDEX_YES_OMIT_NORMS = 5,
79
- INDEX_UNTOKENIZED_OMIT_NORMS = 7
75
+ INDEX_NO = 0,
76
+ INDEX_UNTOKENIZED = 1,
77
+ INDEX_YES = 3,
78
+ INDEX_UNTOKENIZED_OMIT_NORMS = 5,
79
+ INDEX_YES_OMIT_NORMS = 7
80
80
  };
81
81
 
82
82
  enum TermVectorValues
83
83
  {
84
- TERM_VECTOR_NO = 0,
85
- TERM_VECTOR_YES = 1,
84
+ TERM_VECTOR_NO = 0,
85
+ TERM_VECTOR_YES = 1,
86
86
  TERM_VECTOR_WITH_POSITIONS = 3,
87
87
  TERM_VECTOR_WITH_OFFSETS = 5,
88
88
  TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
@@ -374,7 +374,7 @@ typedef struct TermInfosWriter
374
374
 
375
375
  extern TermInfosWriter *tiw_open(Store *store,
376
376
  const char *segment,
377
- int index_interval,
377
+ int index_interval,
378
378
  int skip_interval);
379
379
  extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
380
380
  extern void tiw_add(TermInfosWriter *tiw,
@@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
456
456
 
457
457
  typedef struct Offset
458
458
  {
459
- int start;
460
- int end;
459
+ off_t start;
460
+ off_t end;
461
461
  } Offset;
462
462
 
463
- extern Offset *offset_new(int start, int end);
463
+ extern Offset *offset_new(off_t start, off_t end);
464
464
 
465
465
  /****************************************************************************
466
466
  *
@@ -488,7 +488,7 @@ typedef struct Posting
488
488
  struct Posting *next;
489
489
  } Posting;
490
490
 
491
- extern __inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
491
+ extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
492
492
 
493
493
  /****************************************************************************
494
494
  *
@@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
617
617
  /* * * LazyDocField * * */
618
618
  typedef struct LazyDocFieldData
619
619
  {
620
- int start;
620
+ off_t start;
621
621
  int length;
622
622
  char *text;
623
623
  } LazyDocFieldData;
@@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw);
706
706
  * A utility class (used by both IndexReader and IndexWriter) to keep track of
707
707
  * files that need to be deleted because they are no longer referenced by the
708
708
  * index.
709
- *
709
+ *
710
710
  ****************************************************************************/
711
711
 
712
712
  struct Deleter
@@ -760,6 +760,7 @@ struct IndexReader
760
760
  void (*delete_doc_i)(IndexReader *ir, int doc_num);
761
761
  void (*undelete_all_i)(IndexReader *ir);
762
762
  void (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
763
+ bool (*is_latest_i)(IndexReader *ir);
763
764
  void (*commit_i)(IndexReader *ir);
764
765
  void (*close_i)(IndexReader *ir);
765
766
  int ref_cnt;
data/ext/mempool.c CHANGED
@@ -21,10 +21,13 @@ MemoryPool *mp_new()
21
21
  return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
22
22
  }
23
23
 
24
- __inline void *mp_alloc(MemoryPool *mp, int size)
24
+ INLINE void *mp_alloc(MemoryPool *mp, int size)
25
25
  {
26
26
  char *p;
27
27
  p = mp->curr_buffer + mp->pointer;
28
+ #if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
29
+ size = (((size - 1) >> 3) + 1) << 3;
30
+ #endif
28
31
  mp->pointer += size;
29
32
 
30
33
  if (mp->pointer > mp->chunk_size) {
data/ext/mempool.h CHANGED
@@ -16,7 +16,7 @@ typedef struct MemoryPool {
16
16
 
17
17
  extern MemoryPool *mp_new();
18
18
  extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
19
- extern __inline void *mp_alloc(MemoryPool *mp, int size);
19
+ extern INLINE void *mp_alloc(MemoryPool *mp, int size);
20
20
  extern void mp_reset(MemoryPool *mp);
21
21
  extern void mp_destroy(MemoryPool *mp);
22
22
  extern char *mp_strdup(MemoryPool *mp, const char *str);
data/ext/multimapper.c CHANGED
@@ -121,7 +121,7 @@ MultiMapper *mulmap_new()
121
121
  return self;
122
122
  }
123
123
 
124
- static __inline void mulmap_free_dstates(MultiMapper *self)
124
+ static INLINE void mulmap_free_dstates(MultiMapper *self)
125
125
  {
126
126
  if (self->d_size > 0) {
127
127
  int i;
@@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
151
151
  }
152
152
 
153
153
 
154
- static __inline void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
154
+ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
155
155
  {
156
156
  int i;
157
157
  for (i = cnt - 1; i >= 0; i--) {
data/ext/q_fuzzy.c CHANGED
@@ -11,7 +11,7 @@
11
11
  *
12
12
  ****************************************************************************/
13
13
 
14
- static __inline int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
14
+ static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
15
15
  {
16
16
  return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
17
17
  }
@@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
24
24
  }
25
25
  }
26
26
 
27
- static __inline int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
27
+ static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
28
28
  {
29
29
  return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
30
  : fuzq_calculate_max_distance(fuzq, m);
data/ext/q_multi_term.c CHANGED
@@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
236
236
  return (pq_top(tdew_pq) == NULL) ? false : true;
237
237
  }
238
238
 
239
- static __inline bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
239
+ static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
240
240
  {
241
241
  return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
242
242
  }
@@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field)
661
661
 
662
662
  void multi_tq_add_term_boost(Query *self, const char *term, float boost)
663
663
  {
664
- if (boost > MTQ(self)->min_boost) {
664
+ if (boost > MTQ(self)->min_boost && term && term[0]) {
665
665
  BoostedTerm *bt = boosted_term_new(term, boost);
666
666
  PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
667
667
  pq_insert(bt_pq, bt);
data/ext/q_parser.c CHANGED
@@ -147,7 +147,7 @@ typedef union YYSTYPE
147
147
  Phrase *phrase;
148
148
  char *str;
149
149
  }
150
- /* Line 193 of yacc.c. */
150
+ /* Line 187 of yacc.c. */
151
151
  #line 152 "y.tab.c"
152
152
  YYSTYPE;
153
153
  # define yystype YYSTYPE /* obsolescent; will be withdrawn */
@@ -2061,12 +2061,14 @@ get_word_done:
2061
2061
  * just checks for all of them. */
2062
2062
  *bufp = '\0';
2063
2063
  len = (int)(bufp - buf);
2064
- if (len == 3) {
2065
- if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
2066
- if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
2067
- if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
2064
+ if (qp->use_keywords) {
2065
+ if (len == 3) {
2066
+ if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
2067
+ if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
2068
+ if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
2069
+ }
2070
+ if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
2068
2071
  }
2069
- if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
2070
2072
 
2071
2073
  /* found a word so return it. */
2072
2074
  lvalp->str = buf;
@@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field,
2489
2491
  }
2490
2492
  else {
2491
2493
  int i;
2492
- q = bq_new_max(false, qp->max_clauses);
2494
+ int term_cnt = 0;
2495
+ Token *token;
2496
+ char *last_word = NULL;
2497
+
2493
2498
  for (i = 0; i < word_count; i++) {
2494
- bq_add_query_nr(q, get_term_q(qp, field, words[i]), BC_SHOULD);
2499
+ token = ts_next(get_cached_ts(qp, field, words[i]));
2500
+ free(words[i]);
2501
+ if (token) {
2502
+ last_word = words[i] = estrdup(token->text);
2503
+ ++term_cnt;
2504
+ }
2505
+ else {
2506
+ words[i] = estrdup("");
2507
+ }
2508
+ }
2509
+
2510
+ switch (term_cnt) {
2511
+ case 0:
2512
+ q = bq_new(false);
2513
+ break;
2514
+ case 1:
2515
+ q = tq_new(field, last_word);
2516
+ break;
2517
+ default:
2518
+ q = multi_tq_new_conf(field, term_cnt, 0.0);
2519
+ for (i = 0; i < word_count; i++) {
2520
+ if (words[i][0]) {
2521
+ multi_tq_add_term(q, words[i]);
2522
+ }
2523
+ }
2524
+ break;
2495
2525
  }
2496
2526
  }
2497
2527
  }
@@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
2620
2650
  self->max_clauses = QP_MAX_CLAUSES;
2621
2651
  self->handle_parse_errors = false;
2622
2652
  self->allow_any_fields = false;
2653
+ self->use_keywords = true;
2623
2654
  self->def_slop = 0;
2624
2655
  self->fields_buf = hs_new_str(NULL);
2625
2656
  self->all_fields = all_fields;
data/ext/q_range.c CHANGED
@@ -269,13 +269,44 @@ static void rq_destroy(Query *self)
269
269
  q_destroy_i(self);
270
270
  }
271
271
 
272
+ static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
273
+ TermVector *tv)
274
+ {
275
+ Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
276
+ if (strcmp(tv->field, range->field) == 0) {
277
+ int i, j;
278
+ char *upper_text = range->upper_term;
279
+ char *lower_text = range->lower_term;
280
+ int upper_limit = range->include_upper ? 1 : 0;
281
+ int lower_limit = range->include_lower ? 1 : 0;
282
+
283
+ for (i = tv->term_cnt - 1; i >= 0; i--) {
284
+ TVTerm *tv_term = &(tv->terms[i]);
285
+ char *text = tv_term->text;
286
+ if ((!upper_text || strcmp(text, upper_text) < upper_limit) &&
287
+ (!lower_text || strcmp(lower_text, text) < lower_limit)) {
288
+
289
+ for (j = 0; j < tv_term->freq; j++) {
290
+ int pos = tv_term->positions[j];
291
+ matchv_add(mv, pos, pos);
292
+ }
293
+ }
294
+ }
295
+ }
296
+ return mv;
297
+ }
298
+
272
299
  static Query *rq_rewrite(Query *self, IndexReader *ir)
273
300
  {
301
+ Query *csq;
274
302
  Range *r = RQ(self)->range;
275
303
  Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
276
304
  r->include_lower, r->include_upper);
277
305
  (void)ir;
278
- return csq_new_nr(filter);
306
+ csq = csq_new_nr(filter);
307
+ ((ConstantScoreQuery *)csq)->original = self;
308
+ csq->get_matchv_i = &rq_get_matchv_i;
309
+ return (Query *)csq;
279
310
  }
280
311
 
281
312
  static unsigned long rq_hash(Query *self)
data/ext/r_analysis.c CHANGED
@@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt)
150
150
  * values as needed. For example, if you have a stop word filter you will be
151
151
  * skipping tokens. Let's say you have the stop words "the" and "and" and you
152
152
  * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
153
- * "Sea" will have the position incerements 2, 1 and 3 respectively.
153
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
154
154
  *
155
155
  * Another reason you might want to vary the position increment is if you are
156
156
  * adding synonyms to the index. For example let's say you have the synonym
@@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts)
424
424
  return rts;
425
425
  }
426
426
 
427
- static inline VALUE
427
+ static INLINE VALUE
428
428
  get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
429
429
  {
430
430
  StringValue(rstr);
@@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
811
811
  * LetterTokenizer.new(lower = true) -> tokenizer
812
812
  *
813
813
  * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
814
- * is done according the the current locale.
814
+ * is done according the current locale.
815
815
  *
816
816
  * lower:: set to false if you don't wish to downcase tokens
817
817
  */
@@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
842
842
  * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
843
843
  *
844
844
  * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
845
- * Downcasing is done according the the current locale.
845
+ * Downcasing is done according the current locale.
846
846
  *
847
847
  * lower:: set to false if you don't wish to downcase tokens
848
848
  */
@@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
873
873
  * StandardTokenizer.new(lower = true) -> tokenizer
874
874
  *
875
875
  * Create a new StandardTokenizer which optionally downcases tokens.
876
- * Downcasing is done according the the current locale.
876
+ * Downcasing is done according the current locale.
877
877
  *
878
878
  * lower:: set to false if you don't wish to downcase tokens
879
879
  */
@@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
896
896
  * AsciiLowerCaseFilter.new(token_stream) -> token_stream
897
897
  *
898
898
  * Create an AsciiLowerCaseFilter which normalizes a token's text to
899
- * lowercase but only for Ascii characters. For other characters use
899
+ * lowercase but only for ASCII characters. For other characters use
900
900
  * LowerCaseFilter.
901
901
  */
902
902
  static VALUE
@@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
990
990
  return self;
991
991
  }
992
992
 
993
- static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
993
+ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
994
994
  {
995
995
  switch (TYPE(from)) {
996
996
  case T_STRING:
@@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1046
1046
  * MappingFilter.new(token_stream, mapping) -> token_stream
1047
1047
  *
1048
1048
  * Create an MappingFilter which maps strings in tokens. This is usually used
1049
- * to map UTF-8 characters to ascii characters for easier searching and
1050
- * better searche recall. The mapping is compiled into a Deterministic Finite
1049
+ * to map UTF-8 characters to ASCII characters for easier searching and
1050
+ * better search recall. The mapping is compiled into a Deterministic Finite
1051
1051
  * Automata so it is super fast. This Filter can therefor be used for
1052
1052
  * indexing very large datasets. Currently regular expressions are not
1053
1053
  * supported. If you are really interested in the feature, please contact me
@@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1087
1087
  * algorithm="english",
1088
1088
  * encoding="UTF-8") -> token_stream
1089
1089
  *
1090
- * Create an StemFilter which uses a snowball stemmer (thankyou Martin
1090
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1091
1091
  * Porter) to stem words. You can optionally specify the algorithm (default:
1092
1092
  * "english") and encoding (default: "UTF-8").
1093
1093
  *
@@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a)
1193
1193
  return self;
1194
1194
  }
1195
1195
 
1196
+ INLINE VALUE
1197
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1198
+ {
1199
+ TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1200
+
1201
+ /* Make sure that there is no entry already */
1202
+ object_set(&ts->text, rstring);
1203
+ return get_rb_token_stream(ts);
1204
+ }
1205
+
1196
1206
  /*
1197
1207
  * call-seq:
1198
1208
  * analyzer.token_stream(field_name, input) -> token_stream
@@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1209
1219
  {
1210
1220
  /* NOTE: Any changes made to this method may also need to be applied to
1211
1221
  * frt_re_analyzer_token_stream */
1212
- TokenStream *ts;
1213
1222
  Analyzer *a;
1214
1223
  GET_A(a, self);
1215
1224
 
1216
1225
  StringValue(rstring);
1217
1226
 
1218
- ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1219
-
1220
- /* Make sure that there is no entry already */
1221
- object_set(&ts->text, rstring);
1222
- return get_rb_token_stream(ts);
1227
+ return get_rb_ts_from_a(a, rfield, rstring);
1223
1228
  }
1224
1229
 
1225
1230
  #define GET_LOWER(dflt) \
@@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt)
1234
1239
  *
1235
1240
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1236
1241
  * but can optionally leave case as is. Lowercasing will only be done to
1237
- * ascii characters.
1242
+ * ASCII characters.
1238
1243
  *
1239
1244
  * lower:: set to false if you don't want the field's tokens to be downcased
1240
1245
  */
@@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1279
1284
  *
1280
1285
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1281
1286
  * but can optionally leave case as is. Lowercasing will only be done to
1282
- * ascii characters.
1287
+ * ASCII characters.
1283
1288
  *
1284
1289
  * lower:: set to false if you don't want the field's tokens to be downcased
1285
1290
  */
@@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1457
1462
  return self;
1458
1463
  }
1459
1464
 
1465
+ /*
1466
+ * call-seq:
1467
+ * analyzer.token_stream(field_name, input) -> token_stream
1468
+ *
1469
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1470
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1471
+ *
1472
+ * field_name:: name of the field to be tokenized
1473
+ * input:: data from the field to be tokenized
1474
+ */
1475
+ static VALUE
1476
+ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1477
+ {
1478
+ Analyzer *pfa, *a;
1479
+ char *field = frt_field(rfield);
1480
+ GET_A(pfa, self);
1481
+
1482
+ StringValue(rstring);
1483
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1484
+ if (a == NULL) {
1485
+ a = PFA(pfa)->default_a;
1486
+ }
1487
+ if (a->get_ts == cwa_get_ts) {
1488
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1489
+ ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1490
+ }
1491
+ else {
1492
+ return get_rb_ts_from_a(a, rfield, rstring);
1493
+ }
1494
+ }
1495
+
1460
1496
  /*** RegExpAnalyzer ***/
1461
1497
 
1462
1498
  static void
@@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
1585
1621
  *
1586
1622
  * == Summary
1587
1623
  *
1588
- * A Token is an occurence of a term from the text of a field. It consists
1624
+ * A Token is an occurrence of a term from the text of a field. It consists
1589
1625
  * of a term's text and the start and end offset of the term in the text of
1590
1626
  * the field;
1591
1627
  *
@@ -1648,7 +1684,7 @@ static void Init_TokenStream(void)
1648
1684
  /*
1649
1685
  * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1650
1686
  *
1651
- * A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
1687
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1652
1688
  * That is to say, it defines tokens as maximal strings of adjacent letters,
1653
1689
  * as defined by the regular expression _/[A-Za-z]+/_.
1654
1690
  *
@@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void)
1781
1817
  * Document-class: Ferret::Analysis::RegExpTokenizer
1782
1818
  *
1783
1819
  * A tokenizer that recognizes tokens based on a regular expression passed to
1784
- * the contructor. Most possible tokenizers can be created using this class.
1820
+ * the constructor. Most possible tokenizers can be created using this class.
1785
1821
  *
1786
1822
  * === Example
1787
1823
  *
@@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void)
1817
1853
  * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1818
1854
  *
1819
1855
  * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1820
- * Ascii characters. For other characters use LowerCaseFilter.
1856
+ * ASCII characters. For other characters use LowerCaseFilter.
1821
1857
  *
1822
1858
  * === Example
1823
1859
  *
@@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void)
1881
1917
  * Document-class: Ferret::Analysis::MappingFilter
1882
1918
  *
1883
1919
  * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1884
- * characters to ascii characters for easier searching and better searche
1920
+ * characters to ASCII characters for easier searching and better search
1885
1921
  * recall. The mapping is compiled into a Deterministic Finite Automata so it
1886
1922
  * is super fast. This Filter can therefor be used for indexing very large
1887
1923
  * datasets. Currently regular expressions are not supported. If you are
@@ -2020,7 +2056,7 @@ static void Init_StemFilter(void)
2020
2056
  * a policy for extracting index terms from text.
2021
2057
  *
2022
2058
  * Typical implementations first build a Tokenizer, which breaks the stream
2023
- * of characters from the Reader into raw Tokens. One or more TokenFilter s
2059
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2024
2060
  * may then be applied to the output of the Tokenizer.
2025
2061
  *
2026
2062
  * The default Analyzer just creates a LowerCaseTokenizer which converts
@@ -2057,7 +2093,7 @@ static void Init_Analyzer(void)
2057
2093
  * == Summary
2058
2094
  *
2059
2095
  * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2060
- * maximal strings of Ascii characters. If implemented in Ruby it would look
2096
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2061
2097
  * like;
2062
2098
  *
2063
2099
  * class AsciiLetterAnalyzer
@@ -2075,7 +2111,7 @@ static void Init_Analyzer(void)
2075
2111
  * end
2076
2112
  *
2077
2113
  * As you can see it makes use of the AsciiLetterTokenizer and
2078
- * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
2114
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2079
2115
  * characters so you should use the LetterAnalyzer is you want to analyze
2080
2116
  * multi-byte data like "UTF-8".
2081
2117
  */
@@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void)
2194
2230
  * == Summary
2195
2231
  *
2196
2232
  * The AsciiStandardAnalyzer is the most advanced of the available
2197
- * ascii-analyzers. If it were implemented in Ruby it would look like this;
2233
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2198
2234
  *
2199
2235
  * class AsciiStandardAnalyzer
2200
2236
  * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
@@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void)
2212
2248
  *
2213
2249
  * As you can see it makes use of the AsciiStandardTokenizer and you can also
2214
2250
  * add your own list of stop-words if you wish. Note that this tokenizer
2215
- * won't recognize non-ascii characters so you should use the
2251
+ * won't recognize non-ASCII characters so you should use the
2216
2252
  * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2217
2253
  */
2218
2254
  static void Init_AsciiStandardAnalyzer(void)
@@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void)
2292
2328
  frt_per_field_analyzer_add_field, 2);
2293
2329
  rb_define_method(cPerFieldAnalyzer, "[]=",
2294
2330
  frt_per_field_analyzer_add_field, 2);
2331
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2332
+ frt_pfa_analyzer_token_stream, 2);
2295
2333
  }
2296
2334
 
2297
2335
  /*