ferret 0.11.4 → 0.11.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/Rakefile +1 -0
  2. data/TUTORIAL +3 -3
  3. data/ext/analysis.c +12 -9
  4. data/ext/array.c +10 -10
  5. data/ext/array.h +8 -1
  6. data/ext/bitvector.c +2 -2
  7. data/ext/except.c +1 -1
  8. data/ext/ferret.c +2 -2
  9. data/ext/ferret.h +1 -1
  10. data/ext/fs_store.c +13 -2
  11. data/ext/global.c +4 -4
  12. data/ext/global.h +6 -0
  13. data/ext/hash.c +1 -1
  14. data/ext/helper.c +1 -1
  15. data/ext/helper.h +1 -1
  16. data/ext/index.c +48 -22
  17. data/ext/index.h +17 -16
  18. data/ext/mempool.c +4 -1
  19. data/ext/mempool.h +1 -1
  20. data/ext/multimapper.c +2 -2
  21. data/ext/q_fuzzy.c +2 -2
  22. data/ext/q_multi_term.c +2 -2
  23. data/ext/q_parser.c +39 -8
  24. data/ext/q_range.c +32 -1
  25. data/ext/r_analysis.c +66 -28
  26. data/ext/r_index.c +18 -19
  27. data/ext/r_qparser.c +21 -6
  28. data/ext/r_search.c +74 -49
  29. data/ext/r_store.c +1 -1
  30. data/ext/r_utils.c +17 -17
  31. data/ext/search.c +10 -5
  32. data/ext/search.h +3 -1
  33. data/ext/sort.c +2 -2
  34. data/ext/stopwords.c +23 -34
  35. data/ext/store.c +9 -9
  36. data/ext/store.h +5 -4
  37. data/lib/ferret/document.rb +2 -2
  38. data/lib/ferret/field_infos.rb +37 -35
  39. data/lib/ferret/index.rb +16 -6
  40. data/lib/ferret/number_tools.rb +2 -2
  41. data/lib/ferret_version.rb +1 -1
  42. data/test/unit/analysis/tc_token_stream.rb +40 -0
  43. data/test/unit/index/tc_index.rb +64 -101
  44. data/test/unit/index/tc_index_reader.rb +13 -0
  45. data/test/unit/largefile/tc_largefile.rb +46 -0
  46. data/test/unit/query_parser/tc_query_parser.rb +17 -1
  47. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  48. data/test/unit/search/tm_searcher.rb +27 -1
  49. data/test/unit/ts_largefile.rb +4 -0
  50. metadata +147 -144
data/ext/index.h CHANGED
@@ -65,24 +65,24 @@ extern HashTable *co_hash_create();
65
65
 
66
66
  enum StoreValues
67
67
  {
68
- STORE_NO = 0,
69
- STORE_YES = 1,
68
+ STORE_NO = 0,
69
+ STORE_YES = 1,
70
70
  STORE_COMPRESS = 2
71
71
  };
72
72
 
73
73
  enum IndexValues
74
74
  {
75
- INDEX_NO = 0,
76
- INDEX_YES = 1,
77
- INDEX_UNTOKENIZED = 3,
78
- INDEX_YES_OMIT_NORMS = 5,
79
- INDEX_UNTOKENIZED_OMIT_NORMS = 7
75
+ INDEX_NO = 0,
76
+ INDEX_UNTOKENIZED = 1,
77
+ INDEX_YES = 3,
78
+ INDEX_UNTOKENIZED_OMIT_NORMS = 5,
79
+ INDEX_YES_OMIT_NORMS = 7
80
80
  };
81
81
 
82
82
  enum TermVectorValues
83
83
  {
84
- TERM_VECTOR_NO = 0,
85
- TERM_VECTOR_YES = 1,
84
+ TERM_VECTOR_NO = 0,
85
+ TERM_VECTOR_YES = 1,
86
86
  TERM_VECTOR_WITH_POSITIONS = 3,
87
87
  TERM_VECTOR_WITH_OFFSETS = 5,
88
88
  TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
@@ -374,7 +374,7 @@ typedef struct TermInfosWriter
374
374
 
375
375
  extern TermInfosWriter *tiw_open(Store *store,
376
376
  const char *segment,
377
- int index_interval,
377
+ int index_interval,
378
378
  int skip_interval);
379
379
  extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
380
380
  extern void tiw_add(TermInfosWriter *tiw,
@@ -456,11 +456,11 @@ extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
456
456
 
457
457
  typedef struct Offset
458
458
  {
459
- int start;
460
- int end;
459
+ off_t start;
460
+ off_t end;
461
461
  } Offset;
462
462
 
463
- extern Offset *offset_new(int start, int end);
463
+ extern Offset *offset_new(off_t start, off_t end);
464
464
 
465
465
  /****************************************************************************
466
466
  *
@@ -488,7 +488,7 @@ typedef struct Posting
488
488
  struct Posting *next;
489
489
  } Posting;
490
490
 
491
- extern __inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
491
+ extern Posting *p_new(MemoryPool *mp, int doc_num, int pos);
492
492
 
493
493
  /****************************************************************************
494
494
  *
@@ -617,7 +617,7 @@ extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
617
617
  /* * * LazyDocField * * */
618
618
  typedef struct LazyDocFieldData
619
619
  {
620
- int start;
620
+ off_t start;
621
621
  int length;
622
622
  char *text;
623
623
  } LazyDocFieldData;
@@ -706,7 +706,7 @@ extern void fw_write_tv_index(FieldsWriter *fw);
706
706
  * A utility class (used by both IndexReader and IndexWriter) to keep track of
707
707
  * files that need to be deleted because they are no longer referenced by the
708
708
  * index.
709
- *
709
+ *
710
710
  ****************************************************************************/
711
711
 
712
712
  struct Deleter
@@ -760,6 +760,7 @@ struct IndexReader
760
760
  void (*delete_doc_i)(IndexReader *ir, int doc_num);
761
761
  void (*undelete_all_i)(IndexReader *ir);
762
762
  void (*set_deleter_i)(IndexReader *ir, Deleter *dlr);
763
+ bool (*is_latest_i)(IndexReader *ir);
763
764
  void (*commit_i)(IndexReader *ir);
764
765
  void (*close_i)(IndexReader *ir);
765
766
  int ref_cnt;
data/ext/mempool.c CHANGED
@@ -21,10 +21,13 @@ MemoryPool *mp_new()
21
21
  return mp_new_capa(MP_BUF_SIZE, MP_INIT_CAPA);
22
22
  }
23
23
 
24
- __inline void *mp_alloc(MemoryPool *mp, int size)
24
+ INLINE void *mp_alloc(MemoryPool *mp, int size)
25
25
  {
26
26
  char *p;
27
27
  p = mp->curr_buffer + mp->pointer;
28
+ #if defined POSH_OS_SOLARIS || defined POSH_OS_SUNOS
29
+ size = (((size - 1) >> 3) + 1) << 3;
30
+ #endif
28
31
  mp->pointer += size;
29
32
 
30
33
  if (mp->pointer > mp->chunk_size) {
data/ext/mempool.h CHANGED
@@ -16,7 +16,7 @@ typedef struct MemoryPool {
16
16
 
17
17
  extern MemoryPool *mp_new();
18
18
  extern MemoryPool *mp_new_capa(int chunk_size, int init_capa);
19
- extern __inline void *mp_alloc(MemoryPool *mp, int size);
19
+ extern INLINE void *mp_alloc(MemoryPool *mp, int size);
20
20
  extern void mp_reset(MemoryPool *mp);
21
21
  extern void mp_destroy(MemoryPool *mp);
22
22
  extern char *mp_strdup(MemoryPool *mp, const char *str);
data/ext/multimapper.c CHANGED
@@ -121,7 +121,7 @@ MultiMapper *mulmap_new()
121
121
  return self;
122
122
  }
123
123
 
124
- static __inline void mulmap_free_dstates(MultiMapper *self)
124
+ static INLINE void mulmap_free_dstates(MultiMapper *self)
125
125
  {
126
126
  if (self->d_size > 0) {
127
127
  int i;
@@ -151,7 +151,7 @@ void mulmap_add_mapping(MultiMapper *self, const char *pattern, const char *rep)
151
151
  }
152
152
 
153
153
 
154
- static __inline void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
154
+ static INLINE void mulmap_bv_set_states(BitVector *bv, int *states, int cnt)
155
155
  {
156
156
  int i;
157
157
  for (i = cnt - 1; i >= 0; i--) {
data/ext/q_fuzzy.c CHANGED
@@ -11,7 +11,7 @@
11
11
  *
12
12
  ****************************************************************************/
13
13
 
14
- static __inline int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
14
+ static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
15
15
  {
16
16
  return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
17
17
  }
@@ -24,7 +24,7 @@ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
24
24
  }
25
25
  }
26
26
 
27
- static __inline int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
27
+ static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
28
28
  {
29
29
  return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
30
  : fuzq_calculate_max_distance(fuzq, m);
data/ext/q_multi_term.c CHANGED
@@ -236,7 +236,7 @@ static bool multi_tsc_advance_to(Scorer *self, int target_doc_num)
236
236
  return (pq_top(tdew_pq) == NULL) ? false : true;
237
237
  }
238
238
 
239
- static __inline bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
239
+ static INLINE bool multi_tsc_skip_to(Scorer *self, int target_doc_num)
240
240
  {
241
241
  return multi_tsc_advance_to(self, target_doc_num) && multi_tsc_next(self);
242
242
  }
@@ -661,7 +661,7 @@ Query *multi_tq_new(const char *field)
661
661
 
662
662
  void multi_tq_add_term_boost(Query *self, const char *term, float boost)
663
663
  {
664
- if (boost > MTQ(self)->min_boost) {
664
+ if (boost > MTQ(self)->min_boost && term && term[0]) {
665
665
  BoostedTerm *bt = boosted_term_new(term, boost);
666
666
  PriorityQueue *bt_pq = MTQ(self)->boosted_terms;
667
667
  pq_insert(bt_pq, bt);
data/ext/q_parser.c CHANGED
@@ -147,7 +147,7 @@ typedef union YYSTYPE
147
147
  Phrase *phrase;
148
148
  char *str;
149
149
  }
150
- /* Line 193 of yacc.c. */
150
+ /* Line 187 of yacc.c. */
151
151
  #line 152 "y.tab.c"
152
152
  YYSTYPE;
153
153
  # define yystype YYSTYPE /* obsolescent; will be withdrawn */
@@ -2061,12 +2061,14 @@ get_word_done:
2061
2061
  * just checks for all of them. */
2062
2062
  *bufp = '\0';
2063
2063
  len = (int)(bufp - buf);
2064
- if (len == 3) {
2065
- if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
2066
- if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
2067
- if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
2064
+ if (qp->use_keywords) {
2065
+ if (len == 3) {
2066
+ if (buf[0] == 'A' && buf[1] == 'N' && buf[2] == 'D') return AND;
2067
+ if (buf[0] == 'N' && buf[1] == 'O' && buf[2] == 'T') return NOT;
2068
+ if (buf[0] == 'R' && buf[1] == 'E' && buf[2] == 'Q') return REQ;
2069
+ }
2070
+ if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
2068
2071
  }
2069
- if (len == 2 && buf[0] == 'O' && buf[1] == 'R') return OR;
2070
2072
 
2071
2073
  /* found a word so return it. */
2072
2074
  lvalp->str = buf;
@@ -2489,9 +2491,37 @@ static Query *get_phrase_query(QParser *qp, char *field,
2489
2491
  }
2490
2492
  else {
2491
2493
  int i;
2492
- q = bq_new_max(false, qp->max_clauses);
2494
+ int term_cnt = 0;
2495
+ Token *token;
2496
+ char *last_word = NULL;
2497
+
2493
2498
  for (i = 0; i < word_count; i++) {
2494
- bq_add_query_nr(q, get_term_q(qp, field, words[i]), BC_SHOULD);
2499
+ token = ts_next(get_cached_ts(qp, field, words[i]));
2500
+ free(words[i]);
2501
+ if (token) {
2502
+ last_word = words[i] = estrdup(token->text);
2503
+ ++term_cnt;
2504
+ }
2505
+ else {
2506
+ words[i] = estrdup("");
2507
+ }
2508
+ }
2509
+
2510
+ switch (term_cnt) {
2511
+ case 0:
2512
+ q = bq_new(false);
2513
+ break;
2514
+ case 1:
2515
+ q = tq_new(field, last_word);
2516
+ break;
2517
+ default:
2518
+ q = multi_tq_new_conf(field, term_cnt, 0.0);
2519
+ for (i = 0; i < word_count; i++) {
2520
+ if (words[i][0]) {
2521
+ multi_tq_add_term(q, words[i]);
2522
+ }
2523
+ }
2524
+ break;
2495
2525
  }
2496
2526
  }
2497
2527
  }
@@ -2620,6 +2650,7 @@ QParser *qp_new(HashSet *all_fields, HashSet *def_fields,
2620
2650
  self->max_clauses = QP_MAX_CLAUSES;
2621
2651
  self->handle_parse_errors = false;
2622
2652
  self->allow_any_fields = false;
2653
+ self->use_keywords = true;
2623
2654
  self->def_slop = 0;
2624
2655
  self->fields_buf = hs_new_str(NULL);
2625
2656
  self->all_fields = all_fields;
data/ext/q_range.c CHANGED
@@ -269,13 +269,44 @@ static void rq_destroy(Query *self)
269
269
  q_destroy_i(self);
270
270
  }
271
271
 
272
+ static MatchVector *rq_get_matchv_i(Query *self, MatchVector *mv,
273
+ TermVector *tv)
274
+ {
275
+ Range *range = RQ(((ConstantScoreQuery *)self)->original)->range;
276
+ if (strcmp(tv->field, range->field) == 0) {
277
+ int i, j;
278
+ char *upper_text = range->upper_term;
279
+ char *lower_text = range->lower_term;
280
+ int upper_limit = range->include_upper ? 1 : 0;
281
+ int lower_limit = range->include_lower ? 1 : 0;
282
+
283
+ for (i = tv->term_cnt - 1; i >= 0; i--) {
284
+ TVTerm *tv_term = &(tv->terms[i]);
285
+ char *text = tv_term->text;
286
+ if ((!upper_text || strcmp(text, upper_text) < upper_limit) &&
287
+ (!lower_text || strcmp(lower_text, text) < lower_limit)) {
288
+
289
+ for (j = 0; j < tv_term->freq; j++) {
290
+ int pos = tv_term->positions[j];
291
+ matchv_add(mv, pos, pos);
292
+ }
293
+ }
294
+ }
295
+ }
296
+ return mv;
297
+ }
298
+
272
299
  static Query *rq_rewrite(Query *self, IndexReader *ir)
273
300
  {
301
+ Query *csq;
274
302
  Range *r = RQ(self)->range;
275
303
  Filter *filter = rfilt_new(r->field, r->lower_term, r->upper_term,
276
304
  r->include_lower, r->include_upper);
277
305
  (void)ir;
278
- return csq_new_nr(filter);
306
+ csq = csq_new_nr(filter);
307
+ ((ConstantScoreQuery *)csq)->original = self;
308
+ csq->get_matchv_i = &rq_get_matchv_i;
309
+ return (Query *)csq;
279
310
  }
280
311
 
281
312
  static unsigned long rq_hash(Query *self)
data/ext/r_analysis.c CHANGED
@@ -150,7 +150,7 @@ frt_set_token(Token *tk, VALUE rt)
150
150
  * values as needed. For example, if you have a stop word filter you will be
151
151
  * skipping tokens. Let's say you have the stop words "the" and "and" and you
152
152
  * parse the title "The Old Man and the Sea". The terms "Old", "Man" and
153
- * "Sea" will have the position incerements 2, 1 and 3 respectively.
153
+ * "Sea" will have the position increments 2, 1 and 3 respectively.
154
154
  *
155
155
  * Another reason you might want to vary the position increment is if you are
156
156
  * adding synonyms to the index. For example let's say you have the synonym
@@ -424,7 +424,7 @@ get_rb_token_stream(TokenStream *ts)
424
424
  return rts;
425
425
  }
426
426
 
427
- static inline VALUE
427
+ static INLINE VALUE
428
428
  get_wrapped_ts(VALUE self, VALUE rstr, TokenStream *ts)
429
429
  {
430
430
  StringValue(rstr);
@@ -811,7 +811,7 @@ frt_a_letter_tokenizer_init(VALUE self, VALUE rstr)
811
811
  * LetterTokenizer.new(lower = true) -> tokenizer
812
812
  *
813
813
  * Create a new LetterTokenizer which optionally downcases tokens. Downcasing
814
- * is done according the the current locale.
814
+ * is done according the current locale.
815
815
  *
816
816
  * lower:: set to false if you don't wish to downcase tokens
817
817
  */
@@ -842,7 +842,7 @@ frt_a_whitespace_tokenizer_init(VALUE self, VALUE rstr)
842
842
  * WhiteSpaceTokenizer.new(lower = true) -> tokenizer
843
843
  *
844
844
  * Create a new WhiteSpaceTokenizer which optionally downcases tokens.
845
- * Downcasing is done according the the current locale.
845
+ * Downcasing is done according the current locale.
846
846
  *
847
847
  * lower:: set to false if you don't wish to downcase tokens
848
848
  */
@@ -873,7 +873,7 @@ frt_a_standard_tokenizer_init(VALUE self, VALUE rstr)
873
873
  * StandardTokenizer.new(lower = true) -> tokenizer
874
874
  *
875
875
  * Create a new StandardTokenizer which optionally downcases tokens.
876
- * Downcasing is done according the the current locale.
876
+ * Downcasing is done according the current locale.
877
877
  *
878
878
  * lower:: set to false if you don't wish to downcase tokens
879
879
  */
@@ -896,7 +896,7 @@ frt_standard_tokenizer_init(VALUE self, VALUE rstr)
896
896
  * AsciiLowerCaseFilter.new(token_stream) -> token_stream
897
897
  *
898
898
  * Create an AsciiLowerCaseFilter which normalizes a token's text to
899
- * lowercase but only for Ascii characters. For other characters use
899
+ * lowercase but only for ASCII characters. For other characters use
900
900
  * LowerCaseFilter.
901
901
  */
902
902
  static VALUE
@@ -990,7 +990,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
990
990
  return self;
991
991
  }
992
992
 
993
- static __inline void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
993
+ static INLINE void frt_add_mapping_i(TokenStream *mf, VALUE from, char *to)
994
994
  {
995
995
  switch (TYPE(from)) {
996
996
  case T_STRING:
@@ -1046,8 +1046,8 @@ static int frt_add_mappings_i(VALUE key, VALUE value, VALUE arg)
1046
1046
  * MappingFilter.new(token_stream, mapping) -> token_stream
1047
1047
  *
1048
1048
  * Create an MappingFilter which maps strings in tokens. This is usually used
1049
- * to map UTF-8 characters to ascii characters for easier searching and
1050
- * better searche recall. The mapping is compiled into a Deterministic Finite
1049
+ * to map UTF-8 characters to ASCII characters for easier searching and
1050
+ * better search recall. The mapping is compiled into a Deterministic Finite
1051
1051
  * Automata so it is super fast. This Filter can therefor be used for
1052
1052
  * indexing very large datasets. Currently regular expressions are not
1053
1053
  * supported. If you are really interested in the feature, please contact me
@@ -1087,7 +1087,7 @@ frt_mapping_filter_init(VALUE self, VALUE rsub_ts, VALUE mapping)
1087
1087
  * algorithm="english",
1088
1088
  * encoding="UTF-8") -> token_stream
1089
1089
  *
1090
- * Create an StemFilter which uses a snowball stemmer (thankyou Martin
1090
+ * Create an StemFilter which uses a snowball stemmer (thank you Martin
1091
1091
  * Porter) to stem words. You can optionally specify the algorithm (default:
1092
1092
  * "english") and encoding (default: "UTF-8").
1093
1093
  *
@@ -1193,6 +1193,16 @@ frt_get_analyzer(Analyzer *a)
1193
1193
  return self;
1194
1194
  }
1195
1195
 
1196
+ INLINE VALUE
1197
+ get_rb_ts_from_a(Analyzer *a, VALUE rfield, VALUE rstring)
1198
+ {
1199
+ TokenStream *ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1200
+
1201
+ /* Make sure that there is no entry already */
1202
+ object_set(&ts->text, rstring);
1203
+ return get_rb_token_stream(ts);
1204
+ }
1205
+
1196
1206
  /*
1197
1207
  * call-seq:
1198
1208
  * analyzer.token_stream(field_name, input) -> token_stream
@@ -1209,17 +1219,12 @@ frt_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1209
1219
  {
1210
1220
  /* NOTE: Any changes made to this method may also need to be applied to
1211
1221
  * frt_re_analyzer_token_stream */
1212
- TokenStream *ts;
1213
1222
  Analyzer *a;
1214
1223
  GET_A(a, self);
1215
1224
 
1216
1225
  StringValue(rstring);
1217
1226
 
1218
- ts = a_get_ts(a, frt_field(rfield), rs2s(rstring));
1219
-
1220
- /* Make sure that there is no entry already */
1221
- object_set(&ts->text, rstring);
1222
- return get_rb_token_stream(ts);
1227
+ return get_rb_ts_from_a(a, rfield, rstring);
1223
1228
  }
1224
1229
 
1225
1230
  #define GET_LOWER(dflt) \
@@ -1234,7 +1239,7 @@ lower = (argc ? RTEST(rlower) : dflt)
1234
1239
  *
1235
1240
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1236
1241
  * but can optionally leave case as is. Lowercasing will only be done to
1237
- * ascii characters.
1242
+ * ASCII characters.
1238
1243
  *
1239
1244
  * lower:: set to false if you don't want the field's tokens to be downcased
1240
1245
  */
@@ -1279,7 +1284,7 @@ frt_white_space_analyzer_init(int argc, VALUE *argv, VALUE self)
1279
1284
  *
1280
1285
  * Create a new AsciiWhiteSpaceAnalyzer which downcases tokens by default
1281
1286
  * but can optionally leave case as is. Lowercasing will only be done to
1282
- * ascii characters.
1287
+ * ASCII characters.
1283
1288
  *
1284
1289
  * lower:: set to false if you don't want the field's tokens to be downcased
1285
1290
  */
@@ -1457,6 +1462,37 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
1457
1462
  return self;
1458
1463
  }
1459
1464
 
1465
+ /*
1466
+ * call-seq:
1467
+ * analyzer.token_stream(field_name, input) -> token_stream
1468
+ *
1469
+ * Create a new TokenStream to tokenize +input+. The TokenStream created will
1470
+ * also depend on the +field_name+ in the case of the PerFieldAnalyzer.
1471
+ *
1472
+ * field_name:: name of the field to be tokenized
1473
+ * input:: data from the field to be tokenized
1474
+ */
1475
+ static VALUE
1476
+ frt_pfa_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rstring)
1477
+ {
1478
+ Analyzer *pfa, *a;
1479
+ char *field = frt_field(rfield);
1480
+ GET_A(pfa, self);
1481
+
1482
+ StringValue(rstring);
1483
+ a = (Analyzer *)h_get(PFA(pfa)->dict, field);
1484
+ if (a == NULL) {
1485
+ a = PFA(pfa)->default_a;
1486
+ }
1487
+ if (a->get_ts == cwa_get_ts) {
1488
+ return rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
1489
+ ID2SYM(rb_intern(field)), rb_str_new2(rs2s(rstring)));
1490
+ }
1491
+ else {
1492
+ return get_rb_ts_from_a(a, rfield, rstring);
1493
+ }
1494
+ }
1495
+
1460
1496
  /*** RegExpAnalyzer ***/
1461
1497
 
1462
1498
  static void
@@ -1585,7 +1621,7 @@ static VALUE frt_set_locale(VALUE self, VALUE locale)
1585
1621
  *
1586
1622
  * == Summary
1587
1623
  *
1588
- * A Token is an occurence of a term from the text of a field. It consists
1624
+ * A Token is an occurrence of a term from the text of a field. It consists
1589
1625
  * of a term's text and the start and end offset of the term in the text of
1590
1626
  * the field;
1591
1627
  *
@@ -1648,7 +1684,7 @@ static void Init_TokenStream(void)
1648
1684
  /*
1649
1685
  * Document-class: Ferret::Analysis::AsciiLetterTokenizer
1650
1686
  *
1651
- * A LetterTokenizer is a tokenizer that divides text at non-ascii letters.
1687
+ * A LetterTokenizer is a tokenizer that divides text at non-ASCII letters.
1652
1688
  * That is to say, it defines tokens as maximal strings of adjacent letters,
1653
1689
  * as defined by the regular expression _/[A-Za-z]+/_.
1654
1690
  *
@@ -1781,7 +1817,7 @@ static void Init_StandardTokenizer(void)
1781
1817
  * Document-class: Ferret::Analysis::RegExpTokenizer
1782
1818
  *
1783
1819
  * A tokenizer that recognizes tokens based on a regular expression passed to
1784
- * the contructor. Most possible tokenizers can be created using this class.
1820
+ * the constructor. Most possible tokenizers can be created using this class.
1785
1821
  *
1786
1822
  * === Example
1787
1823
  *
@@ -1817,7 +1853,7 @@ static void Init_RegExpTokenizer(void)
1817
1853
  * Document-class: Ferret::Analysis::AsciiLowerCaseFilter
1818
1854
  *
1819
1855
  * AsciiLowerCaseFilter normalizes a token's text to lowercase but only for
1820
- * Ascii characters. For other characters use LowerCaseFilter.
1856
+ * ASCII characters. For other characters use LowerCaseFilter.
1821
1857
  *
1822
1858
  * === Example
1823
1859
  *
@@ -1881,7 +1917,7 @@ static void Init_HyphenFilter(void)
1881
1917
  * Document-class: Ferret::Analysis::MappingFilter
1882
1918
  *
1883
1919
  * A MappingFilter maps strings in tokens. This is usually used to map UTF-8
1884
- * characters to ascii characters for easier searching and better searche
1920
+ * characters to ASCII characters for easier searching and better search
1885
1921
  * recall. The mapping is compiled into a Deterministic Finite Automata so it
1886
1922
  * is super fast. This Filter can therefor be used for indexing very large
1887
1923
  * datasets. Currently regular expressions are not supported. If you are
@@ -2020,7 +2056,7 @@ static void Init_StemFilter(void)
2020
2056
  * a policy for extracting index terms from text.
2021
2057
  *
2022
2058
  * Typical implementations first build a Tokenizer, which breaks the stream
2023
- * of characters from the Reader into raw Tokens. One or more TokenFilter s
2059
+ * of characters from the Reader into raw Tokens. One or more TokenFilters
2024
2060
  * may then be applied to the output of the Tokenizer.
2025
2061
  *
2026
2062
  * The default Analyzer just creates a LowerCaseTokenizer which converts
@@ -2057,7 +2093,7 @@ static void Init_Analyzer(void)
2057
2093
  * == Summary
2058
2094
  *
2059
2095
  * An AsciiLetterAnalyzer creates a TokenStream that splits the input up into
2060
- * maximal strings of Ascii characters. If implemented in Ruby it would look
2096
+ * maximal strings of ASCII characters. If implemented in Ruby it would look
2061
2097
  * like;
2062
2098
  *
2063
2099
  * class AsciiLetterAnalyzer
@@ -2075,7 +2111,7 @@ static void Init_Analyzer(void)
2075
2111
  * end
2076
2112
  *
2077
2113
  * As you can see it makes use of the AsciiLetterTokenizer and
2078
- * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ascii
2114
+ * AsciiLowerCaseFilter. Note that this tokenizer won't recognize non-ASCII
2079
2115
  * characters so you should use the LetterAnalyzer is you want to analyze
2080
2116
  * multi-byte data like "UTF-8".
2081
2117
  */
@@ -2194,7 +2230,7 @@ static void Init_WhiteSpaceAnalyzer(void)
2194
2230
  * == Summary
2195
2231
  *
2196
2232
  * The AsciiStandardAnalyzer is the most advanced of the available
2197
- * ascii-analyzers. If it were implemented in Ruby it would look like this;
2233
+ * ASCII-analyzers. If it were implemented in Ruby it would look like this;
2198
2234
  *
2199
2235
  * class AsciiStandardAnalyzer
2200
2236
  * def initialize(stop_words = FULL_ENGLISH_STOP_WORDS, lower = true)
@@ -2212,7 +2248,7 @@ static void Init_WhiteSpaceAnalyzer(void)
2212
2248
  *
2213
2249
  * As you can see it makes use of the AsciiStandardTokenizer and you can also
2214
2250
  * add your own list of stop-words if you wish. Note that this tokenizer
2215
- * won't recognize non-ascii characters so you should use the
2251
+ * won't recognize non-ASCII characters so you should use the
2216
2252
  * StandardAnalyzer is you want to analyze multi-byte data like "UTF-8".
2217
2253
  */
2218
2254
  static void Init_AsciiStandardAnalyzer(void)
@@ -2292,6 +2328,8 @@ static void Init_PerFieldAnalyzer(void)
2292
2328
  frt_per_field_analyzer_add_field, 2);
2293
2329
  rb_define_method(cPerFieldAnalyzer, "[]=",
2294
2330
  frt_per_field_analyzer_add_field, 2);
2331
+ rb_define_method(cPerFieldAnalyzer, "token_stream",
2332
+ frt_pfa_analyzer_token_stream, 2);
2295
2333
  }
2296
2334
 
2297
2335
  /*