ferret 0.11.4 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data/Rakefile +1 -0
  2. data/TUTORIAL +3 -3
  3. data/ext/analysis.c +12 -9
  4. data/ext/array.c +10 -10
  5. data/ext/array.h +8 -1
  6. data/ext/bitvector.c +2 -2
  7. data/ext/except.c +1 -1
  8. data/ext/ferret.c +2 -2
  9. data/ext/ferret.h +1 -1
  10. data/ext/fs_store.c +13 -2
  11. data/ext/global.c +4 -4
  12. data/ext/global.h +6 -0
  13. data/ext/hash.c +1 -1
  14. data/ext/helper.c +1 -1
  15. data/ext/helper.h +1 -1
  16. data/ext/index.c +48 -22
  17. data/ext/index.h +17 -16
  18. data/ext/mempool.c +4 -1
  19. data/ext/mempool.h +1 -1
  20. data/ext/multimapper.c +2 -2
  21. data/ext/q_fuzzy.c +2 -2
  22. data/ext/q_multi_term.c +2 -2
  23. data/ext/q_parser.c +39 -8
  24. data/ext/q_range.c +32 -1
  25. data/ext/r_analysis.c +66 -28
  26. data/ext/r_index.c +18 -19
  27. data/ext/r_qparser.c +21 -6
  28. data/ext/r_search.c +74 -49
  29. data/ext/r_store.c +1 -1
  30. data/ext/r_utils.c +17 -17
  31. data/ext/search.c +10 -5
  32. data/ext/search.h +3 -1
  33. data/ext/sort.c +2 -2
  34. data/ext/stopwords.c +23 -34
  35. data/ext/store.c +9 -9
  36. data/ext/store.h +5 -4
  37. data/lib/ferret/document.rb +2 -2
  38. data/lib/ferret/field_infos.rb +37 -35
  39. data/lib/ferret/index.rb +16 -6
  40. data/lib/ferret/number_tools.rb +2 -2
  41. data/lib/ferret_version.rb +1 -1
  42. data/test/unit/analysis/tc_token_stream.rb +40 -0
  43. data/test/unit/index/tc_index.rb +64 -101
  44. data/test/unit/index/tc_index_reader.rb +13 -0
  45. data/test/unit/largefile/tc_largefile.rb +46 -0
  46. data/test/unit/query_parser/tc_query_parser.rb +17 -1
  47. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  48. data/test/unit/search/tm_searcher.rb +27 -1
  49. data/test/unit/ts_largefile.rb +4 -0
  50. metadata +147 -144
data/ext/r_index.c CHANGED
@@ -274,7 +274,7 @@ frt_fi_is_tokenized(VALUE self)
274
274
  * used to store the field boosts for an indexed field. If you do not boost
275
275
  * any fields, and you can live without scoring based on field length then
276
276
  * you can omit the norms file. This will give the index a slight performance
277
- * boost and it will use less memory, escpecially for indexes which have a
277
+ * boost and it will use less memory, especially for indexes which have a
278
278
  * large number of documents.
279
279
  */
280
280
  static VALUE
@@ -623,7 +623,7 @@ frt_fis_create_index(VALUE self, VALUE rdir)
623
623
  * call-seq:
624
624
  * fis.fields -> symbol array
625
625
  *
626
- * Return a list of the field names (as symbols) of all the fieldcs in the
626
+ * Return a list of the field names (as symbols) of all the fields in the
627
627
  * index.
628
628
  */
629
629
  static VALUE
@@ -1415,7 +1415,7 @@ frt_iw_init(int argc, VALUE *argv, VALUE self)
1415
1415
  * iw.doc_count -> number
1416
1416
  *
1417
1417
  * Returns the number of documents in the Index. Note that deletions won't be
1418
- * taken into account until the IndexWriter has been commited.
1418
+ * taken into account until the IndexWriter has been committed.
1419
1419
  */
1420
1420
  static VALUE
1421
1421
  frt_iw_get_doc_count(VALUE self)
@@ -1660,7 +1660,7 @@ frt_iw_get_analyzer(VALUE self)
1660
1660
  *
1661
1661
  * Set the Analyzer for this IndexWriter. This is useful if you need to
1662
1662
  * change the analyzer for a special document. It is risky though as the
1663
- * same anlyzer will be used for all documents during search.
1663
+ * same analyzer will be used for all documents during search.
1664
1664
  */
1665
1665
  static VALUE
1666
1666
  frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
@@ -2191,7 +2191,7 @@ frt_ir_init(VALUE self, VALUE rdir)
2191
2191
  *
2192
2192
  * Expert: change the boost value for a +field+ in document at +doc_id+.
2193
2193
  * +val+ should be an integer in the range 0..255 which corresponds to an
2194
- * encoced float value.
2194
+ * encoded float value.
2195
2195
  */
2196
2196
  static VALUE
2197
2197
  frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
@@ -2267,7 +2267,7 @@ frt_ir_commit(VALUE self)
2267
2267
  * index_reader.close -> index_reader
2268
2268
  *
2269
2269
  * Close the IndexReader. This method also commits any deletions made by this
2270
- * IndexReader. Thise method will be called explicitly by the garbage
2270
+ * IndexReader. This method will be called explicitly by the garbage
2271
2271
  * collector but you should call it explicitly to commit any changes as soon
2272
2272
  * as possible and to close any locks held by the object to prevent locking
2273
2273
  * errors.
@@ -2286,7 +2286,7 @@ frt_ir_close(VALUE self)
2286
2286
  * call-seq:
2287
2287
  * index_reader.has_deletions? -> bool
2288
2288
  *
2289
- * Return true if the index has any deletions, either uncommited by this
2289
+ * Return true if the index has any deletions, either uncommitted by this
2290
2290
  * IndexReader or committed by any other IndexReader.
2291
2291
  */
2292
2292
  static VALUE
@@ -2329,7 +2329,7 @@ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2329
2329
  * call-seq:
2330
2330
  * index_reader.max_doc -> number
2331
2331
  *
2332
- * Returns 1 + the maximum document id in the index. It is the the
2332
+ * Returns 1 + the maximum document id in the index. It is the
2333
2333
  * document_id that will be used by the next document added to the index. If
2334
2334
  * there are no deletions, this number also refers to the number of documents
2335
2335
  * in the index.
@@ -2361,7 +2361,7 @@ frt_ir_num_docs(VALUE self)
2361
2361
  * index_reader.undelete_all -> index_reader
2362
2362
  *
2363
2363
  * Undelete all deleted documents in the index. This is kind of like a
2364
- * rollback feature. Not that once an index is commited or a merge happens
2364
+ * rollback feature. Not that once an index is committed or a merge happens
2365
2365
  * during index, deletions will be committed and undelete_all will have no
2366
2366
  * effect on these documents.
2367
2367
  */
@@ -2434,7 +2434,6 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2434
2434
  len = FIX2LONG(arg2);
2435
2435
  return frt_get_doc_range(ir, pos, len, max);
2436
2436
  }
2437
- return Qnil;
2438
2437
  }
2439
2438
 
2440
2439
  /*
@@ -2713,7 +2712,7 @@ frt_ir_version(VALUE self)
2713
2712
  *
2714
2713
  * == Summary
2715
2714
  *
2716
- * The FieldInfo class is the field descripter for the index. It specifies
2715
+ * The FieldInfo class is the field descriptor for the index. It specifies
2717
2716
  * whether a field is compressed or not or whether it should be indexed and
2718
2717
  * tokenized. Every field has a name which must be a symbol. There are three
2719
2718
  * properties that you can set, +:store+, +:index+ and +:term_vector+. You
@@ -2740,7 +2739,7 @@ frt_ir_version(VALUE self)
2740
2739
  * be indexed to be store in the Ferret index. You may want to use the index
2741
2740
  * as a simple database and store things like images or MP3s in the index. By
2742
2741
  * default each field is indexed and tokenized (split into tokens) (+:yes+).
2743
- * If you don't want to index the field use +:no+. If you wan the field
2742
+ * If you don't want to index the field use +:no+. If you want the field
2744
2743
  * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2745
2744
  * wish to sort by. There are two other values for +:index+; +:omit_norms+
2746
2745
  * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
@@ -2754,7 +2753,7 @@ frt_ir_version(VALUE self)
2754
2753
  * or not you would like to store term-vectors. The available options are
2755
2754
  * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2756
2755
  * +:with_positions_offsets+. Note that you need to store the positions to
2757
- * asscociate offsets with individual terms in the term_vector.
2756
+ * associate offsets with individual terms in the term_vector.
2758
2757
  *
2759
2758
  * == Property Table
2760
2759
  *
@@ -2946,7 +2945,7 @@ Init_FieldInfos(void)
2946
2945
  *
2947
2946
  * te = index_reader.terms(:content)
2948
2947
  *
2949
- * te.each {|term, doc_freq| puts "#{term} occured #{doc_freq} times" }
2948
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2950
2949
  *
2951
2950
  * # or you could do it like this;
2952
2951
  * te = index_reader.terms(:content)
@@ -3093,7 +3092,7 @@ Init_TVTerm(void)
3093
3092
  * highlight search matches in results. This is all done internally so you
3094
3093
  * won't need to worry about the TermVector object. There are some other
3095
3094
  * reasons you may want to use the TermVectors object however. For example,
3096
- * you may wish to see which terms are the most commonly occuring terms in a
3095
+ * you may wish to see which terms are the most commonly occurring terms in a
3097
3096
  * document to implement a MoreLikeThis search.
3098
3097
  *
3099
3098
  * == Example
@@ -3112,7 +3111,7 @@ Init_TVTerm(void)
3112
3111
  * +positions+ and +offsets+ can be +nil+ depending on what you set the
3113
3112
  * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3114
3113
  * particular that you need to store both positions and offsets if you want
3115
- * to asscociate offsets with particular terms.
3114
+ * to associate offsets with particular terms.
3116
3115
  */
3117
3116
  static void
3118
3117
  Init_TermVector(void)
@@ -3136,7 +3135,7 @@ Init_TermVector(void)
3136
3135
  * == Summary
3137
3136
  *
3138
3137
  * The IndexWriter is the class used to add documents to an index. You can
3139
- * also delete docuements from the index using this class. The indexing
3138
+ * also delete documents from the index using this class. The indexing
3140
3139
  * process is highly customizable and the IndexWriter has the following
3141
3140
  * parameters;
3142
3141
  *
@@ -3212,7 +3211,7 @@ Init_TermVector(void)
3212
3211
  * documents).
3213
3212
  * max_field_length:: Default: 10000. The maximum number of terms added to
3214
3213
  * a single field. This can be useful to protect the
3215
- * indexer when indexing documents fromt the web for
3214
+ * indexer when indexing documents from the web for
3216
3215
  * example. Usually the most important terms will occur
3217
3216
  * early on in a document so you can often safely
3218
3217
  * ignore the terms in a field after a certain number
@@ -3221,7 +3220,7 @@ Init_TermVector(void)
3221
3220
  * first 1000 terms in a field. On the other hand, if
3222
3221
  * you want to be more thorough and you are indexing
3223
3222
  * documents from your file-system you may set this
3224
- * paramter to Ferret::FIX_INT_MAX.
3223
+ * parameter to Ferret::FIX_INT_MAX.
3225
3224
  * use_compound_file:: Default: true. Uses a compound file to store the
3226
3225
  * index. This prevents an error being raised for
3227
3226
  * having too many files open at the same time. The
data/ext/r_qparser.c CHANGED
@@ -16,6 +16,7 @@ static VALUE sym_default_slop;
16
16
  static VALUE sym_handle_parse_errors;
17
17
  static VALUE sym_clean_string;
18
18
  static VALUE sym_max_clauses;
19
+ static VALUE sym_use_keywords;
19
20
 
20
21
  extern VALUE frt_get_analyzer(Analyzer *a);
21
22
  extern VALUE frt_get_q(Query *q);
@@ -116,11 +117,20 @@ frt_get_fields(VALUE rfields)
116
117
  * of terms allowed in multi, prefix, wild-card or
117
118
  * fuzzy queries when those queries are generated by
118
119
  * rewriting other queries
120
+ * :use_keywords: Default: true. By default AND, OR, NOT and REQ are
121
+ * keywords used by the query parser. Sometimes this
122
+ * is undesirable. For example, if your application
123
+ * allows searching for US states by their
124
+ * abbreviation, then OR will be a common query
125
+ * string. By setting :use_keywords to false, OR will
126
+ * no longer be a keyword allowing searches for the
127
+ * state of Oregon. You will still be able to use
128
+ * boolean queries by using the + and - characters.
119
129
  */
120
130
  static VALUE
121
131
  frt_qp_init(int argc, VALUE *argv, VALUE self)
122
132
  {
123
- VALUE roptions;
133
+ VALUE roptions = Qnil;
124
134
  VALUE rval;
125
135
  Analyzer *analyzer = NULL;
126
136
  bool has_options = false;
@@ -150,6 +160,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
150
160
  }
151
161
  } else {
152
162
  def_fields = frt_get_fields(roptions);
163
+ roptions = Qnil;
153
164
  }
154
165
  }
155
166
  if (all_fields == NULL) {
@@ -165,7 +176,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
165
176
  qp->clean_str = true;
166
177
  qp->handle_parse_errors = true;
167
178
  /* handle options */
168
- if (argc > 0) {
179
+ if (roptions != Qnil) {
169
180
  if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
170
181
  qp->handle_parse_errors = RTEST(rval);
171
182
  }
@@ -187,6 +198,9 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
187
198
  if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
188
199
  qp->max_clauses = FIX2INT(rval);
189
200
  }
201
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
202
+ qp->use_keywords = RTEST(rval);
203
+ }
190
204
  }
191
205
  Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
192
206
  object_add(qp, self);
@@ -493,8 +507,8 @@ Init_QueryParseException(void)
493
507
  * === WildQuery
494
508
  *
495
509
  * A wild query is a query using the pattern matching characters * and ?. *
496
- * matchs 0 or more characters while ? matchs a single character. This type
497
- * of query can be really useful for matching heirarchical categories for
510
+ * matches 0 or more characters while ? matches a single character. This type
511
+ * of query can be really useful for matching hierarchical categories for
498
512
  * example. Let's say we had this structure;
499
513
  *
500
514
  * /sport/skiing
@@ -514,7 +528,7 @@ Init_QueryParseException(void)
514
528
  * the wild characters at the beginning of the query as it'll have to iterate
515
529
  * through every term in that field. Having said that, some fields like the
516
530
  * category field above will only have a small number of distinct fields so
517
- * this could be ok.
531
+ * this could be okay.
518
532
  *
519
533
  * === FuzzyQuery
520
534
  *
@@ -531,7 +545,7 @@ Init_QueryParseException(void)
531
545
  * 'content:Ostralya~0.4'
532
546
  *
533
547
  * Note that this query can be quite expensive. If you'd like to use this
534
- * query, you may want to set a mininum prefix length in the FuzzyQuery
548
+ * query, you may want to set a minimum prefix length in the FuzzyQuery
535
549
  * class. This can substantially reduce the number of terms that the query
536
550
  * will iterate over.
537
551
  *
@@ -551,6 +565,7 @@ Init_QueryParser(void)
551
565
  sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
552
566
  sym_clean_string = ID2SYM(rb_intern("clean_string"));
553
567
  sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
568
+ sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
554
569
 
555
570
  /* QueryParser */
556
571
  cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
data/ext/r_search.c CHANGED
@@ -179,7 +179,7 @@ frt_get_td(TopDocs *td, VALUE rsearcher)
179
179
  * call-seq:
180
180
  * top_doc.to_s(field = :id) -> string
181
181
  *
182
- * Returns a string represention of the top_doc in readable format.
182
+ * Returns a string representation of the top_doc in readable format.
183
183
  */
184
184
  static VALUE
185
185
  frt_td_to_s(int argc, VALUE *argv, VALUE self)
@@ -197,7 +197,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
197
197
  field = frt_field(argv[0]);
198
198
  }
199
199
 
200
- sprintf(s, "TopDocs: total_hits = %d, max_score = %f [\n",
200
+ sprintf(s, "TopDocs: total_hits = %ld, max_score = %f [\n",
201
201
  FIX2INT(rb_funcall(self, id_total_hits, 0)),
202
202
  NUM2DBL(rb_funcall(self, id_max_score, 0)));
203
203
  s += strlen(s);
@@ -224,7 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
224
224
  return rstr;
225
225
  }
226
226
 
227
- __inline char *
227
+ static INLINE char *
228
228
  frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
229
229
  {
230
230
  int i, j;
@@ -270,7 +270,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
270
270
  * call-seq:
271
271
  * top_doc.to_json() -> string
272
272
  *
273
- * Returns a json represention of the top_doc.
273
+ * Returns a json representation of the top_doc.
274
274
  */
275
275
  static VALUE
276
276
  frt_td_to_json(VALUE self)
@@ -318,7 +318,7 @@ frt_td_to_json(VALUE self)
318
318
  * call-seq:
319
319
  * explanation.to_s -> string
320
320
  *
321
- * Returns a string represention of the explantion in readable format.
321
+ * Returns a string representation of the explanation in readable format.
322
322
  */
323
323
  static VALUE
324
324
  frt_expl_to_s(VALUE self)
@@ -334,7 +334,7 @@ frt_expl_to_s(VALUE self)
334
334
  * call-seq:
335
335
  * explanation.to_html -> string
336
336
  *
337
- * Returns an html represention of the explantion in readable format.
337
+ * Returns an html representation of the explanation in readable format.
338
338
  */
339
339
  static VALUE
340
340
  frt_expl_to_html(VALUE self)
@@ -403,7 +403,7 @@ frt_q_to_s(int argc, VALUE *argv, VALUE self)
403
403
  * call-seq:
404
404
  * query.boost
405
405
  *
406
- * Returns the queries boost value. See the Query desription for more
406
+ * Returns the queries boost value. See the Query description for more
407
407
  * information on Query boosts.
408
408
  */
409
409
  static VALUE
@@ -417,7 +417,7 @@ frt_q_get_boost(VALUE self)
417
417
  * call-seq:
418
418
  * query.boost = boost -> boost
419
419
  *
420
- * Set the boost for a query. See the Query desription for more information
420
+ * Set the boost for a query. See the Query description for more information
421
421
  * on Query boosts.
422
422
  */
423
423
  static VALUE
@@ -582,7 +582,7 @@ static VALUE
582
582
  frt_tq_init(VALUE self, VALUE rfield, VALUE rterm)
583
583
  {
584
584
  char *field = frt_field(rfield);
585
- char *term = StringValuePtr(rterm);
585
+ char *term = rs2s(rb_obj_as_string(rterm));
586
586
  Query *q = tq_new(field, term);
587
587
  Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
588
588
  object_add(q, self);
@@ -795,7 +795,7 @@ frt_bc_init(int argc, VALUE *argv, VALUE self)
795
795
  * call-seq:
796
796
  * clause.query -> query
797
797
  *
798
- * Returnt the query object wrapped by this BooleanClause.
798
+ * Return the query object wrapped by this BooleanClause.
799
799
  */
800
800
  static VALUE
801
801
  frt_bc_get_query(VALUE self)
@@ -921,7 +921,7 @@ frt_bq_mark(void *p)
921
921
  * BooleanQuery.new(coord_disable = false)
922
922
  *
923
923
  * Create a new BooleanQuery. If you don't care about the scores of the
924
- * sub-queries added the the query (as would be the case for many
924
+ * sub-queries added to the query (as would be the case for many
925
925
  * automatically generated queries) you can disable the coord_factor of the
926
926
  * score. This will slightly improve performance for the query. Usually you
927
927
  * should leave this parameter as is.
@@ -1309,7 +1309,7 @@ frt_wcq_init(int argc, VALUE *argv, VALUE self)
1309
1309
  * distance is measured. This parameter is used to improve
1310
1310
  * performance. With a +:prefix_length+ of 0, all terms in
1311
1311
  * the index must be checked which can be quite a
1312
- * performance hit. By setting theprefix length to a
1312
+ * performance hit. By setting the prefix length to a
1313
1313
  * larger number you minimize the number of terms that need
1314
1314
  * to be checked. Even 1 will cut down the work by a
1315
1315
  * factor of about 26 depending on your character set and
@@ -1501,7 +1501,7 @@ frt_maq_init(VALUE self)
1501
1501
  * ConstantScoreQuery.new(filter) -> query
1502
1502
  *
1503
1503
  * Create a ConstantScoreQuery which uses +filter+ to match documents giving
1504
- * each document a consant score.
1504
+ * each document a constant score.
1505
1505
  */
1506
1506
  static VALUE
1507
1507
  frt_csq_init(VALUE self, VALUE rfilter)
@@ -1688,7 +1688,7 @@ frt_spannq_mark(void *p)
1688
1688
  * :slop:: Default: 0. Works exactly like a PhraseQuery slop. It is the
1689
1689
  * amount of slop allowed in the match (the term edit distance
1690
1690
  * allowed in the match).
1691
- * :in_order:: Defualt: false. Specifies whether or not the matches have to
1691
+ * :in_order:: Default: false. Specifies whether or not the matches have to
1692
1692
  * occur in the order they were added to the query. When slop is
1693
1693
  * set to 0, this parameter will make no difference.
1694
1694
  */
@@ -1862,7 +1862,7 @@ frt_f_free(void *p)
1862
1862
  * call-seq:
1863
1863
  * filter.to_s -> string
1864
1864
  *
1865
- * Return a human readable string represting the Filter object that the
1865
+ * Return a human readable string representing the Filter object that the
1866
1866
  * method was called on.
1867
1867
  */
1868
1868
  static VALUE
@@ -2415,7 +2415,7 @@ frt_sea_doc(VALUE self, VALUE rdoc_id)
2415
2415
  * call-seq:
2416
2416
  * searcher.max_doc -> number
2417
2417
  *
2418
- * Returns 1 + the maximum document id in the index. It is the the
2418
+ * Returns 1 + the maximum document id in the index. It is the
2419
2419
  * document_id that will be used by the next document added to the index. If
2420
2420
  * there are no deletions, this number also refers to the number of documents
2421
2421
  * in the index.
@@ -2555,8 +2555,13 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2555
2555
  * :sort:: A Sort object or sort string describing how the field
2556
2556
  * should be sorted. A sort string is made up of field names
2557
2557
  * which cannot contain spaces and the word "DESC" if you
2558
- * want the field reversed, all seperated by commas. For
2559
- * example; "rating DESC, author, title"
2558
+ * want the field reversed, all separated by commas. For
2559
+ * example; "rating DESC, author, title". Note that Ferret
2560
+ * will try to determine a field's type by looking at the
2561
+ * first term in the index and seeing if it can be parsed as
2562
+ * an integer or a float. Keep this in mind as you may need
2563
+ * to specify a fields type to sort it correctly. For more
2564
+ * on this, see the documentation for SortField
2560
2565
  * :filter:: a Filter object to filter the search results with
2561
2566
  * :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
2562
2567
  * and the Searcher object as its parameters and returns a
@@ -2602,8 +2607,13 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
2602
2607
  * :sort:: A Sort object or sort string describing how the field
2603
2608
  * should be sorted. A sort string is made up of field names
2604
2609
  * which cannot contain spaces and the word "DESC" if you
2605
- * want the field reversed, all seperated by commas. For
2606
- * example; "rating DESC, author, title"
2610
+ * want the field reversed, all separated by commas. For
2611
+ * example; "rating DESC, author, title". Note that Ferret
2612
+ * will try to determine a field's type by looking at the
2613
+ * first term in the index and seeing if it can be parsed as
2614
+ * an integer or a float. Keep this in mind as you may need
2615
+ * to specify a fields type to sort it correctly. For more
2616
+ * on this, see the documentation for SortField
2607
2617
  * :filter:: a Filter object to filter the search results with
2608
2618
  * :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
2609
2619
  * and the Searcher object as its parameters and returns a
@@ -2685,7 +2695,7 @@ frt_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
2685
2695
  * :ellipsis:: Default: "...". This is the string that is appended at
2686
2696
  * the beginning and end of excerpts (unless the excerpt
2687
2697
  * hits the start or end of the field. You'll probably
2688
- * want to change this so a Unicode elipsis character.
2698
+ * want to change this so a Unicode ellipsis character.
2689
2699
  */
2690
2700
  static VALUE
2691
2701
  frt_sea_highlight(int argc, VALUE *argv, VALUE self)
@@ -2702,26 +2712,31 @@ frt_sea_highlight(int argc, VALUE *argv, VALUE self)
2702
2712
 
2703
2713
  rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
2704
2714
  Data_Get_Struct(rquery, Query, query);
2705
- if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
2706
- num_excerpts = FIX2INT(v);
2707
- }
2708
- if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
2709
- if (v == sym_all) {
2710
- num_excerpts = 1;
2711
- excerpt_length = INT_MAX/2;
2715
+ if (argc > 3) {
2716
+ if (TYPE(roptions) != T_HASH) {
2717
+ rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash");
2712
2718
  }
2713
- else {
2714
- excerpt_length = FIX2INT(v);
2719
+ if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
2720
+ num_excerpts = FIX2INT(v);
2721
+ }
2722
+ if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
2723
+ if (v == sym_all) {
2724
+ num_excerpts = 1;
2725
+ excerpt_length = INT_MAX/2;
2726
+ }
2727
+ else {
2728
+ excerpt_length = FIX2INT(v);
2729
+ }
2730
+ }
2731
+ if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
2732
+ pre_tag = rs2s(rb_obj_as_string(v));
2733
+ }
2734
+ if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
2735
+ post_tag = rs2s(rb_obj_as_string(v));
2736
+ }
2737
+ if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
2738
+ ellipsis = rs2s(rb_obj_as_string(v));
2715
2739
  }
2716
- }
2717
- if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
2718
- pre_tag = rs2s(rb_obj_as_string(v));
2719
- }
2720
- if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
2721
- post_tag = rs2s(rb_obj_as_string(v));
2722
- }
2723
- if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
2724
- ellipsis = rs2s(rb_obj_as_string(v));
2725
2740
  }
2726
2741
 
2727
2742
  if ((excerpts = searcher_highlight(sea,
@@ -2771,7 +2786,7 @@ frt_sea_mark(void *p)
2771
2786
  * Searcher.new(obj) -> Searcher
2772
2787
  *
2773
2788
  * Create a new Searcher object. +dir+ can either be a string path to an
2774
- * index directory on the file-sytem, an actual Ferret::Store::Directory
2789
+ * index directory on the file-system, an actual Ferret::Store::Directory
2775
2790
  * object or a Ferret::Index::IndexReader. You should use the IndexReader for
2776
2791
  * searching multiple indexes. Just open the IndexReader on multiple
2777
2792
  * directories.
@@ -2898,7 +2913,7 @@ cTopDocs = rb_define_class_under(mSearch, "TopDocs", rb_cObject);
2898
2913
  * document id of the document that matches along with the score for the
2899
2914
  * match. The score is a positive Float value. The score contained in a hit
2900
2915
  * is not normalized so it can be greater than 1.0. To normalize scores to
2901
- * the range 0.0..1.0 devide the scores by TopDocs#max_score.
2916
+ * the range 0.0..1.0 divide the scores by TopDocs#max_score.
2902
2917
  */
2903
2918
  static void
2904
2919
  Init_Hit(void)
@@ -3546,7 +3561,7 @@ Init_SpanPrefixQuery(void)
3546
3561
  *
3547
3562
  * == Summary
3548
3563
  *
3549
- * A SpanFirstQuery resticts a query to search in the first +end+ bytes of a
3564
+ * A SpanFirstQuery restricts a query to search in the first +end+ bytes of a
3550
3565
  * field. This is useful since often the most important information in a
3551
3566
  * document is at the start of the document.
3552
3567
  *
@@ -3577,7 +3592,7 @@ Init_SpanFirstQuery(void)
3577
3592
  *
3578
3593
  * A SpanNearQuery is like a combination between a PhraseQuery and a
3579
3594
  * BooleanQuery. It matches sub-SpanQueries which are added as clauses but
3580
- * those clauses must occur within a +slop+ edit distance of eachother. You
3595
+ * those clauses must occur within a +slop+ edit distance of each other. You
3581
3596
  * can also specify that clauses must occur +in_order+.
3582
3597
  *
3583
3598
  * == Example
@@ -3801,7 +3816,7 @@ Init_QueryFilter(void)
3801
3816
  * A Filter is used to filter query results. It is usually passed to one of
3802
3817
  * Searcher's search methods however it can also be used inside a
3803
3818
  * ConstantScoreQuery or a FilteredQuery. To implement your own Filter you
3804
- * must implement the methoed #get_bitvector(index_reader) which returns a
3819
+ * must implement the method #get_bitvector(index_reader) which returns a
3805
3820
  * BitVector with set bits corresponding to documents that are allowed by
3806
3821
  * this Filter.
3807
3822
  *
@@ -3839,16 +3854,23 @@ Init_Filter(void)
3839
3854
  * The type of the SortField is set by passing it as a parameter to the
3840
3855
  * constructor. The +:auto+ type specifies that the SortField should detect
3841
3856
  * the sort type by looking at the data in the field. This is the default
3842
- * type. Care should be taken however when using the :auto sort-type since
3843
- * numbers will occur before other strings in the index so if you are sorting
3844
- * a field with both numbers and strings (like a title field which might have
3845
- * "24" and "Prison Break") then the sort_field will think it is sorting
3846
- * integers when it really should sort by string.
3857
+ * :type value although it is recommended that you explicitly specify the
3858
+ * fields type.
3847
3859
  *
3848
3860
  * == Example
3849
3861
  *
3850
3862
  * title_sf = SortField.new(:title, :type => :string)
3851
3863
  * rating_sf = SortField.new(:rating, :type => float, :reverse => true)
3864
+ *
3865
+ *
3866
+ * Note 1: Care should be taken when using the :auto sort-type since numbers
3867
+ * will occur before other strings in the index so if you are sorting a field
3868
+ * with both numbers and strings (like a title field which might have "24"
3869
+ * and "Prison Break") then the sort_field will think it is sorting integers
3870
+ * when it really should be sorting strings.
3871
+ *
3872
+ * Note 2: When sorting by integer, integers are only 4 bytes so anything
3873
+ * larger will cause strange sorting behaviour.
3852
3874
  */
3853
3875
  static void
3854
3876
  Init_SortField(void)
@@ -3923,6 +3945,9 @@ Init_SortField(void)
3923
3945
  * sf_rating = SortField.new(:rating, :type => :float, :reverse => true)
3924
3946
  * sf_title = SortField.new(:title, :type => :string)
3925
3947
  * sort = Sort.new([sf_rating, sf_title])
3948
+ *
3949
+ * Remember that the :type parameter for SortField is set to :auto be default
3950
+ * be I strongly recommend you specify a :type value.
3926
3951
  */
3927
3952
  static void
3928
3953
  Init_Sort(void)