ferret 0.11.4 → 0.11.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/Rakefile +1 -0
  2. data/TUTORIAL +3 -3
  3. data/ext/analysis.c +12 -9
  4. data/ext/array.c +10 -10
  5. data/ext/array.h +8 -1
  6. data/ext/bitvector.c +2 -2
  7. data/ext/except.c +1 -1
  8. data/ext/ferret.c +2 -2
  9. data/ext/ferret.h +1 -1
  10. data/ext/fs_store.c +13 -2
  11. data/ext/global.c +4 -4
  12. data/ext/global.h +6 -0
  13. data/ext/hash.c +1 -1
  14. data/ext/helper.c +1 -1
  15. data/ext/helper.h +1 -1
  16. data/ext/index.c +48 -22
  17. data/ext/index.h +17 -16
  18. data/ext/mempool.c +4 -1
  19. data/ext/mempool.h +1 -1
  20. data/ext/multimapper.c +2 -2
  21. data/ext/q_fuzzy.c +2 -2
  22. data/ext/q_multi_term.c +2 -2
  23. data/ext/q_parser.c +39 -8
  24. data/ext/q_range.c +32 -1
  25. data/ext/r_analysis.c +66 -28
  26. data/ext/r_index.c +18 -19
  27. data/ext/r_qparser.c +21 -6
  28. data/ext/r_search.c +74 -49
  29. data/ext/r_store.c +1 -1
  30. data/ext/r_utils.c +17 -17
  31. data/ext/search.c +10 -5
  32. data/ext/search.h +3 -1
  33. data/ext/sort.c +2 -2
  34. data/ext/stopwords.c +23 -34
  35. data/ext/store.c +9 -9
  36. data/ext/store.h +5 -4
  37. data/lib/ferret/document.rb +2 -2
  38. data/lib/ferret/field_infos.rb +37 -35
  39. data/lib/ferret/index.rb +16 -6
  40. data/lib/ferret/number_tools.rb +2 -2
  41. data/lib/ferret_version.rb +1 -1
  42. data/test/unit/analysis/tc_token_stream.rb +40 -0
  43. data/test/unit/index/tc_index.rb +64 -101
  44. data/test/unit/index/tc_index_reader.rb +13 -0
  45. data/test/unit/largefile/tc_largefile.rb +46 -0
  46. data/test/unit/query_parser/tc_query_parser.rb +17 -1
  47. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  48. data/test/unit/search/tm_searcher.rb +27 -1
  49. data/test/unit/ts_largefile.rb +4 -0
  50. metadata +147 -144
data/ext/r_index.c CHANGED
@@ -274,7 +274,7 @@ frt_fi_is_tokenized(VALUE self)
274
274
  * used to store the field boosts for an indexed field. If you do not boost
275
275
  * any fields, and you can live without scoring based on field length then
276
276
  * you can omit the norms file. This will give the index a slight performance
277
- * boost and it will use less memory, escpecially for indexes which have a
277
+ * boost and it will use less memory, especially for indexes which have a
278
278
  * large number of documents.
279
279
  */
280
280
  static VALUE
@@ -623,7 +623,7 @@ frt_fis_create_index(VALUE self, VALUE rdir)
623
623
  * call-seq:
624
624
  * fis.fields -> symbol array
625
625
  *
626
- * Return a list of the field names (as symbols) of all the fieldcs in the
626
+ * Return a list of the field names (as symbols) of all the fields in the
627
627
  * index.
628
628
  */
629
629
  static VALUE
@@ -1415,7 +1415,7 @@ frt_iw_init(int argc, VALUE *argv, VALUE self)
1415
1415
  * iw.doc_count -> number
1416
1416
  *
1417
1417
  * Returns the number of documents in the Index. Note that deletions won't be
1418
- * taken into account until the IndexWriter has been commited.
1418
+ * taken into account until the IndexWriter has been committed.
1419
1419
  */
1420
1420
  static VALUE
1421
1421
  frt_iw_get_doc_count(VALUE self)
@@ -1660,7 +1660,7 @@ frt_iw_get_analyzer(VALUE self)
1660
1660
  *
1661
1661
  * Set the Analyzer for this IndexWriter. This is useful if you need to
1662
1662
  * change the analyzer for a special document. It is risky though as the
1663
- * same anlyzer will be used for all documents during search.
1663
+ * same analyzer will be used for all documents during search.
1664
1664
  */
1665
1665
  static VALUE
1666
1666
  frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
@@ -2191,7 +2191,7 @@ frt_ir_init(VALUE self, VALUE rdir)
2191
2191
  *
2192
2192
  * Expert: change the boost value for a +field+ in document at +doc_id+.
2193
2193
  * +val+ should be an integer in the range 0..255 which corresponds to an
2194
- * encoced float value.
2194
+ * encoded float value.
2195
2195
  */
2196
2196
  static VALUE
2197
2197
  frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
@@ -2267,7 +2267,7 @@ frt_ir_commit(VALUE self)
2267
2267
  * index_reader.close -> index_reader
2268
2268
  *
2269
2269
  * Close the IndexReader. This method also commits any deletions made by this
2270
- * IndexReader. Thise method will be called explicitly by the garbage
2270
+ * IndexReader. This method will be called explicitly by the garbage
2271
2271
  * collector but you should call it explicitly to commit any changes as soon
2272
2272
  * as possible and to close any locks held by the object to prevent locking
2273
2273
  * errors.
@@ -2286,7 +2286,7 @@ frt_ir_close(VALUE self)
2286
2286
  * call-seq:
2287
2287
  * index_reader.has_deletions? -> bool
2288
2288
  *
2289
- * Return true if the index has any deletions, either uncommited by this
2289
+ * Return true if the index has any deletions, either uncommitted by this
2290
2290
  * IndexReader or committed by any other IndexReader.
2291
2291
  */
2292
2292
  static VALUE
@@ -2329,7 +2329,7 @@ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
2329
2329
  * call-seq:
2330
2330
  * index_reader.max_doc -> number
2331
2331
  *
2332
- * Returns 1 + the maximum document id in the index. It is the the
2332
+ * Returns 1 + the maximum document id in the index. It is the
2333
2333
  * document_id that will be used by the next document added to the index. If
2334
2334
  * there are no deletions, this number also refers to the number of documents
2335
2335
  * in the index.
@@ -2361,7 +2361,7 @@ frt_ir_num_docs(VALUE self)
2361
2361
  * index_reader.undelete_all -> index_reader
2362
2362
  *
2363
2363
  * Undelete all deleted documents in the index. This is kind of like a
2364
- * rollback feature. Not that once an index is commited or a merge happens
2364
+ * rollback feature. Not that once an index is committed or a merge happens
2365
2365
  * during index, deletions will be committed and undelete_all will have no
2366
2366
  * effect on these documents.
2367
2367
  */
@@ -2434,7 +2434,6 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2434
2434
  len = FIX2LONG(arg2);
2435
2435
  return frt_get_doc_range(ir, pos, len, max);
2436
2436
  }
2437
- return Qnil;
2438
2437
  }
2439
2438
 
2440
2439
  /*
@@ -2713,7 +2712,7 @@ frt_ir_version(VALUE self)
2713
2712
  *
2714
2713
  * == Summary
2715
2714
  *
2716
- * The FieldInfo class is the field descripter for the index. It specifies
2715
+ * The FieldInfo class is the field descriptor for the index. It specifies
2717
2716
  * whether a field is compressed or not or whether it should be indexed and
2718
2717
  * tokenized. Every field has a name which must be a symbol. There are three
2719
2718
  * properties that you can set, +:store+, +:index+ and +:term_vector+. You
@@ -2740,7 +2739,7 @@ frt_ir_version(VALUE self)
2740
2739
  * be indexed to be store in the Ferret index. You may want to use the index
2741
2740
  * as a simple database and store things like images or MP3s in the index. By
2742
2741
  * default each field is indexed and tokenized (split into tokens) (+:yes+).
2743
- * If you don't want to index the field use +:no+. If you wan the field
2742
+ * If you don't want to index the field use +:no+. If you want the field
2744
2743
  * indexed but not tokenized, use +:untokenized+. Do this for the fields you
2745
2744
  * wish to sort by. There are two other values for +:index+; +:omit_norms+
2746
2745
  * and +:untokenized_omit_norms+. These values correspond to +:yes+ and
@@ -2754,7 +2753,7 @@ frt_ir_version(VALUE self)
2754
2753
  * or not you would like to store term-vectors. The available options are
2755
2754
  * +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
2756
2755
  * +:with_positions_offsets+. Note that you need to store the positions to
2757
- * asscociate offsets with individual terms in the term_vector.
2756
+ * associate offsets with individual terms in the term_vector.
2758
2757
  *
2759
2758
  * == Property Table
2760
2759
  *
@@ -2946,7 +2945,7 @@ Init_FieldInfos(void)
2946
2945
  *
2947
2946
  * te = index_reader.terms(:content)
2948
2947
  *
2949
- * te.each {|term, doc_freq| puts "#{term} occured #{doc_freq} times" }
2948
+ * te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
2950
2949
  *
2951
2950
  * # or you could do it like this;
2952
2951
  * te = index_reader.terms(:content)
@@ -3093,7 +3092,7 @@ Init_TVTerm(void)
3093
3092
  * highlight search matches in results. This is all done internally so you
3094
3093
  * won't need to worry about the TermVector object. There are some other
3095
3094
  * reasons you may want to use the TermVectors object however. For example,
3096
- * you may wish to see which terms are the most commonly occuring terms in a
3095
+ * you may wish to see which terms are the most commonly occurring terms in a
3097
3096
  * document to implement a MoreLikeThis search.
3098
3097
  *
3099
3098
  * == Example
@@ -3112,7 +3111,7 @@ Init_TVTerm(void)
3112
3111
  * +positions+ and +offsets+ can be +nil+ depending on what you set the
3113
3112
  * +:term_vector+ to when you set the FieldInfo object for the field. Note in
3114
3113
  * particular that you need to store both positions and offsets if you want
3115
- * to asscociate offsets with particular terms.
3114
+ * to associate offsets with particular terms.
3116
3115
  */
3117
3116
  static void
3118
3117
  Init_TermVector(void)
@@ -3136,7 +3135,7 @@ Init_TermVector(void)
3136
3135
  * == Summary
3137
3136
  *
3138
3137
  * The IndexWriter is the class used to add documents to an index. You can
3139
- * also delete docuements from the index using this class. The indexing
3138
+ * also delete documents from the index using this class. The indexing
3140
3139
  * process is highly customizable and the IndexWriter has the following
3141
3140
  * parameters;
3142
3141
  *
@@ -3212,7 +3211,7 @@ Init_TermVector(void)
3212
3211
  * documents).
3213
3212
  * max_field_length:: Default: 10000. The maximum number of terms added to
3214
3213
  * a single field. This can be useful to protect the
3215
- * indexer when indexing documents fromt the web for
3214
+ * indexer when indexing documents from the web for
3216
3215
  * example. Usually the most important terms will occur
3217
3216
  * early on in a document so you can often safely
3218
3217
  * ignore the terms in a field after a certain number
@@ -3221,7 +3220,7 @@ Init_TermVector(void)
3221
3220
  * first 1000 terms in a field. On the other hand, if
3222
3221
  * you want to be more thorough and you are indexing
3223
3222
  * documents from your file-system you may set this
3224
- * paramter to Ferret::FIX_INT_MAX.
3223
+ * parameter to Ferret::FIX_INT_MAX.
3225
3224
  * use_compound_file:: Default: true. Uses a compound file to store the
3226
3225
  * index. This prevents an error being raised for
3227
3226
  * having too many files open at the same time. The
data/ext/r_qparser.c CHANGED
@@ -16,6 +16,7 @@ static VALUE sym_default_slop;
16
16
  static VALUE sym_handle_parse_errors;
17
17
  static VALUE sym_clean_string;
18
18
  static VALUE sym_max_clauses;
19
+ static VALUE sym_use_keywords;
19
20
 
20
21
  extern VALUE frt_get_analyzer(Analyzer *a);
21
22
  extern VALUE frt_get_q(Query *q);
@@ -116,11 +117,20 @@ frt_get_fields(VALUE rfields)
116
117
  * of terms allowed in multi, prefix, wild-card or
117
118
  * fuzzy queries when those queries are generated by
118
119
  * rewriting other queries
120
+ * :use_keywords: Default: true. By default AND, OR, NOT and REQ are
121
+ * keywords used by the query parser. Sometimes this
122
+ * is undesirable. For example, if your application
123
+ * allows searching for US states by their
124
+ * abbreviation, then OR will be a common query
125
+ * string. By setting :use_keywords to false, OR will
126
+ * no longer be a keyword allowing searches for the
127
+ * state of Oregon. You will still be able to use
128
+ * boolean queries by using the + and - characters.
119
129
  */
120
130
  static VALUE
121
131
  frt_qp_init(int argc, VALUE *argv, VALUE self)
122
132
  {
123
- VALUE roptions;
133
+ VALUE roptions = Qnil;
124
134
  VALUE rval;
125
135
  Analyzer *analyzer = NULL;
126
136
  bool has_options = false;
@@ -150,6 +160,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
150
160
  }
151
161
  } else {
152
162
  def_fields = frt_get_fields(roptions);
163
+ roptions = Qnil;
153
164
  }
154
165
  }
155
166
  if (all_fields == NULL) {
@@ -165,7 +176,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
165
176
  qp->clean_str = true;
166
177
  qp->handle_parse_errors = true;
167
178
  /* handle options */
168
- if (argc > 0) {
179
+ if (roptions != Qnil) {
169
180
  if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
170
181
  qp->handle_parse_errors = RTEST(rval);
171
182
  }
@@ -187,6 +198,9 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
187
198
  if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
188
199
  qp->max_clauses = FIX2INT(rval);
189
200
  }
201
+ if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
202
+ qp->use_keywords = RTEST(rval);
203
+ }
190
204
  }
191
205
  Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
192
206
  object_add(qp, self);
@@ -493,8 +507,8 @@ Init_QueryParseException(void)
493
507
  * === WildQuery
494
508
  *
495
509
  * A wild query is a query using the pattern matching characters * and ?. *
496
- * matchs 0 or more characters while ? matchs a single character. This type
497
- * of query can be really useful for matching heirarchical categories for
510
+ * matches 0 or more characters while ? matches a single character. This type
511
+ * of query can be really useful for matching hierarchical categories for
498
512
  * example. Let's say we had this structure;
499
513
  *
500
514
  * /sport/skiing
@@ -514,7 +528,7 @@ Init_QueryParseException(void)
514
528
  * the wild characters at the beginning of the query as it'll have to iterate
515
529
  * through every term in that field. Having said that, some fields like the
516
530
  * category field above will only have a small number of distinct fields so
517
- * this could be ok.
531
+ * this could be okay.
518
532
  *
519
533
  * === FuzzyQuery
520
534
  *
@@ -531,7 +545,7 @@ Init_QueryParseException(void)
531
545
  * 'content:Ostralya~0.4'
532
546
  *
533
547
  * Note that this query can be quite expensive. If you'd like to use this
534
- * query, you may want to set a mininum prefix length in the FuzzyQuery
548
+ * query, you may want to set a minimum prefix length in the FuzzyQuery
535
549
  * class. This can substantially reduce the number of terms that the query
536
550
  * will iterate over.
537
551
  *
@@ -551,6 +565,7 @@ Init_QueryParser(void)
551
565
  sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
552
566
  sym_clean_string = ID2SYM(rb_intern("clean_string"));
553
567
  sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
568
+ sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
554
569
 
555
570
  /* QueryParser */
556
571
  cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
data/ext/r_search.c CHANGED
@@ -179,7 +179,7 @@ frt_get_td(TopDocs *td, VALUE rsearcher)
179
179
  * call-seq:
180
180
  * top_doc.to_s(field = :id) -> string
181
181
  *
182
- * Returns a string represention of the top_doc in readable format.
182
+ * Returns a string representation of the top_doc in readable format.
183
183
  */
184
184
  static VALUE
185
185
  frt_td_to_s(int argc, VALUE *argv, VALUE self)
@@ -197,7 +197,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
197
197
  field = frt_field(argv[0]);
198
198
  }
199
199
 
200
- sprintf(s, "TopDocs: total_hits = %d, max_score = %f [\n",
200
+ sprintf(s, "TopDocs: total_hits = %ld, max_score = %f [\n",
201
201
  FIX2INT(rb_funcall(self, id_total_hits, 0)),
202
202
  NUM2DBL(rb_funcall(self, id_max_score, 0)));
203
203
  s += strlen(s);
@@ -224,7 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
224
224
  return rstr;
225
225
  }
226
226
 
227
- __inline char *
227
+ static INLINE char *
228
228
  frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
229
229
  {
230
230
  int i, j;
@@ -270,7 +270,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
270
270
  * call-seq:
271
271
  * top_doc.to_json() -> string
272
272
  *
273
- * Returns a json represention of the top_doc.
273
+ * Returns a json representation of the top_doc.
274
274
  */
275
275
  static VALUE
276
276
  frt_td_to_json(VALUE self)
@@ -318,7 +318,7 @@ frt_td_to_json(VALUE self)
318
318
  * call-seq:
319
319
  * explanation.to_s -> string
320
320
  *
321
- * Returns a string represention of the explantion in readable format.
321
+ * Returns a string representation of the explanation in readable format.
322
322
  */
323
323
  static VALUE
324
324
  frt_expl_to_s(VALUE self)
@@ -334,7 +334,7 @@ frt_expl_to_s(VALUE self)
334
334
  * call-seq:
335
335
  * explanation.to_html -> string
336
336
  *
337
- * Returns an html represention of the explantion in readable format.
337
+ * Returns an html representation of the explanation in readable format.
338
338
  */
339
339
  static VALUE
340
340
  frt_expl_to_html(VALUE self)
@@ -403,7 +403,7 @@ frt_q_to_s(int argc, VALUE *argv, VALUE self)
403
403
  * call-seq:
404
404
  * query.boost
405
405
  *
406
- * Returns the queries boost value. See the Query desription for more
406
+ * Returns the queries boost value. See the Query description for more
407
407
  * information on Query boosts.
408
408
  */
409
409
  static VALUE
@@ -417,7 +417,7 @@ frt_q_get_boost(VALUE self)
417
417
  * call-seq:
418
418
  * query.boost = boost -> boost
419
419
  *
420
- * Set the boost for a query. See the Query desription for more information
420
+ * Set the boost for a query. See the Query description for more information
421
421
  * on Query boosts.
422
422
  */
423
423
  static VALUE
@@ -582,7 +582,7 @@ static VALUE
582
582
  frt_tq_init(VALUE self, VALUE rfield, VALUE rterm)
583
583
  {
584
584
  char *field = frt_field(rfield);
585
- char *term = StringValuePtr(rterm);
585
+ char *term = rs2s(rb_obj_as_string(rterm));
586
586
  Query *q = tq_new(field, term);
587
587
  Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
588
588
  object_add(q, self);
@@ -795,7 +795,7 @@ frt_bc_init(int argc, VALUE *argv, VALUE self)
795
795
  * call-seq:
796
796
  * clause.query -> query
797
797
  *
798
- * Returnt the query object wrapped by this BooleanClause.
798
+ * Return the query object wrapped by this BooleanClause.
799
799
  */
800
800
  static VALUE
801
801
  frt_bc_get_query(VALUE self)
@@ -921,7 +921,7 @@ frt_bq_mark(void *p)
921
921
  * BooleanQuery.new(coord_disable = false)
922
922
  *
923
923
  * Create a new BooleanQuery. If you don't care about the scores of the
924
- * sub-queries added the the query (as would be the case for many
924
+ * sub-queries added to the query (as would be the case for many
925
925
  * automatically generated queries) you can disable the coord_factor of the
926
926
  * score. This will slightly improve performance for the query. Usually you
927
927
  * should leave this parameter as is.
@@ -1309,7 +1309,7 @@ frt_wcq_init(int argc, VALUE *argv, VALUE self)
1309
1309
  * distance is measured. This parameter is used to improve
1310
1310
  * performance. With a +:prefix_length+ of 0, all terms in
1311
1311
  * the index must be checked which can be quite a
1312
- * performance hit. By setting theprefix length to a
1312
+ * performance hit. By setting the prefix length to a
1313
1313
  * larger number you minimize the number of terms that need
1314
1314
  * to be checked. Even 1 will cut down the work by a
1315
1315
  * factor of about 26 depending on your character set and
@@ -1501,7 +1501,7 @@ frt_maq_init(VALUE self)
1501
1501
  * ConstantScoreQuery.new(filter) -> query
1502
1502
  *
1503
1503
  * Create a ConstantScoreQuery which uses +filter+ to match documents giving
1504
- * each document a consant score.
1504
+ * each document a constant score.
1505
1505
  */
1506
1506
  static VALUE
1507
1507
  frt_csq_init(VALUE self, VALUE rfilter)
@@ -1688,7 +1688,7 @@ frt_spannq_mark(void *p)
1688
1688
  * :slop:: Default: 0. Works exactly like a PhraseQuery slop. It is the
1689
1689
  * amount of slop allowed in the match (the term edit distance
1690
1690
  * allowed in the match).
1691
- * :in_order:: Defualt: false. Specifies whether or not the matches have to
1691
+ * :in_order:: Default: false. Specifies whether or not the matches have to
1692
1692
  * occur in the order they were added to the query. When slop is
1693
1693
  * set to 0, this parameter will make no difference.
1694
1694
  */
@@ -1862,7 +1862,7 @@ frt_f_free(void *p)
1862
1862
  * call-seq:
1863
1863
  * filter.to_s -> string
1864
1864
  *
1865
- * Return a human readable string represting the Filter object that the
1865
+ * Return a human readable string representing the Filter object that the
1866
1866
  * method was called on.
1867
1867
  */
1868
1868
  static VALUE
@@ -2415,7 +2415,7 @@ frt_sea_doc(VALUE self, VALUE rdoc_id)
2415
2415
  * call-seq:
2416
2416
  * searcher.max_doc -> number
2417
2417
  *
2418
- * Returns 1 + the maximum document id in the index. It is the the
2418
+ * Returns 1 + the maximum document id in the index. It is the
2419
2419
  * document_id that will be used by the next document added to the index. If
2420
2420
  * there are no deletions, this number also refers to the number of documents
2421
2421
  * in the index.
@@ -2555,8 +2555,13 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2555
2555
  * :sort:: A Sort object or sort string describing how the field
2556
2556
  * should be sorted. A sort string is made up of field names
2557
2557
  * which cannot contain spaces and the word "DESC" if you
2558
- * want the field reversed, all seperated by commas. For
2559
- * example; "rating DESC, author, title"
2558
+ * want the field reversed, all separated by commas. For
2559
+ * example; "rating DESC, author, title". Note that Ferret
2560
+ * will try to determine a field's type by looking at the
2561
+ * first term in the index and seeing if it can be parsed as
2562
+ * an integer or a float. Keep this in mind as you may need
2563
+ * to specify a fields type to sort it correctly. For more
2564
+ * on this, see the documentation for SortField
2560
2565
  * :filter:: a Filter object to filter the search results with
2561
2566
  * :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
2562
2567
  * and the Searcher object as its parameters and returns a
@@ -2602,8 +2607,13 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
2602
2607
  * :sort:: A Sort object or sort string describing how the field
2603
2608
  * should be sorted. A sort string is made up of field names
2604
2609
  * which cannot contain spaces and the word "DESC" if you
2605
- * want the field reversed, all seperated by commas. For
2606
- * example; "rating DESC, author, title"
2610
+ * want the field reversed, all separated by commas. For
2611
+ * example; "rating DESC, author, title". Note that Ferret
2612
+ * will try to determine a field's type by looking at the
2613
+ * first term in the index and seeing if it can be parsed as
2614
+ * an integer or a float. Keep this in mind as you may need
2615
+ * to specify a fields type to sort it correctly. For more
2616
+ * on this, see the documentation for SortField
2607
2617
  * :filter:: a Filter object to filter the search results with
2608
2618
  * :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
2609
2619
  * and the Searcher object as its parameters and returns a
@@ -2685,7 +2695,7 @@ frt_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
2685
2695
  * :ellipsis:: Default: "...". This is the string that is appended at
2686
2696
  * the beginning and end of excerpts (unless the excerpt
2687
2697
  * hits the start or end of the field. You'll probably
2688
- * want to change this so a Unicode elipsis character.
2698
+ * want to change this so a Unicode ellipsis character.
2689
2699
  */
2690
2700
  static VALUE
2691
2701
  frt_sea_highlight(int argc, VALUE *argv, VALUE self)
@@ -2702,26 +2712,31 @@ frt_sea_highlight(int argc, VALUE *argv, VALUE self)
2702
2712
 
2703
2713
  rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
2704
2714
  Data_Get_Struct(rquery, Query, query);
2705
- if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
2706
- num_excerpts = FIX2INT(v);
2707
- }
2708
- if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
2709
- if (v == sym_all) {
2710
- num_excerpts = 1;
2711
- excerpt_length = INT_MAX/2;
2715
+ if (argc > 3) {
2716
+ if (TYPE(roptions) != T_HASH) {
2717
+ rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash");
2712
2718
  }
2713
- else {
2714
- excerpt_length = FIX2INT(v);
2719
+ if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
2720
+ num_excerpts = FIX2INT(v);
2721
+ }
2722
+ if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
2723
+ if (v == sym_all) {
2724
+ num_excerpts = 1;
2725
+ excerpt_length = INT_MAX/2;
2726
+ }
2727
+ else {
2728
+ excerpt_length = FIX2INT(v);
2729
+ }
2730
+ }
2731
+ if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
2732
+ pre_tag = rs2s(rb_obj_as_string(v));
2733
+ }
2734
+ if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
2735
+ post_tag = rs2s(rb_obj_as_string(v));
2736
+ }
2737
+ if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
2738
+ ellipsis = rs2s(rb_obj_as_string(v));
2715
2739
  }
2716
- }
2717
- if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
2718
- pre_tag = rs2s(rb_obj_as_string(v));
2719
- }
2720
- if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
2721
- post_tag = rs2s(rb_obj_as_string(v));
2722
- }
2723
- if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
2724
- ellipsis = rs2s(rb_obj_as_string(v));
2725
2740
  }
2726
2741
 
2727
2742
  if ((excerpts = searcher_highlight(sea,
@@ -2771,7 +2786,7 @@ frt_sea_mark(void *p)
2771
2786
  * Searcher.new(obj) -> Searcher
2772
2787
  *
2773
2788
  * Create a new Searcher object. +dir+ can either be a string path to an
2774
- * index directory on the file-sytem, an actual Ferret::Store::Directory
2789
+ * index directory on the file-system, an actual Ferret::Store::Directory
2775
2790
  * object or a Ferret::Index::IndexReader. You should use the IndexReader for
2776
2791
  * searching multiple indexes. Just open the IndexReader on multiple
2777
2792
  * directories.
@@ -2898,7 +2913,7 @@ cTopDocs = rb_define_class_under(mSearch, "TopDocs", rb_cObject);
2898
2913
  * document id of the document that matches along with the score for the
2899
2914
  * match. The score is a positive Float value. The score contained in a hit
2900
2915
  * is not normalized so it can be greater than 1.0. To normalize scores to
2901
- * the range 0.0..1.0 devide the scores by TopDocs#max_score.
2916
+ * the range 0.0..1.0 divide the scores by TopDocs#max_score.
2902
2917
  */
2903
2918
  static void
2904
2919
  Init_Hit(void)
@@ -3546,7 +3561,7 @@ Init_SpanPrefixQuery(void)
3546
3561
  *
3547
3562
  * == Summary
3548
3563
  *
3549
- * A SpanFirstQuery resticts a query to search in the first +end+ bytes of a
3564
+ * A SpanFirstQuery restricts a query to search in the first +end+ bytes of a
3550
3565
  * field. This is useful since often the most important information in a
3551
3566
  * document is at the start of the document.
3552
3567
  *
@@ -3577,7 +3592,7 @@ Init_SpanFirstQuery(void)
3577
3592
  *
3578
3593
  * A SpanNearQuery is like a combination between a PhraseQuery and a
3579
3594
  * BooleanQuery. It matches sub-SpanQueries which are added as clauses but
3580
- * those clauses must occur within a +slop+ edit distance of eachother. You
3595
+ * those clauses must occur within a +slop+ edit distance of each other. You
3581
3596
  * can also specify that clauses must occur +in_order+.
3582
3597
  *
3583
3598
  * == Example
@@ -3801,7 +3816,7 @@ Init_QueryFilter(void)
3801
3816
  * A Filter is used to filter query results. It is usually passed to one of
3802
3817
  * Searcher's search methods however it can also be used inside a
3803
3818
  * ConstantScoreQuery or a FilteredQuery. To implement your own Filter you
3804
- * must implement the methoed #get_bitvector(index_reader) which returns a
3819
+ * must implement the method #get_bitvector(index_reader) which returns a
3805
3820
  * BitVector with set bits corresponding to documents that are allowed by
3806
3821
  * this Filter.
3807
3822
  *
@@ -3839,16 +3854,23 @@ Init_Filter(void)
3839
3854
  * The type of the SortField is set by passing it as a parameter to the
3840
3855
  * constructor. The +:auto+ type specifies that the SortField should detect
3841
3856
  * the sort type by looking at the data in the field. This is the default
3842
- * type. Care should be taken however when using the :auto sort-type since
3843
- * numbers will occur before other strings in the index so if you are sorting
3844
- * a field with both numbers and strings (like a title field which might have
3845
- * "24" and "Prison Break") then the sort_field will think it is sorting
3846
- * integers when it really should sort by string.
3857
+ * :type value although it is recommended that you explicitly specify the
3858
+ * fields type.
3847
3859
  *
3848
3860
  * == Example
3849
3861
  *
3850
3862
  * title_sf = SortField.new(:title, :type => :string)
3851
3863
  * rating_sf = SortField.new(:rating, :type => float, :reverse => true)
3864
+ *
3865
+ *
3866
+ * Note 1: Care should be taken when using the :auto sort-type since numbers
3867
+ * will occur before other strings in the index so if you are sorting a field
3868
+ * with both numbers and strings (like a title field which might have "24"
3869
+ * and "Prison Break") then the sort_field will think it is sorting integers
3870
+ * when it really should be sorting strings.
3871
+ *
3872
+ * Note 2: When sorting by integer, integers are only 4 bytes so anything
3873
+ * larger will cause strange sorting behaviour.
3852
3874
  */
3853
3875
  static void
3854
3876
  Init_SortField(void)
@@ -3923,6 +3945,9 @@ Init_SortField(void)
3923
3945
  * sf_rating = SortField.new(:rating, :type => :float, :reverse => true)
3924
3946
  * sf_title = SortField.new(:title, :type => :string)
3925
3947
  * sort = Sort.new([sf_rating, sf_title])
3948
+ *
3949
+ * Remember that the :type parameter for SortField is set to :auto be default
3950
+ * be I strongly recommend you specify a :type value.
3926
3951
  */
3927
3952
  static void
3928
3953
  Init_Sort(void)