ferret 0.11.4 → 0.11.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -0
- data/TUTORIAL +3 -3
- data/ext/analysis.c +12 -9
- data/ext/array.c +10 -10
- data/ext/array.h +8 -1
- data/ext/bitvector.c +2 -2
- data/ext/except.c +1 -1
- data/ext/ferret.c +2 -2
- data/ext/ferret.h +1 -1
- data/ext/fs_store.c +13 -2
- data/ext/global.c +4 -4
- data/ext/global.h +6 -0
- data/ext/hash.c +1 -1
- data/ext/helper.c +1 -1
- data/ext/helper.h +1 -1
- data/ext/index.c +48 -22
- data/ext/index.h +17 -16
- data/ext/mempool.c +4 -1
- data/ext/mempool.h +1 -1
- data/ext/multimapper.c +2 -2
- data/ext/q_fuzzy.c +2 -2
- data/ext/q_multi_term.c +2 -2
- data/ext/q_parser.c +39 -8
- data/ext/q_range.c +32 -1
- data/ext/r_analysis.c +66 -28
- data/ext/r_index.c +18 -19
- data/ext/r_qparser.c +21 -6
- data/ext/r_search.c +74 -49
- data/ext/r_store.c +1 -1
- data/ext/r_utils.c +17 -17
- data/ext/search.c +10 -5
- data/ext/search.h +3 -1
- data/ext/sort.c +2 -2
- data/ext/stopwords.c +23 -34
- data/ext/store.c +9 -9
- data/ext/store.h +5 -4
- data/lib/ferret/document.rb +2 -2
- data/lib/ferret/field_infos.rb +37 -35
- data/lib/ferret/index.rb +16 -6
- data/lib/ferret/number_tools.rb +2 -2
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +40 -0
- data/test/unit/index/tc_index.rb +64 -101
- data/test/unit/index/tc_index_reader.rb +13 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +17 -1
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tm_searcher.rb +27 -1
- data/test/unit/ts_largefile.rb +4 -0
- metadata +147 -144
data/ext/r_index.c
CHANGED
@@ -274,7 +274,7 @@ frt_fi_is_tokenized(VALUE self)
|
|
274
274
|
* used to store the field boosts for an indexed field. If you do not boost
|
275
275
|
* any fields, and you can live without scoring based on field length then
|
276
276
|
* you can omit the norms file. This will give the index a slight performance
|
277
|
-
* boost and it will use less memory,
|
277
|
+
* boost and it will use less memory, especially for indexes which have a
|
278
278
|
* large number of documents.
|
279
279
|
*/
|
280
280
|
static VALUE
|
@@ -623,7 +623,7 @@ frt_fis_create_index(VALUE self, VALUE rdir)
|
|
623
623
|
* call-seq:
|
624
624
|
* fis.fields -> symbol array
|
625
625
|
*
|
626
|
-
* Return a list of the field names (as symbols) of all the
|
626
|
+
* Return a list of the field names (as symbols) of all the fields in the
|
627
627
|
* index.
|
628
628
|
*/
|
629
629
|
static VALUE
|
@@ -1415,7 +1415,7 @@ frt_iw_init(int argc, VALUE *argv, VALUE self)
|
|
1415
1415
|
* iw.doc_count -> number
|
1416
1416
|
*
|
1417
1417
|
* Returns the number of documents in the Index. Note that deletions won't be
|
1418
|
-
* taken into account until the IndexWriter has been
|
1418
|
+
* taken into account until the IndexWriter has been committed.
|
1419
1419
|
*/
|
1420
1420
|
static VALUE
|
1421
1421
|
frt_iw_get_doc_count(VALUE self)
|
@@ -1660,7 +1660,7 @@ frt_iw_get_analyzer(VALUE self)
|
|
1660
1660
|
*
|
1661
1661
|
* Set the Analyzer for this IndexWriter. This is useful if you need to
|
1662
1662
|
* change the analyzer for a special document. It is risky though as the
|
1663
|
-
* same
|
1663
|
+
* same analyzer will be used for all documents during search.
|
1664
1664
|
*/
|
1665
1665
|
static VALUE
|
1666
1666
|
frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
|
@@ -2191,7 +2191,7 @@ frt_ir_init(VALUE self, VALUE rdir)
|
|
2191
2191
|
*
|
2192
2192
|
* Expert: change the boost value for a +field+ in document at +doc_id+.
|
2193
2193
|
* +val+ should be an integer in the range 0..255 which corresponds to an
|
2194
|
-
*
|
2194
|
+
* encoded float value.
|
2195
2195
|
*/
|
2196
2196
|
static VALUE
|
2197
2197
|
frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
|
@@ -2267,7 +2267,7 @@ frt_ir_commit(VALUE self)
|
|
2267
2267
|
* index_reader.close -> index_reader
|
2268
2268
|
*
|
2269
2269
|
* Close the IndexReader. This method also commits any deletions made by this
|
2270
|
-
* IndexReader.
|
2270
|
+
* IndexReader. This method will be called explicitly by the garbage
|
2271
2271
|
* collector but you should call it explicitly to commit any changes as soon
|
2272
2272
|
* as possible and to close any locks held by the object to prevent locking
|
2273
2273
|
* errors.
|
@@ -2286,7 +2286,7 @@ frt_ir_close(VALUE self)
|
|
2286
2286
|
* call-seq:
|
2287
2287
|
* index_reader.has_deletions? -> bool
|
2288
2288
|
*
|
2289
|
-
* Return true if the index has any deletions, either
|
2289
|
+
* Return true if the index has any deletions, either uncommitted by this
|
2290
2290
|
* IndexReader or committed by any other IndexReader.
|
2291
2291
|
*/
|
2292
2292
|
static VALUE
|
@@ -2329,7 +2329,7 @@ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
|
|
2329
2329
|
* call-seq:
|
2330
2330
|
* index_reader.max_doc -> number
|
2331
2331
|
*
|
2332
|
-
* Returns 1 + the maximum document id in the index. It is the
|
2332
|
+
* Returns 1 + the maximum document id in the index. It is the
|
2333
2333
|
* document_id that will be used by the next document added to the index. If
|
2334
2334
|
* there are no deletions, this number also refers to the number of documents
|
2335
2335
|
* in the index.
|
@@ -2361,7 +2361,7 @@ frt_ir_num_docs(VALUE self)
|
|
2361
2361
|
* index_reader.undelete_all -> index_reader
|
2362
2362
|
*
|
2363
2363
|
* Undelete all deleted documents in the index. This is kind of like a
|
2364
|
-
* rollback feature. Not that once an index is
|
2364
|
+
* rollback feature. Not that once an index is committed or a merge happens
|
2365
2365
|
* during index, deletions will be committed and undelete_all will have no
|
2366
2366
|
* effect on these documents.
|
2367
2367
|
*/
|
@@ -2434,7 +2434,6 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
|
|
2434
2434
|
len = FIX2LONG(arg2);
|
2435
2435
|
return frt_get_doc_range(ir, pos, len, max);
|
2436
2436
|
}
|
2437
|
-
return Qnil;
|
2438
2437
|
}
|
2439
2438
|
|
2440
2439
|
/*
|
@@ -2713,7 +2712,7 @@ frt_ir_version(VALUE self)
|
|
2713
2712
|
*
|
2714
2713
|
* == Summary
|
2715
2714
|
*
|
2716
|
-
* The FieldInfo class is the field
|
2715
|
+
* The FieldInfo class is the field descriptor for the index. It specifies
|
2717
2716
|
* whether a field is compressed or not or whether it should be indexed and
|
2718
2717
|
* tokenized. Every field has a name which must be a symbol. There are three
|
2719
2718
|
* properties that you can set, +:store+, +:index+ and +:term_vector+. You
|
@@ -2740,7 +2739,7 @@ frt_ir_version(VALUE self)
|
|
2740
2739
|
* be indexed to be store in the Ferret index. You may want to use the index
|
2741
2740
|
* as a simple database and store things like images or MP3s in the index. By
|
2742
2741
|
* default each field is indexed and tokenized (split into tokens) (+:yes+).
|
2743
|
-
* If you don't want to index the field use +:no+. If you
|
2742
|
+
* If you don't want to index the field use +:no+. If you want the field
|
2744
2743
|
* indexed but not tokenized, use +:untokenized+. Do this for the fields you
|
2745
2744
|
* wish to sort by. There are two other values for +:index+; +:omit_norms+
|
2746
2745
|
* and +:untokenized_omit_norms+. These values correspond to +:yes+ and
|
@@ -2754,7 +2753,7 @@ frt_ir_version(VALUE self)
|
|
2754
2753
|
* or not you would like to store term-vectors. The available options are
|
2755
2754
|
* +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
|
2756
2755
|
* +:with_positions_offsets+. Note that you need to store the positions to
|
2757
|
-
*
|
2756
|
+
* associate offsets with individual terms in the term_vector.
|
2758
2757
|
*
|
2759
2758
|
* == Property Table
|
2760
2759
|
*
|
@@ -2946,7 +2945,7 @@ Init_FieldInfos(void)
|
|
2946
2945
|
*
|
2947
2946
|
* te = index_reader.terms(:content)
|
2948
2947
|
*
|
2949
|
-
* te.each {|term, doc_freq| puts "#{term}
|
2948
|
+
* te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
|
2950
2949
|
*
|
2951
2950
|
* # or you could do it like this;
|
2952
2951
|
* te = index_reader.terms(:content)
|
@@ -3093,7 +3092,7 @@ Init_TVTerm(void)
|
|
3093
3092
|
* highlight search matches in results. This is all done internally so you
|
3094
3093
|
* won't need to worry about the TermVector object. There are some other
|
3095
3094
|
* reasons you may want to use the TermVectors object however. For example,
|
3096
|
-
* you may wish to see which terms are the most commonly
|
3095
|
+
* you may wish to see which terms are the most commonly occurring terms in a
|
3097
3096
|
* document to implement a MoreLikeThis search.
|
3098
3097
|
*
|
3099
3098
|
* == Example
|
@@ -3112,7 +3111,7 @@ Init_TVTerm(void)
|
|
3112
3111
|
* +positions+ and +offsets+ can be +nil+ depending on what you set the
|
3113
3112
|
* +:term_vector+ to when you set the FieldInfo object for the field. Note in
|
3114
3113
|
* particular that you need to store both positions and offsets if you want
|
3115
|
-
* to
|
3114
|
+
* to associate offsets with particular terms.
|
3116
3115
|
*/
|
3117
3116
|
static void
|
3118
3117
|
Init_TermVector(void)
|
@@ -3136,7 +3135,7 @@ Init_TermVector(void)
|
|
3136
3135
|
* == Summary
|
3137
3136
|
*
|
3138
3137
|
* The IndexWriter is the class used to add documents to an index. You can
|
3139
|
-
* also delete
|
3138
|
+
* also delete documents from the index using this class. The indexing
|
3140
3139
|
* process is highly customizable and the IndexWriter has the following
|
3141
3140
|
* parameters;
|
3142
3141
|
*
|
@@ -3212,7 +3211,7 @@ Init_TermVector(void)
|
|
3212
3211
|
* documents).
|
3213
3212
|
* max_field_length:: Default: 10000. The maximum number of terms added to
|
3214
3213
|
* a single field. This can be useful to protect the
|
3215
|
-
* indexer when indexing documents
|
3214
|
+
* indexer when indexing documents from the web for
|
3216
3215
|
* example. Usually the most important terms will occur
|
3217
3216
|
* early on in a document so you can often safely
|
3218
3217
|
* ignore the terms in a field after a certain number
|
@@ -3221,7 +3220,7 @@ Init_TermVector(void)
|
|
3221
3220
|
* first 1000 terms in a field. On the other hand, if
|
3222
3221
|
* you want to be more thorough and you are indexing
|
3223
3222
|
* documents from your file-system you may set this
|
3224
|
-
*
|
3223
|
+
* parameter to Ferret::FIX_INT_MAX.
|
3225
3224
|
* use_compound_file:: Default: true. Uses a compound file to store the
|
3226
3225
|
* index. This prevents an error being raised for
|
3227
3226
|
* having too many files open at the same time. The
|
data/ext/r_qparser.c
CHANGED
@@ -16,6 +16,7 @@ static VALUE sym_default_slop;
|
|
16
16
|
static VALUE sym_handle_parse_errors;
|
17
17
|
static VALUE sym_clean_string;
|
18
18
|
static VALUE sym_max_clauses;
|
19
|
+
static VALUE sym_use_keywords;
|
19
20
|
|
20
21
|
extern VALUE frt_get_analyzer(Analyzer *a);
|
21
22
|
extern VALUE frt_get_q(Query *q);
|
@@ -116,11 +117,20 @@ frt_get_fields(VALUE rfields)
|
|
116
117
|
* of terms allowed in multi, prefix, wild-card or
|
117
118
|
* fuzzy queries when those queries are generated by
|
118
119
|
* rewriting other queries
|
120
|
+
* :use_keywords: Default: true. By default AND, OR, NOT and REQ are
|
121
|
+
* keywords used by the query parser. Sometimes this
|
122
|
+
* is undesirable. For example, if your application
|
123
|
+
* allows searching for US states by their
|
124
|
+
* abbreviation, then OR will be a common query
|
125
|
+
* string. By setting :use_keywords to false, OR will
|
126
|
+
* no longer be a keyword allowing searches for the
|
127
|
+
* state of Oregon. You will still be able to use
|
128
|
+
* boolean queries by using the + and - characters.
|
119
129
|
*/
|
120
130
|
static VALUE
|
121
131
|
frt_qp_init(int argc, VALUE *argv, VALUE self)
|
122
132
|
{
|
123
|
-
VALUE roptions;
|
133
|
+
VALUE roptions = Qnil;
|
124
134
|
VALUE rval;
|
125
135
|
Analyzer *analyzer = NULL;
|
126
136
|
bool has_options = false;
|
@@ -150,6 +160,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
150
160
|
}
|
151
161
|
} else {
|
152
162
|
def_fields = frt_get_fields(roptions);
|
163
|
+
roptions = Qnil;
|
153
164
|
}
|
154
165
|
}
|
155
166
|
if (all_fields == NULL) {
|
@@ -165,7 +176,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
165
176
|
qp->clean_str = true;
|
166
177
|
qp->handle_parse_errors = true;
|
167
178
|
/* handle options */
|
168
|
-
if (
|
179
|
+
if (roptions != Qnil) {
|
169
180
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
|
170
181
|
qp->handle_parse_errors = RTEST(rval);
|
171
182
|
}
|
@@ -187,6 +198,9 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
187
198
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
|
188
199
|
qp->max_clauses = FIX2INT(rval);
|
189
200
|
}
|
201
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
|
202
|
+
qp->use_keywords = RTEST(rval);
|
203
|
+
}
|
190
204
|
}
|
191
205
|
Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
|
192
206
|
object_add(qp, self);
|
@@ -493,8 +507,8 @@ Init_QueryParseException(void)
|
|
493
507
|
* === WildQuery
|
494
508
|
*
|
495
509
|
* A wild query is a query using the pattern matching characters * and ?. *
|
496
|
-
*
|
497
|
-
* of query can be really useful for matching
|
510
|
+
* matches 0 or more characters while ? matches a single character. This type
|
511
|
+
* of query can be really useful for matching hierarchical categories for
|
498
512
|
* example. Let's say we had this structure;
|
499
513
|
*
|
500
514
|
* /sport/skiing
|
@@ -514,7 +528,7 @@ Init_QueryParseException(void)
|
|
514
528
|
* the wild characters at the beginning of the query as it'll have to iterate
|
515
529
|
* through every term in that field. Having said that, some fields like the
|
516
530
|
* category field above will only have a small number of distinct fields so
|
517
|
-
* this could be
|
531
|
+
* this could be okay.
|
518
532
|
*
|
519
533
|
* === FuzzyQuery
|
520
534
|
*
|
@@ -531,7 +545,7 @@ Init_QueryParseException(void)
|
|
531
545
|
* 'content:Ostralya~0.4'
|
532
546
|
*
|
533
547
|
* Note that this query can be quite expensive. If you'd like to use this
|
534
|
-
* query, you may want to set a
|
548
|
+
* query, you may want to set a minimum prefix length in the FuzzyQuery
|
535
549
|
* class. This can substantially reduce the number of terms that the query
|
536
550
|
* will iterate over.
|
537
551
|
*
|
@@ -551,6 +565,7 @@ Init_QueryParser(void)
|
|
551
565
|
sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
|
552
566
|
sym_clean_string = ID2SYM(rb_intern("clean_string"));
|
553
567
|
sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
|
568
|
+
sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
|
554
569
|
|
555
570
|
/* QueryParser */
|
556
571
|
cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
|
data/ext/r_search.c
CHANGED
@@ -179,7 +179,7 @@ frt_get_td(TopDocs *td, VALUE rsearcher)
|
|
179
179
|
* call-seq:
|
180
180
|
* top_doc.to_s(field = :id) -> string
|
181
181
|
*
|
182
|
-
* Returns a string
|
182
|
+
* Returns a string representation of the top_doc in readable format.
|
183
183
|
*/
|
184
184
|
static VALUE
|
185
185
|
frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
@@ -197,7 +197,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
197
197
|
field = frt_field(argv[0]);
|
198
198
|
}
|
199
199
|
|
200
|
-
sprintf(s, "TopDocs: total_hits = %
|
200
|
+
sprintf(s, "TopDocs: total_hits = %ld, max_score = %f [\n",
|
201
201
|
FIX2INT(rb_funcall(self, id_total_hits, 0)),
|
202
202
|
NUM2DBL(rb_funcall(self, id_max_score, 0)));
|
203
203
|
s += strlen(s);
|
@@ -224,7 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
224
224
|
return rstr;
|
225
225
|
}
|
226
226
|
|
227
|
-
|
227
|
+
static INLINE char *
|
228
228
|
frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
229
229
|
{
|
230
230
|
int i, j;
|
@@ -270,7 +270,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
|
270
270
|
* call-seq:
|
271
271
|
* top_doc.to_json() -> string
|
272
272
|
*
|
273
|
-
* Returns a json
|
273
|
+
* Returns a json representation of the top_doc.
|
274
274
|
*/
|
275
275
|
static VALUE
|
276
276
|
frt_td_to_json(VALUE self)
|
@@ -318,7 +318,7 @@ frt_td_to_json(VALUE self)
|
|
318
318
|
* call-seq:
|
319
319
|
* explanation.to_s -> string
|
320
320
|
*
|
321
|
-
* Returns a string
|
321
|
+
* Returns a string representation of the explanation in readable format.
|
322
322
|
*/
|
323
323
|
static VALUE
|
324
324
|
frt_expl_to_s(VALUE self)
|
@@ -334,7 +334,7 @@ frt_expl_to_s(VALUE self)
|
|
334
334
|
* call-seq:
|
335
335
|
* explanation.to_html -> string
|
336
336
|
*
|
337
|
-
* Returns an html
|
337
|
+
* Returns an html representation of the explanation in readable format.
|
338
338
|
*/
|
339
339
|
static VALUE
|
340
340
|
frt_expl_to_html(VALUE self)
|
@@ -403,7 +403,7 @@ frt_q_to_s(int argc, VALUE *argv, VALUE self)
|
|
403
403
|
* call-seq:
|
404
404
|
* query.boost
|
405
405
|
*
|
406
|
-
* Returns the queries boost value. See the Query
|
406
|
+
* Returns the queries boost value. See the Query description for more
|
407
407
|
* information on Query boosts.
|
408
408
|
*/
|
409
409
|
static VALUE
|
@@ -417,7 +417,7 @@ frt_q_get_boost(VALUE self)
|
|
417
417
|
* call-seq:
|
418
418
|
* query.boost = boost -> boost
|
419
419
|
*
|
420
|
-
* Set the boost for a query. See the Query
|
420
|
+
* Set the boost for a query. See the Query description for more information
|
421
421
|
* on Query boosts.
|
422
422
|
*/
|
423
423
|
static VALUE
|
@@ -582,7 +582,7 @@ static VALUE
|
|
582
582
|
frt_tq_init(VALUE self, VALUE rfield, VALUE rterm)
|
583
583
|
{
|
584
584
|
char *field = frt_field(rfield);
|
585
|
-
char *term =
|
585
|
+
char *term = rs2s(rb_obj_as_string(rterm));
|
586
586
|
Query *q = tq_new(field, term);
|
587
587
|
Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
|
588
588
|
object_add(q, self);
|
@@ -795,7 +795,7 @@ frt_bc_init(int argc, VALUE *argv, VALUE self)
|
|
795
795
|
* call-seq:
|
796
796
|
* clause.query -> query
|
797
797
|
*
|
798
|
-
*
|
798
|
+
* Return the query object wrapped by this BooleanClause.
|
799
799
|
*/
|
800
800
|
static VALUE
|
801
801
|
frt_bc_get_query(VALUE self)
|
@@ -921,7 +921,7 @@ frt_bq_mark(void *p)
|
|
921
921
|
* BooleanQuery.new(coord_disable = false)
|
922
922
|
*
|
923
923
|
* Create a new BooleanQuery. If you don't care about the scores of the
|
924
|
-
* sub-queries added
|
924
|
+
* sub-queries added to the query (as would be the case for many
|
925
925
|
* automatically generated queries) you can disable the coord_factor of the
|
926
926
|
* score. This will slightly improve performance for the query. Usually you
|
927
927
|
* should leave this parameter as is.
|
@@ -1309,7 +1309,7 @@ frt_wcq_init(int argc, VALUE *argv, VALUE self)
|
|
1309
1309
|
* distance is measured. This parameter is used to improve
|
1310
1310
|
* performance. With a +:prefix_length+ of 0, all terms in
|
1311
1311
|
* the index must be checked which can be quite a
|
1312
|
-
* performance hit. By setting
|
1312
|
+
* performance hit. By setting the prefix length to a
|
1313
1313
|
* larger number you minimize the number of terms that need
|
1314
1314
|
* to be checked. Even 1 will cut down the work by a
|
1315
1315
|
* factor of about 26 depending on your character set and
|
@@ -1501,7 +1501,7 @@ frt_maq_init(VALUE self)
|
|
1501
1501
|
* ConstantScoreQuery.new(filter) -> query
|
1502
1502
|
*
|
1503
1503
|
* Create a ConstantScoreQuery which uses +filter+ to match documents giving
|
1504
|
-
* each document a
|
1504
|
+
* each document a constant score.
|
1505
1505
|
*/
|
1506
1506
|
static VALUE
|
1507
1507
|
frt_csq_init(VALUE self, VALUE rfilter)
|
@@ -1688,7 +1688,7 @@ frt_spannq_mark(void *p)
|
|
1688
1688
|
* :slop:: Default: 0. Works exactly like a PhraseQuery slop. It is the
|
1689
1689
|
* amount of slop allowed in the match (the term edit distance
|
1690
1690
|
* allowed in the match).
|
1691
|
-
* :in_order::
|
1691
|
+
* :in_order:: Default: false. Specifies whether or not the matches have to
|
1692
1692
|
* occur in the order they were added to the query. When slop is
|
1693
1693
|
* set to 0, this parameter will make no difference.
|
1694
1694
|
*/
|
@@ -1862,7 +1862,7 @@ frt_f_free(void *p)
|
|
1862
1862
|
* call-seq:
|
1863
1863
|
* filter.to_s -> string
|
1864
1864
|
*
|
1865
|
-
* Return a human readable string
|
1865
|
+
* Return a human readable string representing the Filter object that the
|
1866
1866
|
* method was called on.
|
1867
1867
|
*/
|
1868
1868
|
static VALUE
|
@@ -2415,7 +2415,7 @@ frt_sea_doc(VALUE self, VALUE rdoc_id)
|
|
2415
2415
|
* call-seq:
|
2416
2416
|
* searcher.max_doc -> number
|
2417
2417
|
*
|
2418
|
-
* Returns 1 + the maximum document id in the index. It is the
|
2418
|
+
* Returns 1 + the maximum document id in the index. It is the
|
2419
2419
|
* document_id that will be used by the next document added to the index. If
|
2420
2420
|
* there are no deletions, this number also refers to the number of documents
|
2421
2421
|
* in the index.
|
@@ -2555,8 +2555,13 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2555
2555
|
* :sort:: A Sort object or sort string describing how the field
|
2556
2556
|
* should be sorted. A sort string is made up of field names
|
2557
2557
|
* which cannot contain spaces and the word "DESC" if you
|
2558
|
-
* want the field reversed, all
|
2559
|
-
* example; "rating DESC, author, title"
|
2558
|
+
* want the field reversed, all separated by commas. For
|
2559
|
+
* example; "rating DESC, author, title". Note that Ferret
|
2560
|
+
* will try to determine a field's type by looking at the
|
2561
|
+
* first term in the index and seeing if it can be parsed as
|
2562
|
+
* an integer or a float. Keep this in mind as you may need
|
2563
|
+
* to specify a fields type to sort it correctly. For more
|
2564
|
+
* on this, see the documentation for SortField
|
2560
2565
|
* :filter:: a Filter object to filter the search results with
|
2561
2566
|
* :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
2562
2567
|
* and the Searcher object as its parameters and returns a
|
@@ -2602,8 +2607,13 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
|
|
2602
2607
|
* :sort:: A Sort object or sort string describing how the field
|
2603
2608
|
* should be sorted. A sort string is made up of field names
|
2604
2609
|
* which cannot contain spaces and the word "DESC" if you
|
2605
|
-
* want the field reversed, all
|
2606
|
-
* example; "rating DESC, author, title"
|
2610
|
+
* want the field reversed, all separated by commas. For
|
2611
|
+
* example; "rating DESC, author, title". Note that Ferret
|
2612
|
+
* will try to determine a field's type by looking at the
|
2613
|
+
* first term in the index and seeing if it can be parsed as
|
2614
|
+
* an integer or a float. Keep this in mind as you may need
|
2615
|
+
* to specify a fields type to sort it correctly. For more
|
2616
|
+
* on this, see the documentation for SortField
|
2607
2617
|
* :filter:: a Filter object to filter the search results with
|
2608
2618
|
* :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
2609
2619
|
* and the Searcher object as its parameters and returns a
|
@@ -2685,7 +2695,7 @@ frt_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
|
|
2685
2695
|
* :ellipsis:: Default: "...". This is the string that is appended at
|
2686
2696
|
* the beginning and end of excerpts (unless the excerpt
|
2687
2697
|
* hits the start or end of the field. You'll probably
|
2688
|
-
* want to change this so a Unicode
|
2698
|
+
* want to change this so a Unicode ellipsis character.
|
2689
2699
|
*/
|
2690
2700
|
static VALUE
|
2691
2701
|
frt_sea_highlight(int argc, VALUE *argv, VALUE self)
|
@@ -2702,26 +2712,31 @@ frt_sea_highlight(int argc, VALUE *argv, VALUE self)
|
|
2702
2712
|
|
2703
2713
|
rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
|
2704
2714
|
Data_Get_Struct(rquery, Query, query);
|
2705
|
-
if (
|
2706
|
-
|
2707
|
-
|
2708
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
|
2709
|
-
if (v == sym_all) {
|
2710
|
-
num_excerpts = 1;
|
2711
|
-
excerpt_length = INT_MAX/2;
|
2715
|
+
if (argc > 3) {
|
2716
|
+
if (TYPE(roptions) != T_HASH) {
|
2717
|
+
rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash");
|
2712
2718
|
}
|
2713
|
-
|
2714
|
-
|
2719
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
|
2720
|
+
num_excerpts = FIX2INT(v);
|
2721
|
+
}
|
2722
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
|
2723
|
+
if (v == sym_all) {
|
2724
|
+
num_excerpts = 1;
|
2725
|
+
excerpt_length = INT_MAX/2;
|
2726
|
+
}
|
2727
|
+
else {
|
2728
|
+
excerpt_length = FIX2INT(v);
|
2729
|
+
}
|
2730
|
+
}
|
2731
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
|
2732
|
+
pre_tag = rs2s(rb_obj_as_string(v));
|
2733
|
+
}
|
2734
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
|
2735
|
+
post_tag = rs2s(rb_obj_as_string(v));
|
2736
|
+
}
|
2737
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
|
2738
|
+
ellipsis = rs2s(rb_obj_as_string(v));
|
2715
2739
|
}
|
2716
|
-
}
|
2717
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
|
2718
|
-
pre_tag = rs2s(rb_obj_as_string(v));
|
2719
|
-
}
|
2720
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
|
2721
|
-
post_tag = rs2s(rb_obj_as_string(v));
|
2722
|
-
}
|
2723
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
|
2724
|
-
ellipsis = rs2s(rb_obj_as_string(v));
|
2725
2740
|
}
|
2726
2741
|
|
2727
2742
|
if ((excerpts = searcher_highlight(sea,
|
@@ -2771,7 +2786,7 @@ frt_sea_mark(void *p)
|
|
2771
2786
|
* Searcher.new(obj) -> Searcher
|
2772
2787
|
*
|
2773
2788
|
* Create a new Searcher object. +dir+ can either be a string path to an
|
2774
|
-
* index directory on the file-
|
2789
|
+
* index directory on the file-system, an actual Ferret::Store::Directory
|
2775
2790
|
* object or a Ferret::Index::IndexReader. You should use the IndexReader for
|
2776
2791
|
* searching multiple indexes. Just open the IndexReader on multiple
|
2777
2792
|
* directories.
|
@@ -2898,7 +2913,7 @@ cTopDocs = rb_define_class_under(mSearch, "TopDocs", rb_cObject);
|
|
2898
2913
|
* document id of the document that matches along with the score for the
|
2899
2914
|
* match. The score is a positive Float value. The score contained in a hit
|
2900
2915
|
* is not normalized so it can be greater than 1.0. To normalize scores to
|
2901
|
-
* the range 0.0..1.0
|
2916
|
+
* the range 0.0..1.0 divide the scores by TopDocs#max_score.
|
2902
2917
|
*/
|
2903
2918
|
static void
|
2904
2919
|
Init_Hit(void)
|
@@ -3546,7 +3561,7 @@ Init_SpanPrefixQuery(void)
|
|
3546
3561
|
*
|
3547
3562
|
* == Summary
|
3548
3563
|
*
|
3549
|
-
* A SpanFirstQuery
|
3564
|
+
* A SpanFirstQuery restricts a query to search in the first +end+ bytes of a
|
3550
3565
|
* field. This is useful since often the most important information in a
|
3551
3566
|
* document is at the start of the document.
|
3552
3567
|
*
|
@@ -3577,7 +3592,7 @@ Init_SpanFirstQuery(void)
|
|
3577
3592
|
*
|
3578
3593
|
* A SpanNearQuery is like a combination between a PhraseQuery and a
|
3579
3594
|
* BooleanQuery. It matches sub-SpanQueries which are added as clauses but
|
3580
|
-
* those clauses must occur within a +slop+ edit distance of
|
3595
|
+
* those clauses must occur within a +slop+ edit distance of each other. You
|
3581
3596
|
* can also specify that clauses must occur +in_order+.
|
3582
3597
|
*
|
3583
3598
|
* == Example
|
@@ -3801,7 +3816,7 @@ Init_QueryFilter(void)
|
|
3801
3816
|
* A Filter is used to filter query results. It is usually passed to one of
|
3802
3817
|
* Searcher's search methods however it can also be used inside a
|
3803
3818
|
* ConstantScoreQuery or a FilteredQuery. To implement your own Filter you
|
3804
|
-
* must implement the
|
3819
|
+
* must implement the method #get_bitvector(index_reader) which returns a
|
3805
3820
|
* BitVector with set bits corresponding to documents that are allowed by
|
3806
3821
|
* this Filter.
|
3807
3822
|
*
|
@@ -3839,16 +3854,23 @@ Init_Filter(void)
|
|
3839
3854
|
* The type of the SortField is set by passing it as a parameter to the
|
3840
3855
|
* constructor. The +:auto+ type specifies that the SortField should detect
|
3841
3856
|
* the sort type by looking at the data in the field. This is the default
|
3842
|
-
* type
|
3843
|
-
*
|
3844
|
-
* a field with both numbers and strings (like a title field which might have
|
3845
|
-
* "24" and "Prison Break") then the sort_field will think it is sorting
|
3846
|
-
* integers when it really should sort by string.
|
3857
|
+
* :type value although it is recommended that you explicitly specify the
|
3858
|
+
* fields type.
|
3847
3859
|
*
|
3848
3860
|
* == Example
|
3849
3861
|
*
|
3850
3862
|
* title_sf = SortField.new(:title, :type => :string)
|
3851
3863
|
* rating_sf = SortField.new(:rating, :type => float, :reverse => true)
|
3864
|
+
*
|
3865
|
+
*
|
3866
|
+
* Note 1: Care should be taken when using the :auto sort-type since numbers
|
3867
|
+
* will occur before other strings in the index so if you are sorting a field
|
3868
|
+
* with both numbers and strings (like a title field which might have "24"
|
3869
|
+
* and "Prison Break") then the sort_field will think it is sorting integers
|
3870
|
+
* when it really should be sorting strings.
|
3871
|
+
*
|
3872
|
+
* Note 2: When sorting by integer, integers are only 4 bytes so anything
|
3873
|
+
* larger will cause strange sorting behaviour.
|
3852
3874
|
*/
|
3853
3875
|
static void
|
3854
3876
|
Init_SortField(void)
|
@@ -3923,6 +3945,9 @@ Init_SortField(void)
|
|
3923
3945
|
* sf_rating = SortField.new(:rating, :type => :float, :reverse => true)
|
3924
3946
|
* sf_title = SortField.new(:title, :type => :string)
|
3925
3947
|
* sort = Sort.new([sf_rating, sf_title])
|
3948
|
+
*
|
3949
|
+
* Remember that the :type parameter for SortField is set to :auto be default
|
3950
|
+
* be I strongly recommend you specify a :type value.
|
3926
3951
|
*/
|
3927
3952
|
static void
|
3928
3953
|
Init_Sort(void)
|