ferret 0.11.4 → 0.11.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/TUTORIAL +3 -3
- data/ext/analysis.c +12 -9
- data/ext/array.c +10 -10
- data/ext/array.h +8 -1
- data/ext/bitvector.c +2 -2
- data/ext/except.c +1 -1
- data/ext/ferret.c +2 -2
- data/ext/ferret.h +1 -1
- data/ext/fs_store.c +13 -2
- data/ext/global.c +4 -4
- data/ext/global.h +6 -0
- data/ext/hash.c +1 -1
- data/ext/helper.c +1 -1
- data/ext/helper.h +1 -1
- data/ext/index.c +48 -22
- data/ext/index.h +17 -16
- data/ext/mempool.c +4 -1
- data/ext/mempool.h +1 -1
- data/ext/multimapper.c +2 -2
- data/ext/q_fuzzy.c +2 -2
- data/ext/q_multi_term.c +2 -2
- data/ext/q_parser.c +39 -8
- data/ext/q_range.c +32 -1
- data/ext/r_analysis.c +66 -28
- data/ext/r_index.c +18 -19
- data/ext/r_qparser.c +21 -6
- data/ext/r_search.c +74 -49
- data/ext/r_store.c +1 -1
- data/ext/r_utils.c +17 -17
- data/ext/search.c +10 -5
- data/ext/search.h +3 -1
- data/ext/sort.c +2 -2
- data/ext/stopwords.c +23 -34
- data/ext/store.c +9 -9
- data/ext/store.h +5 -4
- data/lib/ferret/document.rb +2 -2
- data/lib/ferret/field_infos.rb +37 -35
- data/lib/ferret/index.rb +16 -6
- data/lib/ferret/number_tools.rb +2 -2
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +40 -0
- data/test/unit/index/tc_index.rb +64 -101
- data/test/unit/index/tc_index_reader.rb +13 -0
- data/test/unit/largefile/tc_largefile.rb +46 -0
- data/test/unit/query_parser/tc_query_parser.rb +17 -1
- data/test/unit/search/tc_multiple_search_requests.rb +58 -0
- data/test/unit/search/tm_searcher.rb +27 -1
- data/test/unit/ts_largefile.rb +4 -0
- metadata +147 -144
data/ext/r_index.c
CHANGED
@@ -274,7 +274,7 @@ frt_fi_is_tokenized(VALUE self)
|
|
274
274
|
* used to store the field boosts for an indexed field. If you do not boost
|
275
275
|
* any fields, and you can live without scoring based on field length then
|
276
276
|
* you can omit the norms file. This will give the index a slight performance
|
277
|
-
* boost and it will use less memory,
|
277
|
+
* boost and it will use less memory, especially for indexes which have a
|
278
278
|
* large number of documents.
|
279
279
|
*/
|
280
280
|
static VALUE
|
@@ -623,7 +623,7 @@ frt_fis_create_index(VALUE self, VALUE rdir)
|
|
623
623
|
* call-seq:
|
624
624
|
* fis.fields -> symbol array
|
625
625
|
*
|
626
|
-
* Return a list of the field names (as symbols) of all the
|
626
|
+
* Return a list of the field names (as symbols) of all the fields in the
|
627
627
|
* index.
|
628
628
|
*/
|
629
629
|
static VALUE
|
@@ -1415,7 +1415,7 @@ frt_iw_init(int argc, VALUE *argv, VALUE self)
|
|
1415
1415
|
* iw.doc_count -> number
|
1416
1416
|
*
|
1417
1417
|
* Returns the number of documents in the Index. Note that deletions won't be
|
1418
|
-
* taken into account until the IndexWriter has been
|
1418
|
+
* taken into account until the IndexWriter has been committed.
|
1419
1419
|
*/
|
1420
1420
|
static VALUE
|
1421
1421
|
frt_iw_get_doc_count(VALUE self)
|
@@ -1660,7 +1660,7 @@ frt_iw_get_analyzer(VALUE self)
|
|
1660
1660
|
*
|
1661
1661
|
* Set the Analyzer for this IndexWriter. This is useful if you need to
|
1662
1662
|
* change the analyzer for a special document. It is risky though as the
|
1663
|
-
* same
|
1663
|
+
* same analyzer will be used for all documents during search.
|
1664
1664
|
*/
|
1665
1665
|
static VALUE
|
1666
1666
|
frt_iw_set_analyzer(VALUE self, VALUE ranalyzer)
|
@@ -2191,7 +2191,7 @@ frt_ir_init(VALUE self, VALUE rdir)
|
|
2191
2191
|
*
|
2192
2192
|
* Expert: change the boost value for a +field+ in document at +doc_id+.
|
2193
2193
|
* +val+ should be an integer in the range 0..255 which corresponds to an
|
2194
|
-
*
|
2194
|
+
* encoded float value.
|
2195
2195
|
*/
|
2196
2196
|
static VALUE
|
2197
2197
|
frt_ir_set_norm(VALUE self, VALUE rdoc_id, VALUE rfield, VALUE rval)
|
@@ -2267,7 +2267,7 @@ frt_ir_commit(VALUE self)
|
|
2267
2267
|
* index_reader.close -> index_reader
|
2268
2268
|
*
|
2269
2269
|
* Close the IndexReader. This method also commits any deletions made by this
|
2270
|
-
* IndexReader.
|
2270
|
+
* IndexReader. This method will be called explicitly by the garbage
|
2271
2271
|
* collector but you should call it explicitly to commit any changes as soon
|
2272
2272
|
* as possible and to close any locks held by the object to prevent locking
|
2273
2273
|
* errors.
|
@@ -2286,7 +2286,7 @@ frt_ir_close(VALUE self)
|
|
2286
2286
|
* call-seq:
|
2287
2287
|
* index_reader.has_deletions? -> bool
|
2288
2288
|
*
|
2289
|
-
* Return true if the index has any deletions, either
|
2289
|
+
* Return true if the index has any deletions, either uncommitted by this
|
2290
2290
|
* IndexReader or committed by any other IndexReader.
|
2291
2291
|
*/
|
2292
2292
|
static VALUE
|
@@ -2329,7 +2329,7 @@ frt_ir_is_deleted(VALUE self, VALUE rdoc_id)
|
|
2329
2329
|
* call-seq:
|
2330
2330
|
* index_reader.max_doc -> number
|
2331
2331
|
*
|
2332
|
-
* Returns 1 + the maximum document id in the index. It is the
|
2332
|
+
* Returns 1 + the maximum document id in the index. It is the
|
2333
2333
|
* document_id that will be used by the next document added to the index. If
|
2334
2334
|
* there are no deletions, this number also refers to the number of documents
|
2335
2335
|
* in the index.
|
@@ -2361,7 +2361,7 @@ frt_ir_num_docs(VALUE self)
|
|
2361
2361
|
* index_reader.undelete_all -> index_reader
|
2362
2362
|
*
|
2363
2363
|
* Undelete all deleted documents in the index. This is kind of like a
|
2364
|
-
* rollback feature. Not that once an index is
|
2364
|
+
* rollback feature. Not that once an index is committed or a merge happens
|
2365
2365
|
* during index, deletions will be committed and undelete_all will have no
|
2366
2366
|
* effect on these documents.
|
2367
2367
|
*/
|
@@ -2434,7 +2434,6 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
|
|
2434
2434
|
len = FIX2LONG(arg2);
|
2435
2435
|
return frt_get_doc_range(ir, pos, len, max);
|
2436
2436
|
}
|
2437
|
-
return Qnil;
|
2438
2437
|
}
|
2439
2438
|
|
2440
2439
|
/*
|
@@ -2713,7 +2712,7 @@ frt_ir_version(VALUE self)
|
|
2713
2712
|
*
|
2714
2713
|
* == Summary
|
2715
2714
|
*
|
2716
|
-
* The FieldInfo class is the field
|
2715
|
+
* The FieldInfo class is the field descriptor for the index. It specifies
|
2717
2716
|
* whether a field is compressed or not or whether it should be indexed and
|
2718
2717
|
* tokenized. Every field has a name which must be a symbol. There are three
|
2719
2718
|
* properties that you can set, +:store+, +:index+ and +:term_vector+. You
|
@@ -2740,7 +2739,7 @@ frt_ir_version(VALUE self)
|
|
2740
2739
|
* be indexed to be store in the Ferret index. You may want to use the index
|
2741
2740
|
* as a simple database and store things like images or MP3s in the index. By
|
2742
2741
|
* default each field is indexed and tokenized (split into tokens) (+:yes+).
|
2743
|
-
* If you don't want to index the field use +:no+. If you
|
2742
|
+
* If you don't want to index the field use +:no+. If you want the field
|
2744
2743
|
* indexed but not tokenized, use +:untokenized+. Do this for the fields you
|
2745
2744
|
* wish to sort by. There are two other values for +:index+; +:omit_norms+
|
2746
2745
|
* and +:untokenized_omit_norms+. These values correspond to +:yes+ and
|
@@ -2754,7 +2753,7 @@ frt_ir_version(VALUE self)
|
|
2754
2753
|
* or not you would like to store term-vectors. The available options are
|
2755
2754
|
* +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
|
2756
2755
|
* +:with_positions_offsets+. Note that you need to store the positions to
|
2757
|
-
*
|
2756
|
+
* associate offsets with individual terms in the term_vector.
|
2758
2757
|
*
|
2759
2758
|
* == Property Table
|
2760
2759
|
*
|
@@ -2946,7 +2945,7 @@ Init_FieldInfos(void)
|
|
2946
2945
|
*
|
2947
2946
|
* te = index_reader.terms(:content)
|
2948
2947
|
*
|
2949
|
-
* te.each {|term, doc_freq| puts "#{term}
|
2948
|
+
* te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" }
|
2950
2949
|
*
|
2951
2950
|
* # or you could do it like this;
|
2952
2951
|
* te = index_reader.terms(:content)
|
@@ -3093,7 +3092,7 @@ Init_TVTerm(void)
|
|
3093
3092
|
* highlight search matches in results. This is all done internally so you
|
3094
3093
|
* won't need to worry about the TermVector object. There are some other
|
3095
3094
|
* reasons you may want to use the TermVectors object however. For example,
|
3096
|
-
* you may wish to see which terms are the most commonly
|
3095
|
+
* you may wish to see which terms are the most commonly occurring terms in a
|
3097
3096
|
* document to implement a MoreLikeThis search.
|
3098
3097
|
*
|
3099
3098
|
* == Example
|
@@ -3112,7 +3111,7 @@ Init_TVTerm(void)
|
|
3112
3111
|
* +positions+ and +offsets+ can be +nil+ depending on what you set the
|
3113
3112
|
* +:term_vector+ to when you set the FieldInfo object for the field. Note in
|
3114
3113
|
* particular that you need to store both positions and offsets if you want
|
3115
|
-
* to
|
3114
|
+
* to associate offsets with particular terms.
|
3116
3115
|
*/
|
3117
3116
|
static void
|
3118
3117
|
Init_TermVector(void)
|
@@ -3136,7 +3135,7 @@ Init_TermVector(void)
|
|
3136
3135
|
* == Summary
|
3137
3136
|
*
|
3138
3137
|
* The IndexWriter is the class used to add documents to an index. You can
|
3139
|
-
* also delete
|
3138
|
+
* also delete documents from the index using this class. The indexing
|
3140
3139
|
* process is highly customizable and the IndexWriter has the following
|
3141
3140
|
* parameters;
|
3142
3141
|
*
|
@@ -3212,7 +3211,7 @@ Init_TermVector(void)
|
|
3212
3211
|
* documents).
|
3213
3212
|
* max_field_length:: Default: 10000. The maximum number of terms added to
|
3214
3213
|
* a single field. This can be useful to protect the
|
3215
|
-
* indexer when indexing documents
|
3214
|
+
* indexer when indexing documents from the web for
|
3216
3215
|
* example. Usually the most important terms will occur
|
3217
3216
|
* early on in a document so you can often safely
|
3218
3217
|
* ignore the terms in a field after a certain number
|
@@ -3221,7 +3220,7 @@ Init_TermVector(void)
|
|
3221
3220
|
* first 1000 terms in a field. On the other hand, if
|
3222
3221
|
* you want to be more thorough and you are indexing
|
3223
3222
|
* documents from your file-system you may set this
|
3224
|
-
*
|
3223
|
+
* parameter to Ferret::FIX_INT_MAX.
|
3225
3224
|
* use_compound_file:: Default: true. Uses a compound file to store the
|
3226
3225
|
* index. This prevents an error being raised for
|
3227
3226
|
* having too many files open at the same time. The
|
data/ext/r_qparser.c
CHANGED
@@ -16,6 +16,7 @@ static VALUE sym_default_slop;
|
|
16
16
|
static VALUE sym_handle_parse_errors;
|
17
17
|
static VALUE sym_clean_string;
|
18
18
|
static VALUE sym_max_clauses;
|
19
|
+
static VALUE sym_use_keywords;
|
19
20
|
|
20
21
|
extern VALUE frt_get_analyzer(Analyzer *a);
|
21
22
|
extern VALUE frt_get_q(Query *q);
|
@@ -116,11 +117,20 @@ frt_get_fields(VALUE rfields)
|
|
116
117
|
* of terms allowed in multi, prefix, wild-card or
|
117
118
|
* fuzzy queries when those queries are generated by
|
118
119
|
* rewriting other queries
|
120
|
+
* :use_keywords: Default: true. By default AND, OR, NOT and REQ are
|
121
|
+
* keywords used by the query parser. Sometimes this
|
122
|
+
* is undesirable. For example, if your application
|
123
|
+
* allows searching for US states by their
|
124
|
+
* abbreviation, then OR will be a common query
|
125
|
+
* string. By setting :use_keywords to false, OR will
|
126
|
+
* no longer be a keyword allowing searches for the
|
127
|
+
* state of Oregon. You will still be able to use
|
128
|
+
* boolean queries by using the + and - characters.
|
119
129
|
*/
|
120
130
|
static VALUE
|
121
131
|
frt_qp_init(int argc, VALUE *argv, VALUE self)
|
122
132
|
{
|
123
|
-
VALUE roptions;
|
133
|
+
VALUE roptions = Qnil;
|
124
134
|
VALUE rval;
|
125
135
|
Analyzer *analyzer = NULL;
|
126
136
|
bool has_options = false;
|
@@ -150,6 +160,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
150
160
|
}
|
151
161
|
} else {
|
152
162
|
def_fields = frt_get_fields(roptions);
|
163
|
+
roptions = Qnil;
|
153
164
|
}
|
154
165
|
}
|
155
166
|
if (all_fields == NULL) {
|
@@ -165,7 +176,7 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
165
176
|
qp->clean_str = true;
|
166
177
|
qp->handle_parse_errors = true;
|
167
178
|
/* handle options */
|
168
|
-
if (
|
179
|
+
if (roptions != Qnil) {
|
169
180
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
|
170
181
|
qp->handle_parse_errors = RTEST(rval);
|
171
182
|
}
|
@@ -187,6 +198,9 @@ frt_qp_init(int argc, VALUE *argv, VALUE self)
|
|
187
198
|
if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
|
188
199
|
qp->max_clauses = FIX2INT(rval);
|
189
200
|
}
|
201
|
+
if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
|
202
|
+
qp->use_keywords = RTEST(rval);
|
203
|
+
}
|
190
204
|
}
|
191
205
|
Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
|
192
206
|
object_add(qp, self);
|
@@ -493,8 +507,8 @@ Init_QueryParseException(void)
|
|
493
507
|
* === WildQuery
|
494
508
|
*
|
495
509
|
* A wild query is a query using the pattern matching characters * and ?. *
|
496
|
-
*
|
497
|
-
* of query can be really useful for matching
|
510
|
+
* matches 0 or more characters while ? matches a single character. This type
|
511
|
+
* of query can be really useful for matching hierarchical categories for
|
498
512
|
* example. Let's say we had this structure;
|
499
513
|
*
|
500
514
|
* /sport/skiing
|
@@ -514,7 +528,7 @@ Init_QueryParseException(void)
|
|
514
528
|
* the wild characters at the beginning of the query as it'll have to iterate
|
515
529
|
* through every term in that field. Having said that, some fields like the
|
516
530
|
* category field above will only have a small number of distinct fields so
|
517
|
-
* this could be
|
531
|
+
* this could be okay.
|
518
532
|
*
|
519
533
|
* === FuzzyQuery
|
520
534
|
*
|
@@ -531,7 +545,7 @@ Init_QueryParseException(void)
|
|
531
545
|
* 'content:Ostralya~0.4'
|
532
546
|
*
|
533
547
|
* Note that this query can be quite expensive. If you'd like to use this
|
534
|
-
* query, you may want to set a
|
548
|
+
* query, you may want to set a minimum prefix length in the FuzzyQuery
|
535
549
|
* class. This can substantially reduce the number of terms that the query
|
536
550
|
* will iterate over.
|
537
551
|
*
|
@@ -551,6 +565,7 @@ Init_QueryParser(void)
|
|
551
565
|
sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
|
552
566
|
sym_clean_string = ID2SYM(rb_intern("clean_string"));
|
553
567
|
sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
|
568
|
+
sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
|
554
569
|
|
555
570
|
/* QueryParser */
|
556
571
|
cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
|
data/ext/r_search.c
CHANGED
@@ -179,7 +179,7 @@ frt_get_td(TopDocs *td, VALUE rsearcher)
|
|
179
179
|
* call-seq:
|
180
180
|
* top_doc.to_s(field = :id) -> string
|
181
181
|
*
|
182
|
-
* Returns a string
|
182
|
+
* Returns a string representation of the top_doc in readable format.
|
183
183
|
*/
|
184
184
|
static VALUE
|
185
185
|
frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
@@ -197,7 +197,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
197
197
|
field = frt_field(argv[0]);
|
198
198
|
}
|
199
199
|
|
200
|
-
sprintf(s, "TopDocs: total_hits = %
|
200
|
+
sprintf(s, "TopDocs: total_hits = %ld, max_score = %f [\n",
|
201
201
|
FIX2INT(rb_funcall(self, id_total_hits, 0)),
|
202
202
|
NUM2DBL(rb_funcall(self, id_max_score, 0)));
|
203
203
|
s += strlen(s);
|
@@ -224,7 +224,7 @@ frt_td_to_s(int argc, VALUE *argv, VALUE self)
|
|
224
224
|
return rstr;
|
225
225
|
}
|
226
226
|
|
227
|
-
|
227
|
+
static INLINE char *
|
228
228
|
frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
229
229
|
{
|
230
230
|
int i, j;
|
@@ -270,7 +270,7 @@ frt_lzd_load_to_json(LazyDoc *lzd, char **str, char *s, int *slen)
|
|
270
270
|
* call-seq:
|
271
271
|
* top_doc.to_json() -> string
|
272
272
|
*
|
273
|
-
* Returns a json
|
273
|
+
* Returns a json representation of the top_doc.
|
274
274
|
*/
|
275
275
|
static VALUE
|
276
276
|
frt_td_to_json(VALUE self)
|
@@ -318,7 +318,7 @@ frt_td_to_json(VALUE self)
|
|
318
318
|
* call-seq:
|
319
319
|
* explanation.to_s -> string
|
320
320
|
*
|
321
|
-
* Returns a string
|
321
|
+
* Returns a string representation of the explanation in readable format.
|
322
322
|
*/
|
323
323
|
static VALUE
|
324
324
|
frt_expl_to_s(VALUE self)
|
@@ -334,7 +334,7 @@ frt_expl_to_s(VALUE self)
|
|
334
334
|
* call-seq:
|
335
335
|
* explanation.to_html -> string
|
336
336
|
*
|
337
|
-
* Returns an html
|
337
|
+
* Returns an html representation of the explanation in readable format.
|
338
338
|
*/
|
339
339
|
static VALUE
|
340
340
|
frt_expl_to_html(VALUE self)
|
@@ -403,7 +403,7 @@ frt_q_to_s(int argc, VALUE *argv, VALUE self)
|
|
403
403
|
* call-seq:
|
404
404
|
* query.boost
|
405
405
|
*
|
406
|
-
* Returns the queries boost value. See the Query
|
406
|
+
* Returns the queries boost value. See the Query description for more
|
407
407
|
* information on Query boosts.
|
408
408
|
*/
|
409
409
|
static VALUE
|
@@ -417,7 +417,7 @@ frt_q_get_boost(VALUE self)
|
|
417
417
|
* call-seq:
|
418
418
|
* query.boost = boost -> boost
|
419
419
|
*
|
420
|
-
* Set the boost for a query. See the Query
|
420
|
+
* Set the boost for a query. See the Query description for more information
|
421
421
|
* on Query boosts.
|
422
422
|
*/
|
423
423
|
static VALUE
|
@@ -582,7 +582,7 @@ static VALUE
|
|
582
582
|
frt_tq_init(VALUE self, VALUE rfield, VALUE rterm)
|
583
583
|
{
|
584
584
|
char *field = frt_field(rfield);
|
585
|
-
char *term =
|
585
|
+
char *term = rs2s(rb_obj_as_string(rterm));
|
586
586
|
Query *q = tq_new(field, term);
|
587
587
|
Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
|
588
588
|
object_add(q, self);
|
@@ -795,7 +795,7 @@ frt_bc_init(int argc, VALUE *argv, VALUE self)
|
|
795
795
|
* call-seq:
|
796
796
|
* clause.query -> query
|
797
797
|
*
|
798
|
-
*
|
798
|
+
* Return the query object wrapped by this BooleanClause.
|
799
799
|
*/
|
800
800
|
static VALUE
|
801
801
|
frt_bc_get_query(VALUE self)
|
@@ -921,7 +921,7 @@ frt_bq_mark(void *p)
|
|
921
921
|
* BooleanQuery.new(coord_disable = false)
|
922
922
|
*
|
923
923
|
* Create a new BooleanQuery. If you don't care about the scores of the
|
924
|
-
* sub-queries added
|
924
|
+
* sub-queries added to the query (as would be the case for many
|
925
925
|
* automatically generated queries) you can disable the coord_factor of the
|
926
926
|
* score. This will slightly improve performance for the query. Usually you
|
927
927
|
* should leave this parameter as is.
|
@@ -1309,7 +1309,7 @@ frt_wcq_init(int argc, VALUE *argv, VALUE self)
|
|
1309
1309
|
* distance is measured. This parameter is used to improve
|
1310
1310
|
* performance. With a +:prefix_length+ of 0, all terms in
|
1311
1311
|
* the index must be checked which can be quite a
|
1312
|
-
* performance hit. By setting
|
1312
|
+
* performance hit. By setting the prefix length to a
|
1313
1313
|
* larger number you minimize the number of terms that need
|
1314
1314
|
* to be checked. Even 1 will cut down the work by a
|
1315
1315
|
* factor of about 26 depending on your character set and
|
@@ -1501,7 +1501,7 @@ frt_maq_init(VALUE self)
|
|
1501
1501
|
* ConstantScoreQuery.new(filter) -> query
|
1502
1502
|
*
|
1503
1503
|
* Create a ConstantScoreQuery which uses +filter+ to match documents giving
|
1504
|
-
* each document a
|
1504
|
+
* each document a constant score.
|
1505
1505
|
*/
|
1506
1506
|
static VALUE
|
1507
1507
|
frt_csq_init(VALUE self, VALUE rfilter)
|
@@ -1688,7 +1688,7 @@ frt_spannq_mark(void *p)
|
|
1688
1688
|
* :slop:: Default: 0. Works exactly like a PhraseQuery slop. It is the
|
1689
1689
|
* amount of slop allowed in the match (the term edit distance
|
1690
1690
|
* allowed in the match).
|
1691
|
-
* :in_order::
|
1691
|
+
* :in_order:: Default: false. Specifies whether or not the matches have to
|
1692
1692
|
* occur in the order they were added to the query. When slop is
|
1693
1693
|
* set to 0, this parameter will make no difference.
|
1694
1694
|
*/
|
@@ -1862,7 +1862,7 @@ frt_f_free(void *p)
|
|
1862
1862
|
* call-seq:
|
1863
1863
|
* filter.to_s -> string
|
1864
1864
|
*
|
1865
|
-
* Return a human readable string
|
1865
|
+
* Return a human readable string representing the Filter object that the
|
1866
1866
|
* method was called on.
|
1867
1867
|
*/
|
1868
1868
|
static VALUE
|
@@ -2415,7 +2415,7 @@ frt_sea_doc(VALUE self, VALUE rdoc_id)
|
|
2415
2415
|
* call-seq:
|
2416
2416
|
* searcher.max_doc -> number
|
2417
2417
|
*
|
2418
|
-
* Returns 1 + the maximum document id in the index. It is the
|
2418
|
+
* Returns 1 + the maximum document id in the index. It is the
|
2419
2419
|
* document_id that will be used by the next document added to the index. If
|
2420
2420
|
* there are no deletions, this number also refers to the number of documents
|
2421
2421
|
* in the index.
|
@@ -2555,8 +2555,13 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2555
2555
|
* :sort:: A Sort object or sort string describing how the field
|
2556
2556
|
* should be sorted. A sort string is made up of field names
|
2557
2557
|
* which cannot contain spaces and the word "DESC" if you
|
2558
|
-
* want the field reversed, all
|
2559
|
-
* example; "rating DESC, author, title"
|
2558
|
+
* want the field reversed, all separated by commas. For
|
2559
|
+
* example; "rating DESC, author, title". Note that Ferret
|
2560
|
+
* will try to determine a field's type by looking at the
|
2561
|
+
* first term in the index and seeing if it can be parsed as
|
2562
|
+
* an integer or a float. Keep this in mind as you may need
|
2563
|
+
* to specify a fields type to sort it correctly. For more
|
2564
|
+
* on this, see the documentation for SortField
|
2560
2565
|
* :filter:: a Filter object to filter the search results with
|
2561
2566
|
* :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
2562
2567
|
* and the Searcher object as its parameters and returns a
|
@@ -2602,8 +2607,13 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
|
|
2602
2607
|
* :sort:: A Sort object or sort string describing how the field
|
2603
2608
|
* should be sorted. A sort string is made up of field names
|
2604
2609
|
* which cannot contain spaces and the word "DESC" if you
|
2605
|
-
* want the field reversed, all
|
2606
|
-
* example; "rating DESC, author, title"
|
2610
|
+
* want the field reversed, all separated by commas. For
|
2611
|
+
* example; "rating DESC, author, title". Note that Ferret
|
2612
|
+
* will try to determine a field's type by looking at the
|
2613
|
+
* first term in the index and seeing if it can be parsed as
|
2614
|
+
* an integer or a float. Keep this in mind as you may need
|
2615
|
+
* to specify a fields type to sort it correctly. For more
|
2616
|
+
* on this, see the documentation for SortField
|
2607
2617
|
* :filter:: a Filter object to filter the search results with
|
2608
2618
|
* :filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
2609
2619
|
* and the Searcher object as its parameters and returns a
|
@@ -2685,7 +2695,7 @@ frt_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id)
|
|
2685
2695
|
* :ellipsis:: Default: "...". This is the string that is appended at
|
2686
2696
|
* the beginning and end of excerpts (unless the excerpt
|
2687
2697
|
* hits the start or end of the field. You'll probably
|
2688
|
-
* want to change this so a Unicode
|
2698
|
+
* want to change this so a Unicode ellipsis character.
|
2689
2699
|
*/
|
2690
2700
|
static VALUE
|
2691
2701
|
frt_sea_highlight(int argc, VALUE *argv, VALUE self)
|
@@ -2702,26 +2712,31 @@ frt_sea_highlight(int argc, VALUE *argv, VALUE self)
|
|
2702
2712
|
|
2703
2713
|
rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions);
|
2704
2714
|
Data_Get_Struct(rquery, Query, query);
|
2705
|
-
if (
|
2706
|
-
|
2707
|
-
|
2708
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
|
2709
|
-
if (v == sym_all) {
|
2710
|
-
num_excerpts = 1;
|
2711
|
-
excerpt_length = INT_MAX/2;
|
2715
|
+
if (argc > 3) {
|
2716
|
+
if (TYPE(roptions) != T_HASH) {
|
2717
|
+
rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash");
|
2712
2718
|
}
|
2713
|
-
|
2714
|
-
|
2719
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) {
|
2720
|
+
num_excerpts = FIX2INT(v);
|
2721
|
+
}
|
2722
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) {
|
2723
|
+
if (v == sym_all) {
|
2724
|
+
num_excerpts = 1;
|
2725
|
+
excerpt_length = INT_MAX/2;
|
2726
|
+
}
|
2727
|
+
else {
|
2728
|
+
excerpt_length = FIX2INT(v);
|
2729
|
+
}
|
2730
|
+
}
|
2731
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
|
2732
|
+
pre_tag = rs2s(rb_obj_as_string(v));
|
2733
|
+
}
|
2734
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
|
2735
|
+
post_tag = rs2s(rb_obj_as_string(v));
|
2736
|
+
}
|
2737
|
+
if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
|
2738
|
+
ellipsis = rs2s(rb_obj_as_string(v));
|
2715
2739
|
}
|
2716
|
-
}
|
2717
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) {
|
2718
|
-
pre_tag = rs2s(rb_obj_as_string(v));
|
2719
|
-
}
|
2720
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) {
|
2721
|
-
post_tag = rs2s(rb_obj_as_string(v));
|
2722
|
-
}
|
2723
|
-
if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) {
|
2724
|
-
ellipsis = rs2s(rb_obj_as_string(v));
|
2725
2740
|
}
|
2726
2741
|
|
2727
2742
|
if ((excerpts = searcher_highlight(sea,
|
@@ -2771,7 +2786,7 @@ frt_sea_mark(void *p)
|
|
2771
2786
|
* Searcher.new(obj) -> Searcher
|
2772
2787
|
*
|
2773
2788
|
* Create a new Searcher object. +dir+ can either be a string path to an
|
2774
|
-
* index directory on the file-
|
2789
|
+
* index directory on the file-system, an actual Ferret::Store::Directory
|
2775
2790
|
* object or a Ferret::Index::IndexReader. You should use the IndexReader for
|
2776
2791
|
* searching multiple indexes. Just open the IndexReader on multiple
|
2777
2792
|
* directories.
|
@@ -2898,7 +2913,7 @@ cTopDocs = rb_define_class_under(mSearch, "TopDocs", rb_cObject);
|
|
2898
2913
|
* document id of the document that matches along with the score for the
|
2899
2914
|
* match. The score is a positive Float value. The score contained in a hit
|
2900
2915
|
* is not normalized so it can be greater than 1.0. To normalize scores to
|
2901
|
-
* the range 0.0..1.0
|
2916
|
+
* the range 0.0..1.0 divide the scores by TopDocs#max_score.
|
2902
2917
|
*/
|
2903
2918
|
static void
|
2904
2919
|
Init_Hit(void)
|
@@ -3546,7 +3561,7 @@ Init_SpanPrefixQuery(void)
|
|
3546
3561
|
*
|
3547
3562
|
* == Summary
|
3548
3563
|
*
|
3549
|
-
* A SpanFirstQuery
|
3564
|
+
* A SpanFirstQuery restricts a query to search in the first +end+ bytes of a
|
3550
3565
|
* field. This is useful since often the most important information in a
|
3551
3566
|
* document is at the start of the document.
|
3552
3567
|
*
|
@@ -3577,7 +3592,7 @@ Init_SpanFirstQuery(void)
|
|
3577
3592
|
*
|
3578
3593
|
* A SpanNearQuery is like a combination between a PhraseQuery and a
|
3579
3594
|
* BooleanQuery. It matches sub-SpanQueries which are added as clauses but
|
3580
|
-
* those clauses must occur within a +slop+ edit distance of
|
3595
|
+
* those clauses must occur within a +slop+ edit distance of each other. You
|
3581
3596
|
* can also specify that clauses must occur +in_order+.
|
3582
3597
|
*
|
3583
3598
|
* == Example
|
@@ -3801,7 +3816,7 @@ Init_QueryFilter(void)
|
|
3801
3816
|
* A Filter is used to filter query results. It is usually passed to one of
|
3802
3817
|
* Searcher's search methods however it can also be used inside a
|
3803
3818
|
* ConstantScoreQuery or a FilteredQuery. To implement your own Filter you
|
3804
|
-
* must implement the
|
3819
|
+
* must implement the method #get_bitvector(index_reader) which returns a
|
3805
3820
|
* BitVector with set bits corresponding to documents that are allowed by
|
3806
3821
|
* this Filter.
|
3807
3822
|
*
|
@@ -3839,16 +3854,23 @@ Init_Filter(void)
|
|
3839
3854
|
* The type of the SortField is set by passing it as a parameter to the
|
3840
3855
|
* constructor. The +:auto+ type specifies that the SortField should detect
|
3841
3856
|
* the sort type by looking at the data in the field. This is the default
|
3842
|
-
* type
|
3843
|
-
*
|
3844
|
-
* a field with both numbers and strings (like a title field which might have
|
3845
|
-
* "24" and "Prison Break") then the sort_field will think it is sorting
|
3846
|
-
* integers when it really should sort by string.
|
3857
|
+
* :type value although it is recommended that you explicitly specify the
|
3858
|
+
* fields type.
|
3847
3859
|
*
|
3848
3860
|
* == Example
|
3849
3861
|
*
|
3850
3862
|
* title_sf = SortField.new(:title, :type => :string)
|
3851
3863
|
* rating_sf = SortField.new(:rating, :type => float, :reverse => true)
|
3864
|
+
*
|
3865
|
+
*
|
3866
|
+
* Note 1: Care should be taken when using the :auto sort-type since numbers
|
3867
|
+
* will occur before other strings in the index so if you are sorting a field
|
3868
|
+
* with both numbers and strings (like a title field which might have "24"
|
3869
|
+
* and "Prison Break") then the sort_field will think it is sorting integers
|
3870
|
+
* when it really should be sorting strings.
|
3871
|
+
*
|
3872
|
+
* Note 2: When sorting by integer, integers are only 4 bytes so anything
|
3873
|
+
* larger will cause strange sorting behaviour.
|
3852
3874
|
*/
|
3853
3875
|
static void
|
3854
3876
|
Init_SortField(void)
|
@@ -3923,6 +3945,9 @@ Init_SortField(void)
|
|
3923
3945
|
* sf_rating = SortField.new(:rating, :type => :float, :reverse => true)
|
3924
3946
|
* sf_title = SortField.new(:title, :type => :string)
|
3925
3947
|
* sort = Sort.new([sf_rating, sf_title])
|
3948
|
+
*
|
3949
|
+
* Remember that the :type parameter for SortField is set to :auto be default
|
3950
|
+
* be I strongly recommend you specify a :type value.
|
3926
3951
|
*/
|
3927
3952
|
static void
|
3928
3953
|
Init_Sort(void)
|