ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/search.c
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
|
-
|
4
|
-
static char * const NUM_DOCS_ARG_ERROR_MSG = "num_docs must be > 0 to run a search";
|
5
|
-
static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a search";
|
3
|
+
#include "array.h"
|
6
4
|
|
7
5
|
/***************************************************************************
|
8
6
|
*
|
@@ -10,67 +8,65 @@ static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a s
|
|
10
8
|
*
|
11
9
|
***************************************************************************/
|
12
10
|
|
13
|
-
Explanation *
|
11
|
+
Explanation *expl_new(float value, const char *description, ...)
|
14
12
|
{
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
Explanation *expl = ALLOC(Explanation);
|
14
|
+
|
15
|
+
va_list args;
|
16
|
+
va_start(args, description);
|
17
|
+
expl->description = vstrfmt(description, args);
|
18
|
+
va_end(args);
|
19
|
+
|
20
|
+
expl->value = value;
|
21
|
+
expl->details = ary_new_type_capa(Explanation *,
|
22
|
+
EXPLANATION_DETAILS_START_SIZE);
|
23
|
+
return expl;
|
22
24
|
}
|
23
25
|
|
24
|
-
void
|
26
|
+
void expl_destroy(Explanation *expl)
|
25
27
|
{
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
expl_destoy(expl->details[i]);
|
30
|
-
}
|
31
|
-
free(expl->details);
|
32
|
-
free(expl->description);
|
33
|
-
free(expl);
|
28
|
+
ary_destroy((void **)expl->details, (free_ft)expl_destroy);
|
29
|
+
free(expl->description);
|
30
|
+
free(expl);
|
34
31
|
}
|
35
32
|
|
36
|
-
Explanation *expl_add_detail(Explanation *
|
33
|
+
Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
|
37
34
|
{
|
38
|
-
|
39
|
-
|
40
|
-
REALLOC_N(self->details, Explanation *, self->dcapa);
|
41
|
-
}
|
42
|
-
self->details[self->dcnt] = detail;
|
43
|
-
self->dcnt++;
|
44
|
-
return self;
|
35
|
+
ary_push(expl->details, detail);
|
36
|
+
return expl;
|
45
37
|
}
|
46
38
|
|
47
|
-
char *
|
39
|
+
char *expl_to_s_depth(Explanation *expl, int depth)
|
48
40
|
{
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
41
|
+
int i;
|
42
|
+
char *buffer = ALLOC_N(char, depth * 2 + 1);
|
43
|
+
const int num_details = ary_size(expl->details);
|
44
|
+
|
45
|
+
memset(buffer, ' ', sizeof(char) * depth * 2);
|
46
|
+
buffer[depth*2] = 0;
|
53
47
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
48
|
+
buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
|
49
|
+
for (i = 0; i < num_details; i++) {
|
50
|
+
buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
|
51
|
+
}
|
58
52
|
|
59
|
-
|
53
|
+
return buffer;
|
60
54
|
}
|
61
55
|
|
62
|
-
char *expl_to_html(Explanation *
|
56
|
+
char *expl_to_html(Explanation *expl)
|
63
57
|
{
|
64
|
-
|
65
|
-
|
66
|
-
|
58
|
+
int i;
|
59
|
+
char *buffer;
|
60
|
+
const int num_details = ary_size(expl->details);
|
61
|
+
|
62
|
+
buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
for (i = 0; i < num_details; i++) {
|
65
|
+
estrcat(buffer, expl_to_html(expl->details[i]));
|
66
|
+
}
|
71
67
|
|
72
|
-
|
73
|
-
|
68
|
+
REALLOC_N(buffer, char, strlen(buffer) + 10);
|
69
|
+
return strcat(buffer, "</ul>\n");
|
74
70
|
}
|
75
71
|
|
76
72
|
/***************************************************************************
|
@@ -79,88 +75,104 @@ char *expl_to_html(Explanation *self)
|
|
79
75
|
*
|
80
76
|
***************************************************************************/
|
81
77
|
|
82
|
-
bool hit_less_than(
|
78
|
+
static bool hit_less_than(const Hit *hit1, const Hit *hit2)
|
83
79
|
{
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
80
|
+
if (hit1->score == hit2->score) {
|
81
|
+
return hit1->doc > hit2->doc;
|
82
|
+
}
|
83
|
+
else {
|
84
|
+
return hit1->score < hit1->score;
|
85
|
+
}
|
89
86
|
}
|
90
87
|
|
91
|
-
|
88
|
+
static bool hit_lt(Hit *hit1, Hit *hit2)
|
92
89
|
{
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
if (hit1->score == hit2->score) {
|
91
|
+
return hit1->doc > hit2->doc;
|
92
|
+
}
|
93
|
+
else {
|
94
|
+
return hit1->score < hit2->score;
|
95
|
+
}
|
98
96
|
}
|
99
97
|
|
100
|
-
void hit_pq_down(PriorityQueue *pq)
|
98
|
+
static void hit_pq_down(PriorityQueue *pq)
|
101
99
|
{
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
register int i = 1;
|
101
|
+
register int j = 2; /* i << 1; */
|
102
|
+
register int k = 3; /* j + 1; */
|
103
|
+
Hit **heap = (Hit **)pq->heap;
|
104
|
+
Hit *node = heap[i]; /* save top node */
|
107
105
|
|
108
|
-
|
109
|
-
|
106
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
107
|
+
j = k;
|
108
|
+
}
|
110
109
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
110
|
+
while ((j <= pq->size) && hit_lt(heap[j], node)) {
|
111
|
+
heap[i] = heap[j]; /* shift up child */
|
112
|
+
i = j;
|
113
|
+
j = i << 1;
|
114
|
+
k = j + 1;
|
115
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
116
|
+
j = k;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
heap[i] = node;
|
120
120
|
}
|
121
121
|
|
122
|
-
Hit *hit_pq_pop(PriorityQueue *pq)
|
122
|
+
static Hit *hit_pq_pop(PriorityQueue *pq)
|
123
123
|
{
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
124
|
+
if (pq->size > 0) {
|
125
|
+
Hit *result = (Hit *)pq->heap[1]; /* save first value */
|
126
|
+
pq->heap[1] = pq->heap[pq->size]; /* move last to first */
|
127
|
+
pq->heap[pq->size] = NULL;
|
128
|
+
pq->size--;
|
129
|
+
hit_pq_down(pq); /* adjust heap */
|
130
|
+
return result;
|
131
|
+
}
|
132
|
+
else {
|
133
|
+
return NULL;
|
134
|
+
}
|
134
135
|
}
|
135
136
|
|
136
|
-
|
137
|
+
static void hit_pq_up(PriorityQueue *pq)
|
137
138
|
{
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
139
|
+
Hit **heap = (Hit **)pq->heap;
|
140
|
+
Hit *node;
|
141
|
+
int i = pq->size;
|
142
|
+
int j = i >> 1;
|
143
|
+
node = heap[i];
|
144
|
+
|
145
|
+
while ((j > 0) && hit_lt(node, heap[j])) {
|
146
|
+
heap[i] = heap[j];
|
147
|
+
i = j;
|
148
|
+
j = j >> 1;
|
149
|
+
}
|
150
|
+
heap[i] = node;
|
151
|
+
}
|
143
152
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
154
|
+
{
|
155
|
+
if (pq->size < pq->capa) {
|
156
|
+
Hit *new_hit = ALLOC(Hit);
|
157
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
158
|
+
pq->size++;
|
159
|
+
if (pq->size >= pq->mem_capa) {
|
160
|
+
pq->mem_capa <<= 1;
|
161
|
+
REALLOC_N(pq->heap, void *, pq->mem_capa);
|
162
|
+
}
|
163
|
+
pq->heap[pq->size] = new_hit;
|
164
|
+
hit_pq_up(pq);
|
165
|
+
}
|
166
|
+
else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
167
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
168
|
+
hit_pq_down(pq);
|
169
|
+
}
|
150
170
|
}
|
151
171
|
|
152
|
-
void
|
172
|
+
static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
|
153
173
|
{
|
154
|
-
|
155
|
-
|
156
|
-
memcpy(new_hit, hit, sizeof(Hit));
|
157
|
-
pq->count++;
|
158
|
-
pq->heap[pq->count] = new_hit;
|
159
|
-
hit_pq_up(pq);
|
160
|
-
} else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
161
|
-
memcpy(pq->heap[1], hit, sizeof(Hit));
|
162
|
-
hit_pq_down(pq);
|
163
|
-
}
|
174
|
+
hit_pq_insert(pq, hit);
|
175
|
+
free(hit);
|
164
176
|
}
|
165
177
|
|
166
178
|
/***************************************************************************
|
@@ -169,35 +181,38 @@ void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
|
169
181
|
*
|
170
182
|
***************************************************************************/
|
171
183
|
|
172
|
-
TopDocs *
|
184
|
+
TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
|
173
185
|
{
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
186
|
+
TopDocs *td = ALLOC(TopDocs);
|
187
|
+
td->total_hits = total_hits;
|
188
|
+
td->size = size;
|
189
|
+
td->hits = hits;
|
190
|
+
td->max_score = max_score;
|
191
|
+
return td;
|
179
192
|
}
|
180
193
|
|
181
194
|
void td_destroy(TopDocs *td)
|
182
195
|
{
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
196
|
+
int i;
|
197
|
+
|
198
|
+
for (i = 0; i < td->size; i++) {
|
199
|
+
free(td->hits[i]);
|
200
|
+
}
|
201
|
+
free(td->hits);
|
202
|
+
free(td);
|
189
203
|
}
|
190
204
|
|
191
205
|
char *td_to_s(TopDocs *td)
|
192
206
|
{
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
207
|
+
int i;
|
208
|
+
Hit *hit;
|
209
|
+
char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
|
210
|
+
td->total_hits);
|
211
|
+
for (i = 0; i < td->size; i++) {
|
212
|
+
hit = td->hits[i];
|
213
|
+
estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
|
214
|
+
}
|
215
|
+
return buffer;
|
201
216
|
}
|
202
217
|
|
203
218
|
/***************************************************************************
|
@@ -208,44 +223,50 @@ char *td_to_s(TopDocs *td)
|
|
208
223
|
|
209
224
|
Query *w_get_query(Weight *self)
|
210
225
|
{
|
211
|
-
|
226
|
+
return self->query;
|
212
227
|
}
|
213
228
|
|
214
229
|
float w_get_value(Weight *self)
|
215
230
|
{
|
216
|
-
|
231
|
+
return self->value;
|
217
232
|
}
|
218
233
|
|
219
234
|
float w_sum_of_squared_weights(Weight *self)
|
220
235
|
{
|
221
|
-
|
222
|
-
|
236
|
+
self->qweight = self->idf * self->query->boost;
|
237
|
+
return self->qweight * self->qweight; /* square it */
|
223
238
|
}
|
224
239
|
|
225
240
|
void w_normalize(Weight *self, float normalization_factor)
|
226
241
|
{
|
227
|
-
|
228
|
-
|
229
|
-
|
242
|
+
self->qnorm = normalization_factor;
|
243
|
+
self->qweight *= normalization_factor; /* normalize query weight */
|
244
|
+
self->value = self->qweight * self->idf;/* idf for document */
|
230
245
|
}
|
231
246
|
|
232
247
|
void w_destroy(Weight *self)
|
233
248
|
{
|
234
|
-
|
235
|
-
|
249
|
+
q_deref(self->query);
|
250
|
+
free(self);
|
236
251
|
}
|
237
252
|
|
238
|
-
Weight *w_create(Query *query)
|
253
|
+
Weight *w_create(size_t size, Query *query)
|
239
254
|
{
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
255
|
+
Weight *self = (Weight *)ecalloc(size);
|
256
|
+
#ifdef DEBUG
|
257
|
+
if (size < sizeof(Weight)) {
|
258
|
+
RAISE(ERROR, "size of weight <%d> should be at least <%d>",
|
259
|
+
(int)size, (int)sizeof(Weight));
|
260
|
+
}
|
261
|
+
#endif
|
262
|
+
REF(query);
|
263
|
+
self->query = query;
|
264
|
+
self->get_query = &w_get_query;
|
265
|
+
self->get_value = &w_get_value;
|
266
|
+
self->normalize = &w_normalize;
|
267
|
+
self->destroy = &w_destroy;
|
268
|
+
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
269
|
+
return self;
|
249
270
|
}
|
250
271
|
|
251
272
|
/***************************************************************************
|
@@ -254,128 +275,181 @@ Weight *w_create(Query *query)
|
|
254
275
|
*
|
255
276
|
***************************************************************************/
|
256
277
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
278
|
+
static const char *QUERY_NAMES[] = {
|
279
|
+
"TermQuery",
|
280
|
+
"MultiTermQuery",
|
281
|
+
"BooleanQuery",
|
282
|
+
"PhraseQuery",
|
283
|
+
"MultiPhraseQuery",
|
284
|
+
"ConstantScoreQuery",
|
285
|
+
"FilteredQuery",
|
286
|
+
"MatchAllQuery",
|
287
|
+
"RangeQuery",
|
288
|
+
"WildCardQuery",
|
289
|
+
"FuzzyQuery",
|
290
|
+
"PrefixQuery",
|
291
|
+
"SpanTermQuery",
|
292
|
+
"SpanFirstQuery",
|
293
|
+
"SpanOrQuery",
|
294
|
+
"SpanNotQuery",
|
295
|
+
"SpanNearQuery"
|
296
|
+
};
|
261
297
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
298
|
+
static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
|
299
|
+
|
300
|
+
const char *q_get_query_name(enum QUERY_TYPE type) {
|
301
|
+
if (type >= NELEMS(QUERY_NAMES)) {
|
302
|
+
return UNKNOWN_QUERY_NAME;
|
303
|
+
}
|
304
|
+
else {
|
305
|
+
return QUERY_NAMES[type];
|
306
|
+
}
|
266
307
|
}
|
267
308
|
|
268
|
-
|
309
|
+
static Query *q_rewrite(Query *self, IndexReader *ir)
|
269
310
|
{
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
Similarity *sim = query->get_similarity(query, searcher);
|
274
|
-
float norm = sim_query_norm(sim, sum);
|
275
|
-
q_deref(query);
|
276
|
-
|
277
|
-
weight->normalize(weight, norm);
|
278
|
-
return self->weight = weight;
|
311
|
+
(void)ir;
|
312
|
+
self->ref_cnt++;
|
313
|
+
return self;
|
279
314
|
}
|
280
315
|
|
281
|
-
|
316
|
+
static void q_extract_terms(Query *self, HashSet *terms)
|
282
317
|
{
|
283
|
-
|
284
|
-
|
318
|
+
/* do nothing by default */
|
319
|
+
(void)self;
|
320
|
+
(void)terms;
|
285
321
|
}
|
286
322
|
|
287
|
-
|
323
|
+
Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
|
288
324
|
{
|
289
|
-
|
325
|
+
(void)self;
|
326
|
+
return searcher->get_similarity(searcher);
|
290
327
|
}
|
291
328
|
|
292
|
-
void
|
329
|
+
void q_destroy_i(Query *self)
|
293
330
|
{
|
294
|
-
|
331
|
+
free(self);
|
295
332
|
}
|
296
333
|
|
297
334
|
void q_deref(Query *self)
|
298
335
|
{
|
299
|
-
|
300
|
-
|
301
|
-
|
336
|
+
if (--(self->ref_cnt) == 0) {
|
337
|
+
self->destroy_i(self);
|
338
|
+
}
|
302
339
|
}
|
303
340
|
|
304
|
-
Query *
|
341
|
+
Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
|
305
342
|
{
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
self->get_similarity = &q_get_similarity_i;
|
312
|
-
self->extract_terms = &q_extract_terms;
|
313
|
-
self->weight = NULL;
|
314
|
-
self->ref_cnt = 1;
|
315
|
-
return self;
|
343
|
+
(void)self;
|
344
|
+
(void)searcher;
|
345
|
+
RAISE(UNSUPPORTED_ERROR,
|
346
|
+
"Create weight is unsupported for this type of query");
|
347
|
+
return NULL;
|
316
348
|
}
|
317
349
|
|
318
|
-
|
350
|
+
Weight *q_weight(Query *self, Searcher *searcher)
|
319
351
|
{
|
320
|
-
|
321
|
-
|
352
|
+
Query *query = searcher->rewrite(searcher, self);
|
353
|
+
Weight *weight = query->create_weight_i(query, searcher);
|
354
|
+
float sum = weight->sum_of_squared_weights(weight);
|
355
|
+
Similarity *sim = query->get_similarity(query, searcher);
|
356
|
+
float norm = sim_query_norm(sim, sum);
|
357
|
+
q_deref(query);
|
322
358
|
|
323
|
-
|
324
|
-
|
325
|
-
return (self == o) || ((self->type == o->type) &&
|
326
|
-
(self->boost == o->boost) &&
|
327
|
-
self->eq(self, o));
|
359
|
+
weight->normalize(weight, norm);
|
360
|
+
return self->weight = weight;
|
328
361
|
}
|
329
362
|
|
363
|
+
#define BQ(query) ((BooleanQuery *)(query))
|
330
364
|
Query *q_combine(Query **queries, int q_cnt)
|
331
365
|
{
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
366
|
+
int i;
|
367
|
+
Query *q, *ret_q;
|
368
|
+
HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
|
369
|
+
|
370
|
+
for (i = 0; i < q_cnt; i++) {
|
371
|
+
q = queries[i];
|
372
|
+
if (q->type == BOOLEAN_QUERY) {
|
373
|
+
int j;
|
374
|
+
bool splittable = true;
|
375
|
+
if (BQ(q)->coord_disabled == false) {
|
376
|
+
splittable = false;
|
377
|
+
}
|
378
|
+
else {
|
379
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
380
|
+
if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
|
381
|
+
splittable = false;
|
382
|
+
break;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
}
|
386
|
+
if (splittable) {
|
387
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
388
|
+
Query *sub_q = BQ(q)->clauses[j]->query;
|
389
|
+
hs_add(uniques, sub_q);
|
390
|
+
}
|
391
|
+
}
|
392
|
+
else {
|
393
|
+
hs_add(uniques, q);
|
394
|
+
}
|
351
395
|
}
|
352
|
-
|
353
|
-
|
354
|
-
for (j = 0; j < bq->clause_cnt; j++) {
|
355
|
-
q = bq->clauses[j]->query;
|
356
|
-
hs_add(uniques, q);
|
396
|
+
else {
|
397
|
+
hs_add(uniques, q);
|
357
398
|
}
|
358
|
-
} else {
|
359
|
-
hs_add(uniques, q);
|
360
|
-
}
|
361
|
-
} else {
|
362
|
-
hs_add(uniques, q);
|
363
399
|
}
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
400
|
+
if (uniques->size == 1) {
|
401
|
+
ret_q = (Query *)uniques->elems[0];
|
402
|
+
REF(ret_q);
|
403
|
+
}
|
404
|
+
else {
|
405
|
+
ret_q = bq_new(true);
|
406
|
+
for (i = 0; i < uniques->size; i++) {
|
407
|
+
q = (Query *)uniques->elems[i];
|
408
|
+
bq_add_query(ret_q, q, BC_SHOULD);
|
409
|
+
}
|
374
410
|
}
|
375
|
-
|
376
|
-
hs_destroy(uniques);
|
411
|
+
hs_destroy(uniques);
|
377
412
|
|
378
|
-
|
413
|
+
return ret_q;
|
414
|
+
}
|
415
|
+
|
416
|
+
ulong q_hash(Query *self)
|
417
|
+
{
|
418
|
+
return (self->hash(self) << 5) | self->type;
|
419
|
+
}
|
420
|
+
|
421
|
+
int q_eq(Query *self, Query *o)
|
422
|
+
{
|
423
|
+
return (self == o)
|
424
|
+
|| ((self->type == o->type)
|
425
|
+
&& (self->boost == o->boost)
|
426
|
+
&& self->eq(self, o));
|
427
|
+
}
|
428
|
+
|
429
|
+
static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
|
430
|
+
{
|
431
|
+
/* be default we don't add any matches */
|
432
|
+
(void)self; (void)tv;
|
433
|
+
return mv;
|
434
|
+
}
|
435
|
+
|
436
|
+
Query *q_create(size_t size)
|
437
|
+
{
|
438
|
+
Query *self = (Query *)ecalloc(size);
|
439
|
+
#ifdef DEBUG
|
440
|
+
if (size < sizeof(Query)) {
|
441
|
+
RAISE(ERROR, "Size of a query <%d> should never be smaller than the "
|
442
|
+
"size of a Query struct <%d>", (int)size, (int)sizeof(Query));
|
443
|
+
}
|
444
|
+
#endif
|
445
|
+
self->boost = 1.0;
|
446
|
+
self->rewrite = &q_rewrite;
|
447
|
+
self->get_similarity = &q_get_similarity_i;
|
448
|
+
self->extract_terms = &q_extract_terms;
|
449
|
+
self->get_matchv_i = &q_get_matchv_i;
|
450
|
+
self->weight = NULL;
|
451
|
+
self->ref_cnt = 1;
|
452
|
+
return self;
|
379
453
|
}
|
380
454
|
|
381
455
|
/***************************************************************************
|
@@ -384,36 +458,154 @@ Query *q_combine(Query **queries, int q_cnt)
|
|
384
458
|
*
|
385
459
|
***************************************************************************/
|
386
460
|
|
387
|
-
void scorer_destroy_i(Scorer *
|
461
|
+
void scorer_destroy_i(Scorer *scorer)
|
388
462
|
{
|
389
|
-
|
390
|
-
free(self);
|
463
|
+
free(scorer);
|
391
464
|
}
|
392
465
|
|
393
|
-
Scorer *scorer_create(Similarity *similarity)
|
466
|
+
Scorer *scorer_create(size_t size, Similarity *similarity)
|
394
467
|
{
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
468
|
+
Scorer *self = (Scorer *)ecalloc(size);
|
469
|
+
#ifdef DEBUG
|
470
|
+
if (size < sizeof(Scorer)) {
|
471
|
+
RAISE(ERROR, "size of scorer <%d> should be at least <%d>",
|
472
|
+
(int)size, (int)sizeof(Scorer));
|
473
|
+
}
|
474
|
+
#endif
|
475
|
+
self->destroy = &scorer_destroy_i;
|
476
|
+
self->similarity = similarity;
|
477
|
+
return self;
|
400
478
|
}
|
401
479
|
|
402
480
|
bool scorer_less_than(void *p1, void *p2)
|
403
481
|
{
|
404
|
-
|
405
|
-
|
406
|
-
|
482
|
+
Scorer *s1 = (Scorer *)p1;
|
483
|
+
Scorer *s2 = (Scorer *)p2;
|
484
|
+
return s1->score(s1) < s2->score(s2);
|
407
485
|
}
|
408
486
|
|
409
|
-
bool scorer_doc_less_than(
|
487
|
+
bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
|
410
488
|
{
|
411
|
-
|
489
|
+
return s1->doc < s2->doc;
|
412
490
|
}
|
413
491
|
|
414
492
|
int scorer_doc_cmp(const void *p1, const void *p2)
|
415
493
|
{
|
416
|
-
|
494
|
+
return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
|
495
|
+
}
|
496
|
+
|
497
|
+
/***************************************************************************
|
498
|
+
*
|
499
|
+
* Highlighter
|
500
|
+
*
|
501
|
+
***************************************************************************/
|
502
|
+
|
503
|
+
/* ** MatchRange ** */
|
504
|
+
static int match_range_cmp(const void *p1, const void *p2)
|
505
|
+
{
|
506
|
+
int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
|
507
|
+
if (diff != 0) {
|
508
|
+
return diff;
|
509
|
+
}
|
510
|
+
else {
|
511
|
+
return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
|
512
|
+
}
|
513
|
+
}
|
514
|
+
|
515
|
+
|
516
|
+
|
517
|
+
/* ** MatchVector ** */
|
518
|
+
MatchVector *matchv_new()
|
519
|
+
{
|
520
|
+
MatchVector *matchv = ALLOC(MatchVector);
|
521
|
+
|
522
|
+
matchv->size = 0;
|
523
|
+
matchv->capa = MATCH_VECTOR_INIT_CAPA;
|
524
|
+
matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
|
525
|
+
|
526
|
+
return matchv;
|
527
|
+
}
|
528
|
+
|
529
|
+
MatchVector *matchv_add(MatchVector *self, int start, int end)
|
530
|
+
{
|
531
|
+
if (self->size >= self->capa) {
|
532
|
+
self->capa <<= 1;
|
533
|
+
REALLOC_N(self->matches, MatchRange, self->capa);
|
534
|
+
}
|
535
|
+
self->matches[self->size].start = start;
|
536
|
+
self->matches[self->size].end = end;
|
537
|
+
self->matches[self->size++].score = 1.0;
|
538
|
+
return self;
|
539
|
+
}
|
540
|
+
|
541
|
+
MatchVector *matchv_sort(MatchVector *self)
|
542
|
+
{
|
543
|
+
qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
|
544
|
+
return self;
|
545
|
+
}
|
546
|
+
|
547
|
+
MatchVector *matchv_compact(MatchVector *self)
|
548
|
+
{
|
549
|
+
int left, right;
|
550
|
+
matchv_sort(self);
|
551
|
+
for (right = left = 0; right < self->size; right++) {
|
552
|
+
/* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
|
553
|
+
if (self->matches[right].start > self->matches[left].end + 1) {
|
554
|
+
left++;
|
555
|
+
self->matches[left].start = self->matches[right].start;
|
556
|
+
self->matches[left].end = self->matches[right].end;
|
557
|
+
self->matches[left].score = self->matches[right].score;
|
558
|
+
}
|
559
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
560
|
+
self->matches[left].end = self->matches[right].end;
|
561
|
+
}
|
562
|
+
else {
|
563
|
+
self->matches[left].score += self->matches[right].score;
|
564
|
+
}
|
565
|
+
}
|
566
|
+
self->size = left + 1;
|
567
|
+
return self;
|
568
|
+
}
|
569
|
+
|
570
|
+
MatchVector *matchv_compact_with_breaks(MatchVector *self)
|
571
|
+
{
|
572
|
+
int left, right;
|
573
|
+
matchv_sort(self);
|
574
|
+
for (right = left = 0; right < self->size; right++) {
|
575
|
+
/* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
|
576
|
+
if (self->matches[right].start > self->matches[left].end) {
|
577
|
+
left++;
|
578
|
+
self->matches[left].start = self->matches[right].start;
|
579
|
+
self->matches[left].end = self->matches[right].end;
|
580
|
+
self->matches[left].score = self->matches[right].score;
|
581
|
+
}
|
582
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
583
|
+
self->matches[left].end = self->matches[right].end;
|
584
|
+
self->matches[left].score += self->matches[right].score;
|
585
|
+
}
|
586
|
+
else if (right > left) {
|
587
|
+
self->matches[left].score += self->matches[right].score;
|
588
|
+
}
|
589
|
+
}
|
590
|
+
self->size = left + 1;
|
591
|
+
return self;
|
592
|
+
}
|
593
|
+
|
594
|
+
|
595
|
+
static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
|
596
|
+
{
|
597
|
+
int i;
|
598
|
+
for (i = 0; i < mv->size; i++) {
|
599
|
+
mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
|
600
|
+
mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
|
601
|
+
}
|
602
|
+
return mv;
|
603
|
+
}
|
604
|
+
|
605
|
+
void matchv_destroy(MatchVector *self)
|
606
|
+
{
|
607
|
+
free(self->matches);
|
608
|
+
free(self);
|
417
609
|
}
|
418
610
|
|
419
611
|
/***************************************************************************
|
@@ -422,211 +614,541 @@ int scorer_doc_cmp(const void *p1, const void *p2)
|
|
422
614
|
*
|
423
615
|
***************************************************************************/
|
424
616
|
|
425
|
-
|
617
|
+
MatchVector *searcher_get_match_vector(Searcher *self,
|
618
|
+
Query *query,
|
619
|
+
const int doc_num,
|
620
|
+
const char *field)
|
621
|
+
{
|
622
|
+
MatchVector *mv = matchv_new();
|
623
|
+
Query *rewritten_query = self->rewrite(self, query);
|
624
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
625
|
+
if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
|
626
|
+
mv = rewritten_query->get_matchv_i(rewritten_query, mv, tv);
|
627
|
+
tv_destroy(tv);
|
628
|
+
}
|
629
|
+
q_deref(rewritten_query);
|
630
|
+
return mv;
|
631
|
+
}
|
632
|
+
|
633
|
+
typedef struct Excerpt
|
426
634
|
{
|
427
|
-
|
635
|
+
int start;
|
636
|
+
int end;
|
637
|
+
int start_pos;
|
638
|
+
int end_pos;
|
639
|
+
int start_offset;
|
640
|
+
int end_offset;
|
641
|
+
double score;
|
642
|
+
} Excerpt;
|
643
|
+
|
644
|
+
/*
|
645
|
+
static int excerpt_cmp(const void *p1, const void *p2)
|
646
|
+
{
|
647
|
+
double score1 = (*((Excerpt **)p1))->score;
|
648
|
+
double score2 = (*((Excerpt **)p2))->score;
|
649
|
+
if (score1 > score2) return 1;
|
650
|
+
if (score1 < score2) return -1;
|
651
|
+
return 0;
|
428
652
|
}
|
653
|
+
*/
|
429
654
|
|
430
|
-
static int
|
655
|
+
static int excerpt_start_cmp(const void *p1, const void *p2)
|
431
656
|
{
|
432
|
-
|
433
|
-
|
657
|
+
return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
|
658
|
+
}
|
434
659
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
return freqs;
|
660
|
+
static int excerpt_lt(Excerpt *e1, Excerpt *e2)
|
661
|
+
{
|
662
|
+
return e1->score > e2->score; /* want the highest score at top */
|
439
663
|
}
|
440
664
|
|
441
|
-
static
|
665
|
+
static Excerpt *excerpt_new(int start, int end, double score)
|
442
666
|
{
|
443
|
-
|
444
|
-
|
667
|
+
Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
|
668
|
+
excerpt->start = start;
|
669
|
+
excerpt->end = end;
|
670
|
+
excerpt->score = score;
|
671
|
+
return excerpt;
|
672
|
+
}
|
445
673
|
|
446
|
-
|
447
|
-
|
448
|
-
|
674
|
+
static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
|
675
|
+
{
|
676
|
+
int i;
|
677
|
+
double score = 0.0;
|
678
|
+
for (i = e->start; i <= e->end; i++) {
|
679
|
+
score += mv->matches[i].score;
|
680
|
+
}
|
681
|
+
e->score = score;
|
682
|
+
return e;
|
683
|
+
}
|
449
684
|
|
450
|
-
|
685
|
+
/* expand an excerpt to it's largest possible size */
|
686
|
+
static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
|
687
|
+
{
|
688
|
+
Offset *offsets = tv->offsets;
|
689
|
+
int offset_cnt = tv->offset_cnt;
|
690
|
+
bool did_expansion = true;
|
691
|
+
int i;
|
692
|
+
/* fill in skipped offsets */
|
693
|
+
for (i = 1; i < offset_cnt; i++) {
|
694
|
+
if (offsets[i].start == 0) {
|
695
|
+
offsets[i].start = offsets[i-1].start;
|
696
|
+
}
|
697
|
+
if (offsets[i].end == 0) {
|
698
|
+
offsets[i].end = offsets[i-1].end;
|
699
|
+
}
|
700
|
+
}
|
701
|
+
|
702
|
+
while (did_expansion) {
|
703
|
+
did_expansion = false;
|
704
|
+
if (e->start_pos > 0
|
705
|
+
&& (e->end_offset - offsets[e->start_pos - 1].start) < len) {
|
706
|
+
e->start_pos--;
|
707
|
+
e->start_offset = offsets[e->start_pos].start;
|
708
|
+
did_expansion = true;
|
709
|
+
}
|
710
|
+
if (e->end_pos < (offset_cnt - 1)
|
711
|
+
&& (offsets[e->end_pos + 1].end - e->start_offset) < len) {
|
712
|
+
e->end_pos++;
|
713
|
+
e->end_offset = offsets[e->end_pos].end;
|
714
|
+
did_expansion = true;
|
715
|
+
}
|
716
|
+
}
|
717
|
+
return e;
|
718
|
+
}
|
719
|
+
|
720
|
+
static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
|
721
|
+
LazyDocField *lazy_df,
|
722
|
+
const char *pre_tag,
|
723
|
+
const char *post_tag,
|
724
|
+
const char *ellipsis)
|
725
|
+
{
|
726
|
+
int i, len;
|
727
|
+
int last_offset = e->start_offset;
|
728
|
+
const int num_matches = e->end - e->start + 1;
|
729
|
+
const int pre_tag_len = (int)strlen(pre_tag);
|
730
|
+
const int post_tag_len = (int)strlen(post_tag);
|
731
|
+
const int ellipsis_len = (int)strlen(ellipsis);
|
732
|
+
char *excerpt_str = ALLOC_N(char,
|
733
|
+
10 + e->end_offset - e->start_offset
|
734
|
+
+ (num_matches * (pre_tag_len + post_tag_len))
|
735
|
+
+ (2 * ellipsis_len));
|
736
|
+
char *e_ptr = excerpt_str;
|
737
|
+
if (e->start_offset > 0) {
|
738
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
739
|
+
e_ptr += ellipsis_len;
|
740
|
+
}
|
741
|
+
for (i = e->start; i <= e->end; i++) {
|
742
|
+
MatchRange *mr = mv->matches + i;
|
743
|
+
len = mr->start_offset - last_offset;
|
744
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
745
|
+
e_ptr += len;
|
746
|
+
memcpy(e_ptr, pre_tag, pre_tag_len);
|
747
|
+
e_ptr += pre_tag_len;
|
748
|
+
len = mr->end_offset - mr->start_offset;
|
749
|
+
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
750
|
+
e_ptr += len;
|
751
|
+
memcpy(e_ptr, post_tag, post_tag_len);
|
752
|
+
e_ptr += post_tag_len;
|
753
|
+
last_offset = mr->end_offset;
|
754
|
+
}
|
755
|
+
len = e->end_offset - last_offset;
|
756
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
757
|
+
e_ptr += len;
|
758
|
+
if (e->end_offset < lazy_df->len) {
|
759
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
760
|
+
e_ptr += ellipsis_len;
|
761
|
+
}
|
762
|
+
*e_ptr = '\0';
|
763
|
+
return excerpt_str;
|
764
|
+
}
|
765
|
+
|
766
|
+
char **searcher_highlight(Searcher *self,
|
767
|
+
Query *query,
|
768
|
+
const int doc_num,
|
769
|
+
const char *field,
|
770
|
+
const int excerpt_len,
|
771
|
+
const int num_excerpts,
|
772
|
+
const char *pre_tag,
|
773
|
+
const char *post_tag,
|
774
|
+
const char *ellipsis)
|
775
|
+
{
|
776
|
+
char **excerpt_strs = NULL;
|
777
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
778
|
+
LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
|
779
|
+
LazyDocField *lazy_df = NULL;
|
780
|
+
if (lazy_doc) {
|
781
|
+
lazy_df = h_get(lazy_doc->field_dict, field);
|
782
|
+
}
|
783
|
+
if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
|
784
|
+
&& tv->offsets != NULL) {
|
785
|
+
MatchVector *mv;
|
786
|
+
query = self->rewrite(self, query);
|
787
|
+
mv = query->get_matchv_i(query, matchv_new(), tv);
|
788
|
+
if (mv->size > 0) {
|
789
|
+
Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
|
790
|
+
int e_start, e_end, i, j;
|
791
|
+
MatchRange *matches = mv->matches;
|
792
|
+
double running_score = 0.0;
|
793
|
+
Offset *offsets = tv->offsets;
|
794
|
+
PriorityQueue *excerpt_pq;
|
795
|
+
|
796
|
+
matchv_compact_with_breaks(mv);
|
797
|
+
matchv_set_offsets(mv, offsets);
|
798
|
+
excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
|
799
|
+
/* add all possible excerpts to the priority queue */
|
800
|
+
|
801
|
+
for (e_start = 0, e_end = 1; e_start < mv->size; e_start++) {
|
802
|
+
const int start_offset = matches[e_start].start_offset;
|
803
|
+
if (e_start >= e_end) {
|
804
|
+
e_end = e_start + 1;
|
805
|
+
}
|
806
|
+
running_score += matches[e_start].score;
|
807
|
+
while (e_end < mv->size && (matches[e_end].end_offset
|
808
|
+
<= start_offset + excerpt_len)) {
|
809
|
+
running_score += matches[e_end].score;
|
810
|
+
e_end++;
|
811
|
+
}
|
812
|
+
pq_push(excerpt_pq,
|
813
|
+
excerpt_new(e_start, e_end - 1, running_score));
|
814
|
+
/* - 0.1 so that earlier matches take priority */
|
815
|
+
running_score -= matches[e_start].score;
|
816
|
+
}
|
817
|
+
|
818
|
+
for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
|
819
|
+
excerpts[i] = pq_pop(excerpt_pq);
|
820
|
+
if (i < num_excerpts - 1) {
|
821
|
+
/* set match ranges alread included to 0 */
|
822
|
+
Excerpt *e = excerpts[i];
|
823
|
+
for (j = e->start; j <= e->end; j++) {
|
824
|
+
matches[j].score = 0.0;
|
825
|
+
}
|
826
|
+
e = NULL;
|
827
|
+
while (e != (Excerpt *)pq_top(excerpt_pq)) {
|
828
|
+
e = pq_top(excerpt_pq);
|
829
|
+
excerpt_recalc_score(e, mv);
|
830
|
+
pq_down(excerpt_pq);
|
831
|
+
}
|
832
|
+
}
|
833
|
+
}
|
834
|
+
|
835
|
+
qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
|
836
|
+
for (j = 0; j < i; j++) {
|
837
|
+
Excerpt *e = excerpts[j];
|
838
|
+
e->start_pos = matches[e->start].start;
|
839
|
+
e->end_pos = matches[e->end].end;
|
840
|
+
e->start_offset = offsets[e->start_pos].start;
|
841
|
+
e->end_offset = offsets[e->end_pos].end;
|
842
|
+
}
|
843
|
+
|
844
|
+
if (i < num_excerpts) {
|
845
|
+
const int diff = num_excerpts - i;
|
846
|
+
memmove(excerpts + (diff), excerpts,
|
847
|
+
i * sizeof(Excerpt *));
|
848
|
+
for (j = 0; j < diff; j++) {
|
849
|
+
/* these new excerpts will grow into one long excerpt at
|
850
|
+
* the start */
|
851
|
+
excerpts[j] = ALLOC_AND_ZERO(Excerpt);
|
852
|
+
excerpts[j]->end = -1;
|
853
|
+
}
|
854
|
+
}
|
855
|
+
|
856
|
+
excerpt_strs = ary_new_type_capa(char *, num_excerpts);
|
857
|
+
/* merge excerpts where possible */
|
858
|
+
for (i = 0; i < num_excerpts;) {
|
859
|
+
Excerpt *ei = excerpts[i];
|
860
|
+
int merged = 1; /* 1 means a single excerpt, ie no merges */
|
861
|
+
for (j = i + 1; j < num_excerpts; j++) {
|
862
|
+
Excerpt *ej = excerpts[j];
|
863
|
+
if ((ej->end_offset - ei->start_offset)
|
864
|
+
< (j - i + 1) * excerpt_len) {
|
865
|
+
ei->end = ej->end;
|
866
|
+
ei->end_pos = ej->end_pos;
|
867
|
+
ei->end_offset = ej->end_offset;
|
868
|
+
merged = j - i + 1;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
excerpt_expand(ei, merged * excerpt_len, tv);
|
872
|
+
ary_push(excerpt_strs,
|
873
|
+
excerpt_get_str(ei, mv, lazy_df,
|
874
|
+
pre_tag, post_tag, ellipsis));
|
875
|
+
i += merged;
|
876
|
+
}
|
877
|
+
for (i = 0; i < num_excerpts; i++) {
|
878
|
+
free(excerpts[i]);
|
879
|
+
}
|
880
|
+
free(excerpts);
|
881
|
+
pq_destroy(excerpt_pq);
|
882
|
+
matchv_destroy(mv);
|
883
|
+
}
|
884
|
+
q_deref(query);
|
885
|
+
}
|
886
|
+
if (tv) tv_destroy(tv);
|
887
|
+
if (lazy_doc) lazy_doc_close(lazy_doc);
|
888
|
+
return excerpt_strs;
|
451
889
|
}
|
452
890
|
|
891
|
+
static Weight *sea_create_weight(Searcher *self, Query *query)
|
892
|
+
{
|
893
|
+
return q_weight(query, self);
|
894
|
+
}
|
453
895
|
|
454
|
-
static
|
896
|
+
static void sea_check_args(int num_docs, int first_doc)
|
455
897
|
{
|
456
|
-
|
898
|
+
if (num_docs <= 0) {
|
899
|
+
RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
|
900
|
+
"than 0 : %d <= 0", num_docs, num_docs);
|
901
|
+
}
|
902
|
+
|
903
|
+
if (first_doc < 0) {
|
904
|
+
RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
|
905
|
+
"than or equal to 0 : %d < 0", first_doc, first_doc);
|
906
|
+
}
|
457
907
|
}
|
458
908
|
|
459
|
-
static
|
909
|
+
static Similarity *sea_get_similarity(Searcher *self)
|
460
910
|
{
|
461
|
-
|
911
|
+
return self->similarity;
|
462
912
|
}
|
463
913
|
|
464
|
-
|
914
|
+
/***************************************************************************
|
915
|
+
*
|
916
|
+
* IndexSearcher
|
917
|
+
*
|
918
|
+
***************************************************************************/
|
919
|
+
|
920
|
+
#define ISEA(searcher) ((IndexSearcher *)(searcher))
|
921
|
+
|
922
|
+
int isea_doc_freq(Searcher *self, const char *field, const char *term)
|
465
923
|
{
|
466
|
-
|
924
|
+
return ir_doc_freq(ISEA(self)->ir, field, term);
|
467
925
|
}
|
468
926
|
|
469
|
-
static
|
470
|
-
int num_docs, Filter *filter, Sort *sort)
|
927
|
+
static Document *isea_get_doc(Searcher *self, int doc_num)
|
471
928
|
{
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
Scorer *scorer;
|
476
|
-
Hit **score_docs = NULL;
|
477
|
-
Hit hit;
|
478
|
-
int total_hits = 0;
|
479
|
-
float score;
|
480
|
-
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
481
|
-
Hit *(*hq_pop)(PriorityQueue *pq);
|
482
|
-
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
483
|
-
void (*hq_destroy)(PriorityQueue *self);
|
484
|
-
PriorityQueue *hq;
|
929
|
+
IndexReader *ir = ISEA(self)->ir;
|
930
|
+
return ir->get_doc(ir, doc_num);
|
931
|
+
}
|
485
932
|
|
933
|
+
static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
|
934
|
+
{
|
935
|
+
IndexReader *ir = ISEA(self)->ir;
|
936
|
+
return ir->get_lazy_doc(ir, doc_num);
|
937
|
+
}
|
486
938
|
|
487
|
-
|
488
|
-
|
939
|
+
static int isea_max_doc(Searcher *self)
|
940
|
+
{
|
941
|
+
IndexReader *ir = ISEA(self)->ir;
|
942
|
+
return ir->max_doc(ir);
|
943
|
+
}
|
489
944
|
|
490
|
-
|
491
|
-
|
945
|
+
#define IS_FILTERED(bits, filter_func, scorer, searcher) \
|
946
|
+
((bits && !bv_get(bits, scorer->doc))\
|
947
|
+
|| (filter_func \
|
948
|
+
&& !filter_func(scorer->doc, scorer->score(scorer), searcher)))
|
492
949
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
950
|
+
static TopDocs *isea_search_w(Searcher *self,
|
951
|
+
Weight *weight,
|
952
|
+
int first_doc,
|
953
|
+
int num_docs,
|
954
|
+
Filter *filter,
|
955
|
+
Sort *sort,
|
956
|
+
filter_ft filter_func,
|
957
|
+
bool load_fields)
|
958
|
+
{
|
959
|
+
int max_size = first_doc + num_docs;
|
960
|
+
int i;
|
961
|
+
Scorer *scorer;
|
962
|
+
Hit **score_docs = NULL;
|
963
|
+
Hit hit;
|
964
|
+
int total_hits = 0;
|
965
|
+
float score, max_score = 0.0;
|
966
|
+
BitVector *bits = (filter
|
967
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
968
|
+
: NULL);
|
969
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
970
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
971
|
+
void (*hq_destroy)(PriorityQueue *self);
|
972
|
+
PriorityQueue *hq;
|
500
973
|
|
501
|
-
|
502
|
-
hq = fshq_pq_create(max_size, sort, self->ir);
|
503
|
-
hq_pop = &fshq_pq_pop;
|
504
|
-
hq_insert = &fshq_pq_insert;
|
505
|
-
hq_destroy = &fshq_pq_destroy;
|
506
|
-
} else {
|
507
|
-
hq = pq_create(max_size, &hit_less_than);
|
508
|
-
hq_pop = &hit_pq_pop;
|
509
|
-
hq_insert = &hit_pq_insert;
|
510
|
-
hq_destroy = &pq_destroy;
|
511
|
-
}
|
974
|
+
sea_check_args(num_docs, first_doc);
|
512
975
|
|
513
|
-
|
514
|
-
if (
|
515
|
-
|
516
|
-
|
517
|
-
hit.doc = scorer->doc; hit.score = score;
|
518
|
-
hq_insert(hq, &hit);
|
519
|
-
}
|
520
|
-
scorer->destroy(scorer);
|
521
|
-
weight->destroy(weight);
|
976
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
977
|
+
if (!scorer) {
|
978
|
+
return td_new(0, 0, NULL, 0.0);
|
979
|
+
}
|
522
980
|
|
523
|
-
|
524
|
-
|
525
|
-
|
981
|
+
if (sort) {
|
982
|
+
hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
|
983
|
+
hq_insert = &fshq_pq_insert;
|
984
|
+
hq_destroy = &fshq_pq_destroy;
|
985
|
+
if (load_fields) {
|
986
|
+
hq_pop = &fshq_pq_pop_fd;
|
987
|
+
}
|
988
|
+
else {
|
989
|
+
hq_pop = &fshq_pq_pop;
|
990
|
+
}
|
991
|
+
}
|
992
|
+
else {
|
993
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
994
|
+
hq_pop = &hit_pq_pop;
|
995
|
+
hq_insert = &hit_pq_insert;
|
996
|
+
hq_destroy = &pq_destroy;
|
526
997
|
}
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
998
|
+
|
999
|
+
while (scorer->next(scorer)) {
|
1000
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1001
|
+
continue;
|
1002
|
+
}
|
1003
|
+
total_hits++;
|
1004
|
+
score = scorer->score(scorer);
|
1005
|
+
if (score > max_score) max_score = score;
|
1006
|
+
hit.doc = scorer->doc; hit.score = score;
|
1007
|
+
hq_insert(hq, &hit);
|
532
1008
|
}
|
533
|
-
|
534
|
-
num_docs = 0;
|
535
|
-
}
|
536
|
-
pq_clear(hq);
|
537
|
-
hq_destroy(hq);
|
1009
|
+
scorer->destroy(scorer);
|
538
1010
|
|
539
|
-
|
540
|
-
|
1011
|
+
if (hq->size > first_doc) {
|
1012
|
+
if ((hq->size - first_doc) < num_docs) {
|
1013
|
+
num_docs = hq->size - first_doc;
|
1014
|
+
}
|
1015
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1016
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1017
|
+
score_docs[i] = hq_pop(hq);
|
1018
|
+
/*
|
1019
|
+
hit = score_docs[i] = pq_pop(hq);
|
1020
|
+
printf("hit = %d-->%f\n", hit->doc, hit->score);
|
1021
|
+
*/
|
1022
|
+
}
|
1023
|
+
}
|
1024
|
+
else {
|
1025
|
+
num_docs = 0;
|
1026
|
+
}
|
1027
|
+
pq_clear(hq);
|
1028
|
+
hq_destroy(hq);
|
1029
|
+
|
1030
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
static TopDocs *isea_search(Searcher *self,
|
1034
|
+
Query *query,
|
1035
|
+
int first_doc,
|
1036
|
+
int num_docs,
|
1037
|
+
Filter *filter,
|
1038
|
+
Sort *sort,
|
1039
|
+
filter_ft filter_func,
|
1040
|
+
bool load_fields)
|
1041
|
+
{
|
1042
|
+
TopDocs *td;
|
1043
|
+
Weight *weight = q_weight(query, self);
|
1044
|
+
td = isea_search_w(self, weight, first_doc, num_docs, filter,
|
1045
|
+
sort, filter_func, load_fields);
|
1046
|
+
weight->destroy(weight);
|
1047
|
+
return td;
|
541
1048
|
}
|
542
1049
|
|
543
|
-
static void
|
544
|
-
|
1050
|
+
static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
|
1051
|
+
filter_ft filter_func,
|
1052
|
+
void (*fn)(Searcher *, int, float, void *),
|
1053
|
+
void *arg)
|
545
1054
|
{
|
546
|
-
|
547
|
-
|
1055
|
+
Scorer *scorer;
|
1056
|
+
BitVector *bits = (filter
|
1057
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
1058
|
+
: NULL);
|
548
1059
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
}
|
1060
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
1061
|
+
if (!scorer) {
|
1062
|
+
return;
|
1063
|
+
}
|
554
1064
|
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
1065
|
+
while (scorer->next(scorer)) {
|
1066
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1067
|
+
continue;
|
1068
|
+
}
|
1069
|
+
fn(self, scorer->doc, scorer->score(scorer), arg);
|
1070
|
+
}
|
1071
|
+
scorer->destroy(scorer);
|
560
1072
|
}
|
561
1073
|
|
562
|
-
static void
|
563
|
-
|
1074
|
+
static void isea_search_each(Searcher *self, Query *query, Filter *filter,
|
1075
|
+
filter_ft filter_func,
|
1076
|
+
void (*fn)(Searcher *, int, float, void *),
|
1077
|
+
void *arg)
|
564
1078
|
{
|
565
|
-
|
566
|
-
|
567
|
-
|
1079
|
+
Weight *weight = q_weight(query, self);
|
1080
|
+
isea_search_each_w(self, weight, filter, filter_func, fn, arg);
|
1081
|
+
weight->destroy(weight);
|
568
1082
|
}
|
569
1083
|
|
570
|
-
static Query *
|
1084
|
+
static Query *isea_rewrite(Searcher *self, Query *original)
|
571
1085
|
{
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
1086
|
+
int q_is_destroyed = false;
|
1087
|
+
Query *query = original;
|
1088
|
+
Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1089
|
+
while (q_is_destroyed || (query != rewritten_query)) {
|
1090
|
+
query = rewritten_query;
|
1091
|
+
rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1092
|
+
q_is_destroyed = (query->ref_cnt <= 1);
|
1093
|
+
q_deref(query); /* destroy intermediate queries */
|
1094
|
+
}
|
1095
|
+
return query;
|
582
1096
|
}
|
583
1097
|
|
584
|
-
static Explanation *
|
1098
|
+
static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
|
585
1099
|
{
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
1100
|
+
Weight *weight = q_weight(query, self);
|
1101
|
+
Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
|
1102
|
+
weight->destroy(weight);
|
1103
|
+
return e;
|
590
1104
|
}
|
591
1105
|
|
592
|
-
static Explanation *
|
1106
|
+
static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
|
593
1107
|
{
|
594
|
-
|
1108
|
+
return w->explain(w, ISEA(self)->ir, doc_num);
|
595
1109
|
}
|
596
1110
|
|
597
|
-
static
|
1111
|
+
static TermVector *isea_get_term_vector(Searcher *self,
|
1112
|
+
const int doc_num,
|
1113
|
+
const char *field)
|
598
1114
|
{
|
599
|
-
|
1115
|
+
IndexReader *ir = ISEA(self)->ir;
|
1116
|
+
return ir->term_vector(ir, doc_num, field);
|
600
1117
|
}
|
601
1118
|
|
602
|
-
static void
|
1119
|
+
static void isea_close(Searcher *self)
|
603
1120
|
{
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
1121
|
+
if (ISEA(self)->ir && ISEA(self)->close_ir) {
|
1122
|
+
ir_close(ISEA(self)->ir);
|
1123
|
+
}
|
1124
|
+
free(self);
|
608
1125
|
}
|
609
1126
|
|
610
|
-
Searcher *
|
1127
|
+
Searcher *isea_new(IndexReader *ir)
|
611
1128
|
{
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
1129
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
|
1130
|
+
|
1131
|
+
ISEA(self)->ir = ir;
|
1132
|
+
ISEA(self)->close_ir = true;
|
1133
|
+
|
1134
|
+
self->similarity = sim_create_default();
|
1135
|
+
self->doc_freq = &isea_doc_freq;
|
1136
|
+
self->get_doc = &isea_get_doc;
|
1137
|
+
self->get_lazy_doc = &isea_get_lazy_doc;
|
1138
|
+
self->max_doc = &isea_max_doc;
|
1139
|
+
self->create_weight = &sea_create_weight;
|
1140
|
+
self->search = &isea_search;
|
1141
|
+
self->search_w = &isea_search_w;
|
1142
|
+
self->search_each = &isea_search_each;
|
1143
|
+
self->search_each_w = &isea_search_each_w;
|
1144
|
+
self->rewrite = &isea_rewrite;
|
1145
|
+
self->explain = &isea_explain;
|
1146
|
+
self->explain_w = &isea_explain_w;
|
1147
|
+
self->get_term_vector = &isea_get_term_vector;
|
1148
|
+
self->get_similarity = &sea_get_similarity;
|
1149
|
+
self->close = &isea_close;
|
1150
|
+
|
1151
|
+
return self;
|
630
1152
|
}
|
631
1153
|
|
632
1154
|
/***************************************************************************
|
@@ -635,109 +1157,144 @@ Searcher *sea_create(IndexReader *ir)
|
|
635
1157
|
*
|
636
1158
|
***************************************************************************/
|
637
1159
|
|
638
|
-
|
639
|
-
|
640
|
-
|
1160
|
+
#define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
|
1161
|
+
typedef struct CachedDFSearcher
|
1162
|
+
{
|
1163
|
+
Searcher super;
|
1164
|
+
HashTable *df_map;
|
1165
|
+
int max_doc;
|
641
1166
|
} CachedDFSearcher;
|
642
1167
|
|
643
|
-
static int cdfsea_doc_freq(Searcher *self,
|
1168
|
+
static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
|
644
1169
|
{
|
645
|
-
|
646
|
-
|
1170
|
+
Term term;
|
1171
|
+
int *df;
|
1172
|
+
term.field = (char *)field;
|
1173
|
+
term.text = (char *)text;
|
1174
|
+
df = (int *)h_get(CDFSEA(self)->df_map, &term);
|
1175
|
+
return df ? *df : 0;
|
647
1176
|
}
|
648
1177
|
|
649
1178
|
static Document *cdfsea_get_doc(Searcher *self, int doc_num)
|
650
1179
|
{
|
651
|
-
|
652
|
-
|
1180
|
+
(void)self; (void)doc_num;
|
1181
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1182
|
+
return NULL;
|
653
1183
|
}
|
654
1184
|
|
655
1185
|
static int cdfsea_max_doc(Searcher *self)
|
656
1186
|
{
|
657
|
-
|
1187
|
+
(void)self;
|
1188
|
+
return CDFSEA(self)->max_doc;
|
658
1189
|
}
|
659
1190
|
|
660
1191
|
static Weight *cdfsea_create_weight(Searcher *self, Query *query)
|
661
1192
|
{
|
662
|
-
|
663
|
-
|
1193
|
+
(void)self; (void)query;
|
1194
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1195
|
+
return NULL;
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
|
1199
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
1200
|
+
{
|
1201
|
+
(void)self; (void)w; (void)fd; (void)nd;
|
1202
|
+
(void)f; (void)s; (void)ff, (void)load;
|
1203
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1204
|
+
return NULL;
|
664
1205
|
}
|
665
1206
|
|
666
|
-
static TopDocs *cdfsea_search(Searcher *self, Query *
|
667
|
-
|
1207
|
+
static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
|
1208
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
668
1209
|
{
|
669
|
-
|
670
|
-
|
1210
|
+
(void)self; (void)q; (void)fd; (void)nd;
|
1211
|
+
(void)f; (void)s; (void)ff, (void)load;
|
1212
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1213
|
+
return NULL;
|
671
1214
|
}
|
672
1215
|
|
673
1216
|
static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
|
674
|
-
|
1217
|
+
filter_ft ff,
|
1218
|
+
void (*fn)(Searcher *, int, float, void *),
|
1219
|
+
void *arg)
|
675
1220
|
{
|
676
|
-
|
1221
|
+
(void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
|
1222
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
677
1223
|
}
|
678
1224
|
|
679
1225
|
static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
680
|
-
|
1226
|
+
filter_ft ff,
|
1227
|
+
void (*fn)(Searcher *, int, float, void *),
|
1228
|
+
void *arg)
|
681
1229
|
{
|
682
|
-
|
1230
|
+
(void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
|
1231
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
683
1232
|
}
|
684
1233
|
|
685
1234
|
static Query *cdfsea_rewrite(Searcher *self, Query *original)
|
686
1235
|
{
|
687
|
-
|
688
|
-
|
1236
|
+
(void)self;
|
1237
|
+
original->ref_cnt++;
|
1238
|
+
return original;
|
689
1239
|
}
|
690
1240
|
|
691
1241
|
static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
|
692
1242
|
{
|
693
|
-
|
694
|
-
|
1243
|
+
(void)self; (void)query; (void)doc_num;
|
1244
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1245
|
+
return NULL;
|
695
1246
|
}
|
696
1247
|
|
697
1248
|
static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
|
698
1249
|
{
|
699
|
-
|
700
|
-
|
1250
|
+
(void)self; (void)w; (void)doc_num;
|
1251
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1252
|
+
return NULL;
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
|
1256
|
+
const char *field)
|
1257
|
+
{
|
1258
|
+
(void)self; (void)doc_num; (void)field;
|
1259
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1260
|
+
return NULL;
|
701
1261
|
}
|
702
1262
|
|
703
1263
|
static Similarity *cdfsea_get_similarity(Searcher *self)
|
704
1264
|
{
|
705
|
-
|
706
|
-
|
1265
|
+
(void)self;
|
1266
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1267
|
+
return NULL;
|
707
1268
|
}
|
708
1269
|
|
709
1270
|
static void cdfsea_close(Searcher *self)
|
710
1271
|
{
|
711
|
-
|
712
|
-
|
713
|
-
free(cdfsea);
|
714
|
-
free(self);
|
1272
|
+
h_destroy(CDFSEA(self)->df_map);
|
1273
|
+
free(self);
|
715
1274
|
}
|
716
1275
|
|
717
|
-
Searcher *
|
1276
|
+
static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
|
718
1277
|
{
|
719
|
-
|
1278
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
|
720
1279
|
|
721
|
-
|
1280
|
+
CDFSEA(self)->df_map = df_map;
|
1281
|
+
CDFSEA(self)->max_doc = max_doc;
|
722
1282
|
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
self->get_similarity = &cdfsea_get_similarity;
|
739
|
-
self->close = &cdfsea_close;
|
740
|
-
return self;
|
1283
|
+
self->doc_freq = &cdfsea_doc_freq;
|
1284
|
+
self->get_doc = &cdfsea_get_doc;
|
1285
|
+
self->max_doc = &cdfsea_max_doc;
|
1286
|
+
self->create_weight = &cdfsea_create_weight;
|
1287
|
+
self->search = &cdfsea_search;
|
1288
|
+
self->search_w = &cdfsea_search_w;
|
1289
|
+
self->search_each = &cdfsea_search_each;
|
1290
|
+
self->search_each_w = &cdfsea_search_each_w;
|
1291
|
+
self->rewrite = &cdfsea_rewrite;
|
1292
|
+
self->explain = &cdfsea_explain;
|
1293
|
+
self->explain_w = &cdfsea_explain_w;
|
1294
|
+
self->get_term_vector = &cdfsea_get_term_vector;
|
1295
|
+
self->get_similarity = &cdfsea_get_similarity;
|
1296
|
+
self->close = &cdfsea_close;
|
1297
|
+
return self;
|
741
1298
|
}
|
742
1299
|
|
743
1300
|
/***************************************************************************
|
@@ -746,301 +1303,367 @@ Searcher *cdfsea_create(HshTable *df_map, int max_doc)
|
|
746
1303
|
*
|
747
1304
|
***************************************************************************/
|
748
1305
|
|
1306
|
+
#define MSEA(searcher) ((MultiSearcher *)(searcher))
|
749
1307
|
static inline int msea_get_searcher_index(Searcher *self, int n)
|
750
1308
|
{
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
1309
|
+
MultiSearcher *msea = MSEA(self);
|
1310
|
+
int lo = 0; /* search starts array */
|
1311
|
+
int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
|
1312
|
+
int mid, mid_val;
|
755
1313
|
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
1314
|
+
while (hi >= lo) {
|
1315
|
+
mid = (lo + hi) >> 1;
|
1316
|
+
mid_val = msea->starts[mid];
|
1317
|
+
if (n < mid_val) {
|
1318
|
+
hi = mid - 1;
|
1319
|
+
}
|
1320
|
+
else if (n > mid_val) {
|
1321
|
+
lo = mid + 1;
|
1322
|
+
}
|
1323
|
+
else { /* found a match */
|
1324
|
+
while (((mid+1) < msea->s_cnt)
|
1325
|
+
&& (msea->starts[mid+1] == mid_val)) {
|
1326
|
+
mid++; /* scan to last match */
|
1327
|
+
}
|
1328
|
+
return mid;
|
1329
|
+
}
|
768
1330
|
}
|
769
|
-
|
770
|
-
return hi;
|
1331
|
+
return hi;
|
771
1332
|
}
|
772
1333
|
|
773
|
-
static int msea_doc_freq(Searcher *self,
|
1334
|
+
static int msea_doc_freq(Searcher *self, const char *field, const char *term)
|
774
1335
|
{
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
}
|
1336
|
+
int i;
|
1337
|
+
int doc_freq = 0;
|
1338
|
+
MultiSearcher *msea = MSEA(self);
|
1339
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1340
|
+
Searcher *s = msea->searchers[i];
|
1341
|
+
doc_freq += s->doc_freq(s, field, term);
|
1342
|
+
}
|
783
1343
|
|
784
|
-
|
1344
|
+
return doc_freq;
|
785
1345
|
}
|
786
1346
|
|
787
1347
|
static Document *msea_get_doc(Searcher *self, int doc_num)
|
788
1348
|
{
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
1349
|
+
MultiSearcher *msea = MSEA(self);
|
1350
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1351
|
+
Searcher *s = msea->searchers[i];
|
1352
|
+
return s->get_doc(s, doc_num - msea->starts[i]);
|
1353
|
+
}
|
1354
|
+
|
1355
|
+
static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
|
1356
|
+
{
|
1357
|
+
MultiSearcher *msea = MSEA(self);
|
1358
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1359
|
+
Searcher *s = msea->searchers[i];
|
1360
|
+
return s->get_lazy_doc(s, doc_num - msea->starts[i]);
|
793
1361
|
}
|
794
1362
|
|
795
1363
|
static int msea_max_doc(Searcher *self)
|
796
1364
|
{
|
797
|
-
|
1365
|
+
return MSEA(self)->max_doc;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
|
1369
|
+
{
|
1370
|
+
int i;
|
1371
|
+
const int num_terms = terms->size;
|
1372
|
+
int *doc_freqs = ALLOC_N(int, num_terms);
|
1373
|
+
for (i = 0; i < num_terms; i++) {
|
1374
|
+
Term *t = (Term *)terms->elems[i];
|
1375
|
+
doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
|
1376
|
+
}
|
1377
|
+
return doc_freqs;
|
798
1378
|
}
|
799
1379
|
|
800
1380
|
static Weight *msea_create_weight(Searcher *self, Query *query)
|
801
1381
|
{
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
rq->extract_terms(rq, terms);
|
810
|
-
dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
|
1382
|
+
int i, *doc_freqs;
|
1383
|
+
Searcher *cdfsea;
|
1384
|
+
Weight *w;
|
1385
|
+
HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
|
1386
|
+
(free_ft)NULL, free);
|
1387
|
+
Query *rewritten_query = self->rewrite(self, query);
|
1388
|
+
HashSet *terms = term_set_new();
|
811
1389
|
|
812
|
-
|
813
|
-
|
814
|
-
}
|
815
|
-
/* don't destroy the individual terms, only the HashSet */
|
816
|
-
hs_destroy(terms);
|
817
|
-
free(dfs);
|
1390
|
+
rewritten_query->extract_terms(rewritten_query, terms);
|
1391
|
+
doc_freqs = msea_get_doc_freqs(self, terms);
|
818
1392
|
|
819
|
-
|
1393
|
+
for (i = 0; i < terms->size; i++) {
|
1394
|
+
h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
|
1395
|
+
}
|
1396
|
+
hs_destroy(terms);
|
1397
|
+
free(doc_freqs);
|
820
1398
|
|
821
|
-
|
822
|
-
q_deref(rq);
|
823
|
-
cdfsea->close(cdfsea);
|
1399
|
+
cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
|
824
1400
|
|
825
|
-
|
1401
|
+
w = q_weight(rewritten_query, cdfsea);
|
1402
|
+
q_deref(rewritten_query);
|
1403
|
+
cdfsea->close(cdfsea);
|
1404
|
+
|
1405
|
+
return w;
|
826
1406
|
}
|
827
1407
|
|
828
1408
|
struct MultiSearchEachArg {
|
829
|
-
|
830
|
-
|
831
|
-
|
1409
|
+
int start;
|
1410
|
+
void *arg;
|
1411
|
+
void (*fn)(Searcher *, int, float, void *);
|
832
1412
|
};
|
833
1413
|
|
834
1414
|
void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
|
835
1415
|
{
|
836
|
-
|
1416
|
+
struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
|
837
1417
|
|
838
|
-
|
1418
|
+
mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
|
839
1419
|
}
|
840
1420
|
|
841
1421
|
static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
842
|
-
|
1422
|
+
filter_ft filter_func,
|
1423
|
+
void (*fn)(Searcher *, int, float, void *),
|
1424
|
+
void *arg)
|
843
1425
|
{
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
1426
|
+
int i;
|
1427
|
+
struct MultiSearchEachArg mse_arg;
|
1428
|
+
MultiSearcher *msea = MSEA(self);
|
1429
|
+
Searcher *s;
|
848
1430
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
1431
|
+
mse_arg.fn = fn;
|
1432
|
+
mse_arg.arg = arg;
|
1433
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1434
|
+
s = msea->searchers[i];
|
1435
|
+
mse_arg.start = msea->starts[i];
|
1436
|
+
s->search_each_w(s, w, filter, filter_func,
|
1437
|
+
&msea_search_each_i, &mse_arg);
|
1438
|
+
}
|
856
1439
|
}
|
857
1440
|
|
858
1441
|
static void msea_search_each(Searcher *self, Query *query, Filter *filter,
|
859
|
-
|
1442
|
+
filter_ft filter_func,
|
1443
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
860
1444
|
{
|
861
|
-
|
862
|
-
|
863
|
-
|
1445
|
+
Weight *w = q_weight(query, self);
|
1446
|
+
msea_search_each_w(self, w, filter, filter_func, fn, arg);
|
1447
|
+
w->destroy(w);
|
864
1448
|
}
|
865
1449
|
|
866
1450
|
struct MultiSearchArg {
|
867
|
-
|
868
|
-
|
869
|
-
|
1451
|
+
int total_hits, max_size;
|
1452
|
+
PriorityQueue *hq;
|
1453
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
870
1454
|
};
|
871
1455
|
|
872
1456
|
void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
|
873
1457
|
{
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
}
|
916
|
-
|
917
|
-
|
918
|
-
ms_arg.hq = hq;
|
919
|
-
ms_arg.total_hits = 0;
|
920
|
-
ms_arg.max_size = max_size;
|
921
|
-
ms_arg.hq_insert = hq_insert;
|
922
|
-
|
923
|
-
msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
|
1458
|
+
struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
|
1459
|
+
Hit hit;
|
1460
|
+
(void)self;
|
1461
|
+
|
1462
|
+
ms_arg->total_hits++;
|
1463
|
+
hit.doc = doc_num;
|
1464
|
+
hit.score = score;
|
1465
|
+
ms_arg->hq_insert(ms_arg->hq, &hit);
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
static TopDocs *msea_search_w(Searcher *self,
|
1469
|
+
Weight *weight,
|
1470
|
+
int first_doc,
|
1471
|
+
int num_docs,
|
1472
|
+
Filter *filter,
|
1473
|
+
Sort *sort,
|
1474
|
+
filter_ft filter_func,
|
1475
|
+
bool load_fields)
|
1476
|
+
{
|
1477
|
+
int max_size = first_doc + num_docs;
|
1478
|
+
int i;
|
1479
|
+
int total_hits = 0;
|
1480
|
+
Hit **score_docs = NULL;
|
1481
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
1482
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
1483
|
+
PriorityQueue *hq;
|
1484
|
+
float max_score = 0.0;
|
1485
|
+
(void)load_fields; /* does it automatically */
|
1486
|
+
|
1487
|
+
sea_check_args(num_docs, first_doc);
|
1488
|
+
|
1489
|
+
if (sort) {
|
1490
|
+
hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
|
1491
|
+
hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
|
1492
|
+
hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
|
1493
|
+
}
|
1494
|
+
else {
|
1495
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
1496
|
+
hq_insert = &hit_pq_multi_insert;
|
1497
|
+
hq_pop = &hit_pq_pop;
|
1498
|
+
}
|
924
1499
|
|
925
|
-
|
1500
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1501
|
+
for (i = 0; i < MSEA(self)->s_cnt; i++) {
|
1502
|
+
Searcher *s = MSEA(self)->searchers[i];
|
1503
|
+
TopDocs *td = s->search_w(s, weight, 0, max_size,
|
1504
|
+
filter, sort, filter_func, true);
|
1505
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1506
|
+
if (td->size > 0) {
|
1507
|
+
/*printf("td->size = %d %d\n", td->size, num_docs); */
|
1508
|
+
int j;
|
1509
|
+
int start = MSEA(self)->starts[i];
|
1510
|
+
for (j = 0; j < td->size; j++) {
|
1511
|
+
Hit *hit = td->hits[j];
|
1512
|
+
hit->doc += start;
|
1513
|
+
/*
|
1514
|
+
printf("adding hit = %d:%f\n", hit->doc, hit->score);
|
1515
|
+
*/
|
1516
|
+
hq_insert(hq, hit);
|
1517
|
+
}
|
1518
|
+
td->size = 0;
|
1519
|
+
if (td->max_score > max_score) max_score = td->max_score;
|
1520
|
+
}
|
1521
|
+
total_hits += td->total_hits;
|
1522
|
+
td_destroy(td);
|
1523
|
+
}
|
926
1524
|
|
927
|
-
|
928
|
-
|
929
|
-
|
1525
|
+
if (hq->size > first_doc) {
|
1526
|
+
if ((hq->size - first_doc) < num_docs) {
|
1527
|
+
num_docs = hq->size - first_doc;
|
1528
|
+
}
|
1529
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1530
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1531
|
+
score_docs[i] = hq_pop(hq);
|
1532
|
+
/*
|
1533
|
+
Hit *hit = score_docs[i] = hq_pop(hq);
|
1534
|
+
printf("popped hit = %d-->%f\n", hit->doc, hit->score);
|
1535
|
+
*/
|
1536
|
+
}
|
930
1537
|
}
|
931
|
-
|
932
|
-
|
933
|
-
score_docs[i] = hq_pop(hq);
|
934
|
-
//hit = score_docs[i] = pq_pop(hq);
|
935
|
-
//printf("hit = %d-->%f\n", hit->doc, hit->score);
|
1538
|
+
else {
|
1539
|
+
num_docs = 0;
|
936
1540
|
}
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
1541
|
+
pq_clear(hq);
|
1542
|
+
pq_destroy(hq);
|
1543
|
+
|
1544
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1545
|
+
}
|
942
1546
|
|
943
|
-
|
944
|
-
|
1547
|
+
static TopDocs *msea_search(Searcher *self,
|
1548
|
+
Query *query,
|
1549
|
+
int first_doc,
|
1550
|
+
int num_docs,
|
1551
|
+
Filter *filter,
|
1552
|
+
Sort *sort,
|
1553
|
+
filter_ft filter_func,
|
1554
|
+
bool load_fields)
|
1555
|
+
{
|
1556
|
+
TopDocs *td;
|
1557
|
+
Weight *weight = q_weight(query, self);
|
1558
|
+
td = msea_search_w(self, weight, first_doc, num_docs, filter,
|
1559
|
+
sort, filter_func, load_fields);
|
1560
|
+
weight->destroy(weight);
|
1561
|
+
return td;
|
945
1562
|
}
|
946
1563
|
|
947
1564
|
static Query *msea_rewrite(Searcher *self, Query *original)
|
948
1565
|
{
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
1566
|
+
int i;
|
1567
|
+
Searcher *s;
|
1568
|
+
MultiSearcher *msea = MSEA(self);
|
1569
|
+
Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
|
953
1570
|
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
1571
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1572
|
+
s = msea->searchers[i];
|
1573
|
+
queries[i] = s->rewrite(s, original);
|
1574
|
+
}
|
1575
|
+
rewritten = q_combine(queries, msea->s_cnt);
|
959
1576
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
1577
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1578
|
+
q_deref(queries[i]);
|
1579
|
+
}
|
1580
|
+
free(queries);
|
1581
|
+
return rewritten;
|
965
1582
|
}
|
966
1583
|
|
967
1584
|
static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
|
968
1585
|
{
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
1586
|
+
MultiSearcher *msea = MSEA(self);
|
1587
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1588
|
+
Weight *w = q_weight(query, self);
|
1589
|
+
Searcher *s = msea->searchers[i];
|
1590
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1591
|
+
w->destroy(w);
|
1592
|
+
return e;
|
976
1593
|
}
|
977
1594
|
|
978
1595
|
static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
|
979
1596
|
{
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
1597
|
+
MultiSearcher *msea = MSEA(self);
|
1598
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1599
|
+
Searcher *s = msea->searchers[i];
|
1600
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1601
|
+
return e;
|
1602
|
+
}
|
1603
|
+
|
1604
|
+
static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
|
1605
|
+
const char *field)
|
1606
|
+
{
|
1607
|
+
MultiSearcher *msea = MSEA(self);
|
1608
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1609
|
+
Searcher *s = msea->searchers[i];
|
1610
|
+
return s->get_term_vector(s, doc_num - msea->starts[i],
|
1611
|
+
field);
|
985
1612
|
}
|
986
1613
|
|
987
1614
|
static Similarity *msea_get_similarity(Searcher *self)
|
988
1615
|
{
|
989
|
-
|
1616
|
+
return self->similarity;
|
990
1617
|
}
|
991
1618
|
|
992
1619
|
static void msea_close(Searcher *self)
|
993
1620
|
{
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1621
|
+
int i;
|
1622
|
+
Searcher *s;
|
1623
|
+
MultiSearcher *msea = MSEA(self);
|
1624
|
+
if (msea->close_subs) {
|
1625
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1626
|
+
s = msea->searchers[i];
|
1627
|
+
s->close(s);
|
1628
|
+
}
|
1629
|
+
free(msea->searchers);
|
1001
1630
|
}
|
1002
|
-
free(msea->
|
1003
|
-
|
1004
|
-
free(msea->starts);
|
1005
|
-
free(msea);
|
1006
|
-
free(self);
|
1631
|
+
free(msea->starts);
|
1632
|
+
free(self);
|
1007
1633
|
}
|
1008
1634
|
|
1009
|
-
Searcher *
|
1635
|
+
Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
|
1010
1636
|
{
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1637
|
+
int i, max_doc = 0;
|
1638
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
|
1639
|
+
int *starts = ALLOC_N(int, s_cnt + 1);
|
1640
|
+
for (i = 0; i < s_cnt; i++) {
|
1641
|
+
starts[i] = max_doc;
|
1642
|
+
max_doc += searchers[i]->max_doc(searchers[i]);
|
1643
|
+
}
|
1018
1644
|
starts[i] = max_doc;
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
self->get_similarity = &msea_get_similarity;
|
1044
|
-
self->close = &msea_close;
|
1045
|
-
return self;
|
1645
|
+
|
1646
|
+
MSEA(self)->s_cnt = s_cnt;
|
1647
|
+
MSEA(self)->searchers = searchers;
|
1648
|
+
MSEA(self)->starts = starts;
|
1649
|
+
MSEA(self)->max_doc = max_doc;
|
1650
|
+
MSEA(self)->close_subs = close_subs;
|
1651
|
+
|
1652
|
+
self->similarity = sim_create_default();
|
1653
|
+
self->doc_freq = &msea_doc_freq;
|
1654
|
+
self->get_doc = &msea_get_doc;
|
1655
|
+
self->get_lazy_doc = &msea_get_lazy_doc;
|
1656
|
+
self->max_doc = &msea_max_doc;
|
1657
|
+
self->create_weight = &msea_create_weight;
|
1658
|
+
self->search = &msea_search;
|
1659
|
+
self->search_w = &msea_search_w;
|
1660
|
+
self->search_each = &msea_search_each;
|
1661
|
+
self->search_each_w = &msea_search_each_w;
|
1662
|
+
self->rewrite = &msea_rewrite;
|
1663
|
+
self->explain = &msea_explain;
|
1664
|
+
self->explain_w = &msea_explain_w;
|
1665
|
+
self->get_term_vector = &msea_get_term_vector;
|
1666
|
+
self->get_similarity = &msea_get_similarity;
|
1667
|
+
self->close = &msea_close;
|
1668
|
+
return self;
|
1046
1669
|
}
|