ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/search.c
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
#include <string.h>
|
2
2
|
#include "search.h"
|
3
|
-
|
4
|
-
static char * const NUM_DOCS_ARG_ERROR_MSG = "num_docs must be > 0 to run a search";
|
5
|
-
static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a search";
|
3
|
+
#include "array.h"
|
6
4
|
|
7
5
|
/***************************************************************************
|
8
6
|
*
|
@@ -10,67 +8,65 @@ static char * const FIRST_DOC_ARG_ERROR_MSG = "first_doc must be >= 0 to run a s
|
|
10
8
|
*
|
11
9
|
***************************************************************************/
|
12
10
|
|
13
|
-
Explanation *
|
11
|
+
Explanation *expl_new(float value, const char *description, ...)
|
14
12
|
{
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
13
|
+
Explanation *expl = ALLOC(Explanation);
|
14
|
+
|
15
|
+
va_list args;
|
16
|
+
va_start(args, description);
|
17
|
+
expl->description = vstrfmt(description, args);
|
18
|
+
va_end(args);
|
19
|
+
|
20
|
+
expl->value = value;
|
21
|
+
expl->details = ary_new_type_capa(Explanation *,
|
22
|
+
EXPLANATION_DETAILS_START_SIZE);
|
23
|
+
return expl;
|
22
24
|
}
|
23
25
|
|
24
|
-
void
|
26
|
+
void expl_destroy(Explanation *expl)
|
25
27
|
{
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
expl_destoy(expl->details[i]);
|
30
|
-
}
|
31
|
-
free(expl->details);
|
32
|
-
free(expl->description);
|
33
|
-
free(expl);
|
28
|
+
ary_destroy((void **)expl->details, (free_ft)expl_destroy);
|
29
|
+
free(expl->description);
|
30
|
+
free(expl);
|
34
31
|
}
|
35
32
|
|
36
|
-
Explanation *expl_add_detail(Explanation *
|
33
|
+
Explanation *expl_add_detail(Explanation *expl, Explanation *detail)
|
37
34
|
{
|
38
|
-
|
39
|
-
|
40
|
-
REALLOC_N(self->details, Explanation *, self->dcapa);
|
41
|
-
}
|
42
|
-
self->details[self->dcnt] = detail;
|
43
|
-
self->dcnt++;
|
44
|
-
return self;
|
35
|
+
ary_push(expl->details, detail);
|
36
|
+
return expl;
|
45
37
|
}
|
46
38
|
|
47
|
-
char *
|
39
|
+
char *expl_to_s_depth(Explanation *expl, int depth)
|
48
40
|
{
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
41
|
+
int i;
|
42
|
+
char *buffer = ALLOC_N(char, depth * 2 + 1);
|
43
|
+
const int num_details = ary_size(expl->details);
|
44
|
+
|
45
|
+
memset(buffer, ' ', sizeof(char) * depth * 2);
|
46
|
+
buffer[depth*2] = 0;
|
53
47
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
48
|
+
buffer = estrcat(buffer, strfmt("%f = %s\n", expl->value, expl->description));
|
49
|
+
for (i = 0; i < num_details; i++) {
|
50
|
+
buffer = estrcat(buffer, expl_to_s_depth(expl->details[i], depth + 1));
|
51
|
+
}
|
58
52
|
|
59
|
-
|
53
|
+
return buffer;
|
60
54
|
}
|
61
55
|
|
62
|
-
char *expl_to_html(Explanation *
|
56
|
+
char *expl_to_html(Explanation *expl)
|
63
57
|
{
|
64
|
-
|
65
|
-
|
66
|
-
|
58
|
+
int i;
|
59
|
+
char *buffer;
|
60
|
+
const int num_details = ary_size(expl->details);
|
61
|
+
|
62
|
+
buffer = strfmt("<ul>\n<li>%f = %s</li>\n", expl->value, expl->description);
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
for (i = 0; i < num_details; i++) {
|
65
|
+
estrcat(buffer, expl_to_html(expl->details[i]));
|
66
|
+
}
|
71
67
|
|
72
|
-
|
73
|
-
|
68
|
+
REALLOC_N(buffer, char, strlen(buffer) + 10);
|
69
|
+
return strcat(buffer, "</ul>\n");
|
74
70
|
}
|
75
71
|
|
76
72
|
/***************************************************************************
|
@@ -79,88 +75,104 @@ char *expl_to_html(Explanation *self)
|
|
79
75
|
*
|
80
76
|
***************************************************************************/
|
81
77
|
|
82
|
-
bool hit_less_than(
|
78
|
+
static bool hit_less_than(const Hit *hit1, const Hit *hit2)
|
83
79
|
{
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
80
|
+
if (hit1->score == hit2->score) {
|
81
|
+
return hit1->doc > hit2->doc;
|
82
|
+
}
|
83
|
+
else {
|
84
|
+
return hit1->score < hit1->score;
|
85
|
+
}
|
89
86
|
}
|
90
87
|
|
91
|
-
|
88
|
+
static bool hit_lt(Hit *hit1, Hit *hit2)
|
92
89
|
{
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
90
|
+
if (hit1->score == hit2->score) {
|
91
|
+
return hit1->doc > hit2->doc;
|
92
|
+
}
|
93
|
+
else {
|
94
|
+
return hit1->score < hit2->score;
|
95
|
+
}
|
98
96
|
}
|
99
97
|
|
100
|
-
void hit_pq_down(PriorityQueue *pq)
|
98
|
+
static void hit_pq_down(PriorityQueue *pq)
|
101
99
|
{
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
register int i = 1;
|
101
|
+
register int j = 2; /* i << 1; */
|
102
|
+
register int k = 3; /* j + 1; */
|
103
|
+
Hit **heap = (Hit **)pq->heap;
|
104
|
+
Hit *node = heap[i]; /* save top node */
|
107
105
|
|
108
|
-
|
109
|
-
|
106
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
107
|
+
j = k;
|
108
|
+
}
|
110
109
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
110
|
+
while ((j <= pq->size) && hit_lt(heap[j], node)) {
|
111
|
+
heap[i] = heap[j]; /* shift up child */
|
112
|
+
i = j;
|
113
|
+
j = i << 1;
|
114
|
+
k = j + 1;
|
115
|
+
if ((k <= pq->size) && hit_lt(heap[k], heap[j])) {
|
116
|
+
j = k;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
heap[i] = node;
|
120
120
|
}
|
121
121
|
|
122
|
-
Hit *hit_pq_pop(PriorityQueue *pq)
|
122
|
+
static Hit *hit_pq_pop(PriorityQueue *pq)
|
123
123
|
{
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
124
|
+
if (pq->size > 0) {
|
125
|
+
Hit *result = (Hit *)pq->heap[1]; /* save first value */
|
126
|
+
pq->heap[1] = pq->heap[pq->size]; /* move last to first */
|
127
|
+
pq->heap[pq->size] = NULL;
|
128
|
+
pq->size--;
|
129
|
+
hit_pq_down(pq); /* adjust heap */
|
130
|
+
return result;
|
131
|
+
}
|
132
|
+
else {
|
133
|
+
return NULL;
|
134
|
+
}
|
134
135
|
}
|
135
136
|
|
136
|
-
|
137
|
+
static void hit_pq_up(PriorityQueue *pq)
|
137
138
|
{
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
139
|
+
Hit **heap = (Hit **)pq->heap;
|
140
|
+
Hit *node;
|
141
|
+
int i = pq->size;
|
142
|
+
int j = i >> 1;
|
143
|
+
node = heap[i];
|
144
|
+
|
145
|
+
while ((j > 0) && hit_lt(node, heap[j])) {
|
146
|
+
heap[i] = heap[j];
|
147
|
+
i = j;
|
148
|
+
j = j >> 1;
|
149
|
+
}
|
150
|
+
heap[i] = node;
|
151
|
+
}
|
143
152
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
static void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
154
|
+
{
|
155
|
+
if (pq->size < pq->capa) {
|
156
|
+
Hit *new_hit = ALLOC(Hit);
|
157
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
158
|
+
pq->size++;
|
159
|
+
if (pq->size >= pq->mem_capa) {
|
160
|
+
pq->mem_capa <<= 1;
|
161
|
+
REALLOC_N(pq->heap, void *, pq->mem_capa);
|
162
|
+
}
|
163
|
+
pq->heap[pq->size] = new_hit;
|
164
|
+
hit_pq_up(pq);
|
165
|
+
}
|
166
|
+
else if (pq->size > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
167
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
168
|
+
hit_pq_down(pq);
|
169
|
+
}
|
150
170
|
}
|
151
171
|
|
152
|
-
void
|
172
|
+
static void hit_pq_multi_insert(PriorityQueue *pq, Hit *hit)
|
153
173
|
{
|
154
|
-
|
155
|
-
|
156
|
-
memcpy(new_hit, hit, sizeof(Hit));
|
157
|
-
pq->count++;
|
158
|
-
pq->heap[pq->count] = new_hit;
|
159
|
-
hit_pq_up(pq);
|
160
|
-
} else if (pq->count > 0 && hit_lt((Hit *)pq->heap[1], hit)) {
|
161
|
-
memcpy(pq->heap[1], hit, sizeof(Hit));
|
162
|
-
hit_pq_down(pq);
|
163
|
-
}
|
174
|
+
hit_pq_insert(pq, hit);
|
175
|
+
free(hit);
|
164
176
|
}
|
165
177
|
|
166
178
|
/***************************************************************************
|
@@ -169,35 +181,38 @@ void hit_pq_insert(PriorityQueue *pq, Hit *hit)
|
|
169
181
|
*
|
170
182
|
***************************************************************************/
|
171
183
|
|
172
|
-
TopDocs *
|
184
|
+
TopDocs *td_new(int total_hits, int size, Hit **hits, float max_score)
|
173
185
|
{
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
186
|
+
TopDocs *td = ALLOC(TopDocs);
|
187
|
+
td->total_hits = total_hits;
|
188
|
+
td->size = size;
|
189
|
+
td->hits = hits;
|
190
|
+
td->max_score = max_score;
|
191
|
+
return td;
|
179
192
|
}
|
180
193
|
|
181
194
|
void td_destroy(TopDocs *td)
|
182
195
|
{
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
196
|
+
int i;
|
197
|
+
|
198
|
+
for (i = 0; i < td->size; i++) {
|
199
|
+
free(td->hits[i]);
|
200
|
+
}
|
201
|
+
free(td->hits);
|
202
|
+
free(td);
|
189
203
|
}
|
190
204
|
|
191
205
|
char *td_to_s(TopDocs *td)
|
192
206
|
{
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
207
|
+
int i;
|
208
|
+
Hit *hit;
|
209
|
+
char *buffer = strfmt("%d hits sorted by <score, doc_num>\n",
|
210
|
+
td->total_hits);
|
211
|
+
for (i = 0; i < td->size; i++) {
|
212
|
+
hit = td->hits[i];
|
213
|
+
estrcat(buffer, strfmt("\t%d:%f\n", hit->doc, hit->score));
|
214
|
+
}
|
215
|
+
return buffer;
|
201
216
|
}
|
202
217
|
|
203
218
|
/***************************************************************************
|
@@ -208,44 +223,50 @@ char *td_to_s(TopDocs *td)
|
|
208
223
|
|
209
224
|
Query *w_get_query(Weight *self)
|
210
225
|
{
|
211
|
-
|
226
|
+
return self->query;
|
212
227
|
}
|
213
228
|
|
214
229
|
float w_get_value(Weight *self)
|
215
230
|
{
|
216
|
-
|
231
|
+
return self->value;
|
217
232
|
}
|
218
233
|
|
219
234
|
float w_sum_of_squared_weights(Weight *self)
|
220
235
|
{
|
221
|
-
|
222
|
-
|
236
|
+
self->qweight = self->idf * self->query->boost;
|
237
|
+
return self->qweight * self->qweight; /* square it */
|
223
238
|
}
|
224
239
|
|
225
240
|
void w_normalize(Weight *self, float normalization_factor)
|
226
241
|
{
|
227
|
-
|
228
|
-
|
229
|
-
|
242
|
+
self->qnorm = normalization_factor;
|
243
|
+
self->qweight *= normalization_factor; /* normalize query weight */
|
244
|
+
self->value = self->qweight * self->idf;/* idf for document */
|
230
245
|
}
|
231
246
|
|
232
247
|
void w_destroy(Weight *self)
|
233
248
|
{
|
234
|
-
|
235
|
-
|
249
|
+
q_deref(self->query);
|
250
|
+
free(self);
|
236
251
|
}
|
237
252
|
|
238
|
-
Weight *w_create(Query *query)
|
253
|
+
Weight *w_create(size_t size, Query *query)
|
239
254
|
{
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
255
|
+
Weight *self = (Weight *)ecalloc(size);
|
256
|
+
#ifdef DEBUG
|
257
|
+
if (size < sizeof(Weight)) {
|
258
|
+
RAISE(ERROR, "size of weight <%d> should be at least <%d>",
|
259
|
+
(int)size, (int)sizeof(Weight));
|
260
|
+
}
|
261
|
+
#endif
|
262
|
+
REF(query);
|
263
|
+
self->query = query;
|
264
|
+
self->get_query = &w_get_query;
|
265
|
+
self->get_value = &w_get_value;
|
266
|
+
self->normalize = &w_normalize;
|
267
|
+
self->destroy = &w_destroy;
|
268
|
+
self->sum_of_squared_weights = &w_sum_of_squared_weights;
|
269
|
+
return self;
|
249
270
|
}
|
250
271
|
|
251
272
|
/***************************************************************************
|
@@ -254,128 +275,181 @@ Weight *w_create(Query *query)
|
|
254
275
|
*
|
255
276
|
***************************************************************************/
|
256
277
|
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
278
|
+
static const char *QUERY_NAMES[] = {
|
279
|
+
"TermQuery",
|
280
|
+
"MultiTermQuery",
|
281
|
+
"BooleanQuery",
|
282
|
+
"PhraseQuery",
|
283
|
+
"MultiPhraseQuery",
|
284
|
+
"ConstantScoreQuery",
|
285
|
+
"FilteredQuery",
|
286
|
+
"MatchAllQuery",
|
287
|
+
"RangeQuery",
|
288
|
+
"WildCardQuery",
|
289
|
+
"FuzzyQuery",
|
290
|
+
"PrefixQuery",
|
291
|
+
"SpanTermQuery",
|
292
|
+
"SpanFirstQuery",
|
293
|
+
"SpanOrQuery",
|
294
|
+
"SpanNotQuery",
|
295
|
+
"SpanNearQuery"
|
296
|
+
};
|
261
297
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
298
|
+
static const char *UNKNOWN_QUERY_NAME = "UnkownQuery";
|
299
|
+
|
300
|
+
const char *q_get_query_name(enum QUERY_TYPE type) {
|
301
|
+
if (type >= NELEMS(QUERY_NAMES)) {
|
302
|
+
return UNKNOWN_QUERY_NAME;
|
303
|
+
}
|
304
|
+
else {
|
305
|
+
return QUERY_NAMES[type];
|
306
|
+
}
|
266
307
|
}
|
267
308
|
|
268
|
-
|
309
|
+
static Query *q_rewrite(Query *self, IndexReader *ir)
|
269
310
|
{
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
Similarity *sim = query->get_similarity(query, searcher);
|
274
|
-
float norm = sim_query_norm(sim, sum);
|
275
|
-
q_deref(query);
|
276
|
-
|
277
|
-
weight->normalize(weight, norm);
|
278
|
-
return self->weight = weight;
|
311
|
+
(void)ir;
|
312
|
+
self->ref_cnt++;
|
313
|
+
return self;
|
279
314
|
}
|
280
315
|
|
281
|
-
|
316
|
+
static void q_extract_terms(Query *self, HashSet *terms)
|
282
317
|
{
|
283
|
-
|
284
|
-
|
318
|
+
/* do nothing by default */
|
319
|
+
(void)self;
|
320
|
+
(void)terms;
|
285
321
|
}
|
286
322
|
|
287
|
-
|
323
|
+
Similarity *q_get_similarity_i(Query *self, Searcher *searcher)
|
288
324
|
{
|
289
|
-
|
325
|
+
(void)self;
|
326
|
+
return searcher->get_similarity(searcher);
|
290
327
|
}
|
291
328
|
|
292
|
-
void
|
329
|
+
void q_destroy_i(Query *self)
|
293
330
|
{
|
294
|
-
|
331
|
+
free(self);
|
295
332
|
}
|
296
333
|
|
297
334
|
void q_deref(Query *self)
|
298
335
|
{
|
299
|
-
|
300
|
-
|
301
|
-
|
336
|
+
if (--(self->ref_cnt) == 0) {
|
337
|
+
self->destroy_i(self);
|
338
|
+
}
|
302
339
|
}
|
303
340
|
|
304
|
-
Query *
|
341
|
+
Weight *q_create_weight_unsup(Query *self, Searcher *searcher)
|
305
342
|
{
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
self->get_similarity = &q_get_similarity_i;
|
312
|
-
self->extract_terms = &q_extract_terms;
|
313
|
-
self->weight = NULL;
|
314
|
-
self->ref_cnt = 1;
|
315
|
-
return self;
|
343
|
+
(void)self;
|
344
|
+
(void)searcher;
|
345
|
+
RAISE(UNSUPPORTED_ERROR,
|
346
|
+
"Create weight is unsupported for this type of query");
|
347
|
+
return NULL;
|
316
348
|
}
|
317
349
|
|
318
|
-
|
350
|
+
Weight *q_weight(Query *self, Searcher *searcher)
|
319
351
|
{
|
320
|
-
|
321
|
-
|
352
|
+
Query *query = searcher->rewrite(searcher, self);
|
353
|
+
Weight *weight = query->create_weight_i(query, searcher);
|
354
|
+
float sum = weight->sum_of_squared_weights(weight);
|
355
|
+
Similarity *sim = query->get_similarity(query, searcher);
|
356
|
+
float norm = sim_query_norm(sim, sum);
|
357
|
+
q_deref(query);
|
322
358
|
|
323
|
-
|
324
|
-
|
325
|
-
return (self == o) || ((self->type == o->type) &&
|
326
|
-
(self->boost == o->boost) &&
|
327
|
-
self->eq(self, o));
|
359
|
+
weight->normalize(weight, norm);
|
360
|
+
return self->weight = weight;
|
328
361
|
}
|
329
362
|
|
363
|
+
#define BQ(query) ((BooleanQuery *)(query))
|
330
364
|
Query *q_combine(Query **queries, int q_cnt)
|
331
365
|
{
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
366
|
+
int i;
|
367
|
+
Query *q, *ret_q;
|
368
|
+
HashSet *uniques = hs_new((hash_ft)&q_hash, (eq_ft)&q_eq, NULL);
|
369
|
+
|
370
|
+
for (i = 0; i < q_cnt; i++) {
|
371
|
+
q = queries[i];
|
372
|
+
if (q->type == BOOLEAN_QUERY) {
|
373
|
+
int j;
|
374
|
+
bool splittable = true;
|
375
|
+
if (BQ(q)->coord_disabled == false) {
|
376
|
+
splittable = false;
|
377
|
+
}
|
378
|
+
else {
|
379
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
380
|
+
if (BQ(q)->clauses[j]->occur != BC_SHOULD) {
|
381
|
+
splittable = false;
|
382
|
+
break;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
}
|
386
|
+
if (splittable) {
|
387
|
+
for (j = 0; j < BQ(q)->clause_cnt; j++) {
|
388
|
+
Query *sub_q = BQ(q)->clauses[j]->query;
|
389
|
+
hs_add(uniques, sub_q);
|
390
|
+
}
|
391
|
+
}
|
392
|
+
else {
|
393
|
+
hs_add(uniques, q);
|
394
|
+
}
|
351
395
|
}
|
352
|
-
|
353
|
-
|
354
|
-
for (j = 0; j < bq->clause_cnt; j++) {
|
355
|
-
q = bq->clauses[j]->query;
|
356
|
-
hs_add(uniques, q);
|
396
|
+
else {
|
397
|
+
hs_add(uniques, q);
|
357
398
|
}
|
358
|
-
} else {
|
359
|
-
hs_add(uniques, q);
|
360
|
-
}
|
361
|
-
} else {
|
362
|
-
hs_add(uniques, q);
|
363
399
|
}
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
400
|
+
if (uniques->size == 1) {
|
401
|
+
ret_q = (Query *)uniques->elems[0];
|
402
|
+
REF(ret_q);
|
403
|
+
}
|
404
|
+
else {
|
405
|
+
ret_q = bq_new(true);
|
406
|
+
for (i = 0; i < uniques->size; i++) {
|
407
|
+
q = (Query *)uniques->elems[i];
|
408
|
+
bq_add_query(ret_q, q, BC_SHOULD);
|
409
|
+
}
|
374
410
|
}
|
375
|
-
|
376
|
-
hs_destroy(uniques);
|
411
|
+
hs_destroy(uniques);
|
377
412
|
|
378
|
-
|
413
|
+
return ret_q;
|
414
|
+
}
|
415
|
+
|
416
|
+
ulong q_hash(Query *self)
|
417
|
+
{
|
418
|
+
return (self->hash(self) << 5) | self->type;
|
419
|
+
}
|
420
|
+
|
421
|
+
int q_eq(Query *self, Query *o)
|
422
|
+
{
|
423
|
+
return (self == o)
|
424
|
+
|| ((self->type == o->type)
|
425
|
+
&& (self->boost == o->boost)
|
426
|
+
&& self->eq(self, o));
|
427
|
+
}
|
428
|
+
|
429
|
+
static MatchVector *q_get_matchv_i(Query *self, MatchVector *mv, TermVector *tv)
|
430
|
+
{
|
431
|
+
/* be default we don't add any matches */
|
432
|
+
(void)self; (void)tv;
|
433
|
+
return mv;
|
434
|
+
}
|
435
|
+
|
436
|
+
Query *q_create(size_t size)
|
437
|
+
{
|
438
|
+
Query *self = (Query *)ecalloc(size);
|
439
|
+
#ifdef DEBUG
|
440
|
+
if (size < sizeof(Query)) {
|
441
|
+
RAISE(ERROR, "Size of a query <%d> should never be smaller than the "
|
442
|
+
"size of a Query struct <%d>", (int)size, (int)sizeof(Query));
|
443
|
+
}
|
444
|
+
#endif
|
445
|
+
self->boost = 1.0;
|
446
|
+
self->rewrite = &q_rewrite;
|
447
|
+
self->get_similarity = &q_get_similarity_i;
|
448
|
+
self->extract_terms = &q_extract_terms;
|
449
|
+
self->get_matchv_i = &q_get_matchv_i;
|
450
|
+
self->weight = NULL;
|
451
|
+
self->ref_cnt = 1;
|
452
|
+
return self;
|
379
453
|
}
|
380
454
|
|
381
455
|
/***************************************************************************
|
@@ -384,36 +458,154 @@ Query *q_combine(Query **queries, int q_cnt)
|
|
384
458
|
*
|
385
459
|
***************************************************************************/
|
386
460
|
|
387
|
-
void scorer_destroy_i(Scorer *
|
461
|
+
void scorer_destroy_i(Scorer *scorer)
|
388
462
|
{
|
389
|
-
|
390
|
-
free(self);
|
463
|
+
free(scorer);
|
391
464
|
}
|
392
465
|
|
393
|
-
Scorer *scorer_create(Similarity *similarity)
|
466
|
+
Scorer *scorer_create(size_t size, Similarity *similarity)
|
394
467
|
{
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
468
|
+
Scorer *self = (Scorer *)ecalloc(size);
|
469
|
+
#ifdef DEBUG
|
470
|
+
if (size < sizeof(Scorer)) {
|
471
|
+
RAISE(ERROR, "size of scorer <%d> should be at least <%d>",
|
472
|
+
(int)size, (int)sizeof(Scorer));
|
473
|
+
}
|
474
|
+
#endif
|
475
|
+
self->destroy = &scorer_destroy_i;
|
476
|
+
self->similarity = similarity;
|
477
|
+
return self;
|
400
478
|
}
|
401
479
|
|
402
480
|
bool scorer_less_than(void *p1, void *p2)
|
403
481
|
{
|
404
|
-
|
405
|
-
|
406
|
-
|
482
|
+
Scorer *s1 = (Scorer *)p1;
|
483
|
+
Scorer *s2 = (Scorer *)p2;
|
484
|
+
return s1->score(s1) < s2->score(s2);
|
407
485
|
}
|
408
486
|
|
409
|
-
bool scorer_doc_less_than(
|
487
|
+
bool scorer_doc_less_than(const Scorer *s1, const Scorer *s2)
|
410
488
|
{
|
411
|
-
|
489
|
+
return s1->doc < s2->doc;
|
412
490
|
}
|
413
491
|
|
414
492
|
int scorer_doc_cmp(const void *p1, const void *p2)
|
415
493
|
{
|
416
|
-
|
494
|
+
return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
|
495
|
+
}
|
496
|
+
|
497
|
+
/***************************************************************************
|
498
|
+
*
|
499
|
+
* Highlighter
|
500
|
+
*
|
501
|
+
***************************************************************************/
|
502
|
+
|
503
|
+
/* ** MatchRange ** */
|
504
|
+
static int match_range_cmp(const void *p1, const void *p2)
|
505
|
+
{
|
506
|
+
int diff = ((MatchRange *)p1)->start - ((MatchRange *)p2)->start;
|
507
|
+
if (diff != 0) {
|
508
|
+
return diff;
|
509
|
+
}
|
510
|
+
else {
|
511
|
+
return ((MatchRange *)p2)->end - ((MatchRange *)p1)->end;
|
512
|
+
}
|
513
|
+
}
|
514
|
+
|
515
|
+
|
516
|
+
|
517
|
+
/* ** MatchVector ** */
|
518
|
+
MatchVector *matchv_new()
|
519
|
+
{
|
520
|
+
MatchVector *matchv = ALLOC(MatchVector);
|
521
|
+
|
522
|
+
matchv->size = 0;
|
523
|
+
matchv->capa = MATCH_VECTOR_INIT_CAPA;
|
524
|
+
matchv->matches = ALLOC_N(MatchRange, MATCH_VECTOR_INIT_CAPA);
|
525
|
+
|
526
|
+
return matchv;
|
527
|
+
}
|
528
|
+
|
529
|
+
MatchVector *matchv_add(MatchVector *self, int start, int end)
|
530
|
+
{
|
531
|
+
if (self->size >= self->capa) {
|
532
|
+
self->capa <<= 1;
|
533
|
+
REALLOC_N(self->matches, MatchRange, self->capa);
|
534
|
+
}
|
535
|
+
self->matches[self->size].start = start;
|
536
|
+
self->matches[self->size].end = end;
|
537
|
+
self->matches[self->size++].score = 1.0;
|
538
|
+
return self;
|
539
|
+
}
|
540
|
+
|
541
|
+
MatchVector *matchv_sort(MatchVector *self)
|
542
|
+
{
|
543
|
+
qsort(self->matches, self->size, sizeof(MatchRange), &match_range_cmp);
|
544
|
+
return self;
|
545
|
+
}
|
546
|
+
|
547
|
+
MatchVector *matchv_compact(MatchVector *self)
|
548
|
+
{
|
549
|
+
int left, right;
|
550
|
+
matchv_sort(self);
|
551
|
+
for (right = left = 0; right < self->size; right++) {
|
552
|
+
/* Note the end + 1. This compacts a range 3:5 and 6:8 inleft 3:8 */
|
553
|
+
if (self->matches[right].start > self->matches[left].end + 1) {
|
554
|
+
left++;
|
555
|
+
self->matches[left].start = self->matches[right].start;
|
556
|
+
self->matches[left].end = self->matches[right].end;
|
557
|
+
self->matches[left].score = self->matches[right].score;
|
558
|
+
}
|
559
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
560
|
+
self->matches[left].end = self->matches[right].end;
|
561
|
+
}
|
562
|
+
else {
|
563
|
+
self->matches[left].score += self->matches[right].score;
|
564
|
+
}
|
565
|
+
}
|
566
|
+
self->size = left + 1;
|
567
|
+
return self;
|
568
|
+
}
|
569
|
+
|
570
|
+
MatchVector *matchv_compact_with_breaks(MatchVector *self)
|
571
|
+
{
|
572
|
+
int left, right;
|
573
|
+
matchv_sort(self);
|
574
|
+
for (right = left = 0; right < self->size; right++) {
|
575
|
+
/* Note: no end + 1. Unlike above won't compact ranges 3:5 and 6:8 */
|
576
|
+
if (self->matches[right].start > self->matches[left].end) {
|
577
|
+
left++;
|
578
|
+
self->matches[left].start = self->matches[right].start;
|
579
|
+
self->matches[left].end = self->matches[right].end;
|
580
|
+
self->matches[left].score = self->matches[right].score;
|
581
|
+
}
|
582
|
+
else if (self->matches[right].end > self->matches[left].end) {
|
583
|
+
self->matches[left].end = self->matches[right].end;
|
584
|
+
self->matches[left].score += self->matches[right].score;
|
585
|
+
}
|
586
|
+
else if (right > left) {
|
587
|
+
self->matches[left].score += self->matches[right].score;
|
588
|
+
}
|
589
|
+
}
|
590
|
+
self->size = left + 1;
|
591
|
+
return self;
|
592
|
+
}
|
593
|
+
|
594
|
+
|
595
|
+
static MatchVector *matchv_set_offsets(MatchVector *mv, Offset *offsets)
|
596
|
+
{
|
597
|
+
int i;
|
598
|
+
for (i = 0; i < mv->size; i++) {
|
599
|
+
mv->matches[i].start_offset = offsets[mv->matches[i].start].start;
|
600
|
+
mv->matches[i].end_offset = offsets[mv->matches[i].end].end;
|
601
|
+
}
|
602
|
+
return mv;
|
603
|
+
}
|
604
|
+
|
605
|
+
void matchv_destroy(MatchVector *self)
|
606
|
+
{
|
607
|
+
free(self->matches);
|
608
|
+
free(self);
|
417
609
|
}
|
418
610
|
|
419
611
|
/***************************************************************************
|
@@ -422,211 +614,541 @@ int scorer_doc_cmp(const void *p1, const void *p2)
|
|
422
614
|
*
|
423
615
|
***************************************************************************/
|
424
616
|
|
425
|
-
|
617
|
+
MatchVector *searcher_get_match_vector(Searcher *self,
|
618
|
+
Query *query,
|
619
|
+
const int doc_num,
|
620
|
+
const char *field)
|
621
|
+
{
|
622
|
+
MatchVector *mv = matchv_new();
|
623
|
+
Query *rewritten_query = self->rewrite(self, query);
|
624
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
625
|
+
if (tv && tv->term_cnt > 0 && tv->terms[0].positions != NULL) {
|
626
|
+
mv = rewritten_query->get_matchv_i(rewritten_query, mv, tv);
|
627
|
+
tv_destroy(tv);
|
628
|
+
}
|
629
|
+
q_deref(rewritten_query);
|
630
|
+
return mv;
|
631
|
+
}
|
632
|
+
|
633
|
+
typedef struct Excerpt
|
426
634
|
{
|
427
|
-
|
635
|
+
int start;
|
636
|
+
int end;
|
637
|
+
int start_pos;
|
638
|
+
int end_pos;
|
639
|
+
int start_offset;
|
640
|
+
int end_offset;
|
641
|
+
double score;
|
642
|
+
} Excerpt;
|
643
|
+
|
644
|
+
/*
|
645
|
+
static int excerpt_cmp(const void *p1, const void *p2)
|
646
|
+
{
|
647
|
+
double score1 = (*((Excerpt **)p1))->score;
|
648
|
+
double score2 = (*((Excerpt **)p2))->score;
|
649
|
+
if (score1 > score2) return 1;
|
650
|
+
if (score1 < score2) return -1;
|
651
|
+
return 0;
|
428
652
|
}
|
653
|
+
*/
|
429
654
|
|
430
|
-
static int
|
655
|
+
static int excerpt_start_cmp(const void *p1, const void *p2)
|
431
656
|
{
|
432
|
-
|
433
|
-
|
657
|
+
return (*((Excerpt **)p1))->start - (*((Excerpt **)p2))->start;
|
658
|
+
}
|
434
659
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
return freqs;
|
660
|
+
static int excerpt_lt(Excerpt *e1, Excerpt *e2)
|
661
|
+
{
|
662
|
+
return e1->score > e2->score; /* want the highest score at top */
|
439
663
|
}
|
440
664
|
|
441
|
-
static
|
665
|
+
static Excerpt *excerpt_new(int start, int end, double score)
|
442
666
|
{
|
443
|
-
|
444
|
-
|
667
|
+
Excerpt *excerpt = ALLOC_AND_ZERO(Excerpt);
|
668
|
+
excerpt->start = start;
|
669
|
+
excerpt->end = end;
|
670
|
+
excerpt->score = score;
|
671
|
+
return excerpt;
|
672
|
+
}
|
445
673
|
|
446
|
-
|
447
|
-
|
448
|
-
|
674
|
+
static Excerpt *excerpt_recalc_score(Excerpt *e, MatchVector *mv)
|
675
|
+
{
|
676
|
+
int i;
|
677
|
+
double score = 0.0;
|
678
|
+
for (i = e->start; i <= e->end; i++) {
|
679
|
+
score += mv->matches[i].score;
|
680
|
+
}
|
681
|
+
e->score = score;
|
682
|
+
return e;
|
683
|
+
}
|
449
684
|
|
450
|
-
|
685
|
+
/* expand an excerpt to it's largest possible size */
|
686
|
+
static Excerpt *excerpt_expand(Excerpt *e, const int len, TermVector *tv)
|
687
|
+
{
|
688
|
+
Offset *offsets = tv->offsets;
|
689
|
+
int offset_cnt = tv->offset_cnt;
|
690
|
+
bool did_expansion = true;
|
691
|
+
int i;
|
692
|
+
/* fill in skipped offsets */
|
693
|
+
for (i = 1; i < offset_cnt; i++) {
|
694
|
+
if (offsets[i].start == 0) {
|
695
|
+
offsets[i].start = offsets[i-1].start;
|
696
|
+
}
|
697
|
+
if (offsets[i].end == 0) {
|
698
|
+
offsets[i].end = offsets[i-1].end;
|
699
|
+
}
|
700
|
+
}
|
701
|
+
|
702
|
+
while (did_expansion) {
|
703
|
+
did_expansion = false;
|
704
|
+
if (e->start_pos > 0
|
705
|
+
&& (e->end_offset - offsets[e->start_pos - 1].start) < len) {
|
706
|
+
e->start_pos--;
|
707
|
+
e->start_offset = offsets[e->start_pos].start;
|
708
|
+
did_expansion = true;
|
709
|
+
}
|
710
|
+
if (e->end_pos < (offset_cnt - 1)
|
711
|
+
&& (offsets[e->end_pos + 1].end - e->start_offset) < len) {
|
712
|
+
e->end_pos++;
|
713
|
+
e->end_offset = offsets[e->end_pos].end;
|
714
|
+
did_expansion = true;
|
715
|
+
}
|
716
|
+
}
|
717
|
+
return e;
|
718
|
+
}
|
719
|
+
|
720
|
+
static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
|
721
|
+
LazyDocField *lazy_df,
|
722
|
+
const char *pre_tag,
|
723
|
+
const char *post_tag,
|
724
|
+
const char *ellipsis)
|
725
|
+
{
|
726
|
+
int i, len;
|
727
|
+
int last_offset = e->start_offset;
|
728
|
+
const int num_matches = e->end - e->start + 1;
|
729
|
+
const int pre_tag_len = (int)strlen(pre_tag);
|
730
|
+
const int post_tag_len = (int)strlen(post_tag);
|
731
|
+
const int ellipsis_len = (int)strlen(ellipsis);
|
732
|
+
char *excerpt_str = ALLOC_N(char,
|
733
|
+
10 + e->end_offset - e->start_offset
|
734
|
+
+ (num_matches * (pre_tag_len + post_tag_len))
|
735
|
+
+ (2 * ellipsis_len));
|
736
|
+
char *e_ptr = excerpt_str;
|
737
|
+
if (e->start_offset > 0) {
|
738
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
739
|
+
e_ptr += ellipsis_len;
|
740
|
+
}
|
741
|
+
for (i = e->start; i <= e->end; i++) {
|
742
|
+
MatchRange *mr = mv->matches + i;
|
743
|
+
len = mr->start_offset - last_offset;
|
744
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
745
|
+
e_ptr += len;
|
746
|
+
memcpy(e_ptr, pre_tag, pre_tag_len);
|
747
|
+
e_ptr += pre_tag_len;
|
748
|
+
len = mr->end_offset - mr->start_offset;
|
749
|
+
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
750
|
+
e_ptr += len;
|
751
|
+
memcpy(e_ptr, post_tag, post_tag_len);
|
752
|
+
e_ptr += post_tag_len;
|
753
|
+
last_offset = mr->end_offset;
|
754
|
+
}
|
755
|
+
len = e->end_offset - last_offset;
|
756
|
+
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
757
|
+
e_ptr += len;
|
758
|
+
if (e->end_offset < lazy_df->len) {
|
759
|
+
memcpy(e_ptr, ellipsis, ellipsis_len);
|
760
|
+
e_ptr += ellipsis_len;
|
761
|
+
}
|
762
|
+
*e_ptr = '\0';
|
763
|
+
return excerpt_str;
|
764
|
+
}
|
765
|
+
|
766
|
+
char **searcher_highlight(Searcher *self,
|
767
|
+
Query *query,
|
768
|
+
const int doc_num,
|
769
|
+
const char *field,
|
770
|
+
const int excerpt_len,
|
771
|
+
const int num_excerpts,
|
772
|
+
const char *pre_tag,
|
773
|
+
const char *post_tag,
|
774
|
+
const char *ellipsis)
|
775
|
+
{
|
776
|
+
char **excerpt_strs = NULL;
|
777
|
+
TermVector *tv = self->get_term_vector(self, doc_num, field);
|
778
|
+
LazyDoc *lazy_doc = self->get_lazy_doc(self, doc_num);
|
779
|
+
LazyDocField *lazy_df = NULL;
|
780
|
+
if (lazy_doc) {
|
781
|
+
lazy_df = h_get(lazy_doc->field_dict, field);
|
782
|
+
}
|
783
|
+
if (tv && lazy_df && tv->term_cnt > 0 && tv->terms[0].positions != NULL
|
784
|
+
&& tv->offsets != NULL) {
|
785
|
+
MatchVector *mv;
|
786
|
+
query = self->rewrite(self, query);
|
787
|
+
mv = query->get_matchv_i(query, matchv_new(), tv);
|
788
|
+
if (mv->size > 0) {
|
789
|
+
Excerpt **excerpts = ALLOC_AND_ZERO_N(Excerpt *, num_excerpts);
|
790
|
+
int e_start, e_end, i, j;
|
791
|
+
MatchRange *matches = mv->matches;
|
792
|
+
double running_score = 0.0;
|
793
|
+
Offset *offsets = tv->offsets;
|
794
|
+
PriorityQueue *excerpt_pq;
|
795
|
+
|
796
|
+
matchv_compact_with_breaks(mv);
|
797
|
+
matchv_set_offsets(mv, offsets);
|
798
|
+
excerpt_pq = pq_new(mv->size, (lt_ft)&excerpt_lt, &free);
|
799
|
+
/* add all possible excerpts to the priority queue */
|
800
|
+
|
801
|
+
for (e_start = 0, e_end = 1; e_start < mv->size; e_start++) {
|
802
|
+
const int start_offset = matches[e_start].start_offset;
|
803
|
+
if (e_start >= e_end) {
|
804
|
+
e_end = e_start + 1;
|
805
|
+
}
|
806
|
+
running_score += matches[e_start].score;
|
807
|
+
while (e_end < mv->size && (matches[e_end].end_offset
|
808
|
+
<= start_offset + excerpt_len)) {
|
809
|
+
running_score += matches[e_end].score;
|
810
|
+
e_end++;
|
811
|
+
}
|
812
|
+
pq_push(excerpt_pq,
|
813
|
+
excerpt_new(e_start, e_end - 1, running_score));
|
814
|
+
/* - 0.1 so that earlier matches take priority */
|
815
|
+
running_score -= matches[e_start].score;
|
816
|
+
}
|
817
|
+
|
818
|
+
for (i = 0; i < num_excerpts && excerpt_pq->size > 0; i++) {
|
819
|
+
excerpts[i] = pq_pop(excerpt_pq);
|
820
|
+
if (i < num_excerpts - 1) {
|
821
|
+
/* set match ranges alread included to 0 */
|
822
|
+
Excerpt *e = excerpts[i];
|
823
|
+
for (j = e->start; j <= e->end; j++) {
|
824
|
+
matches[j].score = 0.0;
|
825
|
+
}
|
826
|
+
e = NULL;
|
827
|
+
while (e != (Excerpt *)pq_top(excerpt_pq)) {
|
828
|
+
e = pq_top(excerpt_pq);
|
829
|
+
excerpt_recalc_score(e, mv);
|
830
|
+
pq_down(excerpt_pq);
|
831
|
+
}
|
832
|
+
}
|
833
|
+
}
|
834
|
+
|
835
|
+
qsort(excerpts, i, sizeof(Excerpt *), &excerpt_start_cmp);
|
836
|
+
for (j = 0; j < i; j++) {
|
837
|
+
Excerpt *e = excerpts[j];
|
838
|
+
e->start_pos = matches[e->start].start;
|
839
|
+
e->end_pos = matches[e->end].end;
|
840
|
+
e->start_offset = offsets[e->start_pos].start;
|
841
|
+
e->end_offset = offsets[e->end_pos].end;
|
842
|
+
}
|
843
|
+
|
844
|
+
if (i < num_excerpts) {
|
845
|
+
const int diff = num_excerpts - i;
|
846
|
+
memmove(excerpts + (diff), excerpts,
|
847
|
+
i * sizeof(Excerpt *));
|
848
|
+
for (j = 0; j < diff; j++) {
|
849
|
+
/* these new excerpts will grow into one long excerpt at
|
850
|
+
* the start */
|
851
|
+
excerpts[j] = ALLOC_AND_ZERO(Excerpt);
|
852
|
+
excerpts[j]->end = -1;
|
853
|
+
}
|
854
|
+
}
|
855
|
+
|
856
|
+
excerpt_strs = ary_new_type_capa(char *, num_excerpts);
|
857
|
+
/* merge excerpts where possible */
|
858
|
+
for (i = 0; i < num_excerpts;) {
|
859
|
+
Excerpt *ei = excerpts[i];
|
860
|
+
int merged = 1; /* 1 means a single excerpt, ie no merges */
|
861
|
+
for (j = i + 1; j < num_excerpts; j++) {
|
862
|
+
Excerpt *ej = excerpts[j];
|
863
|
+
if ((ej->end_offset - ei->start_offset)
|
864
|
+
< (j - i + 1) * excerpt_len) {
|
865
|
+
ei->end = ej->end;
|
866
|
+
ei->end_pos = ej->end_pos;
|
867
|
+
ei->end_offset = ej->end_offset;
|
868
|
+
merged = j - i + 1;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
excerpt_expand(ei, merged * excerpt_len, tv);
|
872
|
+
ary_push(excerpt_strs,
|
873
|
+
excerpt_get_str(ei, mv, lazy_df,
|
874
|
+
pre_tag, post_tag, ellipsis));
|
875
|
+
i += merged;
|
876
|
+
}
|
877
|
+
for (i = 0; i < num_excerpts; i++) {
|
878
|
+
free(excerpts[i]);
|
879
|
+
}
|
880
|
+
free(excerpts);
|
881
|
+
pq_destroy(excerpt_pq);
|
882
|
+
matchv_destroy(mv);
|
883
|
+
}
|
884
|
+
q_deref(query);
|
885
|
+
}
|
886
|
+
if (tv) tv_destroy(tv);
|
887
|
+
if (lazy_doc) lazy_doc_close(lazy_doc);
|
888
|
+
return excerpt_strs;
|
451
889
|
}
|
452
890
|
|
891
|
+
static Weight *sea_create_weight(Searcher *self, Query *query)
|
892
|
+
{
|
893
|
+
return q_weight(query, self);
|
894
|
+
}
|
453
895
|
|
454
|
-
static
|
896
|
+
static void sea_check_args(int num_docs, int first_doc)
|
455
897
|
{
|
456
|
-
|
898
|
+
if (num_docs <= 0) {
|
899
|
+
RAISE(ARG_ERROR, ":num_docs was set to %d but should be greater "
|
900
|
+
"than 0 : %d <= 0", num_docs, num_docs);
|
901
|
+
}
|
902
|
+
|
903
|
+
if (first_doc < 0) {
|
904
|
+
RAISE(ARG_ERROR, ":first_doc was set to %d but should be greater "
|
905
|
+
"than or equal to 0 : %d < 0", first_doc, first_doc);
|
906
|
+
}
|
457
907
|
}
|
458
908
|
|
459
|
-
static
|
909
|
+
static Similarity *sea_get_similarity(Searcher *self)
|
460
910
|
{
|
461
|
-
|
911
|
+
return self->similarity;
|
462
912
|
}
|
463
913
|
|
464
|
-
|
914
|
+
/***************************************************************************
|
915
|
+
*
|
916
|
+
* IndexSearcher
|
917
|
+
*
|
918
|
+
***************************************************************************/
|
919
|
+
|
920
|
+
#define ISEA(searcher) ((IndexSearcher *)(searcher))
|
921
|
+
|
922
|
+
int isea_doc_freq(Searcher *self, const char *field, const char *term)
|
465
923
|
{
|
466
|
-
|
924
|
+
return ir_doc_freq(ISEA(self)->ir, field, term);
|
467
925
|
}
|
468
926
|
|
469
|
-
static
|
470
|
-
int num_docs, Filter *filter, Sort *sort)
|
927
|
+
static Document *isea_get_doc(Searcher *self, int doc_num)
|
471
928
|
{
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
Scorer *scorer;
|
476
|
-
Hit **score_docs = NULL;
|
477
|
-
Hit hit;
|
478
|
-
int total_hits = 0;
|
479
|
-
float score;
|
480
|
-
BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
|
481
|
-
Hit *(*hq_pop)(PriorityQueue *pq);
|
482
|
-
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
483
|
-
void (*hq_destroy)(PriorityQueue *self);
|
484
|
-
PriorityQueue *hq;
|
929
|
+
IndexReader *ir = ISEA(self)->ir;
|
930
|
+
return ir->get_doc(ir, doc_num);
|
931
|
+
}
|
485
932
|
|
933
|
+
static LazyDoc *isea_get_lazy_doc(Searcher *self, int doc_num)
|
934
|
+
{
|
935
|
+
IndexReader *ir = ISEA(self)->ir;
|
936
|
+
return ir->get_lazy_doc(ir, doc_num);
|
937
|
+
}
|
486
938
|
|
487
|
-
|
488
|
-
|
939
|
+
static int isea_max_doc(Searcher *self)
|
940
|
+
{
|
941
|
+
IndexReader *ir = ISEA(self)->ir;
|
942
|
+
return ir->max_doc(ir);
|
943
|
+
}
|
489
944
|
|
490
|
-
|
491
|
-
|
945
|
+
#define IS_FILTERED(bits, filter_func, scorer, searcher) \
|
946
|
+
((bits && !bv_get(bits, scorer->doc))\
|
947
|
+
|| (filter_func \
|
948
|
+
&& !filter_func(scorer->doc, scorer->score(scorer), searcher)))
|
492
949
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
950
|
+
static TopDocs *isea_search_w(Searcher *self,
|
951
|
+
Weight *weight,
|
952
|
+
int first_doc,
|
953
|
+
int num_docs,
|
954
|
+
Filter *filter,
|
955
|
+
Sort *sort,
|
956
|
+
filter_ft filter_func,
|
957
|
+
bool load_fields)
|
958
|
+
{
|
959
|
+
int max_size = first_doc + num_docs;
|
960
|
+
int i;
|
961
|
+
Scorer *scorer;
|
962
|
+
Hit **score_docs = NULL;
|
963
|
+
Hit hit;
|
964
|
+
int total_hits = 0;
|
965
|
+
float score, max_score = 0.0;
|
966
|
+
BitVector *bits = (filter
|
967
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
968
|
+
: NULL);
|
969
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
970
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
971
|
+
void (*hq_destroy)(PriorityQueue *self);
|
972
|
+
PriorityQueue *hq;
|
500
973
|
|
501
|
-
|
502
|
-
hq = fshq_pq_create(max_size, sort, self->ir);
|
503
|
-
hq_pop = &fshq_pq_pop;
|
504
|
-
hq_insert = &fshq_pq_insert;
|
505
|
-
hq_destroy = &fshq_pq_destroy;
|
506
|
-
} else {
|
507
|
-
hq = pq_create(max_size, &hit_less_than);
|
508
|
-
hq_pop = &hit_pq_pop;
|
509
|
-
hq_insert = &hit_pq_insert;
|
510
|
-
hq_destroy = &pq_destroy;
|
511
|
-
}
|
974
|
+
sea_check_args(num_docs, first_doc);
|
512
975
|
|
513
|
-
|
514
|
-
if (
|
515
|
-
|
516
|
-
|
517
|
-
hit.doc = scorer->doc; hit.score = score;
|
518
|
-
hq_insert(hq, &hit);
|
519
|
-
}
|
520
|
-
scorer->destroy(scorer);
|
521
|
-
weight->destroy(weight);
|
976
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
977
|
+
if (!scorer) {
|
978
|
+
return td_new(0, 0, NULL, 0.0);
|
979
|
+
}
|
522
980
|
|
523
|
-
|
524
|
-
|
525
|
-
|
981
|
+
if (sort) {
|
982
|
+
hq = fshq_pq_new(max_size, sort, ISEA(self)->ir);
|
983
|
+
hq_insert = &fshq_pq_insert;
|
984
|
+
hq_destroy = &fshq_pq_destroy;
|
985
|
+
if (load_fields) {
|
986
|
+
hq_pop = &fshq_pq_pop_fd;
|
987
|
+
}
|
988
|
+
else {
|
989
|
+
hq_pop = &fshq_pq_pop;
|
990
|
+
}
|
991
|
+
}
|
992
|
+
else {
|
993
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
994
|
+
hq_pop = &hit_pq_pop;
|
995
|
+
hq_insert = &hit_pq_insert;
|
996
|
+
hq_destroy = &pq_destroy;
|
526
997
|
}
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
998
|
+
|
999
|
+
while (scorer->next(scorer)) {
|
1000
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1001
|
+
continue;
|
1002
|
+
}
|
1003
|
+
total_hits++;
|
1004
|
+
score = scorer->score(scorer);
|
1005
|
+
if (score > max_score) max_score = score;
|
1006
|
+
hit.doc = scorer->doc; hit.score = score;
|
1007
|
+
hq_insert(hq, &hit);
|
532
1008
|
}
|
533
|
-
|
534
|
-
num_docs = 0;
|
535
|
-
}
|
536
|
-
pq_clear(hq);
|
537
|
-
hq_destroy(hq);
|
1009
|
+
scorer->destroy(scorer);
|
538
1010
|
|
539
|
-
|
540
|
-
|
1011
|
+
if (hq->size > first_doc) {
|
1012
|
+
if ((hq->size - first_doc) < num_docs) {
|
1013
|
+
num_docs = hq->size - first_doc;
|
1014
|
+
}
|
1015
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1016
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1017
|
+
score_docs[i] = hq_pop(hq);
|
1018
|
+
/*
|
1019
|
+
hit = score_docs[i] = pq_pop(hq);
|
1020
|
+
printf("hit = %d-->%f\n", hit->doc, hit->score);
|
1021
|
+
*/
|
1022
|
+
}
|
1023
|
+
}
|
1024
|
+
else {
|
1025
|
+
num_docs = 0;
|
1026
|
+
}
|
1027
|
+
pq_clear(hq);
|
1028
|
+
hq_destroy(hq);
|
1029
|
+
|
1030
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1031
|
+
}
|
1032
|
+
|
1033
|
+
static TopDocs *isea_search(Searcher *self,
|
1034
|
+
Query *query,
|
1035
|
+
int first_doc,
|
1036
|
+
int num_docs,
|
1037
|
+
Filter *filter,
|
1038
|
+
Sort *sort,
|
1039
|
+
filter_ft filter_func,
|
1040
|
+
bool load_fields)
|
1041
|
+
{
|
1042
|
+
TopDocs *td;
|
1043
|
+
Weight *weight = q_weight(query, self);
|
1044
|
+
td = isea_search_w(self, weight, first_doc, num_docs, filter,
|
1045
|
+
sort, filter_func, load_fields);
|
1046
|
+
weight->destroy(weight);
|
1047
|
+
return td;
|
541
1048
|
}
|
542
1049
|
|
543
|
-
static void
|
544
|
-
|
1050
|
+
static void isea_search_each_w(Searcher *self, Weight *weight, Filter *filter,
|
1051
|
+
filter_ft filter_func,
|
1052
|
+
void (*fn)(Searcher *, int, float, void *),
|
1053
|
+
void *arg)
|
545
1054
|
{
|
546
|
-
|
547
|
-
|
1055
|
+
Scorer *scorer;
|
1056
|
+
BitVector *bits = (filter
|
1057
|
+
? filt_get_bv(filter, ISEA(self)->ir)
|
1058
|
+
: NULL);
|
548
1059
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
}
|
1060
|
+
scorer = weight->scorer(weight, ISEA(self)->ir);
|
1061
|
+
if (!scorer) {
|
1062
|
+
return;
|
1063
|
+
}
|
554
1064
|
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
1065
|
+
while (scorer->next(scorer)) {
|
1066
|
+
if (IS_FILTERED(bits, filter_func, scorer, self)) {
|
1067
|
+
continue;
|
1068
|
+
}
|
1069
|
+
fn(self, scorer->doc, scorer->score(scorer), arg);
|
1070
|
+
}
|
1071
|
+
scorer->destroy(scorer);
|
560
1072
|
}
|
561
1073
|
|
562
|
-
static void
|
563
|
-
|
1074
|
+
static void isea_search_each(Searcher *self, Query *query, Filter *filter,
|
1075
|
+
filter_ft filter_func,
|
1076
|
+
void (*fn)(Searcher *, int, float, void *),
|
1077
|
+
void *arg)
|
564
1078
|
{
|
565
|
-
|
566
|
-
|
567
|
-
|
1079
|
+
Weight *weight = q_weight(query, self);
|
1080
|
+
isea_search_each_w(self, weight, filter, filter_func, fn, arg);
|
1081
|
+
weight->destroy(weight);
|
568
1082
|
}
|
569
1083
|
|
570
|
-
static Query *
|
1084
|
+
static Query *isea_rewrite(Searcher *self, Query *original)
|
571
1085
|
{
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
1086
|
+
int q_is_destroyed = false;
|
1087
|
+
Query *query = original;
|
1088
|
+
Query *rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1089
|
+
while (q_is_destroyed || (query != rewritten_query)) {
|
1090
|
+
query = rewritten_query;
|
1091
|
+
rewritten_query = query->rewrite(query, ISEA(self)->ir);
|
1092
|
+
q_is_destroyed = (query->ref_cnt <= 1);
|
1093
|
+
q_deref(query); /* destroy intermediate queries */
|
1094
|
+
}
|
1095
|
+
return query;
|
582
1096
|
}
|
583
1097
|
|
584
|
-
static Explanation *
|
1098
|
+
static Explanation *isea_explain(Searcher *self, Query *query, int doc_num)
|
585
1099
|
{
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
1100
|
+
Weight *weight = q_weight(query, self);
|
1101
|
+
Explanation *e = weight->explain(weight, ISEA(self)->ir, doc_num);
|
1102
|
+
weight->destroy(weight);
|
1103
|
+
return e;
|
590
1104
|
}
|
591
1105
|
|
592
|
-
static Explanation *
|
1106
|
+
static Explanation *isea_explain_w(Searcher *self, Weight *w, int doc_num)
|
593
1107
|
{
|
594
|
-
|
1108
|
+
return w->explain(w, ISEA(self)->ir, doc_num);
|
595
1109
|
}
|
596
1110
|
|
597
|
-
static
|
1111
|
+
static TermVector *isea_get_term_vector(Searcher *self,
|
1112
|
+
const int doc_num,
|
1113
|
+
const char *field)
|
598
1114
|
{
|
599
|
-
|
1115
|
+
IndexReader *ir = ISEA(self)->ir;
|
1116
|
+
return ir->term_vector(ir, doc_num, field);
|
600
1117
|
}
|
601
1118
|
|
602
|
-
static void
|
1119
|
+
static void isea_close(Searcher *self)
|
603
1120
|
{
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
1121
|
+
if (ISEA(self)->ir && ISEA(self)->close_ir) {
|
1122
|
+
ir_close(ISEA(self)->ir);
|
1123
|
+
}
|
1124
|
+
free(self);
|
608
1125
|
}
|
609
1126
|
|
610
|
-
Searcher *
|
1127
|
+
Searcher *isea_new(IndexReader *ir)
|
611
1128
|
{
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
1129
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(IndexSearcher));
|
1130
|
+
|
1131
|
+
ISEA(self)->ir = ir;
|
1132
|
+
ISEA(self)->close_ir = true;
|
1133
|
+
|
1134
|
+
self->similarity = sim_create_default();
|
1135
|
+
self->doc_freq = &isea_doc_freq;
|
1136
|
+
self->get_doc = &isea_get_doc;
|
1137
|
+
self->get_lazy_doc = &isea_get_lazy_doc;
|
1138
|
+
self->max_doc = &isea_max_doc;
|
1139
|
+
self->create_weight = &sea_create_weight;
|
1140
|
+
self->search = &isea_search;
|
1141
|
+
self->search_w = &isea_search_w;
|
1142
|
+
self->search_each = &isea_search_each;
|
1143
|
+
self->search_each_w = &isea_search_each_w;
|
1144
|
+
self->rewrite = &isea_rewrite;
|
1145
|
+
self->explain = &isea_explain;
|
1146
|
+
self->explain_w = &isea_explain_w;
|
1147
|
+
self->get_term_vector = &isea_get_term_vector;
|
1148
|
+
self->get_similarity = &sea_get_similarity;
|
1149
|
+
self->close = &isea_close;
|
1150
|
+
|
1151
|
+
return self;
|
630
1152
|
}
|
631
1153
|
|
632
1154
|
/***************************************************************************
|
@@ -635,109 +1157,144 @@ Searcher *sea_create(IndexReader *ir)
|
|
635
1157
|
*
|
636
1158
|
***************************************************************************/
|
637
1159
|
|
638
|
-
|
639
|
-
|
640
|
-
|
1160
|
+
#define CDFSEA(searcher) ((CachedDFSearcher *)(searcher))
|
1161
|
+
typedef struct CachedDFSearcher
|
1162
|
+
{
|
1163
|
+
Searcher super;
|
1164
|
+
HashTable *df_map;
|
1165
|
+
int max_doc;
|
641
1166
|
} CachedDFSearcher;
|
642
1167
|
|
643
|
-
static int cdfsea_doc_freq(Searcher *self,
|
1168
|
+
static int cdfsea_doc_freq(Searcher *self, const char *field, const char *text)
|
644
1169
|
{
|
645
|
-
|
646
|
-
|
1170
|
+
Term term;
|
1171
|
+
int *df;
|
1172
|
+
term.field = (char *)field;
|
1173
|
+
term.text = (char *)text;
|
1174
|
+
df = (int *)h_get(CDFSEA(self)->df_map, &term);
|
1175
|
+
return df ? *df : 0;
|
647
1176
|
}
|
648
1177
|
|
649
1178
|
static Document *cdfsea_get_doc(Searcher *self, int doc_num)
|
650
1179
|
{
|
651
|
-
|
652
|
-
|
1180
|
+
(void)self; (void)doc_num;
|
1181
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1182
|
+
return NULL;
|
653
1183
|
}
|
654
1184
|
|
655
1185
|
static int cdfsea_max_doc(Searcher *self)
|
656
1186
|
{
|
657
|
-
|
1187
|
+
(void)self;
|
1188
|
+
return CDFSEA(self)->max_doc;
|
658
1189
|
}
|
659
1190
|
|
660
1191
|
static Weight *cdfsea_create_weight(Searcher *self, Query *query)
|
661
1192
|
{
|
662
|
-
|
663
|
-
|
1193
|
+
(void)self; (void)query;
|
1194
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1195
|
+
return NULL;
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
static TopDocs *cdfsea_search_w(Searcher *self, Weight *w, int fd, int nd,
|
1199
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
1200
|
+
{
|
1201
|
+
(void)self; (void)w; (void)fd; (void)nd;
|
1202
|
+
(void)f; (void)s; (void)ff, (void)load;
|
1203
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1204
|
+
return NULL;
|
664
1205
|
}
|
665
1206
|
|
666
|
-
static TopDocs *cdfsea_search(Searcher *self, Query *
|
667
|
-
|
1207
|
+
static TopDocs *cdfsea_search(Searcher *self, Query *q, int fd, int nd,
|
1208
|
+
Filter *f, Sort *s, filter_ft ff, bool load)
|
668
1209
|
{
|
669
|
-
|
670
|
-
|
1210
|
+
(void)self; (void)q; (void)fd; (void)nd;
|
1211
|
+
(void)f; (void)s; (void)ff, (void)load;
|
1212
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1213
|
+
return NULL;
|
671
1214
|
}
|
672
1215
|
|
673
1216
|
static void cdfsea_search_each(Searcher *self, Query *query, Filter *filter,
|
674
|
-
|
1217
|
+
filter_ft ff,
|
1218
|
+
void (*fn)(Searcher *, int, float, void *),
|
1219
|
+
void *arg)
|
675
1220
|
{
|
676
|
-
|
1221
|
+
(void)self; (void)query; (void)filter; (void)ff; (void)fn; (void)arg;
|
1222
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
677
1223
|
}
|
678
1224
|
|
679
1225
|
static void cdfsea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
680
|
-
|
1226
|
+
filter_ft ff,
|
1227
|
+
void (*fn)(Searcher *, int, float, void *),
|
1228
|
+
void *arg)
|
681
1229
|
{
|
682
|
-
|
1230
|
+
(void)self; (void)w; (void)filter; (void)ff; (void)fn; (void)arg;
|
1231
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
683
1232
|
}
|
684
1233
|
|
685
1234
|
static Query *cdfsea_rewrite(Searcher *self, Query *original)
|
686
1235
|
{
|
687
|
-
|
688
|
-
|
1236
|
+
(void)self;
|
1237
|
+
original->ref_cnt++;
|
1238
|
+
return original;
|
689
1239
|
}
|
690
1240
|
|
691
1241
|
static Explanation *cdfsea_explain(Searcher *self, Query *query, int doc_num)
|
692
1242
|
{
|
693
|
-
|
694
|
-
|
1243
|
+
(void)self; (void)query; (void)doc_num;
|
1244
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1245
|
+
return NULL;
|
695
1246
|
}
|
696
1247
|
|
697
1248
|
static Explanation *cdfsea_explain_w(Searcher *self, Weight *w, int doc_num)
|
698
1249
|
{
|
699
|
-
|
700
|
-
|
1250
|
+
(void)self; (void)w; (void)doc_num;
|
1251
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1252
|
+
return NULL;
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
static TermVector *cdfsea_get_term_vector(Searcher *self, const int doc_num,
|
1256
|
+
const char *field)
|
1257
|
+
{
|
1258
|
+
(void)self; (void)doc_num; (void)field;
|
1259
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1260
|
+
return NULL;
|
701
1261
|
}
|
702
1262
|
|
703
1263
|
static Similarity *cdfsea_get_similarity(Searcher *self)
|
704
1264
|
{
|
705
|
-
|
706
|
-
|
1265
|
+
(void)self;
|
1266
|
+
RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
|
1267
|
+
return NULL;
|
707
1268
|
}
|
708
1269
|
|
709
1270
|
static void cdfsea_close(Searcher *self)
|
710
1271
|
{
|
711
|
-
|
712
|
-
|
713
|
-
free(cdfsea);
|
714
|
-
free(self);
|
1272
|
+
h_destroy(CDFSEA(self)->df_map);
|
1273
|
+
free(self);
|
715
1274
|
}
|
716
1275
|
|
717
|
-
Searcher *
|
1276
|
+
static Searcher *cdfsea_new(HashTable *df_map, int max_doc)
|
718
1277
|
{
|
719
|
-
|
1278
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(CachedDFSearcher));
|
720
1279
|
|
721
|
-
|
1280
|
+
CDFSEA(self)->df_map = df_map;
|
1281
|
+
CDFSEA(self)->max_doc = max_doc;
|
722
1282
|
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
self->get_similarity = &cdfsea_get_similarity;
|
739
|
-
self->close = &cdfsea_close;
|
740
|
-
return self;
|
1283
|
+
self->doc_freq = &cdfsea_doc_freq;
|
1284
|
+
self->get_doc = &cdfsea_get_doc;
|
1285
|
+
self->max_doc = &cdfsea_max_doc;
|
1286
|
+
self->create_weight = &cdfsea_create_weight;
|
1287
|
+
self->search = &cdfsea_search;
|
1288
|
+
self->search_w = &cdfsea_search_w;
|
1289
|
+
self->search_each = &cdfsea_search_each;
|
1290
|
+
self->search_each_w = &cdfsea_search_each_w;
|
1291
|
+
self->rewrite = &cdfsea_rewrite;
|
1292
|
+
self->explain = &cdfsea_explain;
|
1293
|
+
self->explain_w = &cdfsea_explain_w;
|
1294
|
+
self->get_term_vector = &cdfsea_get_term_vector;
|
1295
|
+
self->get_similarity = &cdfsea_get_similarity;
|
1296
|
+
self->close = &cdfsea_close;
|
1297
|
+
return self;
|
741
1298
|
}
|
742
1299
|
|
743
1300
|
/***************************************************************************
|
@@ -746,301 +1303,367 @@ Searcher *cdfsea_create(HshTable *df_map, int max_doc)
|
|
746
1303
|
*
|
747
1304
|
***************************************************************************/
|
748
1305
|
|
1306
|
+
#define MSEA(searcher) ((MultiSearcher *)(searcher))
|
749
1307
|
static inline int msea_get_searcher_index(Searcher *self, int n)
|
750
1308
|
{
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
1309
|
+
MultiSearcher *msea = MSEA(self);
|
1310
|
+
int lo = 0; /* search starts array */
|
1311
|
+
int hi = msea->s_cnt - 1; /* for 1st element < n, return its index */
|
1312
|
+
int mid, mid_val;
|
755
1313
|
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
1314
|
+
while (hi >= lo) {
|
1315
|
+
mid = (lo + hi) >> 1;
|
1316
|
+
mid_val = msea->starts[mid];
|
1317
|
+
if (n < mid_val) {
|
1318
|
+
hi = mid - 1;
|
1319
|
+
}
|
1320
|
+
else if (n > mid_val) {
|
1321
|
+
lo = mid + 1;
|
1322
|
+
}
|
1323
|
+
else { /* found a match */
|
1324
|
+
while (((mid+1) < msea->s_cnt)
|
1325
|
+
&& (msea->starts[mid+1] == mid_val)) {
|
1326
|
+
mid++; /* scan to last match */
|
1327
|
+
}
|
1328
|
+
return mid;
|
1329
|
+
}
|
768
1330
|
}
|
769
|
-
|
770
|
-
return hi;
|
1331
|
+
return hi;
|
771
1332
|
}
|
772
1333
|
|
773
|
-
static int msea_doc_freq(Searcher *self,
|
1334
|
+
static int msea_doc_freq(Searcher *self, const char *field, const char *term)
|
774
1335
|
{
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
}
|
1336
|
+
int i;
|
1337
|
+
int doc_freq = 0;
|
1338
|
+
MultiSearcher *msea = MSEA(self);
|
1339
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1340
|
+
Searcher *s = msea->searchers[i];
|
1341
|
+
doc_freq += s->doc_freq(s, field, term);
|
1342
|
+
}
|
783
1343
|
|
784
|
-
|
1344
|
+
return doc_freq;
|
785
1345
|
}
|
786
1346
|
|
787
1347
|
static Document *msea_get_doc(Searcher *self, int doc_num)
|
788
1348
|
{
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
1349
|
+
MultiSearcher *msea = MSEA(self);
|
1350
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1351
|
+
Searcher *s = msea->searchers[i];
|
1352
|
+
return s->get_doc(s, doc_num - msea->starts[i]);
|
1353
|
+
}
|
1354
|
+
|
1355
|
+
static LazyDoc *msea_get_lazy_doc(Searcher *self, int doc_num)
|
1356
|
+
{
|
1357
|
+
MultiSearcher *msea = MSEA(self);
|
1358
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1359
|
+
Searcher *s = msea->searchers[i];
|
1360
|
+
return s->get_lazy_doc(s, doc_num - msea->starts[i]);
|
793
1361
|
}
|
794
1362
|
|
795
1363
|
static int msea_max_doc(Searcher *self)
|
796
1364
|
{
|
797
|
-
|
1365
|
+
return MSEA(self)->max_doc;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
static int *msea_get_doc_freqs(Searcher *self, HashSet *terms)
|
1369
|
+
{
|
1370
|
+
int i;
|
1371
|
+
const int num_terms = terms->size;
|
1372
|
+
int *doc_freqs = ALLOC_N(int, num_terms);
|
1373
|
+
for (i = 0; i < num_terms; i++) {
|
1374
|
+
Term *t = (Term *)terms->elems[i];
|
1375
|
+
doc_freqs[i] = msea_doc_freq(self, t->field, t->text);
|
1376
|
+
}
|
1377
|
+
return doc_freqs;
|
798
1378
|
}
|
799
1379
|
|
800
1380
|
static Weight *msea_create_weight(Searcher *self, Query *query)
|
801
1381
|
{
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
rq->extract_terms(rq, terms);
|
810
|
-
dfs = self->doc_freqs(self, (Term **)terms->elems, terms->size);
|
1382
|
+
int i, *doc_freqs;
|
1383
|
+
Searcher *cdfsea;
|
1384
|
+
Weight *w;
|
1385
|
+
HashTable *df_map = h_new((hash_ft)&term_hash, (eq_ft)&term_eq,
|
1386
|
+
(free_ft)NULL, free);
|
1387
|
+
Query *rewritten_query = self->rewrite(self, query);
|
1388
|
+
HashSet *terms = term_set_new();
|
811
1389
|
|
812
|
-
|
813
|
-
|
814
|
-
}
|
815
|
-
/* don't destroy the individual terms, only the HashSet */
|
816
|
-
hs_destroy(terms);
|
817
|
-
free(dfs);
|
1390
|
+
rewritten_query->extract_terms(rewritten_query, terms);
|
1391
|
+
doc_freqs = msea_get_doc_freqs(self, terms);
|
818
1392
|
|
819
|
-
|
1393
|
+
for (i = 0; i < terms->size; i++) {
|
1394
|
+
h_set(df_map, terms->elems[i], imalloc(doc_freqs[i]));
|
1395
|
+
}
|
1396
|
+
hs_destroy(terms);
|
1397
|
+
free(doc_freqs);
|
820
1398
|
|
821
|
-
|
822
|
-
q_deref(rq);
|
823
|
-
cdfsea->close(cdfsea);
|
1399
|
+
cdfsea = cdfsea_new(df_map, MSEA(self)->max_doc);
|
824
1400
|
|
825
|
-
|
1401
|
+
w = q_weight(rewritten_query, cdfsea);
|
1402
|
+
q_deref(rewritten_query);
|
1403
|
+
cdfsea->close(cdfsea);
|
1404
|
+
|
1405
|
+
return w;
|
826
1406
|
}
|
827
1407
|
|
828
1408
|
struct MultiSearchEachArg {
|
829
|
-
|
830
|
-
|
831
|
-
|
1409
|
+
int start;
|
1410
|
+
void *arg;
|
1411
|
+
void (*fn)(Searcher *, int, float, void *);
|
832
1412
|
};
|
833
1413
|
|
834
1414
|
void msea_search_each_i(Searcher *self, int doc_num, float score, void *arg)
|
835
1415
|
{
|
836
|
-
|
1416
|
+
struct MultiSearchEachArg *mse_arg = (struct MultiSearchEachArg *)arg;
|
837
1417
|
|
838
|
-
|
1418
|
+
mse_arg->fn(self, doc_num + mse_arg->start, score, mse_arg->arg);
|
839
1419
|
}
|
840
1420
|
|
841
1421
|
static void msea_search_each_w(Searcher *self, Weight *w, Filter *filter,
|
842
|
-
|
1422
|
+
filter_ft filter_func,
|
1423
|
+
void (*fn)(Searcher *, int, float, void *),
|
1424
|
+
void *arg)
|
843
1425
|
{
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
1426
|
+
int i;
|
1427
|
+
struct MultiSearchEachArg mse_arg;
|
1428
|
+
MultiSearcher *msea = MSEA(self);
|
1429
|
+
Searcher *s;
|
848
1430
|
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
1431
|
+
mse_arg.fn = fn;
|
1432
|
+
mse_arg.arg = arg;
|
1433
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1434
|
+
s = msea->searchers[i];
|
1435
|
+
mse_arg.start = msea->starts[i];
|
1436
|
+
s->search_each_w(s, w, filter, filter_func,
|
1437
|
+
&msea_search_each_i, &mse_arg);
|
1438
|
+
}
|
856
1439
|
}
|
857
1440
|
|
858
1441
|
static void msea_search_each(Searcher *self, Query *query, Filter *filter,
|
859
|
-
|
1442
|
+
filter_ft filter_func,
|
1443
|
+
void (*fn)(Searcher *, int, float, void *), void *arg)
|
860
1444
|
{
|
861
|
-
|
862
|
-
|
863
|
-
|
1445
|
+
Weight *w = q_weight(query, self);
|
1446
|
+
msea_search_each_w(self, w, filter, filter_func, fn, arg);
|
1447
|
+
w->destroy(w);
|
864
1448
|
}
|
865
1449
|
|
866
1450
|
struct MultiSearchArg {
|
867
|
-
|
868
|
-
|
869
|
-
|
1451
|
+
int total_hits, max_size;
|
1452
|
+
PriorityQueue *hq;
|
1453
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
870
1454
|
};
|
871
1455
|
|
872
1456
|
void msea_search_i(Searcher *self, int doc_num, float score, void *arg)
|
873
1457
|
{
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
}
|
916
|
-
|
917
|
-
|
918
|
-
ms_arg.hq = hq;
|
919
|
-
ms_arg.total_hits = 0;
|
920
|
-
ms_arg.max_size = max_size;
|
921
|
-
ms_arg.hq_insert = hq_insert;
|
922
|
-
|
923
|
-
msea_search_each_w(self, weight, filter, msea_search_i, &ms_arg);
|
1458
|
+
struct MultiSearchArg *ms_arg = (struct MultiSearchArg *)arg;
|
1459
|
+
Hit hit;
|
1460
|
+
(void)self;
|
1461
|
+
|
1462
|
+
ms_arg->total_hits++;
|
1463
|
+
hit.doc = doc_num;
|
1464
|
+
hit.score = score;
|
1465
|
+
ms_arg->hq_insert(ms_arg->hq, &hit);
|
1466
|
+
}
|
1467
|
+
|
1468
|
+
static TopDocs *msea_search_w(Searcher *self,
|
1469
|
+
Weight *weight,
|
1470
|
+
int first_doc,
|
1471
|
+
int num_docs,
|
1472
|
+
Filter *filter,
|
1473
|
+
Sort *sort,
|
1474
|
+
filter_ft filter_func,
|
1475
|
+
bool load_fields)
|
1476
|
+
{
|
1477
|
+
int max_size = first_doc + num_docs;
|
1478
|
+
int i;
|
1479
|
+
int total_hits = 0;
|
1480
|
+
Hit **score_docs = NULL;
|
1481
|
+
Hit *(*hq_pop)(PriorityQueue *pq);
|
1482
|
+
void (*hq_insert)(PriorityQueue *pq, Hit *hit);
|
1483
|
+
PriorityQueue *hq;
|
1484
|
+
float max_score = 0.0;
|
1485
|
+
(void)load_fields; /* does it automatically */
|
1486
|
+
|
1487
|
+
sea_check_args(num_docs, first_doc);
|
1488
|
+
|
1489
|
+
if (sort) {
|
1490
|
+
hq = pq_new(max_size, (lt_ft)fdshq_lt, &free);
|
1491
|
+
hq_insert = (void (*)(PriorityQueue *pq, Hit *hit))&pq_insert;
|
1492
|
+
hq_pop = (Hit *(*)(PriorityQueue *pq))&pq_pop;
|
1493
|
+
}
|
1494
|
+
else {
|
1495
|
+
hq = pq_new(max_size, (lt_ft)&hit_less_than, &free);
|
1496
|
+
hq_insert = &hit_pq_multi_insert;
|
1497
|
+
hq_pop = &hit_pq_pop;
|
1498
|
+
}
|
924
1499
|
|
925
|
-
|
1500
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1501
|
+
for (i = 0; i < MSEA(self)->s_cnt; i++) {
|
1502
|
+
Searcher *s = MSEA(self)->searchers[i];
|
1503
|
+
TopDocs *td = s->search_w(s, weight, 0, max_size,
|
1504
|
+
filter, sort, filter_func, true);
|
1505
|
+
/*if (sort) printf("sort = %s\n", sort_to_s(sort)); */
|
1506
|
+
if (td->size > 0) {
|
1507
|
+
/*printf("td->size = %d %d\n", td->size, num_docs); */
|
1508
|
+
int j;
|
1509
|
+
int start = MSEA(self)->starts[i];
|
1510
|
+
for (j = 0; j < td->size; j++) {
|
1511
|
+
Hit *hit = td->hits[j];
|
1512
|
+
hit->doc += start;
|
1513
|
+
/*
|
1514
|
+
printf("adding hit = %d:%f\n", hit->doc, hit->score);
|
1515
|
+
*/
|
1516
|
+
hq_insert(hq, hit);
|
1517
|
+
}
|
1518
|
+
td->size = 0;
|
1519
|
+
if (td->max_score > max_score) max_score = td->max_score;
|
1520
|
+
}
|
1521
|
+
total_hits += td->total_hits;
|
1522
|
+
td_destroy(td);
|
1523
|
+
}
|
926
1524
|
|
927
|
-
|
928
|
-
|
929
|
-
|
1525
|
+
if (hq->size > first_doc) {
|
1526
|
+
if ((hq->size - first_doc) < num_docs) {
|
1527
|
+
num_docs = hq->size - first_doc;
|
1528
|
+
}
|
1529
|
+
score_docs = ALLOC_N(Hit *, num_docs);
|
1530
|
+
for (i = num_docs - 1; i >= 0; i--) {
|
1531
|
+
score_docs[i] = hq_pop(hq);
|
1532
|
+
/*
|
1533
|
+
Hit *hit = score_docs[i] = hq_pop(hq);
|
1534
|
+
printf("popped hit = %d-->%f\n", hit->doc, hit->score);
|
1535
|
+
*/
|
1536
|
+
}
|
930
1537
|
}
|
931
|
-
|
932
|
-
|
933
|
-
score_docs[i] = hq_pop(hq);
|
934
|
-
//hit = score_docs[i] = pq_pop(hq);
|
935
|
-
//printf("hit = %d-->%f\n", hit->doc, hit->score);
|
1538
|
+
else {
|
1539
|
+
num_docs = 0;
|
936
1540
|
}
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
1541
|
+
pq_clear(hq);
|
1542
|
+
pq_destroy(hq);
|
1543
|
+
|
1544
|
+
return td_new(total_hits, num_docs, score_docs, max_score);
|
1545
|
+
}
|
942
1546
|
|
943
|
-
|
944
|
-
|
1547
|
+
static TopDocs *msea_search(Searcher *self,
|
1548
|
+
Query *query,
|
1549
|
+
int first_doc,
|
1550
|
+
int num_docs,
|
1551
|
+
Filter *filter,
|
1552
|
+
Sort *sort,
|
1553
|
+
filter_ft filter_func,
|
1554
|
+
bool load_fields)
|
1555
|
+
{
|
1556
|
+
TopDocs *td;
|
1557
|
+
Weight *weight = q_weight(query, self);
|
1558
|
+
td = msea_search_w(self, weight, first_doc, num_docs, filter,
|
1559
|
+
sort, filter_func, load_fields);
|
1560
|
+
weight->destroy(weight);
|
1561
|
+
return td;
|
945
1562
|
}
|
946
1563
|
|
947
1564
|
static Query *msea_rewrite(Searcher *self, Query *original)
|
948
1565
|
{
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
1566
|
+
int i;
|
1567
|
+
Searcher *s;
|
1568
|
+
MultiSearcher *msea = MSEA(self);
|
1569
|
+
Query **queries = ALLOC_N(Query *, msea->s_cnt), *rewritten;
|
953
1570
|
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
1571
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1572
|
+
s = msea->searchers[i];
|
1573
|
+
queries[i] = s->rewrite(s, original);
|
1574
|
+
}
|
1575
|
+
rewritten = q_combine(queries, msea->s_cnt);
|
959
1576
|
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
1577
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1578
|
+
q_deref(queries[i]);
|
1579
|
+
}
|
1580
|
+
free(queries);
|
1581
|
+
return rewritten;
|
965
1582
|
}
|
966
1583
|
|
967
1584
|
static Explanation *msea_explain(Searcher *self, Query *query, int doc_num)
|
968
1585
|
{
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
1586
|
+
MultiSearcher *msea = MSEA(self);
|
1587
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1588
|
+
Weight *w = q_weight(query, self);
|
1589
|
+
Searcher *s = msea->searchers[i];
|
1590
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1591
|
+
w->destroy(w);
|
1592
|
+
return e;
|
976
1593
|
}
|
977
1594
|
|
978
1595
|
static Explanation *msea_explain_w(Searcher *self, Weight *w, int doc_num)
|
979
1596
|
{
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
1597
|
+
MultiSearcher *msea = MSEA(self);
|
1598
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1599
|
+
Searcher *s = msea->searchers[i];
|
1600
|
+
Explanation *e = s->explain_w(s, w, doc_num - msea->starts[i]);
|
1601
|
+
return e;
|
1602
|
+
}
|
1603
|
+
|
1604
|
+
static TermVector *msea_get_term_vector(Searcher *self, const int doc_num,
|
1605
|
+
const char *field)
|
1606
|
+
{
|
1607
|
+
MultiSearcher *msea = MSEA(self);
|
1608
|
+
int i = msea_get_searcher_index(self, doc_num);
|
1609
|
+
Searcher *s = msea->searchers[i];
|
1610
|
+
return s->get_term_vector(s, doc_num - msea->starts[i],
|
1611
|
+
field);
|
985
1612
|
}
|
986
1613
|
|
987
1614
|
static Similarity *msea_get_similarity(Searcher *self)
|
988
1615
|
{
|
989
|
-
|
1616
|
+
return self->similarity;
|
990
1617
|
}
|
991
1618
|
|
992
1619
|
static void msea_close(Searcher *self)
|
993
1620
|
{
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1621
|
+
int i;
|
1622
|
+
Searcher *s;
|
1623
|
+
MultiSearcher *msea = MSEA(self);
|
1624
|
+
if (msea->close_subs) {
|
1625
|
+
for (i = 0; i < msea->s_cnt; i++) {
|
1626
|
+
s = msea->searchers[i];
|
1627
|
+
s->close(s);
|
1628
|
+
}
|
1629
|
+
free(msea->searchers);
|
1001
1630
|
}
|
1002
|
-
free(msea->
|
1003
|
-
|
1004
|
-
free(msea->starts);
|
1005
|
-
free(msea);
|
1006
|
-
free(self);
|
1631
|
+
free(msea->starts);
|
1632
|
+
free(self);
|
1007
1633
|
}
|
1008
1634
|
|
1009
|
-
Searcher *
|
1635
|
+
Searcher *msea_new(Searcher **searchers, int s_cnt, bool close_subs)
|
1010
1636
|
{
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1637
|
+
int i, max_doc = 0;
|
1638
|
+
Searcher *self = (Searcher *)ecalloc(sizeof(MultiSearcher));
|
1639
|
+
int *starts = ALLOC_N(int, s_cnt + 1);
|
1640
|
+
for (i = 0; i < s_cnt; i++) {
|
1641
|
+
starts[i] = max_doc;
|
1642
|
+
max_doc += searchers[i]->max_doc(searchers[i]);
|
1643
|
+
}
|
1018
1644
|
starts[i] = max_doc;
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
self->get_similarity = &msea_get_similarity;
|
1044
|
-
self->close = &msea_close;
|
1045
|
-
return self;
|
1645
|
+
|
1646
|
+
MSEA(self)->s_cnt = s_cnt;
|
1647
|
+
MSEA(self)->searchers = searchers;
|
1648
|
+
MSEA(self)->starts = starts;
|
1649
|
+
MSEA(self)->max_doc = max_doc;
|
1650
|
+
MSEA(self)->close_subs = close_subs;
|
1651
|
+
|
1652
|
+
self->similarity = sim_create_default();
|
1653
|
+
self->doc_freq = &msea_doc_freq;
|
1654
|
+
self->get_doc = &msea_get_doc;
|
1655
|
+
self->get_lazy_doc = &msea_get_lazy_doc;
|
1656
|
+
self->max_doc = &msea_max_doc;
|
1657
|
+
self->create_weight = &msea_create_weight;
|
1658
|
+
self->search = &msea_search;
|
1659
|
+
self->search_w = &msea_search_w;
|
1660
|
+
self->search_each = &msea_search_each;
|
1661
|
+
self->search_each_w = &msea_search_each_w;
|
1662
|
+
self->rewrite = &msea_rewrite;
|
1663
|
+
self->explain = &msea_explain;
|
1664
|
+
self->explain_w = &msea_explain_w;
|
1665
|
+
self->get_term_vector = &msea_get_term_vector;
|
1666
|
+
self->get_similarity = &msea_get_similarity;
|
1667
|
+
self->close = &msea_close;
|
1668
|
+
return self;
|
1046
1669
|
}
|