ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/similarity.c
CHANGED
@@ -1,172 +1,150 @@
|
|
1
|
-
#include
|
2
|
-
#include
|
1
|
+
#include "similarity.h"
|
2
|
+
#include "search.h"
|
3
|
+
#include "array.h"
|
4
|
+
#include "helper.h"
|
3
5
|
#include <math.h>
|
6
|
+
#include <stdlib.h>
|
7
|
+
#include <string.h>
|
4
8
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
char *p;
|
11
|
-
|
12
|
-
if (init) return;
|
13
|
-
init = 1;
|
14
|
-
p = (char*)&init;
|
15
|
-
|
16
|
-
if (p[0]) {
|
17
|
-
low_bit = 0;
|
18
|
-
low_mid_bit = 1;
|
19
|
-
high_mid_bit = 2;
|
20
|
-
high_bit = 3;
|
21
|
-
} else {
|
22
|
-
low_bit = 3;
|
23
|
-
low_mid_bit = 2;
|
24
|
-
high_mid_bit = 1;
|
25
|
-
high_bit = 0;
|
26
|
-
}
|
27
|
-
}
|
9
|
+
/****************************************************************************
|
10
|
+
*
|
11
|
+
* Term
|
12
|
+
*
|
13
|
+
****************************************************************************/
|
28
14
|
|
29
|
-
|
15
|
+
Term *term_new(const char *field, const char *text)
|
30
16
|
{
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
|
36
|
-
int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
|
37
|
-
|
38
|
-
if (!low_mid_bit) setup_endian();
|
39
|
-
flt[low_bit] = flt[low_mid_bit] = 0;
|
40
|
-
flt[high_mid_bit] = mantissa << 5;
|
41
|
-
flt[high_bit] = exponent + 48;
|
42
|
-
return *((float *)flt);
|
43
|
-
}
|
17
|
+
Term *t = ALLOC(Term);
|
18
|
+
t->field = estrdup(field);
|
19
|
+
t->text = estrdup(text);
|
20
|
+
return t;
|
44
21
|
}
|
45
22
|
|
46
|
-
|
23
|
+
void term_destroy(Term *self)
|
47
24
|
{
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
|
53
|
-
int exponent = (bits[high_bit] - 48);
|
54
|
-
|
55
|
-
if (exponent > 0x1f) {
|
56
|
-
exponent = 0x1f; // 0x1f = 31 = 0b00011111
|
57
|
-
mantissa = 0x07; // 0x07 = 7 = 0b00000111
|
58
|
-
}
|
25
|
+
free(self->text);
|
26
|
+
free(self->field);
|
27
|
+
free(self);
|
28
|
+
}
|
59
29
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
30
|
+
int term_eq(const void *t1, const void *t2)
|
31
|
+
{
|
32
|
+
return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
|
33
|
+
(strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
|
34
|
+
}
|
64
35
|
|
65
|
-
|
66
|
-
|
36
|
+
ulong term_hash(const void *t)
|
37
|
+
{
|
38
|
+
return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
|
67
39
|
}
|
68
40
|
|
69
|
-
|
41
|
+
/****************************************************************************
|
42
|
+
*
|
43
|
+
* Similarity
|
44
|
+
*
|
45
|
+
****************************************************************************/
|
46
|
+
|
47
|
+
float simdef_length_norm(Similarity *s, const char *field, int num_terms)
|
70
48
|
{
|
71
|
-
|
49
|
+
(void)s;
|
50
|
+
(void)field;
|
51
|
+
return (float)(1.0 / sqrt(num_terms));
|
72
52
|
}
|
73
53
|
|
74
54
|
float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
|
75
55
|
{
|
76
|
-
|
56
|
+
(void)s;
|
57
|
+
return (float)(1.0 / sqrt(sum_of_squared_weights));
|
77
58
|
}
|
78
59
|
|
79
60
|
float simdef_tf(struct Similarity *s, float freq)
|
80
61
|
{
|
81
|
-
|
62
|
+
(void)s;
|
63
|
+
return (float)sqrt(freq);
|
82
64
|
}
|
83
65
|
|
84
66
|
float simdef_sloppy_freq(struct Similarity *s, int distance)
|
85
67
|
{
|
86
|
-
|
68
|
+
(void)s;
|
69
|
+
return (float)(1.0 / (double)(distance + 1));
|
87
70
|
}
|
88
71
|
|
89
|
-
float simdef_idf_term(struct Similarity *s,
|
72
|
+
float simdef_idf_term(struct Similarity *s, const char *field, char *term,
|
73
|
+
Searcher *searcher)
|
90
74
|
{
|
91
|
-
|
75
|
+
return s->idf(s, searcher->doc_freq(searcher, field, term),
|
76
|
+
searcher->max_doc(searcher));
|
92
77
|
}
|
93
78
|
|
94
|
-
float simdef_idf_phrase(struct Similarity *s,
|
79
|
+
float simdef_idf_phrase(struct Similarity *s, const char *field,
|
80
|
+
PhrasePosition *positions,
|
81
|
+
int pp_cnt, Searcher *searcher)
|
95
82
|
{
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
83
|
+
float idf = 0.0;
|
84
|
+
int i, j;
|
85
|
+
for (i = 0; i < pp_cnt; i++) {
|
86
|
+
char **terms = positions[i].terms;
|
87
|
+
for (j = ary_size(terms) - 1; j >= 0; j--) {
|
88
|
+
idf += sim_idf_term(s, field, terms[j], searcher);
|
89
|
+
}
|
90
|
+
}
|
91
|
+
return idf;
|
102
92
|
}
|
103
93
|
|
104
94
|
float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
|
105
95
|
{
|
106
|
-
|
96
|
+
(void)s;
|
97
|
+
return (float)(log((float)num_docs/(float)(doc_freq+1)) + 1.0);
|
107
98
|
}
|
108
99
|
|
109
100
|
float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
|
110
101
|
{
|
111
|
-
|
102
|
+
(void)s;
|
103
|
+
return (float)((double)overlap / (double)max_overlap);
|
112
104
|
}
|
113
105
|
|
114
106
|
float simdef_decode_norm(struct Similarity *s, uchar b)
|
115
107
|
{
|
116
|
-
|
108
|
+
return s->norm_table[b];
|
117
109
|
}
|
118
110
|
|
119
111
|
uchar simdef_encode_norm(struct Similarity *s, float f)
|
120
112
|
{
|
121
|
-
|
113
|
+
(void)s;
|
114
|
+
return float2byte(f);
|
122
115
|
}
|
123
116
|
|
124
117
|
void simdef_destroy(Similarity *s)
|
125
118
|
{
|
126
|
-
|
119
|
+
(void)s;
|
120
|
+
/* nothing to do here */
|
127
121
|
}
|
128
122
|
|
129
|
-
#ifdef WIN32
|
130
123
|
static Similarity default_similarity = {
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
124
|
+
NULL,
|
125
|
+
{0},
|
126
|
+
&simdef_length_norm,
|
127
|
+
&simdef_query_norm,
|
128
|
+
&simdef_tf,
|
129
|
+
&simdef_sloppy_freq,
|
130
|
+
&simdef_idf_term,
|
131
|
+
&simdef_idf_phrase,
|
132
|
+
&simdef_idf,
|
133
|
+
&simdef_coord,
|
134
|
+
&simdef_decode_norm,
|
135
|
+
&simdef_encode_norm,
|
136
|
+
&simdef_destroy
|
144
137
|
};
|
145
|
-
#else
|
146
|
-
static Similarity default_similarity = {
|
147
|
-
data:NULL,
|
148
|
-
length_norm:&simdef_length_norm,
|
149
|
-
query_norm:&simdef_query_norm,
|
150
|
-
tf:&simdef_tf,
|
151
|
-
sloppy_freq:&simdef_sloppy_freq,
|
152
|
-
idf_term:&simdef_idf_term,
|
153
|
-
idf_phrase:&simdef_idf_phrase,
|
154
|
-
idf:&simdef_idf,
|
155
|
-
coord:&simdef_coord,
|
156
|
-
decode_norm:&simdef_decode_norm,
|
157
|
-
encode_norm:&simdef_encode_norm,
|
158
|
-
destroy:&simdef_destroy
|
159
|
-
};
|
160
|
-
#endif
|
161
138
|
|
162
139
|
Similarity *sim_create_default()
|
163
140
|
{
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
141
|
+
int i;
|
142
|
+
if (!default_similarity.data) {
|
143
|
+
for (i = 0; i < 256; i++) {
|
144
|
+
default_similarity.norm_table[i] = byte2float((unsigned char)i);
|
145
|
+
}
|
168
146
|
|
169
|
-
|
170
|
-
|
171
|
-
|
147
|
+
default_similarity.data = &default_similarity;
|
148
|
+
}
|
149
|
+
return &default_similarity;
|
172
150
|
}
|
data/ext/similarity.h
CHANGED
@@ -9,18 +9,31 @@ typedef struct Searcher Searcher;
|
|
9
9
|
*
|
10
10
|
****************************************************************************/
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
#define term_set_new() \
|
13
|
+
hs_new((hash_ft)&term_hash, (eq_ft)&term_eq, (free_ft)&term_destroy)
|
14
|
+
|
15
|
+
typedef struct Term
|
16
|
+
{
|
17
|
+
char *field;
|
18
|
+
char *text;
|
15
19
|
} Term;
|
16
20
|
|
17
|
-
Term *
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
extern Term *term_new(const char *field, const char *text);
|
22
|
+
extern void term_destroy(Term *self);
|
23
|
+
extern int term_eq(const void *t1, const void *t2);
|
24
|
+
extern unsigned long term_hash(const void *t);
|
25
|
+
|
26
|
+
/***************************************************************************
|
27
|
+
*
|
28
|
+
* PhrasePosition
|
29
|
+
*
|
30
|
+
***************************************************************************/
|
31
|
+
|
32
|
+
typedef struct PhrasePosition
|
33
|
+
{
|
34
|
+
int pos;
|
35
|
+
char **terms;
|
36
|
+
} PhrasePosition;
|
24
37
|
|
25
38
|
/***************************************************************************
|
26
39
|
*
|
@@ -30,38 +43,40 @@ char *term_to_s(Term *term);
|
|
30
43
|
|
31
44
|
typedef struct Similarity Similarity;
|
32
45
|
|
33
|
-
struct Similarity
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
46
|
+
struct Similarity
|
47
|
+
{
|
48
|
+
void *data;
|
49
|
+
float norm_table[256];
|
50
|
+
float (*length_norm)(Similarity *self, const char *field, int num_terms);
|
51
|
+
float (*query_norm)(Similarity *self, float sum_of_squared_weights);
|
52
|
+
float (*tf)(Similarity *self, float freq);
|
53
|
+
float (*sloppy_freq)(Similarity *self, int distance);
|
54
|
+
float (*idf_term)(Similarity *self, const char *field, char *term,
|
55
|
+
Searcher *searcher);
|
56
|
+
float (*idf_phrase)(Similarity *self, const char *field,
|
57
|
+
PhrasePosition *positions,
|
58
|
+
int pp_cnt, Searcher *searcher);
|
59
|
+
float (*idf)(Similarity *self, int doc_freq, int num_docs);
|
60
|
+
float (*coord)(Similarity *self, int overlap, int max_overlap);
|
61
|
+
float (*decode_norm)(Similarity *self, unsigned char b);
|
62
|
+
unsigned char (*encode_norm)(Similarity *self, float f);
|
63
|
+
void (*destroy)(Similarity *self);
|
48
64
|
};
|
49
65
|
|
50
66
|
#define sim_length_norm(msim, field, num_terms) msim->length_norm(msim, field, num_terms)
|
51
67
|
#define sim_query_norm(msim, sosw) msim->query_norm(msim, sosw)
|
52
68
|
#define sim_tf(msim, freq) msim->tf(msim, freq)
|
53
69
|
#define sim_sloppy_freq(msim, distance) msim->sloppy_freq(msim, distance)
|
54
|
-
#define sim_idf_term(msim,
|
55
|
-
|
70
|
+
#define sim_idf_term(msim, field, term, searcher)\
|
71
|
+
msim->idf_term(msim, field, term, searcher)
|
72
|
+
#define sim_idf_phrase(msim, field, positions, pos_cnt, searcher)\
|
73
|
+
msim->idf_phrase(msim, field, positions, pos_cnt, searcher)
|
56
74
|
#define sim_idf(msim, doc_freq, num_docs) msim->idf(msim, doc_freq, num_docs)
|
57
75
|
#define sim_coord(msim, overlap, max_overlap) msim->coord(msim, overlap, max_overlap)
|
58
76
|
#define sim_decode_norm(msim, b) msim->decode_norm(msim, b)
|
59
77
|
#define sim_encode_norm(msim, f) msim->encode_norm(msim, f)
|
60
78
|
#define sim_destroy(msim) msim->destroy(msim)
|
61
79
|
|
62
|
-
float byte_to_float(uchar b);
|
63
|
-
uchar float_to_byte(float f);
|
64
|
-
|
65
80
|
Similarity *sim_create_default();
|
66
81
|
|
67
82
|
#endif
|
data/ext/sort.c
CHANGED
@@ -2,94 +2,96 @@
|
|
2
2
|
#include "search.h"
|
3
3
|
#include "index.h"
|
4
4
|
|
5
|
-
static char * const NO_TERM_ERROR_MSG = "no terms in field to sort by";
|
6
|
-
|
7
5
|
/***************************************************************************
|
8
6
|
*
|
9
7
|
* SortField
|
10
8
|
*
|
11
9
|
***************************************************************************/
|
12
10
|
|
13
|
-
|
11
|
+
ulong sort_field_hash(const void *p)
|
14
12
|
{
|
15
|
-
|
16
|
-
|
13
|
+
SortField *self = (SortField *)p;
|
14
|
+
return str_hash(self->field) ^ (self->type*37);
|
17
15
|
}
|
18
16
|
|
19
17
|
int sort_field_eq(const void *p1, const void *p2)
|
20
18
|
{
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
19
|
+
SortField *key1 = (SortField *)p1;
|
20
|
+
SortField *key2 = (SortField *)p2;
|
21
|
+
return (strcmp(key1->field, key2->field) == 0)
|
22
|
+
&& key1->type == key2->type;
|
23
|
+
}
|
24
|
+
|
25
|
+
static int sort_field_cache_eq(const void *p1, const void *p2)
|
26
|
+
{
|
27
|
+
SortField *key1 = (SortField *)p1;
|
28
|
+
SortField *key2 = (SortField *)p2;
|
29
|
+
int equal = (strcmp(key1->field, key2->field) == 0)
|
30
|
+
&& key1->type == key2->type;
|
31
|
+
|
32
|
+
return equal;
|
33
|
+
}
|
34
|
+
|
35
|
+
static SortField *sort_field_clone(SortField *self)
|
36
|
+
{
|
37
|
+
SortField *clone = ALLOC(SortField);
|
38
|
+
memcpy(clone, self, sizeof(SortField));
|
39
|
+
mutex_init(&clone->mutex, NULL);
|
40
|
+
clone->field = estrdup(self->field);
|
41
|
+
return clone;
|
42
|
+
}
|
43
|
+
|
44
|
+
static SortField *sort_field_alloc(char *field, int type, bool reverse)
|
45
|
+
{
|
46
|
+
SortField *self = ALLOC(SortField);
|
47
|
+
mutex_init(&self->mutex, NULL);
|
48
|
+
self->field = field ? estrdup(field) : NULL;
|
49
|
+
self->type = type;
|
50
|
+
self->reverse = reverse;
|
51
|
+
self->index = NULL;
|
52
|
+
self->destroy_index = &free;
|
53
|
+
self->compare = NULL;
|
54
|
+
return self;
|
55
|
+
}
|
56
|
+
|
57
|
+
SortField *sort_field_new(char *field, enum SORT_TYPE type, bool reverse)
|
58
|
+
{
|
59
|
+
SortField *sf = NULL;
|
60
|
+
switch (type) {
|
61
|
+
case SORT_TYPE_SCORE:
|
62
|
+
sf = sort_field_score_new(reverse);
|
63
|
+
break;
|
64
|
+
case SORT_TYPE_DOC:
|
65
|
+
sf = sort_field_doc_new(reverse);
|
66
|
+
break;
|
67
|
+
case SORT_TYPE_BYTE:
|
68
|
+
sf = sort_field_byte_new(field, reverse);
|
69
|
+
break;
|
70
|
+
case SORT_TYPE_INTEGER:
|
71
|
+
sf = sort_field_int_new(field, reverse);
|
72
|
+
break;
|
73
|
+
case SORT_TYPE_FLOAT:
|
74
|
+
sf = sort_field_float_new(field, reverse);
|
75
|
+
break;
|
76
|
+
case SORT_TYPE_STRING:
|
77
|
+
sf = sort_field_string_new(field, reverse);
|
78
|
+
break;
|
79
|
+
case SORT_TYPE_AUTO:
|
80
|
+
sf = sort_field_auto_new(field, reverse);
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
return sf;
|
82
84
|
}
|
83
85
|
|
84
86
|
void sort_field_destroy(void *p)
|
85
87
|
{
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
88
|
+
SortField *self = (SortField *)p;
|
89
|
+
if (self->index) {
|
90
|
+
self->destroy_index(self->index);
|
91
|
+
}
|
92
|
+
free(self->field);
|
93
|
+
mutex_destroy(&self->mutex);
|
94
|
+
free(p);
|
93
95
|
}
|
94
96
|
|
95
97
|
/*
|
@@ -97,210 +99,304 @@ void sort_field_destroy(void *p)
|
|
97
99
|
*/
|
98
100
|
char *sort_field_to_s(SortField *self)
|
99
101
|
{
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
102
|
+
char *str;
|
103
|
+
char *type = NULL;
|
104
|
+
switch (self->type) {
|
105
|
+
case SORT_TYPE_SCORE:
|
106
|
+
type = "<SCORE>";
|
107
|
+
break;
|
108
|
+
case SORT_TYPE_DOC:
|
109
|
+
type = "<DOC>";
|
110
|
+
break;
|
111
|
+
case SORT_TYPE_BYTE:
|
112
|
+
type = "<byte>";
|
113
|
+
break;
|
114
|
+
case SORT_TYPE_INTEGER:
|
115
|
+
type = "<integer>";
|
116
|
+
break;
|
117
|
+
case SORT_TYPE_FLOAT:
|
118
|
+
type = "<float>";
|
119
|
+
break;
|
120
|
+
case SORT_TYPE_STRING:
|
121
|
+
type = "<string>";
|
122
|
+
break;
|
123
|
+
case SORT_TYPE_AUTO:
|
124
|
+
type = "<auto>";
|
125
|
+
break;
|
126
|
+
}
|
127
|
+
if (self->field) {
|
128
|
+
str = ALLOC_N(char, 10 + strlen(self->field) + strlen(type));
|
129
|
+
sprintf(str, "%s:%s%s", self->field, type, (self->reverse ? "!" : ""));
|
130
|
+
} else {
|
131
|
+
str = ALLOC_N(char, 10 + strlen(type));
|
132
|
+
sprintf(str, "%s%s", type, (self->reverse ? "!" : ""));
|
133
|
+
}
|
134
|
+
return str;
|
130
135
|
}
|
131
136
|
|
132
137
|
/***************************************************************************
|
133
138
|
* ScoreSortField
|
134
139
|
***************************************************************************/
|
135
140
|
|
141
|
+
void sf_score_get_val(void *index, Hit *hit, Comparable *comparable)
|
142
|
+
{
|
143
|
+
(void)index;
|
144
|
+
comparable->val.f = hit->score;
|
145
|
+
}
|
146
|
+
|
136
147
|
int sf_score_compare(void *index_ptr, Hit *hit2, Hit *hit1)
|
137
148
|
{
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
149
|
+
float val1 = hit1->score;
|
150
|
+
float val2 = hit2->score;
|
151
|
+
(void)index_ptr;
|
152
|
+
|
153
|
+
if (val1 > val2) return 1;
|
154
|
+
else if (val1 < val2) return -1;
|
155
|
+
else return 0;
|
143
156
|
}
|
144
157
|
|
145
|
-
SortField *
|
158
|
+
SortField *sort_field_score_new(bool reverse)
|
146
159
|
{
|
147
|
-
|
148
|
-
|
149
|
-
|
160
|
+
SortField *self = sort_field_alloc(NULL, SORT_TYPE_SCORE, reverse);
|
161
|
+
self->compare = &sf_score_compare;
|
162
|
+
self->get_val = &sf_score_get_val;
|
163
|
+
return self;
|
150
164
|
}
|
151
165
|
|
152
|
-
SortField SORT_FIELD_SCORE = {
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
/* compare
|
159
|
-
/*
|
160
|
-
|
161
|
-
|
166
|
+
const SortField SORT_FIELD_SCORE = {
|
167
|
+
MUTEX_INITIALIZER,
|
168
|
+
NULL, /* field */
|
169
|
+
SORT_TYPE_SCORE, /* type */
|
170
|
+
false, /* reverse */
|
171
|
+
NULL, /* index */
|
172
|
+
&sf_score_compare, /* compare */
|
173
|
+
&sf_score_get_val, /* get_val */
|
174
|
+
NULL, /* create_index */
|
175
|
+
NULL, /* destroy_index */
|
176
|
+
NULL, /* handle_term */
|
162
177
|
};
|
163
178
|
|
164
|
-
SortField SORT_FIELD_SCORE_REV = {
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
/* compare
|
171
|
-
/*
|
172
|
-
|
173
|
-
|
179
|
+
const SortField SORT_FIELD_SCORE_REV = {
|
180
|
+
MUTEX_INITIALIZER,
|
181
|
+
NULL, /* field */
|
182
|
+
SORT_TYPE_SCORE, /* type */
|
183
|
+
true, /* reverse */
|
184
|
+
NULL, /* index */
|
185
|
+
&sf_score_compare, /* compare */
|
186
|
+
&sf_score_get_val, /* get_val */
|
187
|
+
NULL, /* create_index */
|
188
|
+
NULL, /* destroy_index */
|
189
|
+
NULL, /* handle_term */
|
174
190
|
};
|
175
191
|
|
176
192
|
/**************************************************************************
|
177
193
|
* DocSortField
|
178
194
|
***************************************************************************/
|
179
195
|
|
196
|
+
void sf_doc_get_val(void *index, Hit *hit, Comparable *comparable)
|
197
|
+
{
|
198
|
+
(void)index;
|
199
|
+
comparable->val.i = hit->doc;
|
200
|
+
}
|
201
|
+
|
180
202
|
int sf_doc_compare(void *index_ptr, Hit *hit1, Hit *hit2)
|
181
203
|
{
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
204
|
+
int val1 = hit1->doc;
|
205
|
+
int val2 = hit2->doc;
|
206
|
+
(void)index_ptr;
|
207
|
+
|
208
|
+
if (val1 > val2) return 1;
|
209
|
+
else if (val1 < val2) return -1;
|
210
|
+
else return 0;
|
187
211
|
}
|
188
212
|
|
189
|
-
SortField *
|
213
|
+
SortField *sort_field_doc_new(bool reverse)
|
190
214
|
{
|
191
|
-
|
192
|
-
|
193
|
-
|
215
|
+
SortField *self = sort_field_alloc(NULL, SORT_TYPE_DOC, reverse);
|
216
|
+
self->compare = &sf_doc_compare;
|
217
|
+
self->get_val = &sf_doc_get_val;
|
218
|
+
return self;
|
194
219
|
}
|
195
220
|
|
196
|
-
SortField SORT_FIELD_DOC = {
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
221
|
+
const SortField SORT_FIELD_DOC = {
|
222
|
+
MUTEX_INITIALIZER,
|
223
|
+
NULL, /* field */
|
224
|
+
SORT_TYPE_DOC, /* type */
|
225
|
+
false, /* reverse */
|
226
|
+
NULL, /* index */
|
227
|
+
&sf_doc_compare, /* compare */
|
228
|
+
&sf_doc_get_val, /* get_val */
|
229
|
+
NULL, /* create_index */
|
230
|
+
NULL, /* destroy_index */
|
231
|
+
NULL, /* handle_term */
|
206
232
|
};
|
207
233
|
|
208
|
-
SortField SORT_FIELD_DOC_REV = {
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
234
|
+
const SortField SORT_FIELD_DOC_REV = {
|
235
|
+
MUTEX_INITIALIZER,
|
236
|
+
NULL, /* field */
|
237
|
+
SORT_TYPE_DOC, /* type */
|
238
|
+
true, /* reverse */
|
239
|
+
NULL, /* index */
|
240
|
+
&sf_doc_compare, /* compare */
|
241
|
+
&sf_doc_get_val, /* get_val */
|
242
|
+
NULL, /* create_index */
|
243
|
+
NULL, /* destroy_index */
|
244
|
+
NULL, /* handle_term */
|
218
245
|
};
|
219
246
|
|
247
|
+
/***************************************************************************
|
248
|
+
* ByteSortField
|
249
|
+
***************************************************************************/
|
250
|
+
|
251
|
+
static void sf_byte_get_val(void *index, Hit *hit, Comparable *comparable)
|
252
|
+
{
|
253
|
+
comparable->val.i = ((int *)index)[hit->doc];
|
254
|
+
}
|
255
|
+
|
256
|
+
static int sf_byte_compare(void *index, Hit *hit1, Hit *hit2)
|
257
|
+
{
|
258
|
+
int val1 = ((int *)index)[hit1->doc];
|
259
|
+
int val2 = ((int *)index)[hit2->doc];
|
260
|
+
if (val1 > val2) return 1;
|
261
|
+
else if (val1 < val2) return -1;
|
262
|
+
else return 0;
|
263
|
+
}
|
264
|
+
|
265
|
+
static void *sf_byte_create_index(int size)
|
266
|
+
{
|
267
|
+
int *index = ALLOC_AND_ZERO_N(int, size + 1);
|
268
|
+
return &index[1];
|
269
|
+
}
|
270
|
+
|
271
|
+
static void sf_byte_destroy_index(void *p)
|
272
|
+
{
|
273
|
+
int *index = (int *)p;
|
274
|
+
free(&index[-1]);
|
275
|
+
}
|
276
|
+
|
277
|
+
static void sf_byte_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
278
|
+
{
|
279
|
+
int *index = (int *)index_ptr;
|
280
|
+
int val = index[-1]++;
|
281
|
+
(void)text;
|
282
|
+
while (tde->next(tde)) {
|
283
|
+
index[tde->doc_num(tde)] = val;
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
static void sort_field_byte_methods(SortField *self)
|
288
|
+
{
|
289
|
+
self->type = SORT_TYPE_BYTE;
|
290
|
+
self->compare = &sf_byte_compare;
|
291
|
+
self->get_val = &sf_byte_get_val;
|
292
|
+
self->create_index = &sf_byte_create_index;
|
293
|
+
self->destroy_index = &sf_byte_destroy_index;
|
294
|
+
self->handle_term = &sf_byte_handle_term;
|
295
|
+
}
|
296
|
+
|
297
|
+
SortField *sort_field_byte_new(char *field, bool reverse)
|
298
|
+
{
|
299
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_BYTE, reverse);
|
300
|
+
sort_field_byte_methods(self);
|
301
|
+
return self;
|
302
|
+
}
|
303
|
+
|
220
304
|
/***************************************************************************
|
221
305
|
* IntegerSortField
|
222
306
|
***************************************************************************/
|
223
307
|
|
224
|
-
|
308
|
+
void sf_int_get_val(void *index, Hit *hit, Comparable *comparable)
|
225
309
|
{
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
310
|
+
comparable->val.i = ((int *)index)[hit->doc];
|
311
|
+
}
|
312
|
+
|
313
|
+
int sf_int_compare(void *index, Hit *hit1, Hit *hit2)
|
314
|
+
{
|
315
|
+
int val1 = ((int *)index)[hit1->doc];
|
316
|
+
int val2 = ((int *)index)[hit2->doc];
|
317
|
+
if (val1 > val2) return 1;
|
318
|
+
else if (val1 < val2) return -1;
|
319
|
+
else return 0;
|
232
320
|
}
|
233
321
|
|
234
322
|
void *sf_int_create_index(int size)
|
235
323
|
{
|
236
|
-
|
324
|
+
return ALLOC_AND_ZERO_N(int, size);
|
237
325
|
}
|
238
326
|
|
239
327
|
void sf_int_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
240
328
|
{
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
329
|
+
int *index = (int *)index_ptr;
|
330
|
+
int val;
|
331
|
+
sscanf(text, "%d", &val);
|
332
|
+
while (tde->next(tde)) {
|
333
|
+
index[tde->doc_num(tde)] = val;
|
334
|
+
}
|
247
335
|
}
|
248
336
|
|
249
337
|
void sort_field_int_methods(SortField *self)
|
250
338
|
{
|
251
|
-
|
252
|
-
|
253
|
-
|
339
|
+
self->type = SORT_TYPE_INTEGER;
|
340
|
+
self->compare = &sf_int_compare;
|
341
|
+
self->get_val = &sf_int_get_val;
|
342
|
+
self->create_index = &sf_int_create_index;
|
343
|
+
self->handle_term = &sf_int_handle_term;
|
254
344
|
}
|
255
345
|
|
256
|
-
SortField *
|
346
|
+
SortField *sort_field_int_new(char *field, bool reverse)
|
257
347
|
{
|
258
|
-
|
259
|
-
|
260
|
-
|
348
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_INTEGER, reverse);
|
349
|
+
sort_field_int_methods(self);
|
350
|
+
return self;
|
261
351
|
}
|
262
352
|
|
263
353
|
/***************************************************************************
|
264
354
|
* FloatSortField
|
265
355
|
***************************************************************************/
|
266
356
|
|
267
|
-
|
357
|
+
void sf_float_get_val(void *index, Hit *hit, Comparable *comparable)
|
268
358
|
{
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
359
|
+
comparable->val.f = ((float *)index)[hit->doc];
|
360
|
+
}
|
361
|
+
|
362
|
+
int sf_float_compare(void *index, Hit *hit1, Hit *hit2)
|
363
|
+
{
|
364
|
+
float val1 = ((float *)index)[hit1->doc];
|
365
|
+
float val2 = ((float *)index)[hit2->doc];
|
366
|
+
if (val1 > val2) return 1;
|
367
|
+
else if (val1 < val2) return -1;
|
368
|
+
else return 0;
|
275
369
|
}
|
276
370
|
|
277
371
|
void *sf_float_create_index(int size)
|
278
372
|
{
|
279
|
-
|
373
|
+
return ALLOC_AND_ZERO_N(float, size);
|
280
374
|
}
|
281
375
|
|
282
376
|
void sf_float_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
283
377
|
{
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
378
|
+
float *index = (float *)index_ptr;
|
379
|
+
float val;
|
380
|
+
sscanf(text, "%g", &val);
|
381
|
+
while (tde->next(tde)) {
|
382
|
+
index[tde->doc_num(tde)] = val;
|
383
|
+
}
|
290
384
|
}
|
291
385
|
|
292
386
|
void sort_field_float_methods(SortField *self)
|
293
387
|
{
|
294
|
-
|
295
|
-
|
296
|
-
|
388
|
+
self->type = SORT_TYPE_FLOAT;
|
389
|
+
self->compare = &sf_float_compare;
|
390
|
+
self->get_val = &sf_float_get_val;
|
391
|
+
self->create_index = &sf_float_create_index;
|
392
|
+
self->handle_term = &sf_float_handle_term;
|
297
393
|
}
|
298
394
|
|
299
|
-
SortField *
|
395
|
+
SortField *sort_field_float_new(char *field, bool reverse)
|
300
396
|
{
|
301
|
-
|
302
|
-
|
303
|
-
|
397
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_FLOAT, reverse);
|
398
|
+
sort_field_float_methods(self);
|
399
|
+
return self;
|
304
400
|
}
|
305
401
|
|
306
402
|
/***************************************************************************
|
@@ -309,78 +405,99 @@ SortField *sort_field_float_create(char *field, bool reverse)
|
|
309
405
|
|
310
406
|
#define VALUES_ARRAY_START_SIZE 8
|
311
407
|
typedef struct StringIndex {
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
408
|
+
int size;
|
409
|
+
int *index;
|
410
|
+
char **values;
|
411
|
+
int v_size;
|
412
|
+
int v_capa;
|
317
413
|
} StringIndex;
|
318
414
|
|
319
|
-
|
415
|
+
void sf_string_get_val(void *index, Hit *hit, Comparable *comparable)
|
320
416
|
{
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
417
|
+
comparable->val.s
|
418
|
+
= ((StringIndex *)index)->values[
|
419
|
+
((StringIndex *)index)->index[hit->doc]];
|
420
|
+
}
|
421
|
+
|
422
|
+
int sf_string_compare(void *index, Hit *hit1, Hit *hit2)
|
423
|
+
{
|
424
|
+
char *s1 = ((StringIndex *)index)->values[
|
425
|
+
((StringIndex *)index)->index[hit1->doc]];
|
426
|
+
char *s2 = ((StringIndex *)index)->values[
|
427
|
+
((StringIndex *)index)->index[hit2->doc]];
|
428
|
+
|
429
|
+
if (s1 == NULL) return s1 ? -1 : 0;
|
430
|
+
if (s2 == NULL) return 1;
|
431
|
+
|
432
|
+
#ifdef POSH_OS_WIN32
|
433
|
+
return strcmp(s1, s2);
|
434
|
+
#else
|
435
|
+
return strcoll(s1, s2);
|
436
|
+
#endif
|
437
|
+
|
438
|
+
/*
|
439
|
+
* TODO: investigate whether it would be a good idea to presort strings.
|
440
|
+
*
|
441
|
+
int val1 = index->index[hit1->doc];
|
442
|
+
int val2 = index->index[hit2->doc];
|
443
|
+
if (val1 > val2) return 1;
|
444
|
+
else if (val1 < val2) return -1;
|
445
|
+
else return 0;
|
446
|
+
*/
|
331
447
|
}
|
332
448
|
|
333
449
|
void *sf_string_create_index(int size)
|
334
450
|
{
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
return self;
|
451
|
+
StringIndex *self = ALLOC_AND_ZERO(StringIndex);
|
452
|
+
self->size = size;
|
453
|
+
self->index = ALLOC_AND_ZERO_N(int, size);
|
454
|
+
self->v_capa = VALUES_ARRAY_START_SIZE;
|
455
|
+
self->v_size = 1; /* leave the first value as NULL */
|
456
|
+
self->values = ALLOC_AND_ZERO_N(char *, VALUES_ARRAY_START_SIZE);
|
457
|
+
return self;
|
343
458
|
}
|
344
459
|
|
345
460
|
void sf_string_destroy_index(void *p)
|
346
461
|
{
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
462
|
+
StringIndex *self = (StringIndex *)p;
|
463
|
+
int i;
|
464
|
+
free(self->index);
|
465
|
+
for (i = 0; i < self->v_size; i++) {
|
466
|
+
free(self->values[i]);
|
467
|
+
}
|
468
|
+
free(self->values);
|
469
|
+
free(self);
|
355
470
|
}
|
356
471
|
|
357
472
|
void sf_string_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
358
473
|
{
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
474
|
+
StringIndex *index = (StringIndex *)index_ptr;
|
475
|
+
if (index->v_size >= index->v_capa) {
|
476
|
+
index->v_capa *= 2;
|
477
|
+
index->values = REALLOC_N(index->values, char *, index->v_capa);
|
478
|
+
}
|
479
|
+
index->values[index->v_size] = estrdup(text);
|
480
|
+
while (tde->next(tde)) {
|
481
|
+
index->index[tde->doc_num(tde)] = index->v_size;
|
482
|
+
}
|
483
|
+
index->v_size++;
|
369
484
|
}
|
370
485
|
|
371
486
|
void sort_field_string_methods(SortField *self)
|
372
487
|
{
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
488
|
+
self->type = SORT_TYPE_STRING;
|
489
|
+
self->compare = &sf_string_compare;
|
490
|
+
self->get_val = &sf_string_get_val;
|
491
|
+
self->create_index = &sf_string_create_index;
|
492
|
+
self->destroy_index = &sf_string_destroy_index;
|
493
|
+
self->handle_term = &sf_string_handle_term;
|
377
494
|
}
|
378
495
|
|
379
|
-
SortField *
|
496
|
+
SortField *sort_field_string_new(char *field, bool reverse)
|
380
497
|
{
|
381
|
-
|
382
|
-
|
383
|
-
|
498
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_STRING, reverse);
|
499
|
+
sort_field_string_methods(self);
|
500
|
+
return self;
|
384
501
|
}
|
385
502
|
|
386
503
|
/***************************************************************************
|
@@ -389,27 +506,27 @@ SortField *sort_field_string_create(char *field, bool reverse)
|
|
389
506
|
|
390
507
|
void sort_field_auto_evaluate(SortField *sf, char *text)
|
391
508
|
{
|
392
|
-
|
393
|
-
|
394
|
-
|
509
|
+
int int_val;
|
510
|
+
float float_val;
|
511
|
+
int text_len = 0, scan_len = 0;
|
395
512
|
|
396
|
-
|
397
|
-
|
398
|
-
if (scan_len == text_len) {
|
399
|
-
sort_field_int_methods(sf);
|
400
|
-
} else {
|
401
|
-
sscanf(text, "%f%n", &float_val, &scan_len);
|
513
|
+
text_len = (int)strlen(text);
|
514
|
+
sscanf(text, "%d%n", &int_val, &scan_len);
|
402
515
|
if (scan_len == text_len) {
|
403
|
-
|
516
|
+
sort_field_int_methods(sf);
|
404
517
|
} else {
|
405
|
-
|
518
|
+
sscanf(text, "%f%n", &float_val, &scan_len);
|
519
|
+
if (scan_len == text_len) {
|
520
|
+
sort_field_float_methods(sf);
|
521
|
+
} else {
|
522
|
+
sort_field_string_methods(sf);
|
523
|
+
}
|
406
524
|
}
|
407
|
-
}
|
408
525
|
}
|
409
526
|
|
410
|
-
SortField *
|
527
|
+
SortField *sort_field_auto_new(char *field, bool reverse)
|
411
528
|
{
|
412
|
-
|
529
|
+
return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
|
413
530
|
}
|
414
531
|
|
415
532
|
/***************************************************************************
|
@@ -420,58 +537,60 @@ SortField *sort_field_auto_create(char *field, bool reverse)
|
|
420
537
|
|
421
538
|
void *field_cache_get_index(IndexReader *ir, SortField *sf)
|
422
539
|
{
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
&sort_field_destroy, NULL);
|
436
|
-
}
|
437
|
-
index = h_get(ir->sort_cache, sf);
|
438
|
-
|
439
|
-
if (index == NULL) {
|
440
|
-
length = ir->max_doc(ir);
|
441
|
-
if (length > 0) {
|
442
|
-
TRY
|
443
|
-
tde = ir->term_docs(ir);
|
444
|
-
term.field = field;
|
445
|
-
term.text = "";
|
446
|
-
te = ir->terms_from(ir, &term);
|
447
|
-
if (te->tb_curr == NULL) {
|
448
|
-
RAISE(ARG_ERROR, NO_TERM_ERROR_MSG);
|
449
|
-
}
|
540
|
+
void *index = NULL;
|
541
|
+
int length = 0;
|
542
|
+
TermEnum *volatile te = NULL;
|
543
|
+
TermDocEnum *volatile tde = NULL;
|
544
|
+
SortField *sf_clone;
|
545
|
+
const int field_num = fis_get_field_num(ir->fis, sf->field);
|
546
|
+
|
547
|
+
if (field_num < 0) {
|
548
|
+
RAISE(ARG_ERROR,
|
549
|
+
"Cannot sort by field \"%s\". It doesn't exist in the index.",
|
550
|
+
sf->field);
|
551
|
+
}
|
450
552
|
|
451
|
-
|
452
|
-
|
453
|
-
|
553
|
+
mutex_lock(&sf->mutex);
|
554
|
+
if (!ir->sort_cache) {
|
555
|
+
ir->sort_cache = h_new(&sort_field_hash, &sort_field_cache_eq,
|
556
|
+
&sort_field_destroy, NULL);
|
557
|
+
}
|
454
558
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
} while (te->next(te));
|
464
|
-
XFINALLY
|
465
|
-
tde->close(tde);
|
559
|
+
if (sf->type == SORT_TYPE_AUTO) {
|
560
|
+
te = ir->terms(ir, field_num);
|
561
|
+
if (!te->next(te)) {
|
562
|
+
RAISE(ARG_ERROR,
|
563
|
+
"Cannot sort by field \"%s\" as there are no terms "
|
564
|
+
"in that field in the index.", sf->field);
|
565
|
+
}
|
566
|
+
sort_field_auto_evaluate(sf, te->curr_term);
|
466
567
|
te->close(te);
|
467
|
-
XENDTRY
|
468
568
|
}
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
569
|
+
|
570
|
+
index = h_get(ir->sort_cache, sf);
|
571
|
+
|
572
|
+
if (index == NULL) {
|
573
|
+
length = ir->max_doc(ir);
|
574
|
+
if (length > 0) {
|
575
|
+
TRY
|
576
|
+
tde = ir->term_docs(ir);
|
577
|
+
te = ir->terms(ir, field_num);
|
578
|
+
index = sf->create_index(length);
|
579
|
+
while (te->next(te)) {
|
580
|
+
tde->seek_te(tde, te);
|
581
|
+
sf->handle_term(index, tde, te->curr_term);
|
582
|
+
}
|
583
|
+
XFINALLY
|
584
|
+
tde->close(tde);
|
585
|
+
te->close(te);
|
586
|
+
XENDTRY
|
587
|
+
}
|
588
|
+
sf_clone = sort_field_clone(sf);
|
589
|
+
sf_clone->index = index;
|
590
|
+
h_set(ir->sort_cache, sf_clone, index);
|
591
|
+
}
|
592
|
+
mutex_unlock(&sf->mutex);
|
593
|
+
return index;
|
475
594
|
}
|
476
595
|
|
477
596
|
/***************************************************************************
|
@@ -485,19 +604,19 @@ void *field_cache_get_index(IndexReader *ir, SortField *sf)
|
|
485
604
|
***************************************************************************/
|
486
605
|
|
487
606
|
typedef struct Comparator {
|
488
|
-
|
489
|
-
|
490
|
-
|
607
|
+
void *index;
|
608
|
+
bool reverse : 1;
|
609
|
+
int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2);
|
491
610
|
} Comparator;
|
492
611
|
|
493
|
-
Comparator *
|
494
|
-
|
612
|
+
Comparator *comparator_new(void *index, bool reverse,
|
613
|
+
int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2))
|
495
614
|
{
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
615
|
+
Comparator *self = ALLOC(Comparator);
|
616
|
+
self->index = index;
|
617
|
+
self->reverse = reverse;
|
618
|
+
self->compare = compare;
|
619
|
+
return self;
|
501
620
|
}
|
502
621
|
|
503
622
|
/***************************************************************************
|
@@ -505,164 +624,279 @@ Comparator *comparator_create(void *index, bool reverse,
|
|
505
624
|
***************************************************************************/
|
506
625
|
|
507
626
|
typedef struct Sorter {
|
508
|
-
|
509
|
-
|
627
|
+
Comparator **comparators;
|
628
|
+
int c_cnt;
|
629
|
+
Sort *sort;
|
510
630
|
} Sorter;
|
511
631
|
|
512
632
|
Comparator *sorter_get_comparator(SortField *sf, IndexReader *ir)
|
513
633
|
{
|
514
|
-
|
634
|
+
void *index = NULL;
|
515
635
|
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
636
|
+
if (sf->type > SORT_TYPE_DOC) {
|
637
|
+
index = field_cache_get_index(ir, sf);
|
638
|
+
}
|
639
|
+
return comparator_new(index, sf->reverse, sf->compare);
|
520
640
|
}
|
521
641
|
|
522
|
-
void sorter_destroy(
|
642
|
+
void sorter_destroy(Sorter *self)
|
523
643
|
{
|
524
|
-
|
525
|
-
Sorter *self = (Sorter *)p;
|
644
|
+
int i;
|
526
645
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
646
|
+
for (i = 0; i < self->c_cnt; i++) {
|
647
|
+
free(self->comparators[i]);
|
648
|
+
}
|
649
|
+
free(self->comparators);
|
650
|
+
free(self);
|
532
651
|
}
|
533
652
|
|
534
|
-
Sorter *
|
653
|
+
Sorter *sorter_new(Sort *sort)
|
535
654
|
{
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
655
|
+
Sorter *self = ALLOC(Sorter);
|
656
|
+
self->c_cnt = sort->size;
|
657
|
+
self->comparators = ALLOC_AND_ZERO_N(Comparator *, self->c_cnt);
|
658
|
+
self->sort = sort;
|
659
|
+
return self;
|
541
660
|
}
|
542
661
|
|
543
662
|
/***************************************************************************
|
544
663
|
* FieldSortedHitQueue
|
545
664
|
***************************************************************************/
|
546
665
|
|
547
|
-
bool fshq_less_than(void *hit1, void *hit2)
|
666
|
+
bool fshq_less_than(const void *hit1, const void *hit2)
|
548
667
|
{
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
668
|
+
int cmp = 0;
|
669
|
+
printf("Whoops, shouldn't call this.\n");
|
670
|
+
if (cmp != 0) {
|
671
|
+
return cmp;
|
672
|
+
} else {
|
673
|
+
return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
|
674
|
+
}
|
556
675
|
}
|
557
676
|
|
558
|
-
inline bool fshq_lt(
|
677
|
+
inline bool fshq_lt(Sorter *sorter, Hit *hit1, Hit *hit2)
|
559
678
|
{
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
679
|
+
Comparator *comp;
|
680
|
+
int diff = 0, i;
|
681
|
+
for (i = 0; i < sorter->c_cnt && diff == 0; i++) {
|
682
|
+
comp = sorter->comparators[i];
|
683
|
+
if (comp->reverse) {
|
684
|
+
diff = comp->compare(comp->index, hit2, hit1);
|
685
|
+
} else {
|
686
|
+
diff = comp->compare(comp->index, hit1, hit2);
|
687
|
+
}
|
569
688
|
}
|
570
|
-
}
|
571
689
|
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
690
|
+
if (diff != 0) {
|
691
|
+
return diff > 0;
|
692
|
+
} else {
|
693
|
+
return hit1->doc > hit2->doc;
|
694
|
+
}
|
577
695
|
}
|
578
696
|
|
579
697
|
void fshq_pq_down(PriorityQueue *pq)
|
580
698
|
{
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
699
|
+
register int i = 1;
|
700
|
+
register int j = 2; /* i << 1; */
|
701
|
+
register int k = 3; /* j + 1; */
|
702
|
+
Hit **heap = (Hit **)pq->heap;
|
703
|
+
Hit *node = heap[i]; /* save top node */
|
704
|
+
Sorter *sorter = (Sorter *)heap[0];
|
586
705
|
|
587
|
-
|
588
|
-
|
706
|
+
if ((k <= pq->size) && fshq_lt(sorter, heap[k], heap[j])) {
|
707
|
+
j = k;
|
708
|
+
}
|
589
709
|
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
710
|
+
while ((j <= pq->size) && fshq_lt(sorter, heap[j], node)) {
|
711
|
+
heap[i] = heap[j]; /* shift up child */
|
712
|
+
i = j;
|
713
|
+
j = i << 1;
|
714
|
+
k = j + 1;
|
715
|
+
if ((k <= pq->size) && fshq_lt(sorter, heap[k], heap[j])) {
|
716
|
+
j = k;
|
717
|
+
}
|
718
|
+
}
|
719
|
+
heap[i] = node;
|
599
720
|
}
|
600
721
|
|
601
722
|
Hit *fshq_pq_pop(PriorityQueue *pq)
|
602
723
|
{
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
724
|
+
if (pq->size > 0) {
|
725
|
+
Hit *hit = (Hit *)pq->heap[1]; /* save first value */
|
726
|
+
pq->heap[1] = pq->heap[pq->size]; /* move last to first */
|
727
|
+
pq->heap[pq->size] = NULL;
|
728
|
+
pq->size--;
|
729
|
+
fshq_pq_down(pq); /* adjust heap */
|
730
|
+
return hit;
|
731
|
+
} else {
|
732
|
+
return NULL;
|
733
|
+
}
|
613
734
|
}
|
614
735
|
|
615
736
|
inline void fshq_pq_up(PriorityQueue *pq)
|
616
737
|
{
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
738
|
+
Hit **heap = (Hit **)pq->heap;
|
739
|
+
Hit *node;
|
740
|
+
int i = pq->size;
|
741
|
+
int j = i >> 1;
|
742
|
+
Sorter *sorter = (Sorter *)heap[0];
|
743
|
+
node = heap[i];
|
744
|
+
|
745
|
+
while ((j > 0) && fshq_lt(sorter, node, heap[j])) {
|
746
|
+
heap[i] = heap[j];
|
747
|
+
i = j;
|
748
|
+
j = j >> 1;
|
749
|
+
}
|
750
|
+
heap[i] = node;
|
629
751
|
}
|
630
752
|
|
631
753
|
void fshq_pq_insert(PriorityQueue *pq, Hit *hit)
|
632
754
|
{
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
755
|
+
if (pq->size < pq->capa) {
|
756
|
+
Hit *new_hit = ALLOC(Hit);
|
757
|
+
memcpy(new_hit, hit, sizeof(Hit));
|
758
|
+
pq->size++;
|
759
|
+
if (pq->size >= pq->mem_capa) {
|
760
|
+
pq->mem_capa <<= 1;
|
761
|
+
REALLOC_N(pq->heap, void *, pq->mem_capa);
|
762
|
+
}
|
763
|
+
pq->heap[pq->size] = new_hit;
|
764
|
+
fshq_pq_up(pq);
|
765
|
+
} else if (pq->size > 0
|
766
|
+
&& fshq_lt((Sorter *)pq->heap[0], (Hit *)pq->heap[1], hit)) {
|
767
|
+
memcpy(pq->heap[1], hit, sizeof(Hit));
|
768
|
+
fshq_pq_down(pq);
|
769
|
+
}
|
644
770
|
}
|
645
771
|
|
646
772
|
void fshq_pq_destroy(PriorityQueue *self)
|
647
773
|
{
|
648
|
-
|
649
|
-
|
774
|
+
sorter_destroy(self->heap[0]);
|
775
|
+
pq_destroy(self);
|
776
|
+
}
|
777
|
+
|
778
|
+
PriorityQueue *fshq_pq_new(int size, Sort *sort, IndexReader *ir)
|
779
|
+
{
|
780
|
+
PriorityQueue *self = pq_new(size, &fshq_less_than, &free);
|
781
|
+
int i;
|
782
|
+
Sorter *sorter = sorter_new(sort);
|
783
|
+
SortField *sf;
|
784
|
+
|
785
|
+
for (i = 0; i < sort->size; i++) {
|
786
|
+
sf = sort->sort_fields[i];
|
787
|
+
sorter->comparators[i] = sorter_get_comparator(sf, ir);
|
788
|
+
}
|
789
|
+
self->heap[0] = sorter;
|
790
|
+
|
791
|
+
return self;
|
792
|
+
}
|
793
|
+
|
794
|
+
Hit *fshq_pq_pop_fd(PriorityQueue *pq)
|
795
|
+
{
|
796
|
+
if (pq->size <= 0) {
|
797
|
+
return NULL;
|
798
|
+
}
|
799
|
+
else {
|
800
|
+
int j;
|
801
|
+
Sorter *sorter = (Sorter *)pq->heap[0];
|
802
|
+
const int cmp_cnt = sorter->c_cnt;
|
803
|
+
SortField **sort_fields = sorter->sort->sort_fields;
|
804
|
+
Hit *hit = (Hit *)pq->heap[1]; /* save first value */
|
805
|
+
FieldDoc *field_doc;
|
806
|
+
Comparable *comparables;
|
807
|
+
Comparator **comparators = sorter->comparators;
|
808
|
+
pq->heap[1] = pq->heap[pq->size]; /* move last to first */
|
809
|
+
pq->heap[pq->size] = NULL;
|
810
|
+
pq->size--;
|
811
|
+
fshq_pq_down(pq); /* adjust heap */
|
812
|
+
|
813
|
+
field_doc = (FieldDoc *)emalloc(sizeof(FieldDoc)
|
814
|
+
+ sizeof(Comparable)*cmp_cnt);
|
815
|
+
comparables = field_doc->comparables;
|
816
|
+
memcpy(field_doc, hit, sizeof(Hit));
|
817
|
+
field_doc->size = cmp_cnt;
|
818
|
+
|
819
|
+
for (j = 0; j < cmp_cnt; j++) {
|
820
|
+
SortField *sf = sort_fields[j];
|
821
|
+
Comparator *comparator = comparators[j];
|
822
|
+
sf->get_val(comparator->index, hit, &(comparables[j]));
|
823
|
+
comparables[j].type = sf->type;
|
824
|
+
comparables[j].reverse = comparator->reverse;
|
825
|
+
}
|
826
|
+
free(hit);
|
827
|
+
return (Hit *)field_doc;
|
828
|
+
}
|
650
829
|
}
|
651
830
|
|
652
|
-
|
831
|
+
/***************************************************************************
|
832
|
+
* FieldDoc
|
833
|
+
***************************************************************************/
|
834
|
+
|
835
|
+
void fd_destroy(FieldDoc *fd)
|
653
836
|
{
|
654
|
-
|
655
|
-
|
656
|
-
Sorter *sorter = sorter_create(sort->sf_cnt);
|
657
|
-
SortField *sf;
|
837
|
+
free(fd);
|
838
|
+
}
|
658
839
|
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
840
|
+
/***************************************************************************
|
841
|
+
* FieldDocSortedHitQueue
|
842
|
+
***************************************************************************/
|
843
|
+
|
844
|
+
bool fdshq_lt(FieldDoc *fd1, FieldDoc *fd2)
|
845
|
+
{
|
846
|
+
int c = 0, i;
|
847
|
+
Comparable *cmps1 = fd1->comparables;
|
848
|
+
Comparable *cmps2 = fd2->comparables;
|
849
|
+
|
850
|
+
for (i = 0; i < fd1->size && c == 0; i++) {
|
851
|
+
int type = cmps1[i].type;
|
852
|
+
switch (type) {
|
853
|
+
case SORT_TYPE_SCORE:
|
854
|
+
if (cmps1[i].val.f < cmps2[i].val.f) c = 1;
|
855
|
+
if (cmps1[i].val.f > cmps2[i].val.f) c = -1;
|
856
|
+
break;
|
857
|
+
case SORT_TYPE_FLOAT:
|
858
|
+
if (cmps1[i].val.f > cmps2[i].val.f) c = 1;
|
859
|
+
if (cmps1[i].val.f < cmps2[i].val.f) c = -1;
|
860
|
+
break;
|
861
|
+
case SORT_TYPE_DOC:
|
862
|
+
if (fd1->hit.doc > fd2->hit.doc) c = 1;
|
863
|
+
if (fd1->hit.doc < fd2->hit.doc) c = -1;
|
864
|
+
break;
|
865
|
+
case SORT_TYPE_INTEGER:
|
866
|
+
if (cmps1[i].val.i > cmps2[i].val.i) c = 1;
|
867
|
+
if (cmps1[i].val.i < cmps2[i].val.i) c = -1;
|
868
|
+
break;
|
869
|
+
case SORT_TYPE_BYTE:
|
870
|
+
if (cmps1[i].val.i > cmps2[i].val.i) c = 1;
|
871
|
+
if (cmps1[i].val.i < cmps2[i].val.i) c = -1;
|
872
|
+
break;
|
873
|
+
case SORT_TYPE_STRING:
|
874
|
+
do {
|
875
|
+
char *s1 = cmps1[i].val.s;
|
876
|
+
char *s2 = cmps2[i].val.s;
|
877
|
+
if (s1 == NULL) c = s2 ? -1 : 0;
|
878
|
+
else if (s2 == NULL) c = 1;
|
879
|
+
#ifdef POSH_OS_WIN32
|
880
|
+
else c = strcmp(s1, s2);
|
881
|
+
#else
|
882
|
+
else c = strcoll(s1, s2);
|
883
|
+
#endif
|
884
|
+
} while (0);
|
885
|
+
break;
|
886
|
+
default:
|
887
|
+
RAISE(ERROR, "Unknown sort type: %d.", type);
|
888
|
+
break;
|
889
|
+
}
|
890
|
+
if (cmps1[i].reverse) {
|
891
|
+
c = -c;
|
892
|
+
}
|
893
|
+
}
|
894
|
+
if (c == 0) {
|
895
|
+
return fd1->hit.doc > fd2->hit.doc;
|
896
|
+
}
|
897
|
+
else {
|
898
|
+
return c > 0;
|
899
|
+
}
|
666
900
|
}
|
667
901
|
|
668
902
|
/***************************************************************************
|
@@ -671,75 +905,78 @@ PriorityQueue *fshq_pq_create(int size, Sort *sort, IndexReader *ir)
|
|
671
905
|
*
|
672
906
|
***************************************************************************/
|
673
907
|
|
674
|
-
|
908
|
+
#define SORT_INIT_SIZE 4
|
909
|
+
|
910
|
+
Sort *sort_new()
|
675
911
|
{
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
912
|
+
Sort *self = ALLOC(Sort);
|
913
|
+
self->size = 0;
|
914
|
+
self->capa = SORT_INIT_SIZE;
|
915
|
+
self->sort_fields = ALLOC_N(SortField *, SORT_INIT_SIZE);
|
916
|
+
self->destroy_all = true;
|
917
|
+
self->start = 0;
|
681
918
|
|
682
|
-
|
919
|
+
return self;
|
683
920
|
}
|
684
921
|
|
685
922
|
void sort_clear(Sort *self)
|
686
923
|
{
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
924
|
+
int i;
|
925
|
+
if (self->destroy_all) {
|
926
|
+
for (i = 0; i < self->size; i++) {
|
927
|
+
sort_field_destroy(self->sort_fields[i]);
|
928
|
+
}
|
691
929
|
}
|
692
|
-
|
693
|
-
self->sf_cnt = 0;
|
930
|
+
self->size = 0;
|
694
931
|
}
|
695
932
|
|
696
933
|
void sort_destroy(void *p)
|
697
934
|
{
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
935
|
+
Sort *self = (Sort *)p;
|
936
|
+
sort_clear(self);
|
937
|
+
free(self->sort_fields);
|
938
|
+
free(self);
|
702
939
|
}
|
703
940
|
|
704
941
|
void sort_add_sort_field(Sort *self, SortField *sf)
|
705
942
|
{
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
943
|
+
if (self->size == self->capa) {
|
944
|
+
self->capa <<= 1;
|
945
|
+
REALLOC_N(self->sort_fields, SortField *, self->capa);
|
946
|
+
}
|
710
947
|
|
711
|
-
|
712
|
-
|
948
|
+
self->sort_fields[self->size] = sf;
|
949
|
+
self->size++;
|
713
950
|
}
|
714
951
|
|
715
952
|
char *sort_to_s(Sort *self)
|
716
953
|
{
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
954
|
+
int i, len = 20;
|
955
|
+
char *s;
|
956
|
+
char *str;
|
957
|
+
char **sf_strs = ALLOC_N(char *, self->size);
|
958
|
+
|
959
|
+
for (i = 0; i < self->size; i++) {
|
960
|
+
sf_strs[i] = s = sort_field_to_s(self->sort_fields[i]);
|
961
|
+
len += (int)strlen(s) + 2;
|
962
|
+
}
|
963
|
+
|
964
|
+
str = ALLOC_N(char, len);
|
965
|
+
s = "Sort[";
|
966
|
+
len = (int)strlen(s);
|
967
|
+
memcpy(str, s, len);
|
968
|
+
|
969
|
+
s = str + len;
|
970
|
+
for (i = 0; i < self->size; i++) {
|
971
|
+
sprintf(s, "%s, ", sf_strs[i]);
|
972
|
+
s += (int)strlen(s);
|
973
|
+
free(sf_strs[i]);
|
974
|
+
}
|
975
|
+
free(sf_strs);
|
976
|
+
|
977
|
+
if (self->size > 0) {
|
978
|
+
s -= 2;
|
979
|
+
}
|
980
|
+
sprintf(s, "]");
|
981
|
+
return str;
|
745
982
|
}
|