ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/similarity.c
CHANGED
@@ -1,59 +1,163 @@
|
|
1
|
-
#include
|
1
|
+
#include <search.h>
|
2
|
+
#include <global.h>
|
2
3
|
#include <math.h>
|
3
4
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
5
|
+
static int low_bit = 0, low_mid_bit = 0, high_mid_bit = 0, high_bit = 0;
|
6
|
+
static void
|
7
|
+
setup_endian()
|
8
|
+
{
|
9
|
+
static int init = 0;
|
10
|
+
char *p;
|
11
|
+
|
12
|
+
if (init) return;
|
13
|
+
init = 1;
|
14
|
+
p = (char*)&init;
|
9
15
|
|
10
|
-
|
11
|
-
|
12
|
-
|
16
|
+
if (p[0]) {
|
17
|
+
low_bit = 0;
|
18
|
+
low_mid_bit = 1;
|
19
|
+
high_mid_bit = 2;
|
20
|
+
high_bit = 3;
|
21
|
+
} else {
|
22
|
+
low_bit = 3;
|
23
|
+
low_mid_bit = 2;
|
24
|
+
high_mid_bit = 1;
|
25
|
+
high_bit = 0;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
float byte_to_float(uchar b)
|
13
30
|
{
|
14
|
-
|
31
|
+
char flt[4];
|
15
32
|
if (b == 0)
|
16
|
-
return
|
33
|
+
return 0.0;
|
17
34
|
int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
|
18
35
|
int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
|
19
|
-
|
20
|
-
|
21
|
-
|
36
|
+
|
37
|
+
if (!low_mid_bit) setup_endian();
|
38
|
+
flt[low_bit] = flt[low_mid_bit] = 0;
|
39
|
+
flt[high_mid_bit] = mantissa << 5;
|
40
|
+
flt[high_bit] = exponent + 48;
|
41
|
+
return *((float *)flt);
|
42
|
+
}
|
43
|
+
|
44
|
+
uchar float_to_byte(float f)
|
45
|
+
{
|
46
|
+
if (f <= 0.0)
|
47
|
+
return 0;
|
48
|
+
|
49
|
+
char *bits = (char *)&f;
|
50
|
+
int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
|
51
|
+
int exponent = (bits[high_bit] - 48);
|
52
|
+
|
53
|
+
if (exponent > 0x1f) {
|
54
|
+
exponent = 0x1f; // 0x1f = 31 = 0b00011111
|
55
|
+
mantissa = 0x07; // 0x07 = 7 = 0b00000111
|
56
|
+
}
|
57
|
+
|
58
|
+
if (exponent < 0) {
|
59
|
+
exponent = 0;
|
60
|
+
mantissa = 1;
|
61
|
+
}
|
62
|
+
|
63
|
+
return ((exponent<<3) | mantissa);
|
64
|
+
}
|
65
|
+
|
66
|
+
float simdef_length_norm(Similarity *s, char *field, int num_terms)
|
67
|
+
{
|
68
|
+
return 1.0 / sqrt(num_terms);
|
69
|
+
}
|
70
|
+
|
71
|
+
float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
|
72
|
+
{
|
73
|
+
return 1.0 / sqrt(sum_of_squared_weights);
|
74
|
+
}
|
75
|
+
|
76
|
+
float simdef_tf(struct Similarity *s, float freq)
|
77
|
+
{
|
78
|
+
return sqrt(freq);
|
79
|
+
}
|
80
|
+
|
81
|
+
float simdef_sloppy_freq(struct Similarity *s, int distance)
|
82
|
+
{
|
83
|
+
return 1.0 / (float)(distance + 1);
|
22
84
|
}
|
23
|
-
*/
|
24
85
|
|
25
|
-
|
26
|
-
frt_dsim_tf(VALUE self, VALUE freq)
|
86
|
+
float simdef_idf_term(struct Similarity *s, Term *term, Searcher *searcher)
|
27
87
|
{
|
28
|
-
return
|
88
|
+
return s->idf(s, searcher->doc_freq(searcher, term), searcher->max_doc(searcher));
|
29
89
|
}
|
30
90
|
|
31
|
-
|
32
|
-
frt_dsim_idf(VALUE self, VALUE rdoc_freq, VALUE rnum_docs)
|
91
|
+
float simdef_idf_phrase(struct Similarity *s, Term **terms, int tcnt, Searcher *searcher)
|
33
92
|
{
|
34
|
-
|
35
|
-
int
|
36
|
-
|
93
|
+
float idf = 0.0;
|
94
|
+
int i;
|
95
|
+
for (i = 0; i < tcnt; i++) {
|
96
|
+
idf += s->idf_term(s, terms[i], searcher);
|
97
|
+
}
|
98
|
+
return idf;
|
99
|
+
}
|
37
100
|
|
38
|
-
|
39
|
-
|
101
|
+
float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
|
102
|
+
{
|
103
|
+
return log((float)num_docs/(float)(doc_freq+1)) + 1.0;
|
40
104
|
}
|
41
105
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
****************************************************************************/
|
106
|
+
float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
|
107
|
+
{
|
108
|
+
return (float)overlap / (float)max_overlap;
|
109
|
+
}
|
47
110
|
|
48
|
-
|
49
|
-
Init_similarity(void)
|
111
|
+
float simdef_decode_norm(struct Similarity *s, uchar b)
|
50
112
|
{
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
113
|
+
return s->norm_table[b];
|
114
|
+
}
|
115
|
+
|
116
|
+
float simdef_encode_norm(struct Similarity *s, float f)
|
117
|
+
{
|
118
|
+
return float_to_byte(f);
|
119
|
+
}
|
120
|
+
|
121
|
+
void simdef_destroy(void *p)
|
122
|
+
{
|
123
|
+
// nothing to do here;
|
124
|
+
}
|
125
|
+
|
126
|
+
static Similarity default_similarity = {
|
127
|
+
data:NULL,
|
128
|
+
length_norm:&simdef_length_norm,
|
129
|
+
query_norm:&simdef_query_norm,
|
130
|
+
tf:&simdef_tf,
|
131
|
+
sloppy_freq:&simdef_sloppy_freq,
|
132
|
+
idf_term:&simdef_idf_term,
|
133
|
+
idf_phrase:&simdef_idf_phrase,
|
134
|
+
idf:&simdef_idf,
|
135
|
+
coord:&simdef_coord,
|
136
|
+
decode_norm:&simdef_decode_norm,
|
137
|
+
encode_norm:&simdef_encode_norm,
|
138
|
+
destroy:&simdef_destroy
|
139
|
+
};
|
140
|
+
Similarity *sim_create_default()
|
141
|
+
{
|
142
|
+
int i;
|
143
|
+
if (!default_similarity.data) {
|
144
|
+
for (i = 0; i < 256; i++)
|
145
|
+
default_similarity.norm_table[i] = byte_to_float(i);
|
146
|
+
|
147
|
+
default_similarity.data = &default_similarity;
|
148
|
+
}
|
149
|
+
return &default_similarity;
|
55
150
|
|
56
|
-
|
57
|
-
|
58
|
-
|
151
|
+
// s->length_norm = &simdef_length_norm;
|
152
|
+
// s->query_norm = &simdef_query_norm;
|
153
|
+
// s->tf = &simdef_tf;
|
154
|
+
// s->sloppy_freq = &simdef_sloppy_freq;
|
155
|
+
// s->idf_term = &simdef_idf_term;
|
156
|
+
// s->idf_phrase = &simdef_idf_phrase;
|
157
|
+
// s->idf = &simdef_idf;
|
158
|
+
// s->coord = &simdef_coord;
|
159
|
+
// s->decode_norm = &simdef_decode_norm;
|
160
|
+
// s->encode_norm = &simdef_encode_norm;
|
161
|
+
// s->destroy = &simdef_destroy;
|
162
|
+
// return s;
|
59
163
|
}
|
data/ext/sort.c
ADDED
@@ -0,0 +1,661 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include "search.h"
|
3
|
+
#include "index.h"
|
4
|
+
|
5
|
+
/***************************************************************************
|
6
|
+
*
|
7
|
+
* SortField
|
8
|
+
*
|
9
|
+
***************************************************************************/
|
10
|
+
|
11
|
+
unsigned int sort_field_hash(const void *p)
|
12
|
+
{
|
13
|
+
SortField *self = (SortField *)p;
|
14
|
+
return str_hash(self->field) ^ (self->type*37);
|
15
|
+
}
|
16
|
+
|
17
|
+
int sort_field_eq(const void *p1, const void *p2)
|
18
|
+
{
|
19
|
+
SortField *key1 = (SortField *)p1;
|
20
|
+
SortField *key2 = (SortField *)p2;
|
21
|
+
int equal = (strcmp(key1->field, key2->field) == 0) && key1->type == key2->type;
|
22
|
+
/*
|
23
|
+
* TODO: The could probable be done more cleanly.
|
24
|
+
* If the sort field is an auto field then it was evaluated before it was
|
25
|
+
* entered into the cache so we need to pass the compare function back to
|
26
|
+
* the new sort field.
|
27
|
+
*/
|
28
|
+
if (equal && (key1->type == SORT_TYPE_AUTO)) {
|
29
|
+
key2->compare = key1->compare;
|
30
|
+
}
|
31
|
+
return equal;
|
32
|
+
}
|
33
|
+
|
34
|
+
SortField *sort_field_clone(SortField *self)
|
35
|
+
{
|
36
|
+
SortField *clone = ALLOC(SortField);
|
37
|
+
memcpy(clone, self, sizeof(SortField));
|
38
|
+
mutex_init(&clone->mutex, NULL);
|
39
|
+
clone->field = estrdup(self->field);
|
40
|
+
return clone;
|
41
|
+
}
|
42
|
+
|
43
|
+
SortField *sort_field_alloc(char *field, int type, bool reverse)
|
44
|
+
{
|
45
|
+
SortField *self = ALLOC(SortField);
|
46
|
+
mutex_init(&self->mutex, NULL);
|
47
|
+
self->field = field ? estrdup(field) : NULL;
|
48
|
+
self->type = type;
|
49
|
+
self->reverse = reverse;
|
50
|
+
self->index = NULL;
|
51
|
+
self->destroy_index = &free;
|
52
|
+
self->compare = NULL;
|
53
|
+
return self;
|
54
|
+
}
|
55
|
+
|
56
|
+
SortField *sort_field_create(char *field, int type, bool reverse)
|
57
|
+
{
|
58
|
+
SortField *sf = NULL;
|
59
|
+
switch (type) {
|
60
|
+
case SORT_TYPE_SCORE:
|
61
|
+
sf = sort_field_score_create(reverse);
|
62
|
+
break;
|
63
|
+
case SORT_TYPE_DOC:
|
64
|
+
sf = sort_field_doc_create(reverse);
|
65
|
+
break;
|
66
|
+
case SORT_TYPE_INTEGER:
|
67
|
+
sf = sort_field_int_create(field, reverse);
|
68
|
+
break;
|
69
|
+
case SORT_TYPE_FLOAT:
|
70
|
+
sf = sort_field_float_create(field, reverse);
|
71
|
+
break;
|
72
|
+
case SORT_TYPE_STRING:
|
73
|
+
sf = sort_field_string_create(field, reverse);
|
74
|
+
break;
|
75
|
+
case SORT_TYPE_AUTO:
|
76
|
+
sf = sort_field_auto_create(field, reverse);
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
return sf;
|
80
|
+
}
|
81
|
+
|
82
|
+
void sort_field_destroy(void *p)
|
83
|
+
{
|
84
|
+
SortField *self = (SortField *)p;
|
85
|
+
if (self->index) {
|
86
|
+
self->destroy_index(self->index);
|
87
|
+
}
|
88
|
+
free(self->field);
|
89
|
+
mutex_destroy(&self->mutex);
|
90
|
+
free(p);
|
91
|
+
}
|
92
|
+
|
93
|
+
/***************************************************************************
|
94
|
+
* ScoreSortField
|
95
|
+
***************************************************************************/
|
96
|
+
|
97
|
+
int sf_score_compare(void *index_ptr, Hit *hit2, Hit *hit1)
|
98
|
+
{
|
99
|
+
float val1 = hit1->score;
|
100
|
+
float val2 = hit2->score;
|
101
|
+
if (val1 > val2) return 1;
|
102
|
+
else if (val1 < val2) return -1;
|
103
|
+
else return 0;
|
104
|
+
}
|
105
|
+
|
106
|
+
SortField *sort_field_score_create(bool reverse)
|
107
|
+
{
|
108
|
+
SortField *self = sort_field_alloc(NULL, SORT_TYPE_SCORE, reverse);
|
109
|
+
self->compare = &sf_score_compare;
|
110
|
+
return self;
|
111
|
+
}
|
112
|
+
|
113
|
+
SortField SORT_FIELD_SCORE = {
|
114
|
+
field:NULL,
|
115
|
+
type:SORT_TYPE_SCORE,
|
116
|
+
reverse:false,
|
117
|
+
index:NULL,
|
118
|
+
compare:&sf_score_compare,
|
119
|
+
create_index:NULL,
|
120
|
+
destroy_index:NULL,
|
121
|
+
handle_term:NULL
|
122
|
+
};
|
123
|
+
|
124
|
+
SortField SORT_FIELD_SCORE_REV = {
|
125
|
+
field:NULL,
|
126
|
+
type:SORT_TYPE_SCORE,
|
127
|
+
reverse:true,
|
128
|
+
index:NULL,
|
129
|
+
compare:&sf_score_compare,
|
130
|
+
create_index:NULL,
|
131
|
+
destroy_index:NULL,
|
132
|
+
handle_term:NULL
|
133
|
+
};
|
134
|
+
|
135
|
+
/**************************************************************************
|
136
|
+
* DocSortField
|
137
|
+
***************************************************************************/
|
138
|
+
|
139
|
+
int sf_doc_compare(void *index_ptr, Hit *hit1, Hit *hit2)
|
140
|
+
{
|
141
|
+
int val1 = hit1->doc;
|
142
|
+
int val2 = hit2->doc;
|
143
|
+
if (val1 > val2) return 1;
|
144
|
+
else if (val1 < val2) return -1;
|
145
|
+
else return 0;
|
146
|
+
}
|
147
|
+
|
148
|
+
SortField *sort_field_doc_create(bool reverse)
|
149
|
+
{
|
150
|
+
SortField *self = sort_field_alloc(NULL, SORT_TYPE_DOC, reverse);
|
151
|
+
self->compare = &sf_doc_compare;
|
152
|
+
return self;
|
153
|
+
}
|
154
|
+
|
155
|
+
SortField SORT_FIELD_DOC = {
|
156
|
+
field:NULL,
|
157
|
+
type:SORT_TYPE_DOC,
|
158
|
+
reverse:false,
|
159
|
+
index:NULL,
|
160
|
+
compare:&sf_doc_compare,
|
161
|
+
create_index:NULL,
|
162
|
+
destroy_index:NULL,
|
163
|
+
handle_term:NULL
|
164
|
+
};
|
165
|
+
|
166
|
+
SortField SORT_FIELD_DOC_REV = {
|
167
|
+
field:NULL,
|
168
|
+
type:SORT_TYPE_DOC,
|
169
|
+
reverse:true,
|
170
|
+
index:NULL,
|
171
|
+
compare:&sf_doc_compare,
|
172
|
+
create_index:NULL,
|
173
|
+
destroy_index:NULL,
|
174
|
+
handle_term:NULL
|
175
|
+
};
|
176
|
+
|
177
|
+
/***************************************************************************
|
178
|
+
* IntegerSortField
|
179
|
+
***************************************************************************/
|
180
|
+
|
181
|
+
int sf_int_compare(void *index_ptr, Hit *hit1, Hit *hit2)
|
182
|
+
{
|
183
|
+
int *index = (int *)index_ptr;
|
184
|
+
int val1 = index[hit1->doc];
|
185
|
+
int val2 = index[hit2->doc];
|
186
|
+
if (val1 > val2) return 1;
|
187
|
+
else if (val1 < val2) return -1;
|
188
|
+
else return 0;
|
189
|
+
}
|
190
|
+
|
191
|
+
void *sf_int_create_index(int size)
|
192
|
+
{
|
193
|
+
return ALLOC_N(int, size);
|
194
|
+
}
|
195
|
+
|
196
|
+
void sf_int_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
197
|
+
{
|
198
|
+
int *index = (int *)index_ptr;
|
199
|
+
int val;
|
200
|
+
sscanf(text, "%d", &val);
|
201
|
+
while (tde->next(tde)) {
|
202
|
+
index[tde->doc_num(tde)] = val;
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
void sort_field_int_methods(SortField *self)
|
207
|
+
{
|
208
|
+
self->compare = &sf_int_compare;
|
209
|
+
self->create_index = &sf_int_create_index;
|
210
|
+
self->handle_term = &sf_int_handle_term;
|
211
|
+
}
|
212
|
+
|
213
|
+
SortField *sort_field_int_create(char *field, bool reverse)
|
214
|
+
{
|
215
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_INTEGER, reverse);
|
216
|
+
sort_field_int_methods(self);
|
217
|
+
return self;
|
218
|
+
}
|
219
|
+
|
220
|
+
/***************************************************************************
|
221
|
+
* FloatSortField
|
222
|
+
***************************************************************************/
|
223
|
+
|
224
|
+
int sf_float_compare(void *index_ptr, Hit *hit1, Hit *hit2)
|
225
|
+
{
|
226
|
+
float *index = (float *)index_ptr;
|
227
|
+
float val1 = index[hit1->doc];
|
228
|
+
float val2 = index[hit2->doc];
|
229
|
+
if (val1 > val2) return 1;
|
230
|
+
else if (val1 < val2) return -1;
|
231
|
+
else return 0;
|
232
|
+
}
|
233
|
+
|
234
|
+
void *sf_float_create_index(int size)
|
235
|
+
{
|
236
|
+
return ALLOC_N(float, size);
|
237
|
+
}
|
238
|
+
|
239
|
+
void sf_float_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
240
|
+
{
|
241
|
+
float *index = (float *)index_ptr;
|
242
|
+
float val;
|
243
|
+
sscanf(text, "%g", &val);
|
244
|
+
while (tde->next(tde)) {
|
245
|
+
index[tde->doc_num(tde)] = val;
|
246
|
+
}
|
247
|
+
}
|
248
|
+
|
249
|
+
void sort_field_float_methods(SortField *self)
|
250
|
+
{
|
251
|
+
self->compare = &sf_float_compare;
|
252
|
+
self->create_index = &sf_float_create_index;
|
253
|
+
self->handle_term = &sf_float_handle_term;
|
254
|
+
}
|
255
|
+
|
256
|
+
SortField *sort_field_float_create(char *field, bool reverse)
|
257
|
+
{
|
258
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_FLOAT, reverse);
|
259
|
+
sort_field_float_methods(self);
|
260
|
+
return self;
|
261
|
+
}
|
262
|
+
|
263
|
+
/***************************************************************************
|
264
|
+
* StringSortField
|
265
|
+
***************************************************************************/
|
266
|
+
|
267
|
+
#define VALUES_ARRAY_START_SIZE 8
|
268
|
+
typedef struct StringIndex {
|
269
|
+
int size;
|
270
|
+
int *index;
|
271
|
+
char **values;
|
272
|
+
int v_cnt;
|
273
|
+
int v_size;
|
274
|
+
} StringIndex;
|
275
|
+
|
276
|
+
int sf_string_compare(void *index_ptr, Hit *hit1, Hit *hit2)
|
277
|
+
{
|
278
|
+
StringIndex *index = (StringIndex *)index_ptr;
|
279
|
+
int val1 = index->index[hit1->doc];
|
280
|
+
int val2 = index->index[hit2->doc];
|
281
|
+
if (val1 > val2) return 1;
|
282
|
+
else if (val1 < val2) return -1;
|
283
|
+
else return 0;
|
284
|
+
}
|
285
|
+
|
286
|
+
void *sf_string_create_index(int size)
|
287
|
+
{
|
288
|
+
StringIndex *self = ALLOC(StringIndex);
|
289
|
+
ZEROSET(self, StringIndex, 1);
|
290
|
+
self->size = size;
|
291
|
+
self->index = ALLOC_N(int, size);
|
292
|
+
ZEROSET(self->index, int, size);
|
293
|
+
self->v_size = VALUES_ARRAY_START_SIZE;
|
294
|
+
self->values = ALLOC_N(char *, VALUES_ARRAY_START_SIZE);
|
295
|
+
return self;
|
296
|
+
}
|
297
|
+
|
298
|
+
void sf_string_destroy_index(void *p)
|
299
|
+
{
|
300
|
+
StringIndex *self = (StringIndex *)p;
|
301
|
+
int i;
|
302
|
+
free(self->index);
|
303
|
+
for (i = 0; i < self->v_cnt; i++) {
|
304
|
+
free(self->values[i]);
|
305
|
+
}
|
306
|
+
free(self->values);
|
307
|
+
free(self);
|
308
|
+
}
|
309
|
+
|
310
|
+
void sf_string_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
|
311
|
+
{
|
312
|
+
StringIndex *index = (StringIndex *)index_ptr;
|
313
|
+
if (index->v_cnt >= index->v_size) {
|
314
|
+
index->v_size *= 2;
|
315
|
+
index->values = REALLOC_N(index->values, char *, index->v_size);
|
316
|
+
}
|
317
|
+
index->values[index->v_cnt] = estrdup(text);
|
318
|
+
while (tde->next(tde)) {
|
319
|
+
index->index[tde->doc_num(tde)] = index->v_cnt;
|
320
|
+
}
|
321
|
+
index->v_cnt++;
|
322
|
+
}
|
323
|
+
|
324
|
+
void sort_field_string_methods(SortField *self)
|
325
|
+
{
|
326
|
+
self->compare = &sf_string_compare;
|
327
|
+
self->create_index = &sf_string_create_index;
|
328
|
+
self->destroy_index = &sf_string_destroy_index;
|
329
|
+
self->handle_term = &sf_string_handle_term;
|
330
|
+
}
|
331
|
+
|
332
|
+
SortField *sort_field_string_create(char *field, bool reverse)
|
333
|
+
{
|
334
|
+
SortField *self = sort_field_alloc(field, SORT_TYPE_STRING, reverse);
|
335
|
+
sort_field_string_methods(self);
|
336
|
+
return self;
|
337
|
+
}
|
338
|
+
|
339
|
+
/***************************************************************************
|
340
|
+
* AutoSortField
|
341
|
+
***************************************************************************/
|
342
|
+
|
343
|
+
void sort_field_auto_evaluate(SortField *sf, char *text)
|
344
|
+
{
|
345
|
+
int int_val;
|
346
|
+
float float_val;
|
347
|
+
int text_len = 0, scan_len = 0;
|
348
|
+
|
349
|
+
text_len = strlen(text);
|
350
|
+
sscanf(text, "%d%n", &int_val, &scan_len);
|
351
|
+
if (scan_len == text_len) {
|
352
|
+
sort_field_int_methods(sf);
|
353
|
+
} else {
|
354
|
+
sscanf(text, "%f%n", &float_val, &scan_len);
|
355
|
+
if (scan_len == text_len) {
|
356
|
+
sort_field_float_methods(sf);
|
357
|
+
} else {
|
358
|
+
sort_field_string_methods(sf);
|
359
|
+
}
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
|
364
|
+
SortField *sort_field_auto_create(char *field, bool reverse)
|
365
|
+
{
|
366
|
+
return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
|
367
|
+
}
|
368
|
+
|
369
|
+
/***************************************************************************
|
370
|
+
*
|
371
|
+
* FieldCache
|
372
|
+
*
|
373
|
+
***************************************************************************/
|
374
|
+
|
375
|
+
void *field_cache_get_index(IndexReader *ir, SortField *sf)
|
376
|
+
{
|
377
|
+
void *index = NULL;
|
378
|
+
int length = 0;
|
379
|
+
Term term;
|
380
|
+
TermBuffer *tb;
|
381
|
+
TermEnum *te;
|
382
|
+
TermDocEnum *tde;
|
383
|
+
char *field = sf->field;
|
384
|
+
SortField *sf_clone;
|
385
|
+
|
386
|
+
mutex_lock(&sf->mutex);
|
387
|
+
if (!ir->sort_cache) {
|
388
|
+
ir->sort_cache = h_new(&sort_field_hash, &sort_field_eq,
|
389
|
+
&sort_field_destroy, NULL);
|
390
|
+
}
|
391
|
+
index = h_get(ir->sort_cache, sf);
|
392
|
+
|
393
|
+
if (index == NULL) {
|
394
|
+
length = ir->max_doc(ir);
|
395
|
+
if (length > 0) {
|
396
|
+
tde = ir->term_docs(ir);
|
397
|
+
term.field = field;
|
398
|
+
term.text = "";
|
399
|
+
te = ir->terms_from(ir, &term);
|
400
|
+
if (te->tb_curr == NULL) {
|
401
|
+
eprintf(ARG_ERROR, "no terms in field '%s' to sort by", field);
|
402
|
+
}
|
403
|
+
|
404
|
+
if (sf->type == SORT_TYPE_AUTO) {
|
405
|
+
sort_field_auto_evaluate(sf, te->tb_curr->text);
|
406
|
+
}
|
407
|
+
|
408
|
+
index = sf->create_index(length);
|
409
|
+
|
410
|
+
do {
|
411
|
+
tb = te->tb_curr;
|
412
|
+
if (strcmp(tb->field, field) != 0) break;
|
413
|
+
term.text = tb->text;
|
414
|
+
tde->seek(tde, &term);
|
415
|
+
sf->handle_term(index, tde, tb->text);
|
416
|
+
} while (te->next(te));
|
417
|
+
tde->close(tde);
|
418
|
+
te->close(te);
|
419
|
+
}
|
420
|
+
sf_clone = sort_field_clone(sf);
|
421
|
+
sf_clone->index = index;
|
422
|
+
h_set(ir->sort_cache, sf_clone, index);
|
423
|
+
}
|
424
|
+
mutex_unlock(&sf->mutex);
|
425
|
+
return index;
|
426
|
+
}
|
427
|
+
|
428
|
+
/***************************************************************************
|
429
|
+
*
|
430
|
+
* FieldSortedHitQueue
|
431
|
+
*
|
432
|
+
***************************************************************************/
|
433
|
+
|
434
|
+
/***************************************************************************
|
435
|
+
* Comparator
|
436
|
+
***************************************************************************/
|
437
|
+
|
438
|
+
typedef struct Comparator {
|
439
|
+
void *index;
|
440
|
+
bool reverse : 1;
|
441
|
+
int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2);
|
442
|
+
} Comparator;
|
443
|
+
|
444
|
+
Comparator *comparator_create(void *index, bool reverse,
|
445
|
+
int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2))
|
446
|
+
{
|
447
|
+
Comparator *self = ALLOC(Comparator);
|
448
|
+
self->index = index;
|
449
|
+
self->reverse = reverse;
|
450
|
+
self->compare = compare;
|
451
|
+
return self;
|
452
|
+
}
|
453
|
+
|
454
|
+
/***************************************************************************
|
455
|
+
* Sorter
|
456
|
+
***************************************************************************/
|
457
|
+
|
458
|
+
typedef struct Sorter {
|
459
|
+
Comparator **comparators;
|
460
|
+
int c_cnt;
|
461
|
+
} Sorter;
|
462
|
+
|
463
|
+
Comparator *sorter_get_comparator(SortField *sf, IndexReader *ir)
|
464
|
+
{
|
465
|
+
void *index = NULL;
|
466
|
+
|
467
|
+
if (sf->type > SORT_TYPE_DOC) {
|
468
|
+
index = field_cache_get_index(ir, sf);
|
469
|
+
}
|
470
|
+
return comparator_create(index, sf->reverse, sf->compare);
|
471
|
+
}
|
472
|
+
|
473
|
+
void sorter_destroy(void *p)
|
474
|
+
{
|
475
|
+
int i;
|
476
|
+
Sorter *self = (Sorter *)p;
|
477
|
+
|
478
|
+
for (i = 0; i < self->c_cnt; i++) {
|
479
|
+
free(self->comparators[i]);
|
480
|
+
}
|
481
|
+
free(self->comparators);
|
482
|
+
free(self);
|
483
|
+
}
|
484
|
+
|
485
|
+
Sorter *sorter_create(int size)
|
486
|
+
{
|
487
|
+
Sorter *self = ALLOC(Sorter);
|
488
|
+
self->c_cnt = size;
|
489
|
+
self->comparators = ALLOC_N(Comparator *, size);
|
490
|
+
ZEROSET(self->comparators, Comparator *, size);
|
491
|
+
return self;
|
492
|
+
}
|
493
|
+
|
494
|
+
/***************************************************************************
|
495
|
+
* FieldSortedHitQueue
|
496
|
+
***************************************************************************/
|
497
|
+
|
498
|
+
bool fshq_less_than(void *hit1, void *hit2)
|
499
|
+
{
|
500
|
+
printf("Whoops, shouldn't call this.\n");
|
501
|
+
int cmp = 0;
|
502
|
+
if (cmp != 0) {
|
503
|
+
return cmp;
|
504
|
+
} else {
|
505
|
+
return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
|
506
|
+
}
|
507
|
+
}
|
508
|
+
|
509
|
+
inline bool fshq_lt(Hit *sorter_ptr, Hit *hit1, Hit *hit2)
|
510
|
+
{
|
511
|
+
Sorter *sorter = (Sorter *)sorter_ptr;
|
512
|
+
Comparator *comp;
|
513
|
+
int diff = 0, i;
|
514
|
+
for (i = 0; i < sorter->c_cnt && diff == 0; i++) {
|
515
|
+
comp = sorter->comparators[i];
|
516
|
+
if (comp->reverse) {
|
517
|
+
diff = comp->compare(comp->index, hit2, hit1);
|
518
|
+
} else {
|
519
|
+
diff = comp->compare(comp->index, hit1, hit2);
|
520
|
+
}
|
521
|
+
}
|
522
|
+
|
523
|
+
if (diff != 0) {
|
524
|
+
return diff > 0;
|
525
|
+
} else {
|
526
|
+
return hit1->doc > hit2->doc;
|
527
|
+
}
|
528
|
+
}
|
529
|
+
|
530
|
+
void fshq_pq_down(PriorityQueue *pq)
|
531
|
+
{
|
532
|
+
register int i = 1;
|
533
|
+
register int j = 2; //i << 1;
|
534
|
+
register int k = 3; //j + 1;
|
535
|
+
Hit **heap = (Hit **)pq->heap;
|
536
|
+
Hit *node = heap[i]; // save top node
|
537
|
+
|
538
|
+
if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
|
539
|
+
j = k;
|
540
|
+
|
541
|
+
while ((j <= pq->count) && fshq_lt(heap[0], heap[j], node)) {
|
542
|
+
heap[i] = heap[j]; // shift up child
|
543
|
+
i = j;
|
544
|
+
j = i << 1;
|
545
|
+
k = j + 1;
|
546
|
+
if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
|
547
|
+
j = k;
|
548
|
+
}
|
549
|
+
heap[i] = node;
|
550
|
+
}
|
551
|
+
|
552
|
+
Hit *fshq_pq_pop(PriorityQueue *pq)
|
553
|
+
{
|
554
|
+
if (pq->count > 0) {
|
555
|
+
Hit *result = (Hit *)pq->heap[1]; // save first value
|
556
|
+
pq->heap[1] = pq->heap[pq->count]; // move last to first
|
557
|
+
pq->heap[pq->count] = NULL;
|
558
|
+
pq->count--;
|
559
|
+
fshq_pq_down(pq); // adjust heap
|
560
|
+
return result;
|
561
|
+
} else {
|
562
|
+
return NULL;
|
563
|
+
}
|
564
|
+
}
|
565
|
+
|
566
|
+
inline void fshq_pq_up(PriorityQueue *pq)
|
567
|
+
{
|
568
|
+
int i,j;
|
569
|
+
i = pq->count;
|
570
|
+
j = i >> 1;
|
571
|
+
Hit **heap = (Hit **)pq->heap;
|
572
|
+
Hit *node = heap[i];
|
573
|
+
|
574
|
+
while ((j > 0) && fshq_lt(heap[0], node, heap[j])) {
|
575
|
+
heap[i] = heap[j];
|
576
|
+
i = j;
|
577
|
+
j = j >> 1;
|
578
|
+
}
|
579
|
+
heap[i] = node;
|
580
|
+
}
|
581
|
+
|
582
|
+
void fshq_pq_push(PriorityQueue *pq, void *elem)
|
583
|
+
{
|
584
|
+
pq->count++;
|
585
|
+
pq->heap[pq->count] = elem;
|
586
|
+
fshq_pq_up(pq);
|
587
|
+
}
|
588
|
+
|
589
|
+
void fshq_pq_destroy(void *p)
|
590
|
+
{
|
591
|
+
PriorityQueue *self = (PriorityQueue *)p;
|
592
|
+
sorter_destroy(self->heap[0]);
|
593
|
+
pq_destroy(self);
|
594
|
+
}
|
595
|
+
|
596
|
+
PriorityQueue *fshq_pq_create(int size, Sort *sort, IndexReader *ir)
|
597
|
+
{
|
598
|
+
PriorityQueue *self = pq_create(size, &fshq_less_than);
|
599
|
+
int i;
|
600
|
+
Sorter *sorter = sorter_create(sort->sf_cnt);
|
601
|
+
SortField *sf;
|
602
|
+
|
603
|
+
for (i = 0; i < sort->sf_cnt; i++) {
|
604
|
+
sf = sort->sort_fields[i];
|
605
|
+
sorter->comparators[i] = sorter_get_comparator(sf, ir);
|
606
|
+
}
|
607
|
+
self->heap[0] = sorter;
|
608
|
+
|
609
|
+
return self;
|
610
|
+
}
|
611
|
+
|
612
|
+
/***************************************************************************
|
613
|
+
*
|
614
|
+
* Sort
|
615
|
+
*
|
616
|
+
***************************************************************************/
|
617
|
+
|
618
|
+
Sort *sort_create()
|
619
|
+
{
|
620
|
+
Sort *self = ALLOC(Sort);
|
621
|
+
self->sf_cnt = 0;
|
622
|
+
self->sf_capa = ARRAY_INIT_SIZE;
|
623
|
+
self->sort_fields = ALLOC_N(SortField *, ARRAY_INIT_SIZE);
|
624
|
+
self->destroy_all = true;
|
625
|
+
|
626
|
+
return self;
|
627
|
+
}
|
628
|
+
|
629
|
+
void sort_clear(Sort *self)
|
630
|
+
{
|
631
|
+
int i;
|
632
|
+
for (i = 0; i < self->sf_cnt; i++) {
|
633
|
+
sort_field_destroy(self->sort_fields[i]);
|
634
|
+
}
|
635
|
+
self->sf_cnt = 0;
|
636
|
+
}
|
637
|
+
|
638
|
+
void sort_destroy(void *p)
|
639
|
+
{
|
640
|
+
int i;
|
641
|
+
Sort *self = (Sort *)p;
|
642
|
+
if (self->destroy_all) {
|
643
|
+
for (i = 0; i < self->sf_cnt; i++) {
|
644
|
+
sort_field_destroy(self->sort_fields[i]);
|
645
|
+
}
|
646
|
+
}
|
647
|
+
free(self->sort_fields);
|
648
|
+
free(self);
|
649
|
+
}
|
650
|
+
|
651
|
+
void sort_add_sort_field(Sort *self, SortField *sf)
|
652
|
+
{
|
653
|
+
if (self->sf_cnt == self->sf_capa) {
|
654
|
+
self->sf_capa *= 2;
|
655
|
+
REALLOC_N(self->sort_fields, SortField *, self->sf_capa);
|
656
|
+
}
|
657
|
+
|
658
|
+
self->sort_fields[self->sf_cnt] = sf;
|
659
|
+
self->sf_cnt++;
|
660
|
+
}
|
661
|
+
|