ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/similarity.c CHANGED
@@ -1,59 +1,163 @@
1
- #include "ferret.h"
1
+ #include <search.h>
2
+ #include <global.h>
2
3
  #include <math.h>
3
4
 
4
- /****************************************************************************
5
- *
6
- * Similarity Methods
7
- *
8
- ****************************************************************************/
5
+ static int low_bit = 0, low_mid_bit = 0, high_mid_bit = 0, high_bit = 0;
6
+ static void
7
+ setup_endian()
8
+ {
9
+ static int init = 0;
10
+ char *p;
11
+
12
+ if (init) return;
13
+ init = 1;
14
+ p = (char*)&init;
9
15
 
10
- /*
11
- static VALUE
12
- frt_sim_c_byte_to_float(VALUE self, VALUE rbyte)
16
+ if (p[0]) {
17
+ low_bit = 0;
18
+ low_mid_bit = 1;
19
+ high_mid_bit = 2;
20
+ high_bit = 3;
21
+ } else {
22
+ low_bit = 3;
23
+ low_mid_bit = 2;
24
+ high_mid_bit = 1;
25
+ high_bit = 0;
26
+ }
27
+ }
28
+
29
+ float byte_to_float(uchar b)
13
30
  {
14
- int b = FIX2INT(rbyte);
31
+ char flt[4];
15
32
  if (b == 0)
16
- return rb_float_new(0.0);
33
+ return 0.0;
17
34
  int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
18
35
  int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
19
- int val = (mantissa << 21) | ((exponent + 48) << 24);
20
- void *tmp = &val;
21
- return rb_float_new(*(float *)tmp);
36
+
37
+ if (!low_mid_bit) setup_endian();
38
+ flt[low_bit] = flt[low_mid_bit] = 0;
39
+ flt[high_mid_bit] = mantissa << 5;
40
+ flt[high_bit] = exponent + 48;
41
+ return *((float *)flt);
42
+ }
43
+
44
+ uchar float_to_byte(float f)
45
+ {
46
+ if (f <= 0.0)
47
+ return 0;
48
+
49
+ char *bits = (char *)&f;
50
+ int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
51
+ int exponent = (bits[high_bit] - 48);
52
+
53
+ if (exponent > 0x1f) {
54
+ exponent = 0x1f; // 0x1f = 31 = 0b00011111
55
+ mantissa = 0x07; // 0x07 = 7 = 0b00000111
56
+ }
57
+
58
+ if (exponent < 0) {
59
+ exponent = 0;
60
+ mantissa = 1;
61
+ }
62
+
63
+ return ((exponent<<3) | mantissa);
64
+ }
65
+
66
+ float simdef_length_norm(Similarity *s, char *field, int num_terms)
67
+ {
68
+ return 1.0 / sqrt(num_terms);
69
+ }
70
+
71
+ float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
72
+ {
73
+ return 1.0 / sqrt(sum_of_squared_weights);
74
+ }
75
+
76
+ float simdef_tf(struct Similarity *s, float freq)
77
+ {
78
+ return sqrt(freq);
79
+ }
80
+
81
+ float simdef_sloppy_freq(struct Similarity *s, int distance)
82
+ {
83
+ return 1.0 / (float)(distance + 1);
22
84
  }
23
- */
24
85
 
25
- static VALUE
26
- frt_dsim_tf(VALUE self, VALUE freq)
86
+ float simdef_idf_term(struct Similarity *s, Term *term, Searcher *searcher)
27
87
  {
28
- return rb_float_new(sqrt(NUM2DBL(freq)));
88
+ return s->idf(s, searcher->doc_freq(searcher, term), searcher->max_doc(searcher));
29
89
  }
30
90
 
31
- static VALUE
32
- frt_dsim_idf(VALUE self, VALUE rdoc_freq, VALUE rnum_docs)
91
+ float simdef_idf_phrase(struct Similarity *s, Term **terms, int tcnt, Searcher *searcher)
33
92
  {
34
- int doc_freq;
35
- int num_docs = FIX2INT(rnum_docs);
36
- if (num_docs == 0) return rb_float_new(0.0);
93
+ float idf = 0.0;
94
+ int i;
95
+ for (i = 0; i < tcnt; i++) {
96
+ idf += s->idf_term(s, terms[i], searcher);
97
+ }
98
+ return idf;
99
+ }
37
100
 
38
- doc_freq = FIX2INT(rdoc_freq);
39
- return rb_float_new(log((double)num_docs/(double)(doc_freq+1)) + 1.0);
101
+ float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
102
+ {
103
+ return log((float)num_docs/(float)(doc_freq+1)) + 1.0;
40
104
  }
41
105
 
42
- /****************************************************************************
43
- *
44
- * Init Function
45
- *
46
- ****************************************************************************/
106
+ float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
107
+ {
108
+ return (float)overlap / (float)max_overlap;
109
+ }
47
110
 
48
- void
49
- Init_similarity(void)
111
+ float simdef_decode_norm(struct Similarity *s, uchar b)
50
112
  {
51
- VALUE cDefaultSimilarity;
52
- /* Similarity */
53
- cSimilarity = rb_define_class_under(mSearch, "Similarity", rb_cObject);
54
- cDefaultSimilarity = rb_define_class_under(mSearch, "DefaultSimilarity", cSimilarity);
113
+ return s->norm_table[b];
114
+ }
115
+
116
+ float simdef_encode_norm(struct Similarity *s, float f)
117
+ {
118
+ return float_to_byte(f);
119
+ }
120
+
121
+ void simdef_destroy(void *p)
122
+ {
123
+ // nothing to do here;
124
+ }
125
+
126
+ static Similarity default_similarity = {
127
+ data:NULL,
128
+ length_norm:&simdef_length_norm,
129
+ query_norm:&simdef_query_norm,
130
+ tf:&simdef_tf,
131
+ sloppy_freq:&simdef_sloppy_freq,
132
+ idf_term:&simdef_idf_term,
133
+ idf_phrase:&simdef_idf_phrase,
134
+ idf:&simdef_idf,
135
+ coord:&simdef_coord,
136
+ decode_norm:&simdef_decode_norm,
137
+ encode_norm:&simdef_encode_norm,
138
+ destroy:&simdef_destroy
139
+ };
140
+ Similarity *sim_create_default()
141
+ {
142
+ int i;
143
+ if (!default_similarity.data) {
144
+ for (i = 0; i < 256; i++)
145
+ default_similarity.norm_table[i] = byte_to_float(i);
146
+
147
+ default_similarity.data = &default_similarity;
148
+ }
149
+ return &default_similarity;
55
150
 
56
- //rb_define_singleton_method(cSimilarity, "byte_to_float", frt_sim_c_byte_to_float, 1);
57
- rb_define_method(cDefaultSimilarity, "tf", frt_dsim_tf, 1);
58
- rb_define_method(cDefaultSimilarity, "idf", frt_dsim_idf, 2);
151
+ // s->length_norm = &simdef_length_norm;
152
+ // s->query_norm = &simdef_query_norm;
153
+ // s->tf = &simdef_tf;
154
+ // s->sloppy_freq = &simdef_sloppy_freq;
155
+ // s->idf_term = &simdef_idf_term;
156
+ // s->idf_phrase = &simdef_idf_phrase;
157
+ // s->idf = &simdef_idf;
158
+ // s->coord = &simdef_coord;
159
+ // s->decode_norm = &simdef_decode_norm;
160
+ // s->encode_norm = &simdef_encode_norm;
161
+ // s->destroy = &simdef_destroy;
162
+ // return s;
59
163
  }
data/ext/sort.c ADDED
@@ -0,0 +1,661 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "index.h"
4
+
5
+ /***************************************************************************
6
+ *
7
+ * SortField
8
+ *
9
+ ***************************************************************************/
10
+
11
+ unsigned int sort_field_hash(const void *p)
12
+ {
13
+ SortField *self = (SortField *)p;
14
+ return str_hash(self->field) ^ (self->type*37);
15
+ }
16
+
17
+ int sort_field_eq(const void *p1, const void *p2)
18
+ {
19
+ SortField *key1 = (SortField *)p1;
20
+ SortField *key2 = (SortField *)p2;
21
+ int equal = (strcmp(key1->field, key2->field) == 0) && key1->type == key2->type;
22
+ /*
23
+ * TODO: The could probable be done more cleanly.
24
+ * If the sort field is an auto field then it was evaluated before it was
25
+ * entered into the cache so we need to pass the compare function back to
26
+ * the new sort field.
27
+ */
28
+ if (equal && (key1->type == SORT_TYPE_AUTO)) {
29
+ key2->compare = key1->compare;
30
+ }
31
+ return equal;
32
+ }
33
+
34
+ SortField *sort_field_clone(SortField *self)
35
+ {
36
+ SortField *clone = ALLOC(SortField);
37
+ memcpy(clone, self, sizeof(SortField));
38
+ mutex_init(&clone->mutex, NULL);
39
+ clone->field = estrdup(self->field);
40
+ return clone;
41
+ }
42
+
43
+ SortField *sort_field_alloc(char *field, int type, bool reverse)
44
+ {
45
+ SortField *self = ALLOC(SortField);
46
+ mutex_init(&self->mutex, NULL);
47
+ self->field = field ? estrdup(field) : NULL;
48
+ self->type = type;
49
+ self->reverse = reverse;
50
+ self->index = NULL;
51
+ self->destroy_index = &free;
52
+ self->compare = NULL;
53
+ return self;
54
+ }
55
+
56
+ SortField *sort_field_create(char *field, int type, bool reverse)
57
+ {
58
+ SortField *sf = NULL;
59
+ switch (type) {
60
+ case SORT_TYPE_SCORE:
61
+ sf = sort_field_score_create(reverse);
62
+ break;
63
+ case SORT_TYPE_DOC:
64
+ sf = sort_field_doc_create(reverse);
65
+ break;
66
+ case SORT_TYPE_INTEGER:
67
+ sf = sort_field_int_create(field, reverse);
68
+ break;
69
+ case SORT_TYPE_FLOAT:
70
+ sf = sort_field_float_create(field, reverse);
71
+ break;
72
+ case SORT_TYPE_STRING:
73
+ sf = sort_field_string_create(field, reverse);
74
+ break;
75
+ case SORT_TYPE_AUTO:
76
+ sf = sort_field_auto_create(field, reverse);
77
+ break;
78
+ }
79
+ return sf;
80
+ }
81
+
82
+ void sort_field_destroy(void *p)
83
+ {
84
+ SortField *self = (SortField *)p;
85
+ if (self->index) {
86
+ self->destroy_index(self->index);
87
+ }
88
+ free(self->field);
89
+ mutex_destroy(&self->mutex);
90
+ free(p);
91
+ }
92
+
93
+ /***************************************************************************
94
+ * ScoreSortField
95
+ ***************************************************************************/
96
+
97
+ int sf_score_compare(void *index_ptr, Hit *hit2, Hit *hit1)
98
+ {
99
+ float val1 = hit1->score;
100
+ float val2 = hit2->score;
101
+ if (val1 > val2) return 1;
102
+ else if (val1 < val2) return -1;
103
+ else return 0;
104
+ }
105
+
106
+ SortField *sort_field_score_create(bool reverse)
107
+ {
108
+ SortField *self = sort_field_alloc(NULL, SORT_TYPE_SCORE, reverse);
109
+ self->compare = &sf_score_compare;
110
+ return self;
111
+ }
112
+
113
+ SortField SORT_FIELD_SCORE = {
114
+ field:NULL,
115
+ type:SORT_TYPE_SCORE,
116
+ reverse:false,
117
+ index:NULL,
118
+ compare:&sf_score_compare,
119
+ create_index:NULL,
120
+ destroy_index:NULL,
121
+ handle_term:NULL
122
+ };
123
+
124
+ SortField SORT_FIELD_SCORE_REV = {
125
+ field:NULL,
126
+ type:SORT_TYPE_SCORE,
127
+ reverse:true,
128
+ index:NULL,
129
+ compare:&sf_score_compare,
130
+ create_index:NULL,
131
+ destroy_index:NULL,
132
+ handle_term:NULL
133
+ };
134
+
135
+ /**************************************************************************
136
+ * DocSortField
137
+ ***************************************************************************/
138
+
139
+ int sf_doc_compare(void *index_ptr, Hit *hit1, Hit *hit2)
140
+ {
141
+ int val1 = hit1->doc;
142
+ int val2 = hit2->doc;
143
+ if (val1 > val2) return 1;
144
+ else if (val1 < val2) return -1;
145
+ else return 0;
146
+ }
147
+
148
+ SortField *sort_field_doc_create(bool reverse)
149
+ {
150
+ SortField *self = sort_field_alloc(NULL, SORT_TYPE_DOC, reverse);
151
+ self->compare = &sf_doc_compare;
152
+ return self;
153
+ }
154
+
155
+ SortField SORT_FIELD_DOC = {
156
+ field:NULL,
157
+ type:SORT_TYPE_DOC,
158
+ reverse:false,
159
+ index:NULL,
160
+ compare:&sf_doc_compare,
161
+ create_index:NULL,
162
+ destroy_index:NULL,
163
+ handle_term:NULL
164
+ };
165
+
166
+ SortField SORT_FIELD_DOC_REV = {
167
+ field:NULL,
168
+ type:SORT_TYPE_DOC,
169
+ reverse:true,
170
+ index:NULL,
171
+ compare:&sf_doc_compare,
172
+ create_index:NULL,
173
+ destroy_index:NULL,
174
+ handle_term:NULL
175
+ };
176
+
177
+ /***************************************************************************
178
+ * IntegerSortField
179
+ ***************************************************************************/
180
+
181
+ int sf_int_compare(void *index_ptr, Hit *hit1, Hit *hit2)
182
+ {
183
+ int *index = (int *)index_ptr;
184
+ int val1 = index[hit1->doc];
185
+ int val2 = index[hit2->doc];
186
+ if (val1 > val2) return 1;
187
+ else if (val1 < val2) return -1;
188
+ else return 0;
189
+ }
190
+
191
+ void *sf_int_create_index(int size)
192
+ {
193
+ return ALLOC_N(int, size);
194
+ }
195
+
196
+ void sf_int_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
197
+ {
198
+ int *index = (int *)index_ptr;
199
+ int val;
200
+ sscanf(text, "%d", &val);
201
+ while (tde->next(tde)) {
202
+ index[tde->doc_num(tde)] = val;
203
+ }
204
+ }
205
+
206
+ void sort_field_int_methods(SortField *self)
207
+ {
208
+ self->compare = &sf_int_compare;
209
+ self->create_index = &sf_int_create_index;
210
+ self->handle_term = &sf_int_handle_term;
211
+ }
212
+
213
+ SortField *sort_field_int_create(char *field, bool reverse)
214
+ {
215
+ SortField *self = sort_field_alloc(field, SORT_TYPE_INTEGER, reverse);
216
+ sort_field_int_methods(self);
217
+ return self;
218
+ }
219
+
220
+ /***************************************************************************
221
+ * FloatSortField
222
+ ***************************************************************************/
223
+
224
+ int sf_float_compare(void *index_ptr, Hit *hit1, Hit *hit2)
225
+ {
226
+ float *index = (float *)index_ptr;
227
+ float val1 = index[hit1->doc];
228
+ float val2 = index[hit2->doc];
229
+ if (val1 > val2) return 1;
230
+ else if (val1 < val2) return -1;
231
+ else return 0;
232
+ }
233
+
234
+ void *sf_float_create_index(int size)
235
+ {
236
+ return ALLOC_N(float, size);
237
+ }
238
+
239
+ void sf_float_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
240
+ {
241
+ float *index = (float *)index_ptr;
242
+ float val;
243
+ sscanf(text, "%g", &val);
244
+ while (tde->next(tde)) {
245
+ index[tde->doc_num(tde)] = val;
246
+ }
247
+ }
248
+
249
+ void sort_field_float_methods(SortField *self)
250
+ {
251
+ self->compare = &sf_float_compare;
252
+ self->create_index = &sf_float_create_index;
253
+ self->handle_term = &sf_float_handle_term;
254
+ }
255
+
256
+ SortField *sort_field_float_create(char *field, bool reverse)
257
+ {
258
+ SortField *self = sort_field_alloc(field, SORT_TYPE_FLOAT, reverse);
259
+ sort_field_float_methods(self);
260
+ return self;
261
+ }
262
+
263
+ /***************************************************************************
264
+ * StringSortField
265
+ ***************************************************************************/
266
+
267
+ #define VALUES_ARRAY_START_SIZE 8
268
+ typedef struct StringIndex {
269
+ int size;
270
+ int *index;
271
+ char **values;
272
+ int v_cnt;
273
+ int v_size;
274
+ } StringIndex;
275
+
276
+ int sf_string_compare(void *index_ptr, Hit *hit1, Hit *hit2)
277
+ {
278
+ StringIndex *index = (StringIndex *)index_ptr;
279
+ int val1 = index->index[hit1->doc];
280
+ int val2 = index->index[hit2->doc];
281
+ if (val1 > val2) return 1;
282
+ else if (val1 < val2) return -1;
283
+ else return 0;
284
+ }
285
+
286
+ void *sf_string_create_index(int size)
287
+ {
288
+ StringIndex *self = ALLOC(StringIndex);
289
+ ZEROSET(self, StringIndex, 1);
290
+ self->size = size;
291
+ self->index = ALLOC_N(int, size);
292
+ ZEROSET(self->index, int, size);
293
+ self->v_size = VALUES_ARRAY_START_SIZE;
294
+ self->values = ALLOC_N(char *, VALUES_ARRAY_START_SIZE);
295
+ return self;
296
+ }
297
+
298
+ void sf_string_destroy_index(void *p)
299
+ {
300
+ StringIndex *self = (StringIndex *)p;
301
+ int i;
302
+ free(self->index);
303
+ for (i = 0; i < self->v_cnt; i++) {
304
+ free(self->values[i]);
305
+ }
306
+ free(self->values);
307
+ free(self);
308
+ }
309
+
310
+ void sf_string_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
311
+ {
312
+ StringIndex *index = (StringIndex *)index_ptr;
313
+ if (index->v_cnt >= index->v_size) {
314
+ index->v_size *= 2;
315
+ index->values = REALLOC_N(index->values, char *, index->v_size);
316
+ }
317
+ index->values[index->v_cnt] = estrdup(text);
318
+ while (tde->next(tde)) {
319
+ index->index[tde->doc_num(tde)] = index->v_cnt;
320
+ }
321
+ index->v_cnt++;
322
+ }
323
+
324
+ void sort_field_string_methods(SortField *self)
325
+ {
326
+ self->compare = &sf_string_compare;
327
+ self->create_index = &sf_string_create_index;
328
+ self->destroy_index = &sf_string_destroy_index;
329
+ self->handle_term = &sf_string_handle_term;
330
+ }
331
+
332
+ SortField *sort_field_string_create(char *field, bool reverse)
333
+ {
334
+ SortField *self = sort_field_alloc(field, SORT_TYPE_STRING, reverse);
335
+ sort_field_string_methods(self);
336
+ return self;
337
+ }
338
+
339
+ /***************************************************************************
340
+ * AutoSortField
341
+ ***************************************************************************/
342
+
343
+ void sort_field_auto_evaluate(SortField *sf, char *text)
344
+ {
345
+ int int_val;
346
+ float float_val;
347
+ int text_len = 0, scan_len = 0;
348
+
349
+ text_len = strlen(text);
350
+ sscanf(text, "%d%n", &int_val, &scan_len);
351
+ if (scan_len == text_len) {
352
+ sort_field_int_methods(sf);
353
+ } else {
354
+ sscanf(text, "%f%n", &float_val, &scan_len);
355
+ if (scan_len == text_len) {
356
+ sort_field_float_methods(sf);
357
+ } else {
358
+ sort_field_string_methods(sf);
359
+ }
360
+ }
361
+ }
362
+
363
+
364
+ SortField *sort_field_auto_create(char *field, bool reverse)
365
+ {
366
+ return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
367
+ }
368
+
369
+ /***************************************************************************
370
+ *
371
+ * FieldCache
372
+ *
373
+ ***************************************************************************/
374
+
375
+ void *field_cache_get_index(IndexReader *ir, SortField *sf)
376
+ {
377
+ void *index = NULL;
378
+ int length = 0;
379
+ Term term;
380
+ TermBuffer *tb;
381
+ TermEnum *te;
382
+ TermDocEnum *tde;
383
+ char *field = sf->field;
384
+ SortField *sf_clone;
385
+
386
+ mutex_lock(&sf->mutex);
387
+ if (!ir->sort_cache) {
388
+ ir->sort_cache = h_new(&sort_field_hash, &sort_field_eq,
389
+ &sort_field_destroy, NULL);
390
+ }
391
+ index = h_get(ir->sort_cache, sf);
392
+
393
+ if (index == NULL) {
394
+ length = ir->max_doc(ir);
395
+ if (length > 0) {
396
+ tde = ir->term_docs(ir);
397
+ term.field = field;
398
+ term.text = "";
399
+ te = ir->terms_from(ir, &term);
400
+ if (te->tb_curr == NULL) {
401
+ eprintf(ARG_ERROR, "no terms in field '%s' to sort by", field);
402
+ }
403
+
404
+ if (sf->type == SORT_TYPE_AUTO) {
405
+ sort_field_auto_evaluate(sf, te->tb_curr->text);
406
+ }
407
+
408
+ index = sf->create_index(length);
409
+
410
+ do {
411
+ tb = te->tb_curr;
412
+ if (strcmp(tb->field, field) != 0) break;
413
+ term.text = tb->text;
414
+ tde->seek(tde, &term);
415
+ sf->handle_term(index, tde, tb->text);
416
+ } while (te->next(te));
417
+ tde->close(tde);
418
+ te->close(te);
419
+ }
420
+ sf_clone = sort_field_clone(sf);
421
+ sf_clone->index = index;
422
+ h_set(ir->sort_cache, sf_clone, index);
423
+ }
424
+ mutex_unlock(&sf->mutex);
425
+ return index;
426
+ }
427
+
428
+ /***************************************************************************
429
+ *
430
+ * FieldSortedHitQueue
431
+ *
432
+ ***************************************************************************/
433
+
434
+ /***************************************************************************
435
+ * Comparator
436
+ ***************************************************************************/
437
+
438
+ typedef struct Comparator {
439
+ void *index;
440
+ bool reverse : 1;
441
+ int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2);
442
+ } Comparator;
443
+
444
+ Comparator *comparator_create(void *index, bool reverse,
445
+ int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2))
446
+ {
447
+ Comparator *self = ALLOC(Comparator);
448
+ self->index = index;
449
+ self->reverse = reverse;
450
+ self->compare = compare;
451
+ return self;
452
+ }
453
+
454
+ /***************************************************************************
455
+ * Sorter
456
+ ***************************************************************************/
457
+
458
+ typedef struct Sorter {
459
+ Comparator **comparators;
460
+ int c_cnt;
461
+ } Sorter;
462
+
463
+ Comparator *sorter_get_comparator(SortField *sf, IndexReader *ir)
464
+ {
465
+ void *index = NULL;
466
+
467
+ if (sf->type > SORT_TYPE_DOC) {
468
+ index = field_cache_get_index(ir, sf);
469
+ }
470
+ return comparator_create(index, sf->reverse, sf->compare);
471
+ }
472
+
473
+ void sorter_destroy(void *p)
474
+ {
475
+ int i;
476
+ Sorter *self = (Sorter *)p;
477
+
478
+ for (i = 0; i < self->c_cnt; i++) {
479
+ free(self->comparators[i]);
480
+ }
481
+ free(self->comparators);
482
+ free(self);
483
+ }
484
+
485
+ Sorter *sorter_create(int size)
486
+ {
487
+ Sorter *self = ALLOC(Sorter);
488
+ self->c_cnt = size;
489
+ self->comparators = ALLOC_N(Comparator *, size);
490
+ ZEROSET(self->comparators, Comparator *, size);
491
+ return self;
492
+ }
493
+
494
+ /***************************************************************************
495
+ * FieldSortedHitQueue
496
+ ***************************************************************************/
497
+
498
+ bool fshq_less_than(void *hit1, void *hit2)
499
+ {
500
+ printf("Whoops, shouldn't call this.\n");
501
+ int cmp = 0;
502
+ if (cmp != 0) {
503
+ return cmp;
504
+ } else {
505
+ return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
506
+ }
507
+ }
508
+
509
+ inline bool fshq_lt(Hit *sorter_ptr, Hit *hit1, Hit *hit2)
510
+ {
511
+ Sorter *sorter = (Sorter *)sorter_ptr;
512
+ Comparator *comp;
513
+ int diff = 0, i;
514
+ for (i = 0; i < sorter->c_cnt && diff == 0; i++) {
515
+ comp = sorter->comparators[i];
516
+ if (comp->reverse) {
517
+ diff = comp->compare(comp->index, hit2, hit1);
518
+ } else {
519
+ diff = comp->compare(comp->index, hit1, hit2);
520
+ }
521
+ }
522
+
523
+ if (diff != 0) {
524
+ return diff > 0;
525
+ } else {
526
+ return hit1->doc > hit2->doc;
527
+ }
528
+ }
529
+
530
+ void fshq_pq_down(PriorityQueue *pq)
531
+ {
532
+ register int i = 1;
533
+ register int j = 2; //i << 1;
534
+ register int k = 3; //j + 1;
535
+ Hit **heap = (Hit **)pq->heap;
536
+ Hit *node = heap[i]; // save top node
537
+
538
+ if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
539
+ j = k;
540
+
541
+ while ((j <= pq->count) && fshq_lt(heap[0], heap[j], node)) {
542
+ heap[i] = heap[j]; // shift up child
543
+ i = j;
544
+ j = i << 1;
545
+ k = j + 1;
546
+ if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
547
+ j = k;
548
+ }
549
+ heap[i] = node;
550
+ }
551
+
552
+ Hit *fshq_pq_pop(PriorityQueue *pq)
553
+ {
554
+ if (pq->count > 0) {
555
+ Hit *result = (Hit *)pq->heap[1]; // save first value
556
+ pq->heap[1] = pq->heap[pq->count]; // move last to first
557
+ pq->heap[pq->count] = NULL;
558
+ pq->count--;
559
+ fshq_pq_down(pq); // adjust heap
560
+ return result;
561
+ } else {
562
+ return NULL;
563
+ }
564
+ }
565
+
566
+ inline void fshq_pq_up(PriorityQueue *pq)
567
+ {
568
+ int i,j;
569
+ i = pq->count;
570
+ j = i >> 1;
571
+ Hit **heap = (Hit **)pq->heap;
572
+ Hit *node = heap[i];
573
+
574
+ while ((j > 0) && fshq_lt(heap[0], node, heap[j])) {
575
+ heap[i] = heap[j];
576
+ i = j;
577
+ j = j >> 1;
578
+ }
579
+ heap[i] = node;
580
+ }
581
+
582
+ void fshq_pq_push(PriorityQueue *pq, void *elem)
583
+ {
584
+ pq->count++;
585
+ pq->heap[pq->count] = elem;
586
+ fshq_pq_up(pq);
587
+ }
588
+
589
+ void fshq_pq_destroy(void *p)
590
+ {
591
+ PriorityQueue *self = (PriorityQueue *)p;
592
+ sorter_destroy(self->heap[0]);
593
+ pq_destroy(self);
594
+ }
595
+
596
+ PriorityQueue *fshq_pq_create(int size, Sort *sort, IndexReader *ir)
597
+ {
598
+ PriorityQueue *self = pq_create(size, &fshq_less_than);
599
+ int i;
600
+ Sorter *sorter = sorter_create(sort->sf_cnt);
601
+ SortField *sf;
602
+
603
+ for (i = 0; i < sort->sf_cnt; i++) {
604
+ sf = sort->sort_fields[i];
605
+ sorter->comparators[i] = sorter_get_comparator(sf, ir);
606
+ }
607
+ self->heap[0] = sorter;
608
+
609
+ return self;
610
+ }
611
+
612
+ /***************************************************************************
613
+ *
614
+ * Sort
615
+ *
616
+ ***************************************************************************/
617
+
618
+ Sort *sort_create()
619
+ {
620
+ Sort *self = ALLOC(Sort);
621
+ self->sf_cnt = 0;
622
+ self->sf_capa = ARRAY_INIT_SIZE;
623
+ self->sort_fields = ALLOC_N(SortField *, ARRAY_INIT_SIZE);
624
+ self->destroy_all = true;
625
+
626
+ return self;
627
+ }
628
+
629
+ void sort_clear(Sort *self)
630
+ {
631
+ int i;
632
+ for (i = 0; i < self->sf_cnt; i++) {
633
+ sort_field_destroy(self->sort_fields[i]);
634
+ }
635
+ self->sf_cnt = 0;
636
+ }
637
+
638
+ void sort_destroy(void *p)
639
+ {
640
+ int i;
641
+ Sort *self = (Sort *)p;
642
+ if (self->destroy_all) {
643
+ for (i = 0; i < self->sf_cnt; i++) {
644
+ sort_field_destroy(self->sort_fields[i]);
645
+ }
646
+ }
647
+ free(self->sort_fields);
648
+ free(self);
649
+ }
650
+
651
+ void sort_add_sort_field(Sort *self, SortField *sf)
652
+ {
653
+ if (self->sf_cnt == self->sf_capa) {
654
+ self->sf_capa *= 2;
655
+ REALLOC_N(self->sort_fields, SortField *, self->sf_capa);
656
+ }
657
+
658
+ self->sort_fields[self->sf_cnt] = sf;
659
+ self->sf_cnt++;
660
+ }
661
+