ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/similarity.c CHANGED
@@ -1,59 +1,163 @@
1
- #include "ferret.h"
1
+ #include <search.h>
2
+ #include <global.h>
2
3
  #include <math.h>
3
4
 
4
- /****************************************************************************
5
- *
6
- * Similarity Methods
7
- *
8
- ****************************************************************************/
5
+ static int low_bit = 0, low_mid_bit = 0, high_mid_bit = 0, high_bit = 0;
6
+ static void
7
+ setup_endian()
8
+ {
9
+ static int init = 0;
10
+ char *p;
11
+
12
+ if (init) return;
13
+ init = 1;
14
+ p = (char*)&init;
9
15
 
10
- /*
11
- static VALUE
12
- frt_sim_c_byte_to_float(VALUE self, VALUE rbyte)
16
+ if (p[0]) {
17
+ low_bit = 0;
18
+ low_mid_bit = 1;
19
+ high_mid_bit = 2;
20
+ high_bit = 3;
21
+ } else {
22
+ low_bit = 3;
23
+ low_mid_bit = 2;
24
+ high_mid_bit = 1;
25
+ high_bit = 0;
26
+ }
27
+ }
28
+
29
+ float byte_to_float(uchar b)
13
30
  {
14
- int b = FIX2INT(rbyte);
31
+ char flt[4];
15
32
  if (b == 0)
16
- return rb_float_new(0.0);
33
+ return 0.0;
17
34
  int mantissa = b & 0x07; // 0x07 = 7 = 0b00000111
18
35
  int exponent = (b >> 3) & 0x1F; // 0x1f = 31 = 0b00011111
19
- int val = (mantissa << 21) | ((exponent + 48) << 24);
20
- void *tmp = &val;
21
- return rb_float_new(*(float *)tmp);
36
+
37
+ if (!low_mid_bit) setup_endian();
38
+ flt[low_bit] = flt[low_mid_bit] = 0;
39
+ flt[high_mid_bit] = mantissa << 5;
40
+ flt[high_bit] = exponent + 48;
41
+ return *((float *)flt);
42
+ }
43
+
44
+ uchar float_to_byte(float f)
45
+ {
46
+ if (f <= 0.0)
47
+ return 0;
48
+
49
+ char *bits = (char *)&f;
50
+ int mantissa = (bits[high_mid_bit] & 0xEf) >> 5;
51
+ int exponent = (bits[high_bit] - 48);
52
+
53
+ if (exponent > 0x1f) {
54
+ exponent = 0x1f; // 0x1f = 31 = 0b00011111
55
+ mantissa = 0x07; // 0x07 = 7 = 0b00000111
56
+ }
57
+
58
+ if (exponent < 0) {
59
+ exponent = 0;
60
+ mantissa = 1;
61
+ }
62
+
63
+ return ((exponent<<3) | mantissa);
64
+ }
65
+
66
+ float simdef_length_norm(Similarity *s, char *field, int num_terms)
67
+ {
68
+ return 1.0 / sqrt(num_terms);
69
+ }
70
+
71
+ float simdef_query_norm(struct Similarity *s, float sum_of_squared_weights)
72
+ {
73
+ return 1.0 / sqrt(sum_of_squared_weights);
74
+ }
75
+
76
+ float simdef_tf(struct Similarity *s, float freq)
77
+ {
78
+ return sqrt(freq);
79
+ }
80
+
81
+ float simdef_sloppy_freq(struct Similarity *s, int distance)
82
+ {
83
+ return 1.0 / (float)(distance + 1);
22
84
  }
23
- */
24
85
 
25
- static VALUE
26
- frt_dsim_tf(VALUE self, VALUE freq)
86
+ float simdef_idf_term(struct Similarity *s, Term *term, Searcher *searcher)
27
87
  {
28
- return rb_float_new(sqrt(NUM2DBL(freq)));
88
+ return s->idf(s, searcher->doc_freq(searcher, term), searcher->max_doc(searcher));
29
89
  }
30
90
 
31
- static VALUE
32
- frt_dsim_idf(VALUE self, VALUE rdoc_freq, VALUE rnum_docs)
91
+ float simdef_idf_phrase(struct Similarity *s, Term **terms, int tcnt, Searcher *searcher)
33
92
  {
34
- int doc_freq;
35
- int num_docs = FIX2INT(rnum_docs);
36
- if (num_docs == 0) return rb_float_new(0.0);
93
+ float idf = 0.0;
94
+ int i;
95
+ for (i = 0; i < tcnt; i++) {
96
+ idf += s->idf_term(s, terms[i], searcher);
97
+ }
98
+ return idf;
99
+ }
37
100
 
38
- doc_freq = FIX2INT(rdoc_freq);
39
- return rb_float_new(log((double)num_docs/(double)(doc_freq+1)) + 1.0);
101
+ float simdef_idf(struct Similarity *s, int doc_freq, int num_docs)
102
+ {
103
+ return log((float)num_docs/(float)(doc_freq+1)) + 1.0;
40
104
  }
41
105
 
42
- /****************************************************************************
43
- *
44
- * Init Function
45
- *
46
- ****************************************************************************/
106
+ float simdef_coord(struct Similarity *s, int overlap, int max_overlap)
107
+ {
108
+ return (float)overlap / (float)max_overlap;
109
+ }
47
110
 
48
- void
49
- Init_similarity(void)
111
+ float simdef_decode_norm(struct Similarity *s, uchar b)
50
112
  {
51
- VALUE cDefaultSimilarity;
52
- /* Similarity */
53
- cSimilarity = rb_define_class_under(mSearch, "Similarity", rb_cObject);
54
- cDefaultSimilarity = rb_define_class_under(mSearch, "DefaultSimilarity", cSimilarity);
113
+ return s->norm_table[b];
114
+ }
115
+
116
+ float simdef_encode_norm(struct Similarity *s, float f)
117
+ {
118
+ return float_to_byte(f);
119
+ }
120
+
121
+ void simdef_destroy(void *p)
122
+ {
123
+ // nothing to do here;
124
+ }
125
+
126
+ static Similarity default_similarity = {
127
+ data:NULL,
128
+ length_norm:&simdef_length_norm,
129
+ query_norm:&simdef_query_norm,
130
+ tf:&simdef_tf,
131
+ sloppy_freq:&simdef_sloppy_freq,
132
+ idf_term:&simdef_idf_term,
133
+ idf_phrase:&simdef_idf_phrase,
134
+ idf:&simdef_idf,
135
+ coord:&simdef_coord,
136
+ decode_norm:&simdef_decode_norm,
137
+ encode_norm:&simdef_encode_norm,
138
+ destroy:&simdef_destroy
139
+ };
140
+ Similarity *sim_create_default()
141
+ {
142
+ int i;
143
+ if (!default_similarity.data) {
144
+ for (i = 0; i < 256; i++)
145
+ default_similarity.norm_table[i] = byte_to_float(i);
146
+
147
+ default_similarity.data = &default_similarity;
148
+ }
149
+ return &default_similarity;
55
150
 
56
- //rb_define_singleton_method(cSimilarity, "byte_to_float", frt_sim_c_byte_to_float, 1);
57
- rb_define_method(cDefaultSimilarity, "tf", frt_dsim_tf, 1);
58
- rb_define_method(cDefaultSimilarity, "idf", frt_dsim_idf, 2);
151
+ // s->length_norm = &simdef_length_norm;
152
+ // s->query_norm = &simdef_query_norm;
153
+ // s->tf = &simdef_tf;
154
+ // s->sloppy_freq = &simdef_sloppy_freq;
155
+ // s->idf_term = &simdef_idf_term;
156
+ // s->idf_phrase = &simdef_idf_phrase;
157
+ // s->idf = &simdef_idf;
158
+ // s->coord = &simdef_coord;
159
+ // s->decode_norm = &simdef_decode_norm;
160
+ // s->encode_norm = &simdef_encode_norm;
161
+ // s->destroy = &simdef_destroy;
162
+ // return s;
59
163
  }
data/ext/sort.c ADDED
@@ -0,0 +1,661 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+ #include "index.h"
4
+
5
+ /***************************************************************************
6
+ *
7
+ * SortField
8
+ *
9
+ ***************************************************************************/
10
+
11
+ unsigned int sort_field_hash(const void *p)
12
+ {
13
+ SortField *self = (SortField *)p;
14
+ return str_hash(self->field) ^ (self->type*37);
15
+ }
16
+
17
+ int sort_field_eq(const void *p1, const void *p2)
18
+ {
19
+ SortField *key1 = (SortField *)p1;
20
+ SortField *key2 = (SortField *)p2;
21
+ int equal = (strcmp(key1->field, key2->field) == 0) && key1->type == key2->type;
22
+ /*
23
+ * TODO: The could probable be done more cleanly.
24
+ * If the sort field is an auto field then it was evaluated before it was
25
+ * entered into the cache so we need to pass the compare function back to
26
+ * the new sort field.
27
+ */
28
+ if (equal && (key1->type == SORT_TYPE_AUTO)) {
29
+ key2->compare = key1->compare;
30
+ }
31
+ return equal;
32
+ }
33
+
34
+ SortField *sort_field_clone(SortField *self)
35
+ {
36
+ SortField *clone = ALLOC(SortField);
37
+ memcpy(clone, self, sizeof(SortField));
38
+ mutex_init(&clone->mutex, NULL);
39
+ clone->field = estrdup(self->field);
40
+ return clone;
41
+ }
42
+
43
+ SortField *sort_field_alloc(char *field, int type, bool reverse)
44
+ {
45
+ SortField *self = ALLOC(SortField);
46
+ mutex_init(&self->mutex, NULL);
47
+ self->field = field ? estrdup(field) : NULL;
48
+ self->type = type;
49
+ self->reverse = reverse;
50
+ self->index = NULL;
51
+ self->destroy_index = &free;
52
+ self->compare = NULL;
53
+ return self;
54
+ }
55
+
56
+ SortField *sort_field_create(char *field, int type, bool reverse)
57
+ {
58
+ SortField *sf = NULL;
59
+ switch (type) {
60
+ case SORT_TYPE_SCORE:
61
+ sf = sort_field_score_create(reverse);
62
+ break;
63
+ case SORT_TYPE_DOC:
64
+ sf = sort_field_doc_create(reverse);
65
+ break;
66
+ case SORT_TYPE_INTEGER:
67
+ sf = sort_field_int_create(field, reverse);
68
+ break;
69
+ case SORT_TYPE_FLOAT:
70
+ sf = sort_field_float_create(field, reverse);
71
+ break;
72
+ case SORT_TYPE_STRING:
73
+ sf = sort_field_string_create(field, reverse);
74
+ break;
75
+ case SORT_TYPE_AUTO:
76
+ sf = sort_field_auto_create(field, reverse);
77
+ break;
78
+ }
79
+ return sf;
80
+ }
81
+
82
+ void sort_field_destroy(void *p)
83
+ {
84
+ SortField *self = (SortField *)p;
85
+ if (self->index) {
86
+ self->destroy_index(self->index);
87
+ }
88
+ free(self->field);
89
+ mutex_destroy(&self->mutex);
90
+ free(p);
91
+ }
92
+
93
+ /***************************************************************************
94
+ * ScoreSortField
95
+ ***************************************************************************/
96
+
97
+ int sf_score_compare(void *index_ptr, Hit *hit2, Hit *hit1)
98
+ {
99
+ float val1 = hit1->score;
100
+ float val2 = hit2->score;
101
+ if (val1 > val2) return 1;
102
+ else if (val1 < val2) return -1;
103
+ else return 0;
104
+ }
105
+
106
+ SortField *sort_field_score_create(bool reverse)
107
+ {
108
+ SortField *self = sort_field_alloc(NULL, SORT_TYPE_SCORE, reverse);
109
+ self->compare = &sf_score_compare;
110
+ return self;
111
+ }
112
+
113
+ SortField SORT_FIELD_SCORE = {
114
+ field:NULL,
115
+ type:SORT_TYPE_SCORE,
116
+ reverse:false,
117
+ index:NULL,
118
+ compare:&sf_score_compare,
119
+ create_index:NULL,
120
+ destroy_index:NULL,
121
+ handle_term:NULL
122
+ };
123
+
124
+ SortField SORT_FIELD_SCORE_REV = {
125
+ field:NULL,
126
+ type:SORT_TYPE_SCORE,
127
+ reverse:true,
128
+ index:NULL,
129
+ compare:&sf_score_compare,
130
+ create_index:NULL,
131
+ destroy_index:NULL,
132
+ handle_term:NULL
133
+ };
134
+
135
+ /**************************************************************************
136
+ * DocSortField
137
+ ***************************************************************************/
138
+
139
+ int sf_doc_compare(void *index_ptr, Hit *hit1, Hit *hit2)
140
+ {
141
+ int val1 = hit1->doc;
142
+ int val2 = hit2->doc;
143
+ if (val1 > val2) return 1;
144
+ else if (val1 < val2) return -1;
145
+ else return 0;
146
+ }
147
+
148
+ SortField *sort_field_doc_create(bool reverse)
149
+ {
150
+ SortField *self = sort_field_alloc(NULL, SORT_TYPE_DOC, reverse);
151
+ self->compare = &sf_doc_compare;
152
+ return self;
153
+ }
154
+
155
+ SortField SORT_FIELD_DOC = {
156
+ field:NULL,
157
+ type:SORT_TYPE_DOC,
158
+ reverse:false,
159
+ index:NULL,
160
+ compare:&sf_doc_compare,
161
+ create_index:NULL,
162
+ destroy_index:NULL,
163
+ handle_term:NULL
164
+ };
165
+
166
+ SortField SORT_FIELD_DOC_REV = {
167
+ field:NULL,
168
+ type:SORT_TYPE_DOC,
169
+ reverse:true,
170
+ index:NULL,
171
+ compare:&sf_doc_compare,
172
+ create_index:NULL,
173
+ destroy_index:NULL,
174
+ handle_term:NULL
175
+ };
176
+
177
+ /***************************************************************************
178
+ * IntegerSortField
179
+ ***************************************************************************/
180
+
181
+ int sf_int_compare(void *index_ptr, Hit *hit1, Hit *hit2)
182
+ {
183
+ int *index = (int *)index_ptr;
184
+ int val1 = index[hit1->doc];
185
+ int val2 = index[hit2->doc];
186
+ if (val1 > val2) return 1;
187
+ else if (val1 < val2) return -1;
188
+ else return 0;
189
+ }
190
+
191
+ void *sf_int_create_index(int size)
192
+ {
193
+ return ALLOC_N(int, size);
194
+ }
195
+
196
+ void sf_int_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
197
+ {
198
+ int *index = (int *)index_ptr;
199
+ int val;
200
+ sscanf(text, "%d", &val);
201
+ while (tde->next(tde)) {
202
+ index[tde->doc_num(tde)] = val;
203
+ }
204
+ }
205
+
206
+ void sort_field_int_methods(SortField *self)
207
+ {
208
+ self->compare = &sf_int_compare;
209
+ self->create_index = &sf_int_create_index;
210
+ self->handle_term = &sf_int_handle_term;
211
+ }
212
+
213
+ SortField *sort_field_int_create(char *field, bool reverse)
214
+ {
215
+ SortField *self = sort_field_alloc(field, SORT_TYPE_INTEGER, reverse);
216
+ sort_field_int_methods(self);
217
+ return self;
218
+ }
219
+
220
+ /***************************************************************************
221
+ * FloatSortField
222
+ ***************************************************************************/
223
+
224
+ int sf_float_compare(void *index_ptr, Hit *hit1, Hit *hit2)
225
+ {
226
+ float *index = (float *)index_ptr;
227
+ float val1 = index[hit1->doc];
228
+ float val2 = index[hit2->doc];
229
+ if (val1 > val2) return 1;
230
+ else if (val1 < val2) return -1;
231
+ else return 0;
232
+ }
233
+
234
+ void *sf_float_create_index(int size)
235
+ {
236
+ return ALLOC_N(float, size);
237
+ }
238
+
239
+ void sf_float_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
240
+ {
241
+ float *index = (float *)index_ptr;
242
+ float val;
243
+ sscanf(text, "%g", &val);
244
+ while (tde->next(tde)) {
245
+ index[tde->doc_num(tde)] = val;
246
+ }
247
+ }
248
+
249
+ void sort_field_float_methods(SortField *self)
250
+ {
251
+ self->compare = &sf_float_compare;
252
+ self->create_index = &sf_float_create_index;
253
+ self->handle_term = &sf_float_handle_term;
254
+ }
255
+
256
+ SortField *sort_field_float_create(char *field, bool reverse)
257
+ {
258
+ SortField *self = sort_field_alloc(field, SORT_TYPE_FLOAT, reverse);
259
+ sort_field_float_methods(self);
260
+ return self;
261
+ }
262
+
263
+ /***************************************************************************
264
+ * StringSortField
265
+ ***************************************************************************/
266
+
267
+ #define VALUES_ARRAY_START_SIZE 8
268
+ typedef struct StringIndex {
269
+ int size;
270
+ int *index;
271
+ char **values;
272
+ int v_cnt;
273
+ int v_size;
274
+ } StringIndex;
275
+
276
+ int sf_string_compare(void *index_ptr, Hit *hit1, Hit *hit2)
277
+ {
278
+ StringIndex *index = (StringIndex *)index_ptr;
279
+ int val1 = index->index[hit1->doc];
280
+ int val2 = index->index[hit2->doc];
281
+ if (val1 > val2) return 1;
282
+ else if (val1 < val2) return -1;
283
+ else return 0;
284
+ }
285
+
286
+ void *sf_string_create_index(int size)
287
+ {
288
+ StringIndex *self = ALLOC(StringIndex);
289
+ ZEROSET(self, StringIndex, 1);
290
+ self->size = size;
291
+ self->index = ALLOC_N(int, size);
292
+ ZEROSET(self->index, int, size);
293
+ self->v_size = VALUES_ARRAY_START_SIZE;
294
+ self->values = ALLOC_N(char *, VALUES_ARRAY_START_SIZE);
295
+ return self;
296
+ }
297
+
298
+ void sf_string_destroy_index(void *p)
299
+ {
300
+ StringIndex *self = (StringIndex *)p;
301
+ int i;
302
+ free(self->index);
303
+ for (i = 0; i < self->v_cnt; i++) {
304
+ free(self->values[i]);
305
+ }
306
+ free(self->values);
307
+ free(self);
308
+ }
309
+
310
+ void sf_string_handle_term(void *index_ptr, TermDocEnum *tde, char *text)
311
+ {
312
+ StringIndex *index = (StringIndex *)index_ptr;
313
+ if (index->v_cnt >= index->v_size) {
314
+ index->v_size *= 2;
315
+ index->values = REALLOC_N(index->values, char *, index->v_size);
316
+ }
317
+ index->values[index->v_cnt] = estrdup(text);
318
+ while (tde->next(tde)) {
319
+ index->index[tde->doc_num(tde)] = index->v_cnt;
320
+ }
321
+ index->v_cnt++;
322
+ }
323
+
324
+ void sort_field_string_methods(SortField *self)
325
+ {
326
+ self->compare = &sf_string_compare;
327
+ self->create_index = &sf_string_create_index;
328
+ self->destroy_index = &sf_string_destroy_index;
329
+ self->handle_term = &sf_string_handle_term;
330
+ }
331
+
332
+ SortField *sort_field_string_create(char *field, bool reverse)
333
+ {
334
+ SortField *self = sort_field_alloc(field, SORT_TYPE_STRING, reverse);
335
+ sort_field_string_methods(self);
336
+ return self;
337
+ }
338
+
339
+ /***************************************************************************
340
+ * AutoSortField
341
+ ***************************************************************************/
342
+
343
+ void sort_field_auto_evaluate(SortField *sf, char *text)
344
+ {
345
+ int int_val;
346
+ float float_val;
347
+ int text_len = 0, scan_len = 0;
348
+
349
+ text_len = strlen(text);
350
+ sscanf(text, "%d%n", &int_val, &scan_len);
351
+ if (scan_len == text_len) {
352
+ sort_field_int_methods(sf);
353
+ } else {
354
+ sscanf(text, "%f%n", &float_val, &scan_len);
355
+ if (scan_len == text_len) {
356
+ sort_field_float_methods(sf);
357
+ } else {
358
+ sort_field_string_methods(sf);
359
+ }
360
+ }
361
+ }
362
+
363
+
364
+ SortField *sort_field_auto_create(char *field, bool reverse)
365
+ {
366
+ return sort_field_alloc(field, SORT_TYPE_AUTO, reverse);
367
+ }
368
+
369
+ /***************************************************************************
370
+ *
371
+ * FieldCache
372
+ *
373
+ ***************************************************************************/
374
+
375
+ void *field_cache_get_index(IndexReader *ir, SortField *sf)
376
+ {
377
+ void *index = NULL;
378
+ int length = 0;
379
+ Term term;
380
+ TermBuffer *tb;
381
+ TermEnum *te;
382
+ TermDocEnum *tde;
383
+ char *field = sf->field;
384
+ SortField *sf_clone;
385
+
386
+ mutex_lock(&sf->mutex);
387
+ if (!ir->sort_cache) {
388
+ ir->sort_cache = h_new(&sort_field_hash, &sort_field_eq,
389
+ &sort_field_destroy, NULL);
390
+ }
391
+ index = h_get(ir->sort_cache, sf);
392
+
393
+ if (index == NULL) {
394
+ length = ir->max_doc(ir);
395
+ if (length > 0) {
396
+ tde = ir->term_docs(ir);
397
+ term.field = field;
398
+ term.text = "";
399
+ te = ir->terms_from(ir, &term);
400
+ if (te->tb_curr == NULL) {
401
+ eprintf(ARG_ERROR, "no terms in field '%s' to sort by", field);
402
+ }
403
+
404
+ if (sf->type == SORT_TYPE_AUTO) {
405
+ sort_field_auto_evaluate(sf, te->tb_curr->text);
406
+ }
407
+
408
+ index = sf->create_index(length);
409
+
410
+ do {
411
+ tb = te->tb_curr;
412
+ if (strcmp(tb->field, field) != 0) break;
413
+ term.text = tb->text;
414
+ tde->seek(tde, &term);
415
+ sf->handle_term(index, tde, tb->text);
416
+ } while (te->next(te));
417
+ tde->close(tde);
418
+ te->close(te);
419
+ }
420
+ sf_clone = sort_field_clone(sf);
421
+ sf_clone->index = index;
422
+ h_set(ir->sort_cache, sf_clone, index);
423
+ }
424
+ mutex_unlock(&sf->mutex);
425
+ return index;
426
+ }
427
+
428
+ /***************************************************************************
429
+ *
430
+ * FieldSortedHitQueue
431
+ *
432
+ ***************************************************************************/
433
+
434
+ /***************************************************************************
435
+ * Comparator
436
+ ***************************************************************************/
437
+
438
+ typedef struct Comparator {
439
+ void *index;
440
+ bool reverse : 1;
441
+ int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2);
442
+ } Comparator;
443
+
444
+ Comparator *comparator_create(void *index, bool reverse,
445
+ int (*compare)(void *index_ptr, Hit *hit1, Hit *hit2))
446
+ {
447
+ Comparator *self = ALLOC(Comparator);
448
+ self->index = index;
449
+ self->reverse = reverse;
450
+ self->compare = compare;
451
+ return self;
452
+ }
453
+
454
+ /***************************************************************************
455
+ * Sorter
456
+ ***************************************************************************/
457
+
458
+ typedef struct Sorter {
459
+ Comparator **comparators;
460
+ int c_cnt;
461
+ } Sorter;
462
+
463
+ Comparator *sorter_get_comparator(SortField *sf, IndexReader *ir)
464
+ {
465
+ void *index = NULL;
466
+
467
+ if (sf->type > SORT_TYPE_DOC) {
468
+ index = field_cache_get_index(ir, sf);
469
+ }
470
+ return comparator_create(index, sf->reverse, sf->compare);
471
+ }
472
+
473
+ void sorter_destroy(void *p)
474
+ {
475
+ int i;
476
+ Sorter *self = (Sorter *)p;
477
+
478
+ for (i = 0; i < self->c_cnt; i++) {
479
+ free(self->comparators[i]);
480
+ }
481
+ free(self->comparators);
482
+ free(self);
483
+ }
484
+
485
+ Sorter *sorter_create(int size)
486
+ {
487
+ Sorter *self = ALLOC(Sorter);
488
+ self->c_cnt = size;
489
+ self->comparators = ALLOC_N(Comparator *, size);
490
+ ZEROSET(self->comparators, Comparator *, size);
491
+ return self;
492
+ }
493
+
494
+ /***************************************************************************
495
+ * FieldSortedHitQueue
496
+ ***************************************************************************/
497
+
498
+ bool fshq_less_than(void *hit1, void *hit2)
499
+ {
500
+ printf("Whoops, shouldn't call this.\n");
501
+ int cmp = 0;
502
+ if (cmp != 0) {
503
+ return cmp;
504
+ } else {
505
+ return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
506
+ }
507
+ }
508
+
509
+ inline bool fshq_lt(Hit *sorter_ptr, Hit *hit1, Hit *hit2)
510
+ {
511
+ Sorter *sorter = (Sorter *)sorter_ptr;
512
+ Comparator *comp;
513
+ int diff = 0, i;
514
+ for (i = 0; i < sorter->c_cnt && diff == 0; i++) {
515
+ comp = sorter->comparators[i];
516
+ if (comp->reverse) {
517
+ diff = comp->compare(comp->index, hit2, hit1);
518
+ } else {
519
+ diff = comp->compare(comp->index, hit1, hit2);
520
+ }
521
+ }
522
+
523
+ if (diff != 0) {
524
+ return diff > 0;
525
+ } else {
526
+ return hit1->doc > hit2->doc;
527
+ }
528
+ }
529
+
530
+ void fshq_pq_down(PriorityQueue *pq)
531
+ {
532
+ register int i = 1;
533
+ register int j = 2; //i << 1;
534
+ register int k = 3; //j + 1;
535
+ Hit **heap = (Hit **)pq->heap;
536
+ Hit *node = heap[i]; // save top node
537
+
538
+ if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
539
+ j = k;
540
+
541
+ while ((j <= pq->count) && fshq_lt(heap[0], heap[j], node)) {
542
+ heap[i] = heap[j]; // shift up child
543
+ i = j;
544
+ j = i << 1;
545
+ k = j + 1;
546
+ if ((k <= pq->count) && fshq_lt(heap[0], heap[k], heap[j]))
547
+ j = k;
548
+ }
549
+ heap[i] = node;
550
+ }
551
+
552
+ Hit *fshq_pq_pop(PriorityQueue *pq)
553
+ {
554
+ if (pq->count > 0) {
555
+ Hit *result = (Hit *)pq->heap[1]; // save first value
556
+ pq->heap[1] = pq->heap[pq->count]; // move last to first
557
+ pq->heap[pq->count] = NULL;
558
+ pq->count--;
559
+ fshq_pq_down(pq); // adjust heap
560
+ return result;
561
+ } else {
562
+ return NULL;
563
+ }
564
+ }
565
+
566
+ inline void fshq_pq_up(PriorityQueue *pq)
567
+ {
568
+ int i,j;
569
+ i = pq->count;
570
+ j = i >> 1;
571
+ Hit **heap = (Hit **)pq->heap;
572
+ Hit *node = heap[i];
573
+
574
+ while ((j > 0) && fshq_lt(heap[0], node, heap[j])) {
575
+ heap[i] = heap[j];
576
+ i = j;
577
+ j = j >> 1;
578
+ }
579
+ heap[i] = node;
580
+ }
581
+
582
+ void fshq_pq_push(PriorityQueue *pq, void *elem)
583
+ {
584
+ pq->count++;
585
+ pq->heap[pq->count] = elem;
586
+ fshq_pq_up(pq);
587
+ }
588
+
589
+ void fshq_pq_destroy(void *p)
590
+ {
591
+ PriorityQueue *self = (PriorityQueue *)p;
592
+ sorter_destroy(self->heap[0]);
593
+ pq_destroy(self);
594
+ }
595
+
596
+ PriorityQueue *fshq_pq_create(int size, Sort *sort, IndexReader *ir)
597
+ {
598
+ PriorityQueue *self = pq_create(size, &fshq_less_than);
599
+ int i;
600
+ Sorter *sorter = sorter_create(sort->sf_cnt);
601
+ SortField *sf;
602
+
603
+ for (i = 0; i < sort->sf_cnt; i++) {
604
+ sf = sort->sort_fields[i];
605
+ sorter->comparators[i] = sorter_get_comparator(sf, ir);
606
+ }
607
+ self->heap[0] = sorter;
608
+
609
+ return self;
610
+ }
611
+
612
+ /***************************************************************************
613
+ *
614
+ * Sort
615
+ *
616
+ ***************************************************************************/
617
+
618
+ Sort *sort_create()
619
+ {
620
+ Sort *self = ALLOC(Sort);
621
+ self->sf_cnt = 0;
622
+ self->sf_capa = ARRAY_INIT_SIZE;
623
+ self->sort_fields = ALLOC_N(SortField *, ARRAY_INIT_SIZE);
624
+ self->destroy_all = true;
625
+
626
+ return self;
627
+ }
628
+
629
+ void sort_clear(Sort *self)
630
+ {
631
+ int i;
632
+ for (i = 0; i < self->sf_cnt; i++) {
633
+ sort_field_destroy(self->sort_fields[i]);
634
+ }
635
+ self->sf_cnt = 0;
636
+ }
637
+
638
+ void sort_destroy(void *p)
639
+ {
640
+ int i;
641
+ Sort *self = (Sort *)p;
642
+ if (self->destroy_all) {
643
+ for (i = 0; i < self->sf_cnt; i++) {
644
+ sort_field_destroy(self->sort_fields[i]);
645
+ }
646
+ }
647
+ free(self->sort_fields);
648
+ free(self);
649
+ }
650
+
651
+ void sort_add_sort_field(Sort *self, SortField *sf)
652
+ {
653
+ if (self->sf_cnt == self->sf_capa) {
654
+ self->sf_capa *= 2;
655
+ REALLOC_N(self->sort_fields, SortField *, self->sf_capa);
656
+ }
657
+
658
+ self->sort_fields[self->sf_cnt] = sf;
659
+ self->sf_cnt++;
660
+ }
661
+