ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/ram_store.c ADDED
@@ -0,0 +1,447 @@
1
+ #include <string.h>
2
+ #include <store.h>
3
+
4
+ typedef struct RamFile {
5
+ char *name;
6
+ uchar **buffers;
7
+ int bufcnt;
8
+ int len;
9
+ int refcnt;
10
+ bool alive;
11
+ } RamFile;
12
+
13
+ RamFile *rf_create(const char *name)
14
+ {
15
+ RamFile *rf = ALLOC(RamFile);
16
+ rf->buffers = ALLOC(uchar *);
17
+ rf->buffers[0] = ALLOC_N(uchar, BUFFER_SIZE);
18
+ rf->name = estrdup(name);
19
+ rf->len = 0;
20
+ rf->bufcnt = 1;
21
+ rf->refcnt = 0;
22
+ rf->alive = true;
23
+ return rf;
24
+ }
25
+
26
+ void rf_extend_if_necessary(RamFile *rf, int buf_num)
27
+ {
28
+ while (rf->bufcnt <= buf_num) {
29
+ REALLOC_N(rf->buffers, uchar *, (rf->bufcnt + 1));
30
+ rf->buffers[rf->bufcnt++] = ALLOC_N(uchar, BUFFER_SIZE);
31
+ }
32
+ }
33
+
34
+ void rf_close(void *p)
35
+ {
36
+ int i;
37
+ RamFile *rf = (RamFile *)p;
38
+ if (rf->refcnt > 0 || rf->alive) return;
39
+ free(rf->name);
40
+ for (i = 0; i < rf->bufcnt; i++) {
41
+ free(rf->buffers[i]);
42
+ }
43
+ free(rf->buffers);
44
+ free(rf);
45
+ }
46
+
47
+ void ram_touch(Store *store, char *filename)
48
+ {
49
+ if (h_get(store->dir.ht, filename) == NULL)
50
+ h_set(store->dir.ht, filename, rf_create(filename));
51
+ }
52
+
53
+ int ram_exists(Store *store, char *filename)
54
+ {
55
+ if (h_get(store->dir.ht, filename) != NULL)
56
+ return true;
57
+ else
58
+ return false;
59
+ }
60
+
61
+ int ram_remove(Store *store, char *filename)
62
+ {
63
+ RamFile *rf = h_rem(store->dir.ht, filename, false);
64
+ if (rf != NULL) {
65
+ rf->alive = false;
66
+ rf_close(rf);
67
+ return true;
68
+ } else {
69
+ return false;
70
+ }
71
+ }
72
+
73
+ int ram_rename(Store *store, char *from, char *to)
74
+ {
75
+ RamFile *rf = (RamFile *)h_rem(store->dir.ht, from, false);
76
+ if (rf == NULL)
77
+ eprintf(IO_ERROR, "tried to rename a file that doesn't exist");
78
+
79
+ free(rf->name);
80
+
81
+ rf->name = estrdup(to);
82
+
83
+ // clean up the file we are overwriting
84
+ RamFile *tmp = (RamFile *)h_get(store->dir.ht, to);
85
+ if (tmp != NULL)
86
+ tmp->alive = false;
87
+
88
+ h_set(store->dir.ht, rf->name, rf);
89
+ return true;
90
+ }
91
+
92
+ int ram_count(Store *store)
93
+ {
94
+ return store->dir.ht->used;
95
+ }
96
+
97
+ void ram_each(Store *store, void (*func)(char *fname, void *arg), void *arg)
98
+ {
99
+ HshTable *ht = store->dir.ht;
100
+ RamFile *rf;
101
+ int i;
102
+ for (i = 0; i <= ht->mask; i++) {
103
+ rf = (RamFile *)ht->table[i].value;
104
+ if (rf) {
105
+ if (strncmp(rf->name, LOCK_PREFIX, strlen(LOCK_PREFIX)) == 0)
106
+ continue;
107
+ func(rf->name, arg);
108
+ }
109
+ }
110
+ }
111
+
112
+ void ram_close(Store *store)
113
+ {
114
+ HshTable *ht = store->dir.ht;
115
+ RamFile *rf;
116
+ int i;
117
+ for (i = 0; i <= ht->mask; i++) {
118
+ rf = (RamFile *)ht->table[i].value;
119
+ if (rf) rf->alive = false;
120
+ }
121
+ h_destroy(store->dir.ht);
122
+ store_destroy(store);
123
+ }
124
+
125
+ /*
126
+ * Be sure to keep the locks
127
+ */
128
+ void ram_clear(Store *store)
129
+ {
130
+ int i;
131
+ HshTable *ht = store->dir.ht;
132
+ RamFile *rf;
133
+ for (i = 0; i <= ht->mask; i++) {
134
+ rf = (RamFile *)ht->table[i].value;
135
+ if (rf && !file_is_lock(rf->name)) {
136
+ rf->alive = false;
137
+ h_del(ht, rf->name);
138
+ }
139
+ }
140
+ }
141
+
142
+ void ram_clear_locks(Store *store)
143
+ {
144
+ int i;
145
+ HshTable *ht = store->dir.ht;
146
+ RamFile *rf;
147
+ for (i = 0; i <= ht->mask; i++) {
148
+ rf = (RamFile *)ht->table[i].value;
149
+ if (rf && file_is_lock(rf->name)) {
150
+ rf->alive = false;
151
+ h_del(ht, rf->name);
152
+ }
153
+ }
154
+ }
155
+ void ram_clear_all(Store *store)
156
+ {
157
+ int i;
158
+ HshTable *ht = store->dir.ht;
159
+ RamFile *rf;
160
+ for (i = 0; i <= ht->mask; i++) {
161
+ rf = (RamFile *)ht->table[i].value;
162
+ if (rf) {
163
+ rf->alive = false;
164
+ h_del(ht, rf->name);
165
+ }
166
+ }
167
+ }
168
+
169
+ int ram_length(Store *store, char *filename)
170
+ {
171
+ RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
172
+ if (rf != NULL)
173
+ return rf->len;
174
+ else
175
+ return 0;
176
+ }
177
+
178
+ int ramo_length(OutStream *os)
179
+ {
180
+ return ((RamFile *)os->file)->len;
181
+ }
182
+
183
+ void ramo_flush_internal(OutStream *os, uchar *src, int len)
184
+ {
185
+ RamFile *rf = (RamFile *)os->file;
186
+ int buffer_number, buffer_offset, bytes_in_buffer, bytes_to_copy;
187
+ int src_offset;
188
+ int pointer = os->pointer;
189
+
190
+ buffer_number = (int)(pointer / BUFFER_SIZE);
191
+ buffer_offset = pointer % BUFFER_SIZE;
192
+ bytes_in_buffer = BUFFER_SIZE - buffer_offset;
193
+ bytes_to_copy = bytes_in_buffer < len ? bytes_in_buffer : len;
194
+
195
+ rf_extend_if_necessary(rf, buffer_number);
196
+
197
+ uchar *buffer = rf->buffers[buffer_number];
198
+ memcpy(buffer + buffer_offset, src, bytes_to_copy);
199
+
200
+ if (bytes_to_copy < len) {
201
+ src_offset = bytes_to_copy;
202
+ bytes_to_copy = len - bytes_to_copy;
203
+ buffer_number += 1;
204
+ rf_extend_if_necessary(rf, buffer_number);
205
+ buffer = rf->buffers[buffer_number];
206
+
207
+ memcpy(buffer, src + src_offset, bytes_to_copy);
208
+ }
209
+ os->pointer += len;
210
+
211
+ if (os->pointer > rf->len)
212
+ rf->len = os->pointer;
213
+ }
214
+
215
+ void ramo_seek_internal(OutStream *os, int pos)
216
+ {
217
+ os->pointer = pos;
218
+ }
219
+
220
+ void ramo_reset(OutStream *os)
221
+ {
222
+ RamFile *rf = (RamFile *)os->file;
223
+ os_seek(os, 0);
224
+ rf->len = 0;
225
+ }
226
+
227
+ void ramo_close_internal(OutStream *os)
228
+ {
229
+ RamFile *rf = (RamFile *)os->file;
230
+ rf->refcnt--;
231
+ rf_close(rf);
232
+ }
233
+
234
+ void ramo_write_to(OutStream *os, OutStream *other_o)
235
+ {
236
+ int i, len;
237
+ RamFile *rf = (RamFile *)os->file;
238
+ os_flush(os);
239
+ int last_buffer_number = (int)(rf->len / BUFFER_SIZE);
240
+ int last_buffer_offset = rf->len % BUFFER_SIZE;
241
+ for (i = 0; i <= last_buffer_number; i++) {
242
+ len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE);
243
+ os_write_bytes(other_o, rf->buffers[i], len);
244
+ }
245
+ }
246
+
247
+ OutStream *ram_create_buffer()
248
+ {
249
+ RamFile *rf = rf_create("");
250
+ rf->alive = false;
251
+ OutStream *os = os_create();
252
+ os->file = rf;
253
+ os->pointer = 0;
254
+ os->flush_internal = &ramo_flush_internal;
255
+ os->seek_internal = &ramo_seek_internal;
256
+ os->close_internal = &ramo_close_internal;
257
+ return os;
258
+ }
259
+
260
+ void ram_destroy_buffer(OutStream *os)
261
+ {
262
+ rf_close(os->file);
263
+ free(os);
264
+ }
265
+
266
+ OutStream *ram_create_output(Store *store, const char *filename)
267
+ {
268
+ RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
269
+ if (rf == NULL) {
270
+ rf = rf_create(filename);
271
+ h_set(store->dir.ht, rf->name, rf);
272
+ }
273
+ rf->refcnt++;
274
+ OutStream *os = os_create();
275
+ os->pointer = 0;
276
+ os->file = rf;
277
+ os->flush_internal = &ramo_flush_internal;
278
+ os->seek_internal = &ramo_seek_internal;
279
+ os->close_internal = &ramo_close_internal;
280
+ return os;
281
+ }
282
+
283
+ void rami_read_internal(InStream *is, uchar *b, int offset, int len)
284
+ {
285
+ RamFile *rf = (RamFile *)is->file;
286
+
287
+ int buffer_number, buffer_offset, bytes_in_buffer, bytes_to_copy;
288
+ int remainder = len;
289
+ int start = is->d.pointer;
290
+ uchar *buffer;
291
+
292
+ while (remainder > 0) {
293
+ buffer_number = (int)(start / BUFFER_SIZE);
294
+ buffer_offset = start % BUFFER_SIZE;
295
+ bytes_in_buffer = BUFFER_SIZE - buffer_offset;
296
+
297
+ if (bytes_in_buffer >= remainder) {
298
+ bytes_to_copy = remainder;
299
+ } else {
300
+ bytes_to_copy = bytes_in_buffer;
301
+ }
302
+ buffer = rf->buffers[buffer_number];
303
+ memcpy(b + offset, buffer + buffer_offset, bytes_to_copy);
304
+ offset += bytes_to_copy;
305
+ start += bytes_to_copy;
306
+ remainder -= bytes_to_copy;
307
+ }
308
+
309
+ is->d.pointer += len;
310
+ }
311
+
312
+ int rami_length(InStream *is)
313
+ {
314
+ return ((RamFile *)is->file)->len;
315
+ }
316
+
317
+ void rami_seek_internal(InStream *is, int pos)
318
+ {
319
+ is->d.pointer = pos;
320
+ }
321
+
322
+ void rami_close_internal(InStream *is)
323
+ {
324
+ RamFile *rf = (RamFile *)is->file;
325
+ rf->refcnt--;
326
+ rf_close(rf);
327
+ }
328
+
329
+ void rami_clone_internal(InStream *is, InStream *new_index_i)
330
+ {
331
+ ((RamFile *)is->file)->refcnt++;
332
+ }
333
+
334
+ InStream *ram_open_input(Store *store, const char *filename)
335
+ {
336
+ RamFile *rf = (RamFile *)h_get(store->dir.ht, filename);
337
+ if (rf == NULL) {
338
+ eprintf(IO_ERROR, "Couldn't open the ram file %s to read", filename);
339
+ }
340
+ rf->refcnt++;
341
+ InStream *is = is_create();
342
+ is->file = rf;
343
+ is->d.pointer = 0;
344
+ is->is_clone = false;
345
+ is->read_internal = &rami_read_internal;
346
+ is->seek_internal = &rami_seek_internal;
347
+ is->close_internal = &rami_close_internal;
348
+ is->clone_internal = &rami_clone_internal;
349
+ is->length_internal = &rami_length;
350
+ return is;
351
+ }
352
+
353
+ #define LOCK_OBTAIN_TIMEOUT 5
354
+
355
+ int ram_lock_obtain(Lock *lock)
356
+ {
357
+ int ret = true;
358
+ if (ram_exists(lock->store, lock->name))
359
+ ret = false;
360
+ ram_touch(lock->store, lock->name);
361
+ return ret;
362
+ }
363
+
364
+ int ram_lock_is_locked(Lock *lock)
365
+ {
366
+ return ram_exists(lock->store, lock->name);
367
+ }
368
+
369
+ void ram_lock_release(Lock *lock)
370
+ {
371
+ ram_remove(lock->store, lock->name);
372
+ }
373
+
374
+ Lock *ram_open_lock(Store *store, char *lockname)
375
+ {
376
+ Lock *lock = ALLOC(Lock);
377
+ char lname[100];
378
+ sprintf(lname, "%s%s.lck", LOCK_PREFIX, lockname);
379
+ lock->name = estrdup(lname);
380
+ lock->store = store;
381
+ lock->obtain = &ram_lock_obtain;
382
+ lock->release = &ram_lock_release;
383
+ lock->is_locked = &ram_lock_is_locked;
384
+ return lock;
385
+ }
386
+
387
+ void ram_close_lock(Lock *lock)
388
+ {
389
+ free(lock->name);
390
+ free(lock);
391
+ }
392
+
393
+
394
+ Store *open_ram_store()
395
+ {
396
+ Store *new_store = store_create();
397
+
398
+ new_store->dir.ht = h_new_str(NULL, rf_close);
399
+ new_store->touch = &ram_touch;
400
+ new_store->exists = &ram_exists;
401
+ new_store->remove = &ram_remove;
402
+ new_store->rename = &ram_rename;
403
+ new_store->count = &ram_count;
404
+ new_store->close = &ram_close;
405
+ new_store->clear = &ram_clear;
406
+ new_store->clear_all = &ram_clear_all;
407
+ new_store->clear_locks = &ram_clear_locks;
408
+ new_store->length = &ram_length;
409
+ new_store->each = &ram_each;
410
+ new_store->create_output = &ram_create_output;
411
+ new_store->open_input = &ram_open_input;
412
+ new_store->open_lock = &ram_open_lock;
413
+ new_store->close_lock = &ram_close_lock;
414
+ return new_store;
415
+ }
416
+
417
+ struct CopyFileArg {
418
+ Store *to_store, *from_store;
419
+ };
420
+
421
+ static void copy_files(char *fname, void *arg)
422
+ {
423
+ struct CopyFileArg *cfa = (struct CopyFileArg *)arg;
424
+ OutStream *os = cfa->to_store->create_output(cfa->to_store, fname);
425
+ InStream *is = cfa->from_store->open_input(cfa->from_store, fname);
426
+ int len = is_length(is);
427
+ uchar buffer[len+1];
428
+ is_read_bytes(is, buffer, 0, len);
429
+ os_write_bytes(os, buffer, len);
430
+ is_close(is);
431
+ os_close(os);
432
+ }
433
+
434
+ Store *open_ram_store_and_copy(Store *from_store, bool close_dir)
435
+ {
436
+ Store *store = open_ram_store();
437
+ struct CopyFileArg cfa;
438
+ cfa.to_store = store;
439
+ cfa.from_store = from_store;
440
+
441
+ from_store->each(from_store, &copy_files, &cfa);
442
+
443
+ if (close_dir)
444
+ from_store->close(from_store);
445
+
446
+ return store;
447
+ }
data/ext/search.c ADDED
@@ -0,0 +1,524 @@
1
+ #include <string.h>
2
+ #include "search.h"
3
+
4
+ /***************************************************************************
5
+ *
6
+ * Explanation
7
+ *
8
+ ***************************************************************************/
9
+
10
+ Explanation *expl_create(float value, char *description)
11
+ {
12
+ Explanation *self = ALLOC(Explanation);
13
+ self->value = value;
14
+ self->description = description;
15
+ self->dcnt = 0;
16
+ self->dcapa = EXPLANATION_DETAILS_START_SIZE;
17
+ self->details = ALLOC_N(Explanation *, EXPLANATION_DETAILS_START_SIZE);
18
+ return self;
19
+ }
20
+
21
+ void expl_destoy(void *p)
22
+ {
23
+ Explanation *expl = (Explanation *)p;
24
+ int i;
25
+ for (i = 0; i < expl->dcnt; i++) {
26
+ expl_destoy(expl->details[i]);
27
+ }
28
+ free(expl->details);
29
+ free(expl->description);
30
+ free(expl);
31
+ }
32
+
33
+ Explanation *expl_add_detail(Explanation *self, Explanation *detail)
34
+ {
35
+ if (self->dcnt >= self->dcapa) {
36
+ self->dcapa *= 2;
37
+ REALLOC_N(self->details, Explanation *, self->dcapa);
38
+ }
39
+ self->details[self->dcnt] = detail;
40
+ self->dcnt++;
41
+ return self;
42
+ }
43
+
44
+ char *expl_to_s(Explanation *self, int depth)
45
+ {
46
+ int i;
47
+ char dbuf[32];
48
+ char *buffer = ALLOC_N(char, depth * 2 + 1);
49
+ memset(buffer, ' ', sizeof(char) * depth * 2);
50
+ buffer[depth*2] = 0;
51
+
52
+ dbl_to_s(dbuf, self->value);
53
+ buffer = estrcat(buffer, epstrdup("%s = %s\n",
54
+ strlen(dbuf) + strlen(self->description),
55
+ dbuf, self->description));
56
+ for (i = 0; i < self->dcnt; i++) {
57
+ buffer = estrcat(buffer, expl_to_s(self->details[i], depth + 1));
58
+ }
59
+
60
+ return buffer;
61
+ }
62
+
63
+ char *expl_to_html(Explanation *self)
64
+ {
65
+ int i;
66
+ char dbuf[32];
67
+ char *buffer;
68
+ dbl_to_s(dbuf, self->value);
69
+ buffer = epstrdup("<ul>\n<li>%s = %s</li>\n",
70
+ strlen(dbuf) + strlen(self->description),
71
+ dbuf, self->description);
72
+
73
+ for (i = 0; i < self->dcnt; i++) {
74
+ estrcat(buffer, expl_to_html(self->details[i]));
75
+ }
76
+
77
+ REALLOC_N(buffer, char, strlen(buffer) + 10);
78
+ return strcat(buffer, "</ul>\n");
79
+ }
80
+
81
+ /***************************************************************************
82
+ *
83
+ * Hit
84
+ *
85
+ ***************************************************************************/
86
+
87
+ bool hit_less_than(void *hit1, void *hit2)
88
+ {
89
+ if (((Hit *)hit1)->score == ((Hit *)hit2)->score) {
90
+ return ((Hit *)hit1)->doc > ((Hit *)hit2)->doc;
91
+ } else {
92
+ return ((Hit *)hit1)->score < ((Hit *)hit2)->score;
93
+ }
94
+ }
95
+
96
+ inline bool hit_lt(Hit *hit1, Hit *hit2)
97
+ {
98
+ if (hit1->score == hit2->score) {
99
+ return hit1->doc > hit2->doc;
100
+ } else {
101
+ return hit1->score < hit2->score;
102
+ }
103
+ }
104
+
105
+ void hit_pq_down(PriorityQueue *pq)
106
+ {
107
+ register int i = 1;
108
+ register int j = 2; //i << 1;
109
+ register int k = 3; //j + 1;
110
+ Hit **heap = (Hit **)pq->heap;
111
+ Hit *node = heap[i]; // save top node
112
+
113
+ if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
114
+ j = k;
115
+
116
+ while ((j <= pq->count) && hit_lt(heap[j], node)) {
117
+ heap[i] = heap[j]; // shift up child
118
+ i = j;
119
+ j = i << 1;
120
+ k = j + 1;
121
+ if ((k <= pq->count) && hit_lt(heap[k], heap[j]))
122
+ j = k;
123
+ }
124
+ heap[i] = node;
125
+ }
126
+
127
+ Hit *hit_pq_pop(PriorityQueue *pq)
128
+ {
129
+ if (pq->count > 0) {
130
+ Hit *result = (Hit *)pq->heap[1]; // save first value
131
+ pq->heap[1] = pq->heap[pq->count]; // move last to first
132
+ pq->heap[pq->count] = NULL;
133
+ pq->count--;
134
+ hit_pq_down(pq); // adjust heap
135
+ return result;
136
+ } else {
137
+ return NULL;
138
+ }
139
+ }
140
+
141
+ inline void hit_pq_up(PriorityQueue *pq)
142
+ {
143
+ int i,j;
144
+ i = pq->count;
145
+ j = i >> 1;
146
+ Hit **heap = (Hit **)pq->heap;
147
+ Hit *node = heap[i];
148
+
149
+ while ((j > 0) && hit_lt(node, heap[j])) {
150
+ heap[i] = heap[j];
151
+ i = j;
152
+ j = j >> 1;
153
+ }
154
+ heap[i] = node;
155
+ }
156
+
157
+
158
+ void hit_pq_push(PriorityQueue *pq, void *elem)
159
+ {
160
+ pq->count++;
161
+ pq->heap[pq->count] = elem;
162
+ hit_pq_up(pq);
163
+ }
164
+
165
+
166
+
167
+ /***************************************************************************
168
+ *
169
+ * TopDocs
170
+ *
171
+ ***************************************************************************/
172
+
173
+ TopDocs *td_create(int total_hits, int size, Hit **hits)
174
+ {
175
+ TopDocs *td = ALLOC(TopDocs);
176
+ td->total_hits = total_hits;
177
+ td->size = size;
178
+ td->hits = hits;
179
+ return td;
180
+ }
181
+
182
+ void td_destroy(void *p)
183
+ {
184
+ TopDocs *td = (TopDocs *)p;
185
+ int i;
186
+ for (i = 0; i < td->size; i++) {
187
+ free(td->hits[i]);
188
+ }
189
+ free(td->hits);
190
+ free(td);
191
+ }
192
+
193
+ char *td_to_s(TopDocs *td)
194
+ {
195
+ int i;
196
+ char dbuf[32];
197
+ Hit *hit;
198
+ char *buffer = epstrdup("%d hits sorted by <score, doc_num>\n", 20, td->total_hits);
199
+ for (i = 0; i < td->size; i++) {
200
+ hit = td->hits[i];
201
+ dbl_to_s(dbuf, hit->score);
202
+ estrcat(buffer, epstrdup("\t%d:%s\n", 52, hit->doc, dbuf));
203
+ }
204
+ return buffer;
205
+ }
206
+
207
+ /***************************************************************************
208
+ *
209
+ * Weight
210
+ *
211
+ ***************************************************************************/
212
+
213
+ Query *w_get_query(Weight *self)
214
+ {
215
+ return self->query;
216
+ }
217
+
218
+ float w_get_value(Weight *self)
219
+ {
220
+ return self->value;
221
+ }
222
+
223
+ float w_sum_of_squared_weights(Weight *self)
224
+ {
225
+ self->qweight = self->idf * self->query->boost;
226
+ return self->qweight * self->qweight; // square it
227
+ }
228
+
229
+ void w_normalize(Weight *self, float normalization_factor)
230
+ {
231
+ self->qnorm = normalization_factor;
232
+ self->qweight *= normalization_factor; // normalize query weight
233
+ self->value = self->qweight * self->idf; // idf for document
234
+ }
235
+
236
+ /***************************************************************************
237
+ *
238
+ * Query
239
+ *
240
+ ***************************************************************************/
241
+
242
+ Similarity *q_get_similarity(Query *self, Searcher *searcher)
243
+ {
244
+ return searcher->get_similarity(searcher);
245
+ }
246
+
247
+ Query *q_rewrite(Query *self, IndexReader *ir)
248
+ {
249
+ return self;
250
+ }
251
+
252
+ Weight *q_weight(Query *self, Searcher *searcher)
253
+ {
254
+ if (self->weight) {
255
+ self->weight->destroy(self->weight);
256
+ }
257
+ Query *query = searcher->rewrite(searcher, self);
258
+ Weight *weight = query->create_weight(query, searcher);
259
+ float sum = weight->sum_of_squared_weights(weight);
260
+ Similarity *sim = query->get_similarity(query, searcher);
261
+ float norm = sim_query_norm(sim, sum);
262
+
263
+ weight->normalize(weight, norm);
264
+ return self->weight = weight;
265
+ }
266
+
267
+ void q_destroy(Query *self)
268
+ {
269
+ if (self->rewritten) {
270
+ self->rewritten->destroy(self->rewritten);
271
+ self->rewritten = NULL;
272
+ }
273
+ if (self->weight) {
274
+ self->weight->destroy(self->weight);
275
+ }
276
+ free(self);
277
+ }
278
+
279
+ void q_extract_terms(Query *self, Array *terms)
280
+ {
281
+ /* do nothing by default */
282
+ }
283
+
284
+ Query *q_create()
285
+ {
286
+ Query *self = ALLOC(Query);
287
+ ZEROSET(self, Query, 1);
288
+ self->destroy_all = true;
289
+ self->boost = 1.0;
290
+ self->rewrite = &q_rewrite;
291
+ self->get_similarity = &q_get_similarity;
292
+ self->extract_terms = &q_extract_terms;
293
+ self->weight = NULL;
294
+ self->rewritten = NULL;
295
+ return self;
296
+ }
297
+
298
+ /***************************************************************************
299
+ *
300
+ * Scorer
301
+ *
302
+ ***************************************************************************/
303
+
304
+ void scorer_destroy(void *p)
305
+ {
306
+ Scorer *scorer = (Scorer *)p;
307
+ free(scorer->data);
308
+ free(scorer);
309
+ }
310
+
311
+ Scorer *scorer_create(Similarity *similarity)
312
+ {
313
+ Scorer *self = ALLOC(Scorer);
314
+ self->destroy = &scorer_destroy;
315
+ self->data = NULL;
316
+ self->similarity = similarity;
317
+ return self;
318
+ }
319
+
320
+ bool scorer_less_than(void *p1, void *p2)
321
+ {
322
+ Scorer *s1 = (Scorer *)p1;
323
+ Scorer *s2 = (Scorer *)p2;
324
+ return s1->score(s1) < s2->score(s2);
325
+ }
326
+
327
+ bool scorer_doc_less_than(void *p1, void *p2)
328
+ {
329
+ return ((Scorer *)p1)->doc < ((Scorer *)p2)->doc;
330
+ }
331
+
332
+ int scorer_doc_cmp(const void *p1, const void *p2)
333
+ {
334
+ return (*(Scorer **)p1)->doc - (*(Scorer **)p2)->doc;
335
+ }
336
+ /***************************************************************************
337
+ *
338
+ * Searcher
339
+ *
340
+ ***************************************************************************/
341
+
342
+ int sea_doc_freq(Searcher *self, Term *term)
343
+ {
344
+ return self->ir->doc_freq(self->ir, term);
345
+ }
346
+
347
+ int *sea_doc_freqs(Searcher *self, Term **terms, int tcnt)
348
+ {
349
+ int *freqs = ALLOC_N(int, tcnt);
350
+ int i;
351
+ for (i = 0; i < tcnt; i++) {
352
+ freqs[i] = self->ir->doc_freq(self->ir, terms[i]);
353
+ }
354
+ return freqs;
355
+ }
356
+
357
+ Document *sea_get_doc(Searcher *self, int doc_num)
358
+ {
359
+ return self->ir->get_doc(self->ir, doc_num);
360
+ }
361
+
362
+ int sea_max_doc(Searcher *self)
363
+ {
364
+ return self->ir->max_doc(self->ir);
365
+ }
366
+
367
+ Weight *sea_create_weight(Searcher *self, Query *query)
368
+ {
369
+ return q_weight(query, self);
370
+ }
371
+
372
+ TopDocs *sea_search(Searcher *self, Query *query, int first_doc,
373
+ int num_docs, Filter *filter, Sort *sort)
374
+ {
375
+ int max_size = first_doc + num_docs;
376
+ int i;
377
+ Weight *weight;
378
+ Scorer *scorer;
379
+ Hit **score_docs = NULL;
380
+ Hit *hit;
381
+ int total_hits = 0;
382
+ float min_score = 0.0, score;
383
+ BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
384
+ Hit *(*hq_pop)(PriorityQueue *pq);
385
+ void (*hq_down)(PriorityQueue *pq);
386
+ void (*hq_push)(PriorityQueue *pq, void *elem);
387
+ void (*hq_destroy)(void *p);
388
+ PriorityQueue *hq;
389
+
390
+
391
+ if (num_docs <= 0)
392
+ eprintf(ARG_ERROR, "num_docs must be > 0 to run a search");
393
+
394
+ if (first_doc < 0)
395
+ eprintf(ARG_ERROR, "first_doc must be >= 0 to run a search");
396
+
397
+ weight = q_weight(query, self);
398
+ scorer = weight->scorer(weight, self->ir);
399
+ if (!scorer) {
400
+ if (bits) bv_destroy(bits);
401
+ return td_create(0, 0, NULL);
402
+ }
403
+
404
+ if (sort) {
405
+ hq = fshq_pq_create(max_size, sort, self->ir);
406
+ hq_pop = &fshq_pq_pop;
407
+ hq_down = &fshq_pq_down;
408
+ hq_push = &fshq_pq_push;
409
+ hq_destroy = &fshq_pq_destroy;
410
+ } else {
411
+ hq = pq_create(max_size, &hit_less_than);
412
+ hq_pop = &hit_pq_pop;
413
+ hq_down = &hit_pq_down;
414
+ hq_push = &hit_pq_push;
415
+ hq_destroy = &pq_destroy;
416
+ }
417
+
418
+ while (scorer->next(scorer)) {
419
+ if (bits && !bv_get(bits, scorer->doc)) continue;
420
+ total_hits++;
421
+ score = scorer->score(scorer);
422
+ if (hq->count < max_size) {
423
+ hit = ALLOC(Hit);
424
+ hit->doc = scorer->doc; hit->score = score;
425
+ hq_push(hq, hit);
426
+ min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
427
+ } else if (score > min_score) {
428
+ hit = pq_top(hq);
429
+ hit->doc = scorer->doc; hit->score = score;
430
+ hq_down(hq);
431
+ min_score = ((Hit *)pq_top(hq))->score; // maintain min_score
432
+ }
433
+ }
434
+ scorer->destroy(scorer);
435
+
436
+ if (hq->count > first_doc) {
437
+ if ((hq->count - first_doc) < num_docs) {
438
+ num_docs = hq->count - first_doc;
439
+ }
440
+ score_docs = ALLOC_N(Hit *, num_docs);
441
+ for (i = num_docs - 1; i >= 0; i--) {
442
+ score_docs[i] = hq_pop(hq);
443
+ //hit = score_docs[i] = pq_pop(hq);
444
+ //printf("hit = %d-->%f\n", hit->doc, hit->score);
445
+ }
446
+ } else {
447
+ num_docs = 0;
448
+ }
449
+ pq_clear(hq);
450
+ hq_destroy(hq);
451
+
452
+ if (bits) bv_destroy(bits);
453
+ return td_create(total_hits, num_docs, score_docs);
454
+ }
455
+
456
+ void sea_search_each(Searcher *self, Query *query, Filter *filter,
457
+ void (*fn)(Searcher *self, int doc_num, void *arg), void *arg)
458
+ {
459
+ Weight *weight;
460
+ Scorer *scorer;
461
+ BitVector *bits = (filter ? filter->get_bv(filter, self->ir) : NULL);
462
+
463
+ weight = q_weight(query, self);
464
+ scorer = weight->scorer(weight, self->ir);
465
+ if (!scorer) {
466
+ if (bits) bv_destroy(bits);
467
+ return;
468
+ }
469
+
470
+ while (scorer->next(scorer)) {
471
+ if (bits && !bv_get(bits, scorer->doc)) continue;
472
+ fn(self, scorer->doc, arg);
473
+ }
474
+ scorer->destroy(scorer);
475
+ }
476
+
477
+ Query *sea_rewrite(Searcher *self, Query *original)
478
+ {
479
+ Query *query = original;
480
+ Query *rewritten_query = query->rewrite(query, self->ir);
481
+ while (query != rewritten_query) {
482
+ query = rewritten_query;
483
+ rewritten_query = query->rewrite(query, self->ir);
484
+ }
485
+ return query;
486
+ }
487
+
488
+ Explanation *sea_explain(Searcher *self, Query *query, int doc_num)
489
+ {
490
+ Weight *weight = q_weight(query, self);
491
+ return weight->explain(weight, self->ir, doc_num);
492
+ }
493
+
494
+ Similarity *sea_get_similarity(Searcher *self)
495
+ {
496
+ return self->similarity;
497
+ }
498
+
499
+ void sea_close(Searcher *self)
500
+ {
501
+ if (self->ir)
502
+ ir_close(self->ir);
503
+ free(self);
504
+ }
505
+
506
+ Searcher *sea_create(IndexReader *ir)
507
+ {
508
+ Searcher *self = ALLOC(Searcher);
509
+ self->ir = ir;
510
+ self->similarity = sim_create_default();
511
+ self->doc_freq = &sea_doc_freq;
512
+ self->doc_freqs = &sea_doc_freqs;
513
+ self->get_doc = &sea_get_doc;
514
+ self->max_doc = &sea_max_doc;
515
+ self->create_weight = &sea_create_weight;
516
+ self->search = &sea_search;
517
+ self->rewrite = &sea_rewrite;
518
+ self->explain = &sea_explain;
519
+ self->get_similarity = &sea_get_similarity;
520
+ self->close = &sea_close;
521
+ return self;
522
+ }
523
+
524
+