ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_term.c CHANGED
@@ -1,310 +1,337 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
3
 
4
+ #define TQ(query) ((TermQuery *)(query))
5
+ #define TSc(scorer) ((TermScorer *)(scorer))
6
+
4
7
  /***************************************************************************
5
8
  *
6
- * TermWeight
9
+ * TermScorer
7
10
  *
8
11
  ***************************************************************************/
9
12
 
10
- Scorer *tw_scorer(Weight *self, IndexReader *ir)
13
+ #define SCORE_CACHE_SIZE 32
14
+ #define TDE_READ_SIZE 32
15
+
16
+ typedef struct TermScorer
17
+ {
18
+ Scorer super;
19
+ int docs[TDE_READ_SIZE];
20
+ int freqs[TDE_READ_SIZE];
21
+ int pointer;
22
+ int pointer_max;
23
+ float score_cache[SCORE_CACHE_SIZE];
24
+ Weight *weight;
25
+ TermDocEnum *tde;
26
+ uchar *norms;
27
+ float weight_value;
28
+ } TermScorer;
29
+
30
+ static float tsc_score(Scorer *self)
31
+ {
32
+ TermScorer *ts = TSc(self);
33
+ int freq = ts->freqs[ts->pointer];
34
+ float score;
35
+ /* compute tf(f)*weight */
36
+ if (freq < SCORE_CACHE_SIZE) { /* check cache */
37
+ score = ts->score_cache[freq]; /* cache hit */
38
+ }
39
+ else {
40
+ /* cache miss */
41
+ score = sim_tf(self->similarity, (float)freq) * ts->weight_value;
42
+ }
43
+ /* normalize for field */
44
+ score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
45
+ return score;
46
+ }
47
+
48
+ static bool tsc_next(Scorer *self)
49
+ {
50
+ TermScorer *ts = TSc(self);
51
+
52
+ ts->pointer++;
53
+ if (ts->pointer >= ts->pointer_max) {
54
+ /* refill buffer */
55
+ ts->pointer_max = ts->tde->read(ts->tde, ts->docs, ts->freqs,
56
+ TDE_READ_SIZE);
57
+ if (ts->pointer_max != 0) {
58
+ ts->pointer = 0;
59
+ }
60
+ else {
61
+ return false;
62
+ }
63
+ }
64
+ self->doc = ts->docs[ts->pointer];
65
+ return true;
66
+ }
67
+
68
+ static bool tsc_skip_to(Scorer *self, int doc_num)
11
69
  {
12
- Term *term = ((TermQuery *)self->query->data)->term;
13
- TermDocEnum *tde = ir_term_docs_for(ir,term);
14
- if (!tde) return NULL;
70
+ TermScorer *ts = TSc(self);
71
+ TermDocEnum *tde = ts->tde;
72
+
73
+ /* first scan in cache */
74
+ while (++(ts->pointer) < ts->pointer_max) {
75
+ if (ts->docs[ts->pointer] >= doc_num) {
76
+ self->doc = ts->docs[ts->pointer];
77
+ return true;
78
+ }
79
+ }
15
80
 
16
- return tsc_create(self, tde, ir->get_norms_always(ir, term->field));
81
+ /* not found in cache, seek underlying stream */
82
+ if (tde->skip_to(tde, doc_num)) {
83
+ ts->pointer_max = 1;
84
+ ts->pointer = 0;
85
+ ts->docs[0] = self->doc = tde->doc_num(tde);
86
+ ts->freqs[0] = tde->freq(tde);
87
+ return true;
88
+ }
89
+ else {
90
+ return false;
91
+ }
17
92
  }
18
93
 
19
- Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
94
+ static Explanation *tsc_explain(Scorer *self, int doc_num)
20
95
  {
21
- Explanation *qnorm_expl;
22
- Explanation *field_expl;
23
- Scorer *scorer;
24
- Explanation *tf_expl;
25
- uchar *field_norms;
26
- float field_norm;
27
- Explanation *field_norm_expl;
28
-
29
- char *query_str = self->query->to_s(self->query, "");
30
- TermQuery *tq = (TermQuery *)self->query->data;
31
- Term *term = tq->term;
32
- char *field_name = term->field;
33
-
34
- Explanation *expl = expl_create(0.0,
35
- strfmt("weight(%s in %d), product of:", query_str, doc_num));
36
-
37
- /* We need two of these as it's included in both the query explanation
38
- * and the field explanation */
39
- Explanation *idf_expl1 = expl_create(self->idf,
40
- strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
41
- Explanation *idf_expl2 = expl_create(self->idf,
42
- strfmt("idf(doc_freq=%d)", ir->doc_freq(ir, tq->term)));
43
-
44
- /* explain query weight */
45
- Explanation *query_expl = expl_create(0.0,
46
- strfmt("query_weight(%s), product of:", query_str));
47
- free(query_str);
48
-
49
- if (self->query->boost != 1.0) {
50
- expl_add_detail(query_expl, expl_create(self->query->boost, estrdup("boost")));
51
- }
52
-
53
- expl_add_detail(query_expl, idf_expl1);
54
-
55
- qnorm_expl = expl_create(self->qnorm, estrdup("query_norm"));
56
- expl_add_detail(query_expl, qnorm_expl);
57
-
58
- query_expl->value = self->query->boost * idf_expl1->value * qnorm_expl->value;
59
-
60
- expl_add_detail(expl, query_expl);
61
-
62
- /* explain field weight */
63
- field_expl = expl_create(0.0,
64
- strfmt("field_weight(%s:%s in %d), product of:",
65
- field_name, term->text, doc_num));
66
-
67
- scorer = self->scorer(self, ir);
68
- tf_expl = scorer->explain(scorer, doc_num);
69
- scorer->destroy(scorer);
70
- expl_add_detail(field_expl, tf_expl);
71
- expl_add_detail(field_expl, idf_expl2);
72
-
73
- field_norms = ir->get_norms(ir, field_name);
74
- field_norm = (field_norms
75
- ? sim_decode_norm(self->similarity, field_norms[doc_num])
76
- : (float)0.0);
77
- field_norm_expl = expl_create(field_norm,
78
- strfmt("field_norm(field=%s, doc=%d)", field_name, doc_num));
79
- expl_add_detail(field_expl, field_norm_expl);
80
-
81
- field_expl->value = tf_expl->value * idf_expl2->value * field_norm_expl->value;
82
-
83
- /* combine them */
84
- if (query_expl->value == 1.0) {
85
- expl_destoy(expl);
86
- return field_expl;
87
- } else {
88
- expl->value = (query_expl->value * field_expl->value);
89
- expl_add_detail(expl, field_expl);
90
- return expl;
91
- }
96
+ TermScorer *ts = TSc(self);
97
+ Query *query = ts->weight->get_query(ts->weight);
98
+ int tf = 0;
99
+
100
+ tsc_skip_to(self, doc_num);
101
+ if (self->doc == doc_num) {
102
+ tf = ts->freqs[ts->pointer];
103
+ }
104
+ return expl_new(sim_tf(self->similarity, (float)tf),
105
+ "tf(term_freq(%s:%s)=%d)",
106
+ TQ(query)->field, TQ(query)->term, tf);
92
107
  }
93
108
 
94
- char *tw_to_s(Weight *self)
109
+ static void tsc_destroy(Scorer *self)
95
110
  {
96
- return strfmt("TermWeight(%f)", self->value);
111
+ TSc(self)->tde->close(TSc(self)->tde);
112
+ scorer_destroy_i(self);
97
113
  }
98
114
 
99
- Weight *tw_create(Query *query, Searcher *searcher)
115
+ static Scorer *tsc_new(Weight *weight, TermDocEnum *tde, uchar *norms)
100
116
  {
101
- Weight *self = w_create(query);
102
- self->scorer = &tw_scorer;
103
- self->explain = &tw_explain;
104
- self->to_s = &tw_to_s;
105
- self->sum_of_squared_weights = &w_sum_of_squared_weights;
106
-
107
- self->similarity = query->get_similarity(query, searcher);
108
- self->idf = sim_idf(self->similarity,
109
- searcher->doc_freq(searcher, ((TermQuery *)query->data)->term),
110
- searcher->max_doc(searcher)); // compute idf
111
-
112
- return self;
117
+ int i;
118
+ Scorer *self = scorer_new(TermScorer, weight->similarity);
119
+ TSc(self)->weight = weight;
120
+ TSc(self)->tde = tde;
121
+ TSc(self)->norms = norms;
122
+ TSc(self)->weight_value = weight->value;
123
+
124
+ for (i = 0; i < SCORE_CACHE_SIZE; i++) {
125
+ TSc(self)->score_cache[i]
126
+ = sim_tf(self->similarity, (float)i) * TSc(self)->weight_value;
127
+ }
128
+
129
+ self->score = &tsc_score;
130
+ self->next = &tsc_next;
131
+ self->skip_to = &tsc_skip_to;
132
+ self->explain = &tsc_explain;
133
+ self->destroy = &tsc_destroy;
134
+ return self;
113
135
  }
114
136
 
115
137
  /***************************************************************************
116
138
  *
117
- * TermQuery
139
+ * TermWeight
118
140
  *
119
141
  ***************************************************************************/
120
142
 
121
- void tq_destroy(Query *self)
143
+ static Scorer *tw_scorer(Weight *self, IndexReader *ir)
122
144
  {
123
- TermQuery *tq = self->data;
124
- term_destroy(tq->term);
125
- free(tq);
126
- q_destroy_i(self);
127
- }
145
+ TermQuery *tq = TQ(self->query);
146
+ TermDocEnum *tde = ir_term_docs_for(ir, tq->field, tq->term);
147
+ if (!tde) {
148
+ return NULL;
149
+ }
128
150
 
129
- char *tq_to_s(Query *self, char *field)
130
- {
131
- Term *term = ((TermQuery *)self->data)->term;
132
- size_t flen = strlen(term->field);
133
- size_t tlen = strlen(term->text);
134
- char *buffer = ALLOC_N(char, 34 + flen + tlen);
135
- char *b = buffer;
136
- if (strcmp(field, term->field) != 0) {
137
- memcpy(b, term->field, sizeof(char) * flen);
138
- b[flen] = ':';
139
- b += flen + 1;
140
- }
141
- memcpy(b, term->text, tlen);
142
- b += tlen;
143
- *b = 0;
144
- if (self->boost != 1.0) {
145
- *b = '^';
146
- dbl_to_s(b+1, self->boost);
147
- }
148
- return buffer;
151
+ return tsc_new(self, tde, ir_get_norms(ir, tq->field));
149
152
  }
150
153
 
151
- static void tq_extract_terms(Query *self, HashSet *terms)
154
+ static Explanation *tw_explain(Weight *self, IndexReader *ir, int doc_num)
152
155
  {
153
- Term *term = ((TermQuery *)self->data)->term;
154
- hs_add(terms, term_clone(term));
155
- }
156
+ Explanation *qnorm_expl;
157
+ Explanation *field_expl;
158
+ Scorer *scorer;
159
+ Explanation *tf_expl;
160
+ uchar *field_norms;
161
+ float field_norm;
162
+ Explanation *field_norm_expl;
163
+
164
+ char *query_str = self->query->to_s(self->query, "");
165
+ TermQuery *tq = TQ(self->query);
166
+ char *term = tq->term;
167
+ char *field = tq->field;
168
+
169
+ Explanation *expl = expl_new(0.0, "weight(%s in %d), product of:",
170
+ query_str, doc_num);
171
+
172
+ /* We need two of these as it's included in both the query explanation
173
+ * and the field explanation */
174
+ Explanation *idf_expl1 = expl_new(self->idf, "idf(doc_freq=%d)",
175
+ ir_doc_freq(ir, field, term));
176
+ Explanation *idf_expl2 = expl_new(self->idf, "idf(doc_freq=%d)",
177
+ ir_doc_freq(ir, field, term));
178
+
179
+ /* explain query weight */
180
+ Explanation *query_expl = expl_new(0.0, "query_weight(%s), product of:",
181
+ query_str);
182
+ free(query_str);
183
+
184
+ if (self->query->boost != 1.0) {
185
+ expl_add_detail(query_expl, expl_new(self->query->boost, "boost"));
186
+ }
156
187
 
157
- static uint tq_hash(Query *self)
158
- {
159
- return term_hash(((TermQuery *)self->data)->term);
188
+ expl_add_detail(query_expl, idf_expl1);
189
+
190
+ qnorm_expl = expl_new(self->qnorm, "query_norm");
191
+ expl_add_detail(query_expl, qnorm_expl);
192
+
193
+ query_expl->value = self->query->boost
194
+ * idf_expl1->value * qnorm_expl->value;
195
+
196
+ expl_add_detail(expl, query_expl);
197
+
198
+ /* explain field weight */
199
+ field_expl = expl_new(0.0, "field_weight(%s:%s in %d), product of:",
200
+ field, term, doc_num);
201
+
202
+ scorer = self->scorer(self, ir);
203
+ tf_expl = scorer->explain(scorer, doc_num);
204
+ scorer->destroy(scorer);
205
+ expl_add_detail(field_expl, tf_expl);
206
+ expl_add_detail(field_expl, idf_expl2);
207
+
208
+ field_norms = ir_get_norms(ir, field);
209
+ field_norm = (field_norms
210
+ ? sim_decode_norm(self->similarity, field_norms[doc_num])
211
+ : (float)0.0);
212
+ field_norm_expl = expl_new(field_norm, "field_norm(field=%s, doc=%d)",
213
+ field, doc_num);
214
+
215
+ expl_add_detail(field_expl, field_norm_expl);
216
+
217
+ field_expl->value = tf_expl->value * idf_expl2->value
218
+ * field_norm_expl->value;
219
+
220
+ /* combine them */
221
+ if (query_expl->value == 1.0) {
222
+ expl_destroy(expl);
223
+ return field_expl;
224
+ } else {
225
+ expl->value = (query_expl->value * field_expl->value);
226
+ expl_add_detail(expl, field_expl);
227
+ return expl;
228
+ }
160
229
  }
161
230
 
162
- static int tq_eq(Query *self, Query *o)
231
+ static char *tw_to_s(Weight *self)
163
232
  {
164
- return term_eq(((TermQuery *)self->data)->term,
165
- ((TermQuery *)o->data)->term);
233
+ return strfmt("TermWeight(%f)", self->value);
166
234
  }
167
235
 
168
- Query *tq_create(Term *term)
236
+ static Weight *tw_new(Query *query, Searcher *searcher)
169
237
  {
170
- Query *self = q_create();
171
- TermQuery *tq = ALLOC(TermQuery);
172
- tq->term = term;
173
- self->type = TERM_QUERY;
174
- self->data = tq;
175
- self->extract_terms = &tq_extract_terms;
176
- self->to_s = &tq_to_s;
177
- self->hash = &tq_hash;
178
- self->eq = &tq_eq;
179
-
180
- self->destroy_i = &tq_destroy;
181
- self->create_weight_i = &tw_create;
182
-
183
- return self;
238
+ Weight *self = w_new(Weight, query);
239
+ self->scorer = &tw_scorer;
240
+ self->explain = &tw_explain;
241
+ self->to_s = &tw_to_s;
242
+
243
+ self->similarity = query->get_similarity(query, searcher);
244
+ self->idf = sim_idf(self->similarity,
245
+ searcher->doc_freq(searcher,
246
+ TQ(query)->field,
247
+ TQ(query)->term),
248
+ searcher->max_doc(searcher)); /* compute idf */
249
+
250
+ return self;
184
251
  }
185
252
 
186
-
187
253
  /***************************************************************************
188
254
  *
189
- * TermScorer
255
+ * TermQuery
190
256
  *
191
257
  ***************************************************************************/
192
258
 
193
- float tsc_score(Scorer *self)
259
+ static void tq_destroy(Query *self)
194
260
  {
195
- TermScorer *ts = (TermScorer *)self->data;
196
- int freq = ts->freqs[ts->pointer];
197
- float score;
198
- /* compute tf(f)*weight */
199
- if (freq < SCORE_CACHE_SIZE) { /* check cache */
200
- score = ts->score_cache[freq]; /* cache hit */
201
- } else {
202
- score = sim_tf(self->similarity, (float)freq) * ts->weight_value; /* cache miss */
203
- }
204
- /* normalize for field */
205
- score *= sim_decode_norm(self->similarity, ts->norms[self->doc]);
206
- return score;
261
+ free(TQ(self)->term);
262
+ free(TQ(self)->field);
263
+ q_destroy_i(self);
207
264
  }
208
265
 
209
- bool tsc_next(Scorer *self)
266
+ static char *tq_to_s(Query *self, const char *field)
210
267
  {
211
- TermScorer *ts = (TermScorer *)self->data;
212
-
213
- ts->pointer++;
214
- if (ts->pointer >= ts->pointer_max) {
215
- /* refill buffer */
216
- ts->pointer_max = ts->tde->read(ts->tde, ts->docs, ts->freqs, TDE_READ_SIZE);
217
- if (ts->pointer_max != 0) {
218
- ts->pointer = 0;
219
- } else {
220
- ts->tde->close(ts->tde); /* close stream */
221
- ts->tde = NULL;
222
- return false;
268
+ size_t flen = strlen(TQ(self)->field);
269
+ size_t tlen = strlen(TQ(self)->term);
270
+ char *buffer = ALLOC_N(char, 34 + flen + tlen);
271
+ char *b = buffer;
272
+ if (strcmp(field, TQ(self)->field) != 0) {
273
+ memcpy(b, TQ(self)->field, sizeof(char) * flen);
274
+ b[flen] = ':';
275
+ b += flen + 1;
276
+ }
277
+ memcpy(b, TQ(self)->term, tlen);
278
+ b += tlen;
279
+ *b = 0;
280
+ if (self->boost != 1.0) {
281
+ *b = '^';
282
+ dbl_to_s(b+1, self->boost);
223
283
  }
224
- }
225
- self->doc = ts->docs[ts->pointer];
226
- return true;
284
+ return buffer;
227
285
  }
228
286
 
229
- bool tsc_skip_to(Scorer *self, int doc_num)
287
+ static void tq_extract_terms(Query *self, HashSet *terms)
230
288
  {
231
- TermScorer *ts = (TermScorer *)self->data;
232
- TermDocEnum *tde = ts->tde;
233
-
234
- /* first scan in cache */
235
- while (++(ts->pointer) < ts->pointer_max) {
236
- if (ts->docs[ts->pointer] >= doc_num) {
237
- self->doc = ts->docs[ts->pointer];
238
- return true;
239
- }
240
- }
241
-
242
- /* not found in cache, seek underlying stream */
243
- if (tde->skip_to(tde, doc_num)) {
244
- ts->pointer_max = 1;
245
- ts->pointer = 0;
246
- ts->docs[0] = self->doc = tde->doc_num(tde);
247
- ts->freqs[0] = tde->freq(tde);
248
- return true;
249
- } else {
250
- return false;
251
- }
289
+ hs_add(terms, term_new(TQ(self)->field, TQ(self)->term));
252
290
  }
253
291
 
254
- Explanation *tsc_explain(Scorer *self, int doc_num)
292
+ static ulong tq_hash(Query *self)
255
293
  {
256
- Explanation *tf_explanation;
257
- TermScorer *ts = (TermScorer *)self->data;
258
- Query *query = ts->weight->get_query(ts->weight);
259
- Term *term = ((TermQuery *)query->data)->term;
260
- int tf = 0;
261
- TermDocEnum *tde = ts->tde;
262
- while (ts->pointer < ts->pointer_max) {
263
- if (ts->docs[ts->pointer] == doc_num)
264
- tf = ts->freqs[ts->pointer];
265
- ts->pointer++;
266
- }
267
- if (tf == 0) {
268
- while (tde->next(tde)) {
269
- if (tde->doc_num(tde) == doc_num)
270
- tf = tde->freq(tde);
271
- }
272
- }
273
- tde->close(tde);
274
- ts->tde = NULL;
275
- tf_explanation = expl_create(sim_tf(self->similarity, (float)tf),
276
- strfmt("tf(term_freq(%s:%s)=%d)", term->field, term->text, tf));
294
+ return str_hash(TQ(self)->term) ^ str_hash(TQ(self)->field);
295
+ }
277
296
 
278
- return tf_explanation;
297
+ static int tq_eq(Query *self, Query *o)
298
+ {
299
+ return (strcmp(TQ(self)->term, TQ(o)->term) == 0)
300
+ && (strcmp(TQ(self)->field, TQ(o)->field) == 0);
279
301
  }
280
302
 
281
- void tsc_destroy(Scorer *self)
303
+ static MatchVector *tq_get_matchv_i(Query *self, MatchVector *mv,
304
+ TermVector *tv)
282
305
  {
283
- TermScorer *ts = (TermScorer *)self->data;
284
- if (ts->tde) ts->tde->close(ts->tde);
285
- scorer_destroy_i(self);
306
+ if (strcmp(tv->field, TQ(self)->field) == 0) {
307
+ int i;
308
+ TVTerm *tv_term = tv_get_tv_term(tv, TQ(self)->term);
309
+ if (tv_term) {
310
+ for (i = 0; i < tv_term->freq; i++) {
311
+ int pos = tv_term->positions[i];
312
+ matchv_add(mv, pos, pos);
313
+ }
314
+ }
315
+ }
316
+ return mv;
286
317
  }
287
318
 
288
- Scorer *tsc_create(Weight *weight, TermDocEnum *tde, uchar *norms)
319
+ Query *tq_new(const char *field, const char *term)
289
320
  {
290
- int i;
291
- Scorer *self = scorer_create(weight->similarity);
292
- TermScorer *ts = ALLOC(TermScorer);
293
- ZEROSET(ts, TermScorer, 1);
294
- self->data = ts;
295
- ts->weight = weight;
296
- ts->tde = tde;
297
- ts->norms = norms;
298
- ts->weight_value = weight->value;
299
-
300
- for (i = 0; i < SCORE_CACHE_SIZE; i++) {
301
- ts->score_cache[i] = sim_tf(self->similarity, (float)i) * ts->weight_value;
302
- }
303
-
304
- self->score = &tsc_score;
305
- self->next = &tsc_next;
306
- self->skip_to = &tsc_skip_to;
307
- self->explain = &tsc_explain;
308
- self->destroy = &tsc_destroy;
309
- return self;
321
+ Query *self = q_new(TermQuery);
322
+
323
+ TQ(self)->field = estrdup(field);
324
+ TQ(self)->term = estrdup(term);
325
+
326
+ self->type = TERM_QUERY;
327
+ self->extract_terms = &tq_extract_terms;
328
+ self->to_s = &tq_to_s;
329
+ self->hash = &tq_hash;
330
+ self->eq = &tq_eq;
331
+
332
+ self->destroy_i = &tq_destroy;
333
+ self->create_weight_i = &tw_new;
334
+ self->get_matchv_i = &tq_get_matchv_i;
335
+
336
+ return self;
310
337
  }