isomorfeus-ferret 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (222) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +612 -0
  3. data/README.md +44 -0
  4. data/ext/isomorfeus_ferret_ext/benchmark.c +223 -0
  5. data/ext/isomorfeus_ferret_ext/benchmark.h +45 -0
  6. data/ext/isomorfeus_ferret_ext/benchmarks_all.h +25 -0
  7. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +123 -0
  8. data/ext/isomorfeus_ferret_ext/bm_hash.c +118 -0
  9. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +40 -0
  10. data/ext/isomorfeus_ferret_ext/bm_store.c +93 -0
  11. data/ext/isomorfeus_ferret_ext/email.rl +21 -0
  12. data/ext/isomorfeus_ferret_ext/extconf.rb +5 -0
  13. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -0
  14. data/ext/isomorfeus_ferret_ext/frb_analysis.c +2577 -0
  15. data/ext/isomorfeus_ferret_ext/frb_index.c +3457 -0
  16. data/ext/isomorfeus_ferret_ext/frb_lang.c +9 -0
  17. data/ext/isomorfeus_ferret_ext/frb_lang.h +17 -0
  18. data/ext/isomorfeus_ferret_ext/frb_qparser.c +629 -0
  19. data/ext/isomorfeus_ferret_ext/frb_search.c +4460 -0
  20. data/ext/isomorfeus_ferret_ext/frb_store.c +515 -0
  21. data/ext/isomorfeus_ferret_ext/frb_threading.h +30 -0
  22. data/ext/isomorfeus_ferret_ext/frb_utils.c +1127 -0
  23. data/ext/isomorfeus_ferret_ext/frt_analysis.c +1644 -0
  24. data/ext/isomorfeus_ferret_ext/frt_analysis.h +247 -0
  25. data/ext/isomorfeus_ferret_ext/frt_array.c +124 -0
  26. data/ext/isomorfeus_ferret_ext/frt_array.h +54 -0
  27. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +95 -0
  28. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +586 -0
  29. data/ext/isomorfeus_ferret_ext/frt_compound_io.c +374 -0
  30. data/ext/isomorfeus_ferret_ext/frt_config.h +44 -0
  31. data/ext/isomorfeus_ferret_ext/frt_document.c +134 -0
  32. data/ext/isomorfeus_ferret_ext/frt_document.h +52 -0
  33. data/ext/isomorfeus_ferret_ext/frt_except.c +95 -0
  34. data/ext/isomorfeus_ferret_ext/frt_except.h +188 -0
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +233 -0
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +42 -0
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +157 -0
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +502 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +427 -0
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +290 -0
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +518 -0
  42. data/ext/isomorfeus_ferret_ext/frt_hash.h +466 -0
  43. data/ext/isomorfeus_ferret_ext/frt_hashset.c +191 -0
  44. data/ext/isomorfeus_ferret_ext/frt_hashset.h +206 -0
  45. data/ext/isomorfeus_ferret_ext/frt_helper.c +62 -0
  46. data/ext/isomorfeus_ferret_ext/frt_helper.h +13 -0
  47. data/ext/isomorfeus_ferret_ext/frt_ind.c +353 -0
  48. data/ext/isomorfeus_ferret_ext/frt_ind.h +54 -0
  49. data/ext/isomorfeus_ferret_ext/frt_index.c +6377 -0
  50. data/ext/isomorfeus_ferret_ext/frt_index.h +880 -0
  51. data/ext/isomorfeus_ferret_ext/frt_lang.c +104 -0
  52. data/ext/isomorfeus_ferret_ext/frt_lang.h +44 -0
  53. data/ext/isomorfeus_ferret_ext/frt_mempool.c +87 -0
  54. data/ext/isomorfeus_ferret_ext/frt_mempool.h +33 -0
  55. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +349 -0
  56. data/ext/isomorfeus_ferret_ext/frt_multimapper.h +52 -0
  57. data/ext/isomorfeus_ferret_ext/frt_posh.c +1006 -0
  58. data/ext/isomorfeus_ferret_ext/frt_posh.h +973 -0
  59. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.c +147 -0
  60. data/ext/isomorfeus_ferret_ext/frt_priorityqueue.h +147 -0
  61. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +1612 -0
  62. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +157 -0
  63. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +209 -0
  64. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +281 -0
  65. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +147 -0
  66. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +672 -0
  67. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +3084 -0
  68. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +1182 -0
  69. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +98 -0
  70. data/ext/isomorfeus_ferret_ext/frt_q_range.c +665 -0
  71. data/ext/isomorfeus_ferret_ext/frt_q_span.c +2386 -0
  72. data/ext/isomorfeus_ferret_ext/frt_q_term.c +311 -0
  73. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +166 -0
  74. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +460 -0
  75. data/ext/isomorfeus_ferret_ext/frt_scanner.c +899 -0
  76. data/ext/isomorfeus_ferret_ext/frt_scanner.h +28 -0
  77. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +6705 -0
  78. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +4419 -0
  79. data/ext/isomorfeus_ferret_ext/frt_search.c +1824 -0
  80. data/ext/isomorfeus_ferret_ext/frt_search.h +924 -0
  81. data/ext/isomorfeus_ferret_ext/frt_similarity.c +150 -0
  82. data/ext/isomorfeus_ferret_ext/frt_similarity.h +79 -0
  83. data/ext/isomorfeus_ferret_ext/frt_sort.c +796 -0
  84. data/ext/isomorfeus_ferret_ext/frt_stopwords.c +395 -0
  85. data/ext/isomorfeus_ferret_ext/frt_store.c +680 -0
  86. data/ext/isomorfeus_ferret_ext/frt_store.h +789 -0
  87. data/ext/isomorfeus_ferret_ext/frt_term_vectors.c +72 -0
  88. data/ext/isomorfeus_ferret_ext/frt_threading.h +23 -0
  89. data/ext/isomorfeus_ferret_ext/frt_win32.h +54 -0
  90. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +409 -0
  91. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +95 -0
  92. data/ext/isomorfeus_ferret_ext/libstemmer.c +93 -0
  93. data/ext/isomorfeus_ferret_ext/libstemmer.h +73 -0
  94. data/ext/isomorfeus_ferret_ext/q_parser.y +1366 -0
  95. data/ext/isomorfeus_ferret_ext/scanner.h +28 -0
  96. data/ext/isomorfeus_ferret_ext/scanner.in +43 -0
  97. data/ext/isomorfeus_ferret_ext/scanner.rl +84 -0
  98. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +200 -0
  99. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +85 -0
  100. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +324 -0
  101. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +7 -0
  102. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +610 -0
  103. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +1104 -0
  105. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +6 -0
  106. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +749 -0
  107. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +7 -0
  108. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +1233 -0
  109. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +490 -0
  111. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +6 -0
  112. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1217 -0
  113. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +7 -0
  114. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +1052 -0
  115. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +6 -0
  116. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +283 -0
  117. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +6 -0
  118. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +735 -0
  119. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +6 -0
  120. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +1003 -0
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +7 -0
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +1079 -0
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +6 -0
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +293 -0
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +6 -0
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +984 -0
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +6 -0
  128. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +686 -0
  129. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +6 -0
  130. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +325 -0
  131. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +6 -0
  132. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +620 -0
  133. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +6 -0
  134. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +1111 -0
  135. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +6 -0
  136. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +754 -0
  137. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +6 -0
  138. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +1242 -0
  139. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +6 -0
  140. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +495 -0
  141. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +6 -0
  142. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +1220 -0
  143. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +6 -0
  144. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +1059 -0
  145. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +6 -0
  146. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +285 -0
  147. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +6 -0
  148. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +741 -0
  149. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +6 -0
  150. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +1009 -0
  151. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +6 -0
  152. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +990 -0
  153. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +6 -0
  154. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +680 -0
  155. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +6 -0
  156. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +1083 -0
  157. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +6 -0
  158. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +294 -0
  159. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +6 -0
  160. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +2191 -0
  161. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +6 -0
  162. data/ext/isomorfeus_ferret_ext/stem_api.c +66 -0
  163. data/ext/isomorfeus_ferret_ext/stem_api.h +26 -0
  164. data/ext/isomorfeus_ferret_ext/stem_header.h +57 -0
  165. data/ext/isomorfeus_ferret_ext/stem_modules.h +190 -0
  166. data/ext/isomorfeus_ferret_ext/stem_modules.txt +50 -0
  167. data/ext/isomorfeus_ferret_ext/stem_utilities.c +478 -0
  168. data/ext/isomorfeus_ferret_ext/test.c +850 -0
  169. data/ext/isomorfeus_ferret_ext/test.h +416 -0
  170. data/ext/isomorfeus_ferret_ext/test_1710.c +63 -0
  171. data/ext/isomorfeus_ferret_ext/test_analysis.c +1221 -0
  172. data/ext/isomorfeus_ferret_ext/test_array.c +272 -0
  173. data/ext/isomorfeus_ferret_ext/test_bitvector.c +600 -0
  174. data/ext/isomorfeus_ferret_ext/test_compound_io.c +170 -0
  175. data/ext/isomorfeus_ferret_ext/test_document.c +156 -0
  176. data/ext/isomorfeus_ferret_ext/test_except.c +244 -0
  177. data/ext/isomorfeus_ferret_ext/test_fields.c +522 -0
  178. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +185 -0
  179. data/ext/isomorfeus_ferret_ext/test_filter.c +331 -0
  180. data/ext/isomorfeus_ferret_ext/test_fs_store.c +25 -0
  181. data/ext/isomorfeus_ferret_ext/test_global.c +299 -0
  182. data/ext/isomorfeus_ferret_ext/test_hash.c +485 -0
  183. data/ext/isomorfeus_ferret_ext/test_hashset.c +288 -0
  184. data/ext/isomorfeus_ferret_ext/test_helper.c +47 -0
  185. data/ext/isomorfeus_ferret_ext/test_highlighter.c +548 -0
  186. data/ext/isomorfeus_ferret_ext/test_index.c +2323 -0
  187. data/ext/isomorfeus_ferret_ext/test_lang.c +74 -0
  188. data/ext/isomorfeus_ferret_ext/test_mempool.c +102 -0
  189. data/ext/isomorfeus_ferret_ext/test_multimapper.c +64 -0
  190. data/ext/isomorfeus_ferret_ext/test_priorityqueue.c +213 -0
  191. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +84 -0
  192. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +61 -0
  193. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +241 -0
  194. data/ext/isomorfeus_ferret_ext/test_q_parser.c +464 -0
  195. data/ext/isomorfeus_ferret_ext/test_q_span.c +575 -0
  196. data/ext/isomorfeus_ferret_ext/test_ram_store.c +77 -0
  197. data/ext/isomorfeus_ferret_ext/test_search.c +1874 -0
  198. data/ext/isomorfeus_ferret_ext/test_segments.c +167 -0
  199. data/ext/isomorfeus_ferret_ext/test_similarity.c +25 -0
  200. data/ext/isomorfeus_ferret_ext/test_sort.c +333 -0
  201. data/ext/isomorfeus_ferret_ext/test_store.c +591 -0
  202. data/ext/isomorfeus_ferret_ext/test_store.h +3 -0
  203. data/ext/isomorfeus_ferret_ext/test_term.c +351 -0
  204. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +373 -0
  205. data/ext/isomorfeus_ferret_ext/test_test.c +83 -0
  206. data/ext/isomorfeus_ferret_ext/test_threading.c +188 -0
  207. data/ext/isomorfeus_ferret_ext/testhelper.c +561 -0
  208. data/ext/isomorfeus_ferret_ext/testhelper.h +25 -0
  209. data/ext/isomorfeus_ferret_ext/tests_all.h +87 -0
  210. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +1854 -0
  211. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +1999 -0
  212. data/ext/isomorfeus_ferret_ext/url.rl +27 -0
  213. data/ext/isomorfeus_ferret_ext/word_list.h +15156 -0
  214. data/lib/isomorfeus/ferret/document.rb +132 -0
  215. data/lib/isomorfeus/ferret/field_symbol.rb +85 -0
  216. data/lib/isomorfeus/ferret/index/field_infos.rb +48 -0
  217. data/lib/isomorfeus/ferret/index/index.rb +970 -0
  218. data/lib/isomorfeus/ferret/monitor.rb +323 -0
  219. data/lib/isomorfeus/ferret/stdlib_patches.rb +151 -0
  220. data/lib/isomorfeus/ferret/version.rb +5 -0
  221. data/lib/isomorfeus-ferret.rb +8 -0
  222. metadata +307 -0
@@ -0,0 +1,157 @@
1
+ #include "frt_search.h"
2
+ #include <string.h>
3
+
4
+ /***************************************************************************
5
+ *
6
+ * ConstantScoreScorer
7
+ *
8
+ ***************************************************************************/
9
+
10
+ #define CScQ(query) ((FrtConstantScoreQuery *)(query))
11
+ #define CScSc(scorer) ((ConstantScoreScorer *)(scorer))
12
+
13
+ typedef struct ConstantScoreScorer
14
+ {
15
+ FrtScorer super;
16
+ FrtBitVector *bv;
17
+ float score;
18
+ } ConstantScoreScorer;
19
+
20
+ static float cssc_score(FrtScorer *self)
21
+ {
22
+ return CScSc(self)->score;
23
+ }
24
+
25
+ static bool cssc_next(FrtScorer *self)
26
+ {
27
+ return ((self->doc = frt_bv_scan_next(CScSc(self)->bv)) >= 0);
28
+ }
29
+
30
+ static bool cssc_skip_to(FrtScorer *self, int doc_num)
31
+ {
32
+ return ((self->doc = frt_bv_scan_next_from(CScSc(self)->bv, doc_num)) >= 0);
33
+ }
34
+
35
+ static FrtExplanation *cssc_explain(FrtScorer *self, int doc_num)
36
+ {
37
+ (void)self; (void)doc_num;
38
+ return frt_expl_new(1.0, "ConstantScoreScorer");
39
+ }
40
+
41
+ static FrtScorer *cssc_new(FrtWeight *weight, FrtIndexReader *ir)
42
+ {
43
+ FrtScorer *self = frt_scorer_new(ConstantScoreScorer, weight->similarity);
44
+ FrtFilter *filter = CScQ(weight->query)->filter;
45
+
46
+ CScSc(self)->score = weight->value;
47
+ CScSc(self)->bv = frt_filt_get_bv(filter, ir);
48
+
49
+ self->score = &cssc_score;
50
+ self->next = &cssc_next;
51
+ self->skip_to = &cssc_skip_to;
52
+ self->explain = &cssc_explain;
53
+ self->destroy = &frt_scorer_destroy_i;
54
+ return self;
55
+ }
56
+
57
+ /***************************************************************************
58
+ *
59
+ * ConstantScoreWeight
60
+ *
61
+ ***************************************************************************/
62
+
63
+ static char *csw_to_s(FrtWeight *self)
64
+ {
65
+ return frt_strfmt("ConstantScoreWeight(%f)", self->value);
66
+ }
67
+
68
+ static FrtExplanation *csw_explain(FrtWeight *self, FrtIndexReader *ir, int doc_num)
69
+ {
70
+ FrtFilter *filter = CScQ(self->query)->filter;
71
+ FrtExplanation *expl;
72
+ char *filter_str = filter->to_s(filter);
73
+ FrtBitVector *bv = frt_filt_get_bv(filter, ir);
74
+
75
+ if (frt_bv_get(bv, doc_num)) {
76
+ expl = frt_expl_new(self->value, "ConstantScoreQuery(%s), product of:", filter_str);
77
+ frt_expl_add_detail(expl, frt_expl_new(self->query->boost, "boost"));
78
+ frt_expl_add_detail(expl, frt_expl_new(self->qnorm, "query_norm"));
79
+ } else {
80
+ expl = frt_expl_new(self->value, "ConstantScoreQuery(%s), does not match id %d", filter_str, doc_num);
81
+ }
82
+ free(filter_str);
83
+ return expl;
84
+ }
85
+
86
+ static FrtWeight *csw_new(FrtQuery *query, FrtSearcher *searcher)
87
+ {
88
+ FrtWeight *self = w_new(FrtWeight, query);
89
+
90
+ self->scorer = &cssc_new;
91
+ self->explain = &csw_explain;
92
+ self->to_s = &csw_to_s;
93
+
94
+ self->similarity = query->get_similarity(query, searcher);
95
+ self->idf = 1.0f;
96
+
97
+ return self;
98
+ }
99
+
100
+ /***************************************************************************
101
+ *
102
+ * ConstantScoreQuery
103
+ *
104
+ ***************************************************************************/
105
+
106
+ static char *csq_to_s(FrtQuery *self, FrtSymbol default_field)
107
+ {
108
+ FrtFilter *filter = CScQ(self)->filter;
109
+ char *filter_str = filter->to_s(filter);
110
+ char *buffer;
111
+ (void)default_field;
112
+ if (self->boost == 1.0) {
113
+ buffer = frt_strfmt("ConstantScore(%s)", filter_str);
114
+ }
115
+ else {
116
+ buffer = frt_strfmt("ConstantScore(%s)^%f", filter_str, self->boost);
117
+ }
118
+ free(filter_str);
119
+ return buffer;;
120
+ }
121
+
122
+ static void csq_destroy(FrtQuery *self)
123
+ {
124
+ frt_filt_deref(CScQ(self)->filter);
125
+ frt_q_destroy_i(self);
126
+ }
127
+
128
+ static unsigned long long csq_hash(FrtQuery *self)
129
+ {
130
+ return frt_filt_hash(CScQ(self)->filter);
131
+ }
132
+
133
+ static int csq_eq(FrtQuery *self, FrtQuery *o)
134
+ {
135
+ return frt_filt_eq(CScQ(self)->filter, CScQ(o)->filter);
136
+ }
137
+
138
+ FrtQuery *frt_csq_new_nr(FrtFilter *filter)
139
+ {
140
+ FrtQuery *self = frt_q_new(FrtConstantScoreQuery);
141
+ CScQ(self)->filter = filter;
142
+
143
+ self->type = CONSTANT_QUERY;
144
+ self->to_s = &csq_to_s;
145
+ self->hash = &csq_hash;
146
+ self->eq = &csq_eq;
147
+ self->destroy_i = &csq_destroy;
148
+ self->create_weight_i = &csw_new;
149
+
150
+ return self;
151
+ }
152
+
153
+ FrtQuery *frt_csq_new(FrtFilter *filter)
154
+ {
155
+ FRT_REF(filter);
156
+ return frt_csq_new_nr(filter);
157
+ }
@@ -0,0 +1,209 @@
1
+ #include "frt_search.h"
2
+ #include <string.h>
3
+
4
+ /***************************************************************************
5
+ *
6
+ * FilteredQueryScorer
7
+ *
8
+ ***************************************************************************/
9
+
10
+ #define FQSc(scorer) ((FilteredQueryScorer *)(scorer))
11
+ #define FQQ(query) ((FrtFilteredQuery *)(query))
12
+
13
+ typedef struct FilteredQueryScorer
14
+ {
15
+ FrtScorer super;
16
+ FrtScorer *sub_scorer;
17
+ FrtBitVector *bv;
18
+ } FilteredQueryScorer;
19
+
20
+ static float fqsc_score(FrtScorer *self)
21
+ {
22
+ FrtScorer *sub_sc = FQSc(self)->sub_scorer;
23
+ return sub_sc->score(sub_sc);
24
+ }
25
+
26
+ static bool fqsc_next(FrtScorer *self)
27
+ {
28
+ FrtScorer *sub_sc = FQSc(self)->sub_scorer;
29
+ FrtBitVector *bv = FQSc(self)->bv;
30
+ while (sub_sc->next(sub_sc)) {
31
+ self->doc = sub_sc->doc;
32
+ if (frt_bv_get(bv, self->doc)) return true;
33
+ }
34
+ return false;
35
+ }
36
+
37
+ static bool fqsc_skip_to(FrtScorer *self, int doc_num)
38
+ {
39
+ FrtScorer *sub_sc = FQSc(self)->sub_scorer;
40
+ FrtBitVector *bv = FQSc(self)->bv;
41
+ if (sub_sc->skip_to(sub_sc, doc_num)) {
42
+ do {
43
+ self->doc = sub_sc->doc;
44
+ if (frt_bv_get(bv, self->doc)) {
45
+ return true;
46
+ }
47
+ } while (sub_sc->next(sub_sc));
48
+ }
49
+ return false;
50
+ }
51
+
52
+ static FrtExplanation *fqsc_explain(FrtScorer *self, int doc_num)
53
+ {
54
+ FrtScorer *sub_sc = FQSc(self)->sub_scorer;
55
+ return sub_sc->explain(sub_sc, doc_num);
56
+ }
57
+
58
+ static void fqsc_destroy(FrtScorer *self)
59
+ {
60
+ FilteredQueryScorer *fqsc = FQSc(self);
61
+ fqsc->sub_scorer->destroy(fqsc->sub_scorer);
62
+ frt_scorer_destroy_i(self);
63
+ }
64
+
65
+ static FrtScorer *fqsc_new(FrtScorer *scorer, FrtBitVector *bv, FrtSimilarity *sim)
66
+ {
67
+ FrtScorer *self = frt_scorer_new(FilteredQueryScorer, sim);
68
+
69
+ FQSc(self)->sub_scorer = scorer;
70
+ FQSc(self)->bv = bv;
71
+
72
+ self->score = &fqsc_score;
73
+ self->next = &fqsc_next;
74
+ self->skip_to = &fqsc_skip_to;
75
+ self->explain = &fqsc_explain;
76
+ self->destroy = &fqsc_destroy;
77
+
78
+ return self;
79
+ }
80
+
81
+ /***************************************************************************
82
+ *
83
+ * FrtWeight
84
+ *
85
+ ***************************************************************************/
86
+
87
+ #define FQW(weight) ((FilteredQueryWeight *)(weight))
88
+ typedef struct FilteredQueryWeight
89
+ {
90
+ FrtWeight super;
91
+ FrtWeight *sub_weight;
92
+ } FilteredQueryWeight;
93
+
94
+ static char *fqw_to_s(FrtWeight *self)
95
+ {
96
+ return frt_strfmt("FilteredQueryWeight(%f)", self->value);
97
+ }
98
+
99
+ static float fqw_sum_of_squared_weights(FrtWeight *self)
100
+ {
101
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
102
+ return sub_weight->sum_of_squared_weights(sub_weight);
103
+ }
104
+
105
+ static void fqw_normalize(FrtWeight *self, float normalization_factor)
106
+ {
107
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
108
+ sub_weight->normalize(sub_weight, normalization_factor);
109
+ }
110
+
111
+ static float fqw_get_value(FrtWeight *self)
112
+ {
113
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
114
+ return sub_weight->get_value(sub_weight);
115
+ }
116
+
117
+ static FrtExplanation *fqw_explain(FrtWeight *self, FrtIndexReader *ir, int doc_num)
118
+ {
119
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
120
+ return sub_weight->explain(sub_weight, ir, doc_num);
121
+ }
122
+
123
+ static FrtScorer *fqw_scorer(FrtWeight *self, FrtIndexReader *ir)
124
+ {
125
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
126
+ FrtScorer *scorer = sub_weight->scorer(sub_weight, ir);
127
+ FrtFilter *filter = FQQ(self->query)->filter;
128
+
129
+ return fqsc_new(scorer, frt_filt_get_bv(filter, ir), self->similarity);
130
+ }
131
+
132
+ static void fqw_destroy(FrtWeight *self)
133
+ {
134
+ FrtWeight *sub_weight = FQW(self)->sub_weight;
135
+ sub_weight->destroy(sub_weight);
136
+ frt_w_destroy(self);
137
+ }
138
+
139
+ static FrtWeight *fqw_new(FrtQuery *query, FrtWeight *sub_weight, FrtSimilarity *sim)
140
+ {
141
+ FrtWeight *self = w_new(FilteredQueryWeight, query);
142
+
143
+ FQW(self)->sub_weight = sub_weight;
144
+
145
+ self->get_value = &fqw_get_value;
146
+ self->normalize = &fqw_normalize;
147
+ self->scorer = &fqw_scorer;
148
+ self->explain = &fqw_explain;
149
+ self->to_s = &fqw_to_s;
150
+ self->destroy = &fqw_destroy;
151
+ self->sum_of_squared_weights = &fqw_sum_of_squared_weights;
152
+
153
+ self->similarity = sim;
154
+ self->idf = 1.0f;
155
+ self->value = sub_weight->value;
156
+
157
+ return self;
158
+ }
159
+
160
+ /***************************************************************************
161
+ *
162
+ * FilteredQuery
163
+ *
164
+ ***************************************************************************/
165
+
166
+ static char *fq_to_s(FrtQuery *self, FrtSymbol default_field)
167
+ {
168
+ FrtFilteredQuery *fq = FQQ(self);
169
+ char *filter_str = fq->filter->to_s(fq->filter);
170
+ char *query_str = fq->query->to_s(fq->query, default_field);
171
+ char *buffer;
172
+ if (self->boost == 1.0) {
173
+ buffer = frt_strfmt("FilteredQuery(query:%s, filter:%s)", query_str, filter_str);
174
+ } else {
175
+ buffer = frt_strfmt("FilteredQuery(query:%s, filter:%s)^%f", query_str, filter_str, self->boost);
176
+ }
177
+ free(filter_str);
178
+ free(query_str);
179
+ return buffer;;
180
+ }
181
+
182
+ static void fq_destroy(FrtQuery *self)
183
+ {
184
+ frt_filt_deref(FQQ(self)->filter);
185
+ frt_q_deref(FQQ(self)->query);
186
+ frt_q_destroy_i(self);
187
+ }
188
+
189
+ static FrtWeight *fq_new_weight(FrtQuery *self, FrtSearcher *searcher)
190
+ {
191
+ FrtQuery *sub_query = FQQ(self)->query;
192
+ return fqw_new(self, frt_q_weight(sub_query, searcher),
193
+ searcher->similarity);
194
+ }
195
+
196
+ FrtQuery *frt_fq_new(FrtQuery *query, FrtFilter *filter)
197
+ {
198
+ FrtQuery *self = frt_q_new(FrtFilteredQuery);
199
+
200
+ FQQ(self)->query = query;
201
+ FQQ(self)->filter = filter;
202
+
203
+ self->type = FILTERED_QUERY;
204
+ self->to_s = &fq_to_s;
205
+ self->destroy_i = &fq_destroy;
206
+ self->create_weight_i = &fq_new_weight;
207
+
208
+ return self;
209
+ }
@@ -0,0 +1,281 @@
1
+ #include <string.h>
2
+ #include "frt_search.h"
3
+ #include "frt_helper.h"
4
+
5
+ /****************************************************************************
6
+ *
7
+ * FuzzyStuff
8
+ *
9
+ * The main method here is the fuzq_score_mn method which scores a term
10
+ * against another term. The other methods all act in support.
11
+ *
12
+ * To learn more about the fuzzy scoring algorithm see;
13
+ *
14
+ * http://en.wikipedia.org/wiki/Levenshtein_distance
15
+ *
16
+ ****************************************************************************/
17
+
18
+ /**
19
+ * Calculate the maximum nomber of allowed edits (or maximum edit distance)
20
+ * for a word to be a match.
21
+ *
22
+ * Note that fuzq->text_len and m are both the lengths text *after* the prefix
23
+ * so `FRT_MIN(fuzq->text_len, m) + fuzq->pre_len)` actually gets the byte length
24
+ * of the shorter string out of the query string and the index term being
25
+ * compared.
26
+ */
27
+ static int fuzq_calculate_max_distance(FrtFuzzyQuery *fuzq, int m)
28
+ {
29
+ return (int)((1.0 - fuzq->min_sim) * (FRT_MIN(fuzq->text_len, m) + fuzq->pre_len));
30
+ }
31
+
32
+ /**
33
+ * The max-distance formula gets used a lot - it needs to be calculated for
34
+ * every possible match in the index - so we cache the results for all
35
+ * lengths up to the FRT_TYPICAL_LONGEST_WORD limit. For words longer than this we
36
+ * calculate the value live.
37
+ */
38
+ static void fuzq_initialize_max_distances(FrtFuzzyQuery *fuzq)
39
+ {
40
+ int i;
41
+ for (i = 0; i < FRT_TYPICAL_LONGEST_WORD; i++) {
42
+ fuzq->max_distances[i] = fuzq_calculate_max_distance(fuzq, i);
43
+ }
44
+ }
45
+
46
+ /**
47
+ * Return the cached max-distance value if the word is within the
48
+ * FRT_TYPICAL_LONGEST_WORD limit.
49
+ */
50
+ static int fuzq_get_max_distance(FrtFuzzyQuery *fuzq, int m)
51
+ {
52
+ if (m < FRT_TYPICAL_LONGEST_WORD)
53
+ return fuzq->max_distances[m];
54
+ return fuzq_calculate_max_distance(fuzq, m);
55
+ }
56
+
57
+ /**
58
+ * Calculate the similarity score for the +target+ against the query.
59
+ *
60
+ * @params fuzq The Fuzzy Query
61
+ * @params target *the term to compare against minus the prefix
62
+ * @params m the string length of +target+
63
+ * @params n the string length of the query string minus length of the prefix
64
+ */
65
+ static float fuzq_score_mn(FrtFuzzyQuery *fuzq,
66
+ const char *target,
67
+ const int m, const int n)
68
+ {
69
+ int i, j, prune;
70
+ int *d_curr, *d_prev;
71
+ const char *text = fuzq->text;
72
+ const int max_distance = fuzq_get_max_distance(fuzq, m);
73
+
74
+ /* Just adding the characters of m to n or vice-versa results in
75
+ * too many edits for example "pre" length is 3 and "prefixes"
76
+ * length is 8. We can see that given this optimal circumstance,
77
+ * the edit distance cannot be less than 5 which is 8-3 or more
78
+ * precisesly Math.abs(3-8). If our maximum edit distance is 4,
79
+ * then we can discard this word without looking at it. */
80
+ if (max_distance < FRT_ABS(m-n)) {
81
+ return 0.0f;
82
+ }
83
+
84
+ d_curr = fuzq->da;
85
+ d_prev = d_curr + n + 1;
86
+
87
+ /* init array */
88
+ for (j = 0; j <= n; j++) {
89
+ d_curr[j] = j;
90
+ }
91
+
92
+ /* start computing edit distance */
93
+ for (i = 0; i < m;) {
94
+ char s_i = target[i];
95
+ /* swap d_current into d_prev */
96
+ int *d_tmp = d_prev;
97
+ d_prev = d_curr;
98
+ d_curr = d_tmp;
99
+ prune = (d_curr[0] = ++i) > max_distance;
100
+
101
+ for (j = 0; j < n; j++) {
102
+ d_curr[j + 1] = (s_i == text[j])
103
+ ? FRT_MIN3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
104
+ : FRT_MIN3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
105
+ if (prune && d_curr[j + 1] <= max_distance) {
106
+ prune = false;
107
+ }
108
+ }
109
+ if (prune) {
110
+ return 0.0f;
111
+ }
112
+ }
113
+
114
+ /* this will return less than 0.0 when the edit distance is greater
115
+ * than the number of characters in the shorter word. but this was
116
+ * the formula that was previously used in FuzzyTermEnum, so it has
117
+ * not been changed (even though min_sim must be greater than 0.0) */
118
+ return (float)(1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + FRT_MIN(n, m))));
119
+ }
120
+
121
+ /**
122
+ * The following algorithm is taken from Bob Carpenter's FuzzyTermEnum
123
+ * implentation here;
124
+ *
125
+ * http://mail-archives.apache.org/mod_mbox/lucene-java-dev/200606.mbox/%3c448F0E8C.3050901@alias-i.com%3e
126
+ */
127
+ float frt_fuzq_score(FrtFuzzyQuery *fuzq, const char *target)
128
+ {
129
+ const int m = (int)strlen(target);
130
+ const int n = fuzq->text_len;
131
+
132
+ /* we don't have anything to compare. That means if we just add
133
+ * the letters for m we get the new word */
134
+ if (m == 0 || n == 0) {
135
+ if (fuzq->pre_len == 0)
136
+ return 0.0f;
137
+ return 1.0f - ((float) (m+n) / fuzq->pre_len);
138
+ }
139
+
140
+ return fuzq_score_mn(fuzq, target, m, n);
141
+ }
142
+
143
+ /****************************************************************************
144
+ *
145
+ * FuzzyQuery
146
+ *
147
+ ****************************************************************************/
148
+
149
+ #define FzQ(query) ((FrtFuzzyQuery *)(query))
150
+
151
+ static char *fuzq_to_s(FrtQuery *self, FrtSymbol curr_field)
152
+ {
153
+ char *buffer, *bptr;
154
+ char *term = FzQ(self)->term;
155
+ FrtSymbol field = FzQ(self)->field;
156
+ const char *field_name = rb_id2name(field);
157
+ bptr = buffer = FRT_ALLOC_N(char, strlen(term) + strlen(field_name) + 70);
158
+
159
+ if (curr_field != field) {
160
+ bptr += sprintf(bptr, "%s:", field_name);
161
+ }
162
+
163
+ bptr += sprintf(bptr, "%s~", term);
164
+ if (FzQ(self)->min_sim != 0.5) {
165
+ frt_dbl_to_s(bptr, FzQ(self)->min_sim);
166
+ bptr += strlen(bptr);
167
+ }
168
+
169
+ if (self->boost != 1.0) {
170
+ *bptr = '^';
171
+ frt_dbl_to_s(++bptr, self->boost);
172
+ }
173
+
174
+ return buffer;
175
+ }
176
+
177
+ static FrtQuery *fuzq_rewrite(FrtQuery *self, FrtIndexReader *ir)
178
+ {
179
+ FrtQuery *q;
180
+ FrtFuzzyQuery *fuzq = FzQ(self);
181
+
182
+ int pre_len = fuzq->pre_len;
183
+ char *prefix = NULL;
184
+ const char *term = fuzq->term;
185
+ const int field_num = frt_fis_get_field_num(ir->fis, fuzq->field);
186
+ FrtTermEnum *te;
187
+
188
+ if (field_num < 0) {
189
+ return frt_bq_new(true);
190
+ }
191
+ if (fuzq->pre_len >= (int)strlen(term)) {
192
+ return frt_tq_new(fuzq->field, term);
193
+ }
194
+
195
+ q = frt_multi_tq_new_conf(fuzq->field, FrtMTQMaxTerms(self), fuzq->min_sim);
196
+ if (pre_len > 0) {
197
+ prefix = FRT_ALLOC_N(char, pre_len + 1);
198
+ strncpy(prefix, term, pre_len);
199
+ prefix[pre_len] = '\0';
200
+ te = ir->terms_from(ir, field_num, prefix);
201
+ }
202
+ else {
203
+ te = ir->terms(ir, field_num);
204
+ }
205
+
206
+ assert(NULL != te);
207
+
208
+ fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
209
+ fuzq->text = term + pre_len;
210
+ fuzq->text_len = (int)strlen(fuzq->text);
211
+ FRT_REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
212
+ fuzq_initialize_max_distances(fuzq);
213
+
214
+ do {
215
+ const char *curr_term = te->curr_term;
216
+ const char *curr_suffix = curr_term + pre_len;
217
+ float score = 0.0f;
218
+
219
+ if (prefix && strncmp(curr_term, prefix, pre_len) != 0)
220
+ break;
221
+
222
+ score = frt_fuzq_score(fuzq, curr_suffix);
223
+ frt_multi_tq_add_term_boost(q, curr_term, score);
224
+ } while (te->next(te) != NULL);
225
+
226
+ te->close(te);
227
+ if (prefix) free(prefix);
228
+ return q;
229
+ }
230
+
231
+ static void fuzq_destroy(FrtQuery *self)
232
+ {
233
+ free(FzQ(self)->term);
234
+ free(FzQ(self)->da);
235
+ frt_q_destroy_i(self);
236
+ }
237
+
238
+ static unsigned long long fuzq_hash(FrtQuery *self)
239
+ {
240
+ return frt_str_hash(FzQ(self)->term) ^ frt_str_hash(rb_id2name(FzQ(self)->field))
241
+ ^ frt_float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
242
+ }
243
+
244
+ static int fuzq_eq(FrtQuery *self, FrtQuery *o)
245
+ {
246
+ FrtFuzzyQuery *fq1 = FzQ(self);
247
+ FrtFuzzyQuery *fq2 = FzQ(o);
248
+
249
+ return (strcmp(fq1->term, fq2->term) == 0)
250
+ && (fq1->field == fq2->field)
251
+ && (fq1->pre_len == fq2->pre_len)
252
+ && (fq1->min_sim == fq2->min_sim);
253
+ }
254
+
255
+ FrtQuery *frt_fuzq_new_conf(FrtSymbol field, const char *term,
256
+ float min_sim, int pre_len, int max_terms)
257
+ {
258
+ FrtQuery *self = frt_q_new(FrtFuzzyQuery);
259
+
260
+ FzQ(self)->field = field;
261
+ FzQ(self)->term = frt_estrdup(term);
262
+ FzQ(self)->pre_len = pre_len ? pre_len : FRT_DEF_PRE_LEN;
263
+ FzQ(self)->min_sim = min_sim ? min_sim : FRT_DEF_MIN_SIM;
264
+ FzQ(self)->da = NULL;
265
+ FrtMTQMaxTerms(self) = max_terms ? max_terms : FRT_DEF_MAX_TERMS;
266
+
267
+ self->type = FUZZY_QUERY;
268
+ self->to_s = &fuzq_to_s;
269
+ self->hash = &fuzq_hash;
270
+ self->eq = &fuzq_eq;
271
+ self->rewrite = &fuzq_rewrite;
272
+ self->destroy_i = &fuzq_destroy;
273
+ self->create_weight_i = &frt_q_create_weight_unsup;
274
+
275
+ return self;
276
+ }
277
+
278
+ FrtQuery *frt_fuzq_new(FrtSymbol field, const char *term)
279
+ {
280
+ return frt_fuzq_new_conf(field, term, 0.0f, 0, 0);
281
+ }