ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/q_fuzzy.c CHANGED
@@ -1,5 +1,6 @@
1
1
  #include <string.h>
2
2
  #include "search.h"
3
+ #include "helper.h"
3
4
 
4
5
  /****************************************************************************
5
6
  *
@@ -10,101 +11,106 @@
10
11
  *
11
12
  ****************************************************************************/
12
13
 
13
-
14
- int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
14
+ static inline int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m)
15
15
  {
16
- return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
16
+ return (int)((1.0 - fuzq->min_sim) * (MIN(fuzq->text_len, m) + fuzq->pre_len));
17
17
  }
18
18
 
19
- void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
19
+ static void fuzq_initialize_max_distances(FuzzyQuery *fuzq)
20
20
  {
21
- int i;
22
- for (i = 0; i < TYPICAL_LONGEST_WORD; i++) {
23
- fuzq->max_distances[i] = fuzq_calculate_max_distance(fuzq, i);
24
- }
21
+ int i;
22
+ for (i = 0; i < TYPICAL_LONGEST_WORD; i++) {
23
+ fuzq->max_distances[i] = fuzq_calculate_max_distance(fuzq, i);
24
+ }
25
25
  }
26
26
 
27
- int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
27
+ static inline int fuzq_get_max_distance(FuzzyQuery *fuzq, int m)
28
28
  {
29
- return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
- : fuzq_calculate_max_distance(fuzq, m);
29
+ return (m < TYPICAL_LONGEST_WORD) ? fuzq->max_distances[m]
30
+ : fuzq_calculate_max_distance(fuzq, m);
31
31
  }
32
32
 
33
- float fuzq_score(FuzzyQuery *fuzq, char *target)
33
+ /**
34
+ * The following algorithm is taken from Bob Carpenter's FuzzyTermEnum
35
+ * implentation here;
36
+ *
37
+ * http://mail-archives.apache.org/mod_mbox/lucene-java-dev/200606.mbox/%3c448F0E8C.3050901@alias-i.com%3e
38
+ */
39
+ float fuzq_score(FuzzyQuery *fuzq, const char *target)
34
40
  {
35
- int i, j;
36
- int max_distance;
37
- int m = (int)strlen(target);
38
- int n = fuzq->text_len;
39
- int *d = fuzq->da;
40
- char *text = fuzq->text;
41
- if (n == 0) {
42
- /* we don't have anything to compare. That means if we just add
43
- * the letters for m we get the new word */
44
- return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) m / fuzq->pre_len);
45
- }
46
- if (m == 0) {
47
- return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
48
- }
49
-
50
- max_distance = fuzq_get_max_distance(fuzq, m);
51
-
52
- //printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n, fuzq->text, target);
53
- if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
54
- /* Just adding the characters of m to n or vice-versa results in too many
55
- * edits for example "pre" length is 3 and "prefixes" length is 8. We can
56
- * see that given this optimal circumstance, the edit distance cannot be
57
- * less than 5 which is 8-3 or more precisesly Math.abs(3-8). If our
58
- * maximum edit distance is 4, then we can discard this word without
59
- * looking at it. */
60
- return 0.0f;
61
- }
62
-
63
- /* Let's make sure we have enough room in our array to do the distance
64
- * calculations. */
65
- if (((m+1) * (n+1)) >= fuzq->da_capa) {
66
- fuzq->da_capa = ((m+1) * (n+1)) * 2;
67
- REALLOC_N(fuzq->da, int, fuzq->da_capa);
68
- d = fuzq->da;
69
- }
70
-
71
- /* init matrix d */
72
- for (i = 0; i <= n; i++) d[i + n * 0] = i;
73
- for (j = 0; j <= m; j++) d[0 + n * j] = j;
74
-
75
- /* start computing edit distance */
76
- for (i = 1; i <= n; i++) {
77
- int best_pos_ed_dist = m;
78
- char s_i = text[i - 1];
79
- for (j = 1; j <= m; j++) {
80
- if (s_i != target[j-1]) {
81
- d[i + n*j] = min3(d[i-1 + n*j], d[i + n*(j-1)], d[i-1 + n*(j-1)])+1;
82
- } else {
83
- d[i + n*j] = min3(d[i-1 + n*j]+1, d[i + n*(j-1)]+1, d[i-1 + n*(j-1)]);
84
- }
85
- best_pos_ed_dist = min2(best_pos_ed_dist, d[i + n*j]);
41
+ const int m = (int)strlen(target);
42
+ const int n = fuzq->text_len;
43
+
44
+ if (n == 0) {
45
+ /* we don't have anything to compare. That means if we just add
46
+ * the letters for m we get the new word */
47
+ return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) m / fuzq->pre_len);
48
+ }
49
+ else if (m == 0) {
50
+ return fuzq->pre_len == 0 ? 0.0f : 1.0f - ((float) n / fuzq->pre_len);
86
51
  }
87
- //printf("(bped = %d, i = %d, md = %d)", best_pos_ed_dist, i, max_distance);
88
-
89
- /* After calculating row i, the best possible edit distance can be found
90
- * by found by finding the smallest value in a given column. If the
91
- * best_pos_ed_dist is greater than the max distance, abort.
92
- */
93
-
94
- if ((i > max_distance) && (best_pos_ed_dist > max_distance)) {
95
- /* equal is okay, but not greater
96
- * the closest the target can be to the text is just too far away.
97
- * this target is leaving the party early. */
98
- return 0.0f;
52
+ else {
53
+ int i, j, prune;
54
+ int *d_curr, *d_prev;
55
+ const char *text = fuzq->text;
56
+ const int max_distance = fuzq_get_max_distance(fuzq, m);
57
+
58
+ /*
59
+ printf("n%dm%dmd%ddiff%d<%s><%s>\n", n, m, max_distance, m-n,
60
+ fuzq->text, target);
61
+ */
62
+ if (max_distance < ((m > n) ? (m-n) : (n-m))) { /* abs */
63
+ /* Just adding the characters of m to n or vice-versa results in
64
+ * too many edits for example "pre" length is 3 and "prefixes"
65
+ * length is 8. We can see that given this optimal circumstance,
66
+ * the edit distance cannot be less than 5 which is 8-3 or more
67
+ * precisesly Math.abs(3-8). If our maximum edit distance is 4,
68
+ * then we can discard this word without looking at it. */
69
+ return 0.0f;
70
+ }
71
+
72
+ d_curr = fuzq->da;
73
+ d_prev = d_curr + n + 1;
74
+
75
+ /* init array */
76
+ for (j = 0; j <= n; j++) {
77
+ d_curr[j] = j;
78
+ }
79
+
80
+ /* start computing edit distance */
81
+ for (i = 0; i < m;) {
82
+ char s_i = target[i];
83
+ /* swap d_current into d_prev */
84
+ int *d_tmp = d_prev;
85
+ d_prev = d_curr;
86
+ d_curr = d_tmp;
87
+ prune = (d_curr[0] = ++i) > max_distance;
88
+
89
+ for (j = 0; j < n; j++) {
90
+ d_curr[j + 1] = (s_i == text[j])
91
+ ? min3(d_prev[j + 1] + 1, d_curr[j] + 1, d_prev[j])
92
+ : min3(d_prev[j + 1], d_curr[j], d_prev[j]) + 1;
93
+ if (prune && d_curr[j + 1] <= max_distance) {
94
+ prune = false;
95
+ }
96
+ }
97
+ if (prune) {
98
+ return 0.0f;
99
+ }
100
+ }
101
+
102
+ /*
103
+ printf("<%f, d_curr[n] = %d min_len = %d>",
104
+ 1.0f - ((float)d_curr[m] / (float) (fuzq->pre_len + min2(n, m))),
105
+ d_curr[m], fuzq->pre_len + min2(n, m));
106
+ */
107
+
108
+ /* this will return less than 0.0 when the edit distance is greater
109
+ * than the number of characters in the shorter word. but this was
110
+ * the formula that was previously used in FuzzyTermEnum, so it has
111
+ * not been changed (even though min_sim must be greater than 0.0) */
112
+ return 1.0f - ((float)d_curr[n] / (float) (fuzq->pre_len + min2(n, m)));
99
113
  }
100
- }
101
- //printf("<%f, d[n + m*m] = %d min_len = %d>", 1.0f - ((float)d[n + m*m] / (float) (fuzq->pre_len + min(n, m))), d[n + m*m], fuzq->pre_len + min(n, m));
102
-
103
- /* this will return less than 0.0 when the edit distance is greater than the
104
- * number of characters in the shorter word. but this was the formula that
105
- * was previously used in FuzzyTermEnum, so it has not been changed (even
106
- * though min_sim must be greater than 0.0) */
107
- return 1.0f - ((float)d[n + n*m] / (float) (fuzq->pre_len + min2(n, m)));
108
114
  }
109
115
 
110
116
  /****************************************************************************
@@ -113,192 +119,150 @@ float fuzq_score(FuzzyQuery *fuzq, char *target)
113
119
  *
114
120
  ****************************************************************************/
115
121
 
116
- char *fuzq_to_s(Query *self, char *field)
117
- {
118
- char *buffer, *bptr;
119
- FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
120
- Term *term = fuzq->term;
121
- int tlen = (int)strlen(term->text);
122
- int flen = (int)strlen(term->field);
123
- bptr = buffer = ALLOC_N(char, tlen + flen + 35);
124
-
125
- if (strcmp(term->field, field) != 0) {
126
- sprintf(bptr, "%s:", term->field);
127
- bptr += strlen(term->field) + 1;
128
- }
129
- sprintf(bptr, "%s~", term->text);
130
- bptr += strlen(bptr);
131
- if (fuzq->min_sim != 0.5) {
132
- dbl_to_s(bptr, fuzq->min_sim);
133
- bptr += strlen(bptr);
134
- }
135
- if (self->boost != 1.0) {
136
- *bptr = '^';
137
- dbl_to_s(++bptr, self->boost);
138
- }
139
- return buffer;
140
- }
122
+ #define FzQ(query) ((FuzzyQuery *)(query))
141
123
 
142
- typedef struct ScoredTerm {
143
- Term *term;
144
- float score;
145
- } ScoredTerm;
146
-
147
- bool scored_term_less_than(void *p1, void *p2)
124
+ static char *fuzq_to_s(Query *self, const char *curr_field)
148
125
  {
149
- ScoredTerm *st1 = (ScoredTerm *)p1;
150
- ScoredTerm *st2 = (ScoredTerm *)p2;
126
+ char *buffer, *bptr;
127
+ char *term = FzQ(self)->term;
128
+ char *field = FzQ(self)->field;
129
+ int tlen = (int)strlen(term);
130
+ int flen = (int)strlen(field);
131
+ bptr = buffer = ALLOC_N(char, tlen + flen + 70);
132
+
133
+ if (strcmp(curr_field, field) != 0) {
134
+ sprintf(bptr, "%s:", field);
135
+ bptr += flen + 1;
136
+ }
151
137
 
152
- if (st1->score == st2->score)
153
- return (strcmp(st1->term->text, st2->term->text) < 0);
138
+ sprintf(bptr, "%s~", term);
139
+ bptr += tlen + 1;
140
+ if (FzQ(self)->min_sim != 0.5) {
141
+ dbl_to_s(bptr, FzQ(self)->min_sim);
142
+ bptr += strlen(bptr);
143
+ }
154
144
 
155
- return (st1->score < st2->score);
156
- }
145
+ if (self->boost != 1.0) {
146
+ *bptr = '^';
147
+ dbl_to_s(++bptr, self->boost);
148
+ }
157
149
 
158
- void scored_term_destroy(ScoredTerm *self)
159
- {
160
- term_destroy(self->term);
161
- free(self);
150
+ return buffer;
162
151
  }
163
152
 
164
- ScoredTerm *scored_term_create(Term *term, float score)
153
+ static Query *fuzq_rewrite(Query *self, IndexReader *ir)
165
154
  {
166
- ScoredTerm *self = ALLOC(ScoredTerm);
167
- self->term = term;
168
- self->score = score;
169
- return self;
170
- }
155
+ Query *q;
156
+ FuzzyQuery *fuzq = FzQ(self);
171
157
 
172
- Query *fuzq_rewrite(Query *self, IndexReader *ir)
173
- {
174
- Query *q;
175
- Query *tq;
176
- FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
177
-
178
- Term *term = fuzq->term;
179
- char *text = term->text;
180
- char *field = term->field;
181
- Term prefix_term;
182
- prefix_term.field = field;
183
- if (fuzq->pre_len >= (int)strlen(text)) {
184
- q = tq_create(term_clone(term));
185
- } else {
186
- PriorityQueue *term_pq;
187
- TermEnum *te;
188
- Term prefix_term;
189
- char *prefix = NULL;
190
- int pre_len = fuzq->pre_len;
191
- ScoredTerm *scored_term;
192
-
193
- q = bq_create(true);
194
-
195
- term_pq = pq_create(((BooleanQuery *)q->data)->max_clause_cnt,
196
- &scored_term_less_than);
197
- term_pq->free_elem = (free_ft)&scored_term_destroy;
198
-
199
- prefix_term.field = field;
200
- prefix_term.text = (char *)EMPTY_STRING;
201
- if (pre_len >= 0) {
202
- prefix = ALLOC_N(char, pre_len + 1);
203
- strncpy(prefix, text, pre_len);
204
- prefix_term.text = prefix;
205
- prefix_term.text[pre_len] = '\0';
158
+ const char *term = fuzq->term;
159
+ const char *field = fuzq->field;
160
+ const int field_num = fis_get_field_num(ir->fis, field);
161
+
162
+ if (field_num < 0) {
163
+ q = bq_new(true);
206
164
  }
207
- te = ir->terms_from(ir, &prefix_term);
208
-
209
- fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
210
- fuzq->text = fuzq->term->text + pre_len;
211
- fuzq->text_len = (int)strlen(fuzq->text);
212
- fuzq_initialize_max_distances(fuzq);
213
-
214
- if (te) {
215
- TermBuffer *tb = te->tb_curr;
216
- float score = 0.0, min_score = fuzq->min_sim;
217
-
218
- TRY
219
- do {
220
- if (strcmp(tb->field, field) != 0 ||
221
- (prefix && strncmp(tb->text, prefix, pre_len) != 0))
222
- break;
223
-
224
- score = fuzq_score(fuzq, tb->text + pre_len);
225
- //printf("%s:%s:%f\n", tb->text, fuzq->text, score);
226
-
227
- if (score > min_score) {
228
- pq_insert(term_pq, scored_term_create(tb_get_term(tb), score));
229
- if (pq_full(term_pq))
230
- min_score = ((ScoredTerm *)pq_top(term_pq))->score;
231
- }
232
- } while ((tb = te->next(te)) != NULL);
233
- XFINALLY
234
- te->close(te);
235
- XENDTRY
165
+ else if (fuzq->pre_len >= (int)strlen(term)) {
166
+ q = tq_new(field, term);
236
167
  }
237
- free(prefix);
238
-
239
- while ((scored_term = pq_pop(term_pq)) != NULL) {
240
- tq = tq_create(scored_term->term); /* found match */
241
- tq->boost = self->boost; /* set boost */
242
- bq_add_query(q, tq, BC_SHOULD); /* add query */
243
- free(scored_term); /* no need to free the term as it's in the query */
168
+ else {
169
+ TermEnum *te;
170
+ char *prefix = NULL;
171
+ int pre_len = fuzq->pre_len;
172
+
173
+ q = multi_tq_new_conf(fuzq->field, MTQMaxTerms(self), fuzq->min_sim);
174
+
175
+ if (pre_len > 0) {
176
+ prefix = ALLOC_N(char, pre_len + 1);
177
+ strncpy(prefix, term, pre_len);
178
+ prefix[pre_len] = '\0';
179
+ te = ir->terms_from(ir, field_num, prefix);
180
+ }
181
+ else {
182
+ te = ir->terms(ir, field_num);
183
+ }
184
+
185
+ fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim));
186
+ fuzq->text = term + pre_len;
187
+ fuzq->text_len = (int)strlen(fuzq->text);
188
+ fuzq->da = REALLOC_N(fuzq->da, int, fuzq->text_len * 2 + 2);
189
+ fuzq_initialize_max_distances(fuzq);
190
+
191
+ if (te) {
192
+ const char *curr_term = te->curr_term;
193
+ const char *curr_suffix = curr_term + pre_len;
194
+ float score = 0.0;
195
+
196
+
197
+ do {
198
+ if ((prefix && strncmp(curr_term, prefix, pre_len) != 0)) {
199
+ break;
200
+ }
201
+
202
+ score = fuzq_score(fuzq, curr_suffix);
203
+ /*
204
+ printf("%s:%s:%f < %f\n", curr_term, term, score, min_score);
205
+ */
206
+ multi_tq_add_term_boost(q, curr_term, score);
207
+
208
+ } while (te->next(te) != NULL);
209
+
210
+ te->close(te);
211
+ }
212
+ free(prefix);
244
213
  }
245
- pq_destroy(term_pq);
246
- }
247
214
 
248
- return q;
215
+ return q;
249
216
  }
250
217
 
251
- void fuzq_destroy(Query *self)
218
+ static void fuzq_destroy(Query *self)
252
219
  {
253
- FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
254
- if (self->destroy_all) term_destroy((Term *)fuzq->term);
255
- free(fuzq->da);
256
- free(fuzq);
257
- q_destroy_i(self);
220
+ free(FzQ(self)->term);
221
+ free(FzQ(self)->field);
222
+ free(FzQ(self)->da);
223
+ q_destroy_i(self);
258
224
  }
259
225
 
260
- static uint fuzq_hash(Query *self)
226
+ static ulong fuzq_hash(Query *self)
261
227
  {
262
- FuzzyQuery *fq = (FuzzyQuery *)self->data;
263
- return term_hash(fq->term) ^ *((int *)&fq->min_sim) ^ fq->pre_len;
228
+ return str_hash(FzQ(self)->term) ^ str_hash(FzQ(self)->field)
229
+ ^ float2int(FzQ(self)->min_sim) ^ FzQ(self)->pre_len;
264
230
  }
265
231
 
266
232
  static int fuzq_eq(Query *self, Query *o)
267
233
  {
268
- FuzzyQuery *fq1 = (FuzzyQuery *)self->data;
269
- FuzzyQuery *fq2 = (FuzzyQuery *)o->data;
270
- return term_eq(fq1->term, fq2->term) &&
271
- (fq1->pre_len == fq2->pre_len) &&
272
- (fq1->min_sim == fq2->min_sim);
234
+ FuzzyQuery *fq1 = FzQ(self);
235
+ FuzzyQuery *fq2 = FzQ(o);
236
+
237
+ return (strcmp(fq1->term, fq2->term) == 0)
238
+ && (strcmp(fq1->field, fq2->field) == 0)
239
+ && (fq1->pre_len == fq2->pre_len)
240
+ && (fq1->min_sim == fq2->min_sim);
273
241
  }
274
242
 
275
- Query *fuzq_create(Term *term)
243
+ Query *fuzq_new_conf(const char *field, const char *term,
244
+ float min_sim, int pre_len, int max_terms)
276
245
  {
277
- Query *self = q_create();
278
-
279
- FuzzyQuery *fq = ALLOC(FuzzyQuery);
280
- ZEROSET(fq, FuzzyQuery, 1);
281
- fq->term = term;
282
- fq->pre_len = DEF_PRE_LEN;
283
- fq->min_sim = DEF_MIN_SIM;
284
- self->data = fq;
285
-
286
- self->type = FUZZY_QUERY;
287
- self->to_s = &fuzq_to_s;
288
- self->hash = &fuzq_hash;
289
- self->eq = &fuzq_eq;
290
- self->rewrite = &fuzq_rewrite;
291
- self->destroy_i = &fuzq_destroy;
292
- self->create_weight_i = &q_create_weight_unsup;
293
-
294
- return self;
246
+ Query *self = q_new(FuzzyQuery);
247
+
248
+ FzQ(self)->field = estrdup(field);
249
+ FzQ(self)->term = estrdup(term);
250
+ FzQ(self)->pre_len = pre_len ? pre_len : DEF_PRE_LEN;
251
+ FzQ(self)->min_sim = min_sim ? min_sim : DEF_MIN_SIM;
252
+ MTQMaxTerms(self) = max_terms ? max_terms : DEF_MAX_TERMS;
253
+
254
+ self->type = FUZZY_QUERY;
255
+ self->to_s = &fuzq_to_s;
256
+ self->hash = &fuzq_hash;
257
+ self->eq = &fuzq_eq;
258
+ self->rewrite = &fuzq_rewrite;
259
+ self->destroy_i = &fuzq_destroy;
260
+ self->create_weight_i = &q_create_weight_unsup;
261
+
262
+ return self;
295
263
  }
296
264
 
297
- Query *fuzq_create_mp(Term *term, float min_sim, int pre_len)
265
+ Query *fuzq_new(const char *field, const char *term)
298
266
  {
299
- Query *self = fuzq_create(term);
300
- FuzzyQuery *fuzq = (FuzzyQuery *)self->data;
301
- if (pre_len) fuzq->pre_len = pre_len;
302
- if (min_sim) fuzq->min_sim = min_sim;
303
- return self;
267
+ return fuzq_new_conf(term, field, 0.0f, 0, 0);
304
268
  }