ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,91 +0,0 @@
1
- module Ferret::Search
2
- # Expert: Common scoring functionality for different types of queries.
3
- #
4
- # A +Scorer+ either iterates over documents matching a query, or provides an
5
- # explanation of the score for a query for a given document.
6
- #
7
- # Document scores are computed using a given +Similarity+ implementation.
8
- class Scorer
9
- attr_reader :similarity
10
- MAX_DOCS = 0x7FFFFFFF
11
-
12
- # Constructs a Scorer.
13
- # similarity:: The +Similarity+ implementation used by this scorer.
14
- def initialize(similarity)
15
- @similarity = similarity
16
- end
17
-
18
- # Expert: Iterates over matching all documents, yielding the document
19
- # number and the score.
20
- #
21
- # returns:: true if more matching documents may remain.
22
- def each_hit() # :yields: doc, score
23
- while next?
24
- yield(doc(), score())
25
- end
26
- end
27
-
28
- # Expert: Iterates over matching documents in a range.
29
- #
30
- # max:: Do not score documents past this. Default will search all documents
31
- # avaliable.
32
- # returns:: true if more matching documents may remain.
33
- def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
34
- while (next? and doc() < max)
35
- yield(doc(), score())
36
- end
37
- return doc() < max
38
- end
39
-
40
- # Advances to the next document matching the query.
41
- # returns:: true iff there is another document matching the query.
42
- # When this method is used the #explain(int) method should not be used.
43
- def next?()
44
- raise NotImplementedError
45
- end
46
-
47
- # Returns the current document number matching the query.
48
- # Initially invalid, until #next?() is called the first time.
49
- def doc()
50
- raise NotImplementedError
51
- end
52
-
53
- # Returns the score for the current document matching the query.
54
- # Initially invalid, until #next?() is called the first time.
55
- def score()
56
- raise NotImplementedError
57
- end
58
-
59
- # Skips to the first match beyond the current whose document number is
60
- # greater than or equal to a given target.
61
- #
62
- # When this method is used the #explain(int) method should not be used.
63
- #
64
- # target:: The target document number.
65
- # returns:: true iff there is such a match.
66
- #
67
- # Behaves as if written:
68
- #
69
- # def skip_to(target)
70
- # begin
71
- # return false if not next?()
72
- # end while (target > doc())
73
- # return true
74
- # end
75
- #
76
- # Most implementations are considerably more efficient than that.
77
- def skip_to(target)
78
- raise NotImplementedError
79
- end
80
-
81
- # Returns an explanation of the score for a document.
82
- #
83
- # When this method is used, the #next?(), #skip_to(int) and
84
- # #score(HitCollector) methods should not be used.
85
- #
86
- # doc:: The document number for the explanation.
87
- def explain(doc)
88
- raise NotImplementedError
89
- end
90
- end
91
- end
@@ -1,278 +0,0 @@
1
- module Ferret::Search
2
- # Expert: Scoring API.
3
- # Subclasses implement search scoring.
4
- #
5
- # The score of query *q* for document *d* is defined
6
- # in terms of these methods as follows:
7
- #
8
- # <table cellpadding="0" cellspacing="0" border="0">
9
- # <tr>
10
- # <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
11
- # <td valign="middle" align="center">
12
- # <big><big><big><big><big>&Sigma</big></big></big></big></big></td>
13
- # <td valign="middle"><small>
14
- # #tf(int) tf(t in d)#
15
- # #idf_term(Term,Searcher) idf(t)#
16
- # Field#getBoost getBoost(t.field in d)#
17
- # #length_norm(String,int) length_norm(t.field in d)
18
- # </small></td>
19
- # <td valign="middle" rowspan="2">&nbsp*
20
- # #coord(int,int) coord(q,d)#
21
- # #query_norm(float) query_norm(q)
22
- # </td>
23
- # </tr>
24
- # <tr>
25
- # <td valign="top" align="right">
26
- # <small>t in q</small>
27
- # </td>
28
- # </tr>
29
- # </table>
30
- #
31
- # See #set_default
32
- # See IndexWriter#set_similarity
33
- # See Searcher#set_similarity
34
- class Similarity
35
-
36
- def Similarity.byte_to_float(b)
37
- if (b == 0)
38
- return 0.0
39
- end
40
- mantissa = b & 0x07 # 0x07 = 7 = 0b00000111
41
- exponent = (b >> 3) & 0x1F # 0x1f = 31 = 0b00011111
42
- return [0,0,(mantissa << 5),(exponent+48)].pack("cccc").unpack("e")[0]
43
- end
44
-
45
- def Similarity.float_to_byte(f)
46
- if (f <= 0.0) then return 0 end
47
-
48
- bits = [f].pack("e").unpack("cccc")
49
- mantissa = (bits[2] & 0xEf) >> 5
50
- exponent = (bits[3] - 48)
51
-
52
- if (exponent > 0x1f)
53
- exponent = 0x1f # 0x1f = 31 = 0b00011111
54
- mantissa = 0x07 # 0x07 = 7 = 0b00000111
55
- end
56
-
57
- if (exponent < 0)
58
- exponent = 0
59
- mantissa = 1
60
- end
61
-
62
- return ((exponent<<3) | mantissa)
63
- end
64
-
65
- # Cache of decoded bytes
66
- NORM_TABLE = Array.new(256) { |i| Similarity.byte_to_float(i) }
67
-
68
- # Decodes a normalization factor stored in an index.
69
- # See Similarity#encode_norm(float)
70
- def Similarity.decode_norm(b)
71
- return NORM_TABLE[b & 0xFF]
72
- end
73
-
74
- # Decodes a normalization factor stored in an index.
75
- # See Similarity#encode_norm(float)
76
- def decode_norm(b)
77
- return self.class.decode_norm(b)
78
- end
79
-
80
- # Computes the normalization value for a field given the total number of
81
- # terms contained in a field. These values, together with field boosts, are
82
- # stored in an index and multipled into scores for hits on each field by the
83
- # search code.
84
- #
85
- # Matches in longer fields are less precise, so implemenations of this
86
- # method usually return smaller values when *num_tokens* is large,
87
- # and larger values when *num_tokens* is small.
88
- #
89
- # That these values are computed under
90
- # IndexWriter#add_document and stored then using
91
- # #encode_norm(float). Thus they have limited precision, and documents
92
- # must be re-indexed if this method is altered.
93
- #
94
- # field:: the name of the field
95
- # num_tokens:: the total number of tokens contained in fields named
96
- # _field_ of _doc_.
97
- #
98
- # See Field#set_boost
99
- def length_norm
100
- raise NotImplementedError
101
- end
102
-
103
- # Computes the normalization value for a query given the sum of the squared
104
- # weights of each of the query terms. This value is then multipled into the
105
- # weight of each query term.
106
- #
107
- # This does not affect ranking, but rather just attempts to make scores
108
- # from different queries comparable.
109
- #
110
- # sum_of_squared_weights:: the sum of the squares of query term weights
111
- # Return:: a normalization factor for query weights
112
- def query_norm
113
- raise NotImplementedError
114
- end
115
-
116
- # Encodes a normalization factor for storage in an index.
117
- #
118
- # The encoding uses a five-bit exponent and three-bit mantissa, thus
119
- # representing values from around 7x10^9 to 2x10^-9 with about one
120
- # significant decimal digit of accuracy. Zero is also represented.
121
- # Negative numbers are rounded up to zero. Values too large to represent
122
- # are rounded down to the largest representable value. Positive values too
123
- # small to represent are rounded up to the smallest positive representable
124
- # value.
125
- #
126
- # See Field#boost=
127
- def Similarity.encode_norm(f)
128
- return Similarity.float_to_byte(f)
129
- end
130
-
131
- def encode_norm(f)
132
- return self.class.float_to_byte(f)
133
- end
134
-
135
- # Computes a score factor based on a term or phrase's frequency in a
136
- # document. This value is multiplied by the #idf_term(Term, Searcher)
137
- # factor for each term in the query and these products are then summed to
138
- # form the initial score for a document.
139
- #
140
- # Terms and phrases repeated in a document indicate the topic of the
141
- # document, so implementations of this method usually return larger values
142
- # when _freq_ is large, and smaller values when _freq_
143
- # is small.
144
- #
145
- # The default implementation calls #tf(float)
146
- #
147
- # freq:: the frequency of a term within a document
148
- # Return:: a score factor based on a term's within-document frequency
149
- def tf
150
- raise NotImplementedError
151
- end
152
-
153
- # Computes the amount of a sloppy phrase match, based on an edit distance.
154
- # This value is summed for each sloppy phrase match in a document to form
155
- # the frequency that is passed to #tf(float).
156
- #
157
- # A phrase match with a small edit distance to a document passage more
158
- # closely matches the document, so implementations of this method usually
159
- # return larger values when the edit distance is small and smaller values
160
- # when it is large.
161
- #
162
- # See PhraseQuery#slop(int)
163
- # distance:: the edit distance of this sloppy phrase match
164
- # Return:: the frequency increment for this match
165
- def sloppy_freq
166
- raise NotImplementedError
167
- end
168
-
169
- # Computes a score factor for a simple term.
170
- #
171
- # The default implementation is:
172
- # return idf(searcher.doc_freq(term), searcher.max_doc())
173
- #
174
- # Note that Searcher#max_doc() is used instead of
175
- # IndexReader#num_docs() because it is proportional to
176
- # Searcher#doc_freq(Term) , i.e., when one is inaccurate,
177
- # so is the other, and in the same direction.
178
- #
179
- # term:: the term in question
180
- # searcher:: the document collection being searched
181
- # Return:: a score factor for the term
182
- def idf_term(term, searcher)
183
- return idf(searcher.doc_freq(term), searcher.max_doc())
184
- end
185
-
186
- # Computes a score factor for a phrase.
187
- #
188
- # The default implementation sums the #idf(Term,Searcher) factor
189
- # for each term in the phrase.
190
- #
191
- # terms:: the terms in the phrase
192
- # searcher:: the document collection being searched
193
- # Return:: a score factor for the phrase
194
- def idf_phrase(terms, searcher)
195
- idf = 0.0
196
- terms.each { |term| idf += idf_term(term, searcher) }
197
- return idf
198
- end
199
-
200
- # Computes a score factor based on a term's document frequency (the number
201
- # of documents which contain the term). This value is multiplied by the
202
- # #tf(int) factor for each term in the query and these products are
203
- # then summed to form the initial score for a document.
204
- #
205
- # Terms that occur in fewer documents are better indicators of topic, so
206
- # implemenations of this method usually return larger values for rare terms,
207
- # and smaller values for common terms.
208
- #
209
- # doc_freq:: the number of documents which contain the term
210
- # num_docs:: the total number of documents in the collection
211
- # Return:: a score factor based on the term's document frequency
212
- def idf
213
- raise NotImplementedError
214
- end
215
-
216
- # Computes a score factor based on the fraction of all query terms that a
217
- # document contains. This value is multiplied into scores.
218
- #
219
- # The presence of a large portion of the query terms indicates a better
220
- # match with the query, so implemenations of this method usually return
221
- # larger values when the ratio between these parameters is large and smaller
222
- # values when the ratio between them is small.
223
- #
224
- # overlap:: the number of query terms matched in the document
225
- # max_overlap:: the total number of terms in the query
226
- # Return:: a score factor based on term overlap with the query
227
- def coord
228
- raise NotImplementedError
229
- end
230
- end
231
-
232
- # Expert: Default scoring implementation.
233
- class DefaultSimilarity < Similarity
234
- # See source
235
- def length_norm(field, num_terms)
236
- return 1.0 / Math.sqrt(num_terms)
237
- end
238
-
239
- # See source
240
- def query_norm(sum_of_squared_weights)
241
- return 1.0 / Math.sqrt(sum_of_squared_weights)
242
- end
243
-
244
- # See source
245
- def tf(freq)
246
- return Math.sqrt(freq)
247
- end
248
-
249
- # See source
250
- def sloppy_freq(distance)
251
- return 1.0 / (distance + 1)
252
- end
253
-
254
- # See source
255
- def idf(doc_freq, num_docs)
256
- return 0.0 if num_docs == 0
257
- return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
258
- end
259
-
260
- # See source
261
- def coord(overlap, max_overlap)
262
- return overlap.to_f / max_overlap
263
- end
264
- end
265
-
266
- class Similarity
267
- # The Similarity implementation used by default.
268
- @@default = DefaultSimilarity.new()
269
-
270
- def Similarity.default
271
- return @@default
272
- end
273
-
274
- def Similarity.default=(default)
275
- @@default = default
276
- end
277
- end
278
- end
@@ -1,47 +0,0 @@
1
- module Ferret::Search
2
- class SloppyPhraseScorer < PhraseScorer
3
-
4
- def initialize(weight, tps, positions, similarity, slop, norms)
5
- super(weight, tps, positions, similarity, norms)
6
- @slop = slop
7
- end
8
-
9
- def phrase_freq()
10
- @pq.clear()
11
- last_pos = 0
12
- each do |pp|
13
- pp.first_position()
14
- last_pos = pp.position if (pp.position > last_pos)
15
- @pq.push(pp) # build pq from list
16
- end
17
-
18
- freq = 0.0
19
- done = false
20
- begin
21
- pp = @pq.pop()
22
- pos = start = pp.position
23
- next_pos = @pq.top().position
24
- while pos <= next_pos
25
- start = pos # advance pp to min window
26
- if not pp.next_position()
27
- done = true # ran out of a term -- done
28
- break
29
- end
30
- pos = pp.position
31
- end
32
-
33
- match_length = last_pos - start
34
- if (match_length <= @slop)
35
- freq += @similarity.sloppy_freq(match_length) # score match
36
- end
37
-
38
- if (pp.position > last_pos)
39
- last_pos = pp.position
40
- end
41
- @pq.push(pp) # restore pq
42
- end while (!done)
43
-
44
- return freq
45
- end
46
- end
47
- end
@@ -1,112 +0,0 @@
1
- module Ferret::Search
2
- # Encapsulates sort criteria for returned hits.
3
- #
4
- # The fields used to determine sort order must be carefully chosen.
5
- # Documents must contain a single term in such a field, and the value of the
6
- # term should indicate the document's relative position in a given sort
7
- # order. The field must be indexed, but should not be tokenized, and does
8
- # not need to be stored (unless you happen to want it back with the rest of
9
- # your document data). In other words:
10
- #
11
- # document << Field.new("by_number",
12
- # x.to_s,
13
- # Field::Store::NO,
14
- # Field::Index::UN_TOKENIZED))
15
- #
16
- #
17
- # === Valid Types of Values
18
- #
19
- # There are three possible kinds of term values which may be put into
20
- # sorting fields: Integers, Floats, or Strings. Unless SortField objects
21
- # are specified, the type of value in the field is determined by parsing the
22
- # first term in the field.
23
- #
24
- # Integer term values should contain only digits and an optional preceeding
25
- # negative sign. Values must be base 10. Documents which should appear
26
- # first in the sort should have low value integers, later documents high
27
- # values (i.e. the documents should be numbered +1..n+ where +1+ is the
28
- # first and +n+ the last).
29
- #
30
- # Float term values should conform to values accepted by String#to_f.
31
- # Documents which should appear first in the sort should have low values,
32
- # later documents high values.
33
- #
34
- # String term values can contain any valid String, but should not be
35
- # tokenized. The values are sorted according to their Comparable natural
36
- # order. Note that using this type of term value has higher memory
37
- # requirements than the other two types.
38
- #
39
- # === Object Reuse
40
- #
41
- # One of these objects can be used multiple times and the sort order changed
42
- # between usages.
43
- #
44
- # This class is thread safe.
45
- #
46
- # === Memory Usage
47
- #
48
- # Sorting uses caches of term values maintained by the internal HitQueue(s).
49
- # The cache is static and contains an integer or float array of length
50
- # +IndexReader#max_doc+ for each field name for which a sort is performed.
51
- # In other words, the size of the cache in bytes is:
52
- #
53
- # 4 * IndexReader#max_doc * (# of different fields actually used to sort)
54
- #
55
- # For String fields, the cache is larger: in addition to the above array,
56
- # the value of every term in the field is kept in memory. If there are many
57
- # unique terms in the field, this could be quite large.
58
- #
59
- # Note that the size of the cache is not affected by how many fields are in
60
- # the index and _might_ be used to sort - only by the ones actually used to
61
- # sort a result set.
62
- #
63
- # The cache is cleared each time a new +IndexReader+ is passed in, or if the
64
- # value returned by +max_doc()+ changes for the current IndexReader. This
65
- # class is not set up to be able to efficiently sort hits from more than one
66
- # index simultaneously.
67
- class Sort
68
-
69
- attr_accessor :fields
70
-
71
- # Sorts by computed relevance. You can pass a string representing the name
72
- # of the field you want to sort on, a SortField, or an array of either
73
- # (but not a mixed array). If you pass a string or and array of strings
74
- # you can also pass a reverse flag. If you pass a SortField the reverse is
75
- # handled by it.
76
- #
77
- # fields:: The fields you want to sort on. See also SortField
78
- # reverse:: pass true if you want the sort order to be reversed. Only
79
- # works if you pass the field names.
80
- def initialize(fields = [SortField::FIELD_SCORE, SortField::FIELD_DOC],
81
- reverse = false)
82
- fields = [fields] unless fields.is_a?(Array)
83
- @fields = fields
84
- fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
85
- if fields[0].is_a?(String)
86
- @fields = fields.map do |field|
87
- if (field.is_a?(String))
88
- next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
89
- :reverse => reverse})
90
- else
91
- next field
92
- end
93
- end
94
- end
95
- doc_sort_added = false
96
- @fields.each {|f| doc_sort_added = true if f == SortField::FIELD_DOC }
97
- @fields << SortField::FIELD_DOC if not doc_sort_added
98
- end
99
-
100
- # Represents sorting by computed relevance. Using this sort criteria returns
101
- # the same results as calling Searcher#search(Query) Searcher#search()
102
- # without a sort criteria, only with slightly more overhead.
103
- RELEVANCE = Sort.new()
104
-
105
- # Represents sorting by index order.
106
- INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
107
-
108
- def to_s()
109
- return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
110
- end
111
- end
112
- end