ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,32 +0,0 @@
1
- module Ferret::Search
2
- class ExactPhraseScorer < PhraseScorer
3
-
4
- def initialize(weight, tps, positions, similarity, norms)
5
- super(weight, tps, positions, similarity, norms)
6
- end
7
-
8
- def phrase_freq()
9
- # sort list with pq
10
- each do |pp|
11
- pp.first_position()
12
- @pq.push(pp) # build pq from list
13
- end
14
- pq_to_list() # rebuild list from pq
15
-
16
- freq = 0
17
- begin # find position w/ all terms
18
- while (@first.position < @last.position) # scan forward in first
19
- begin
20
- if not @first.next_position()
21
- return freq
22
- end
23
- end while (@first.position < @last.position)
24
- first_to_last()
25
- end
26
- freq += 1 # all equal: a match
27
- end while @last.next_position()
28
-
29
- return freq
30
- end
31
- end
32
- end
@@ -1,41 +0,0 @@
1
- module Ferret::Search
2
- # Expert: Describes the score computation for document and query.
3
- class Explanation
4
- attr_accessor :value, :description, :details
5
-
6
- def initialize(value = nil, description = nil)
7
- @value = value
8
- @description = description
9
- @details = []
10
- end
11
-
12
- def <<(detail)
13
- @details << detail
14
- end
15
-
16
- # Render an explanation as text.
17
- def to_s(depth = 0)
18
- buffer = " " * depth
19
- buffer << "#{@value} = #{@description}\n"
20
-
21
- @details.each do |detail|
22
- buffer << detail.to_s(depth + 1)
23
- end
24
- return buffer
25
- end
26
-
27
- # Render an explanation as HTML.
28
- def to_html()
29
- buffer = "<ul>\n"
30
- buffer << "<li>#{@value} = #{@description}</li>\n"
31
-
32
- @details.each do |detail|
33
- buffer << detail.to_html
34
- end
35
-
36
- buffer << "</ul>\n"
37
-
38
- return buffer
39
- end
40
- end
41
- end
@@ -1,215 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Expert: The default cache implementation, storing all values in memory.
4
- # A WeakKeyHash is used for storage.
5
- class FieldCache
6
- include Ferret::Index
7
-
8
- StringIndex = Struct.new(:str_index, :str_map)
9
-
10
- # Expert: Every key in the internal cache is of this type.
11
- class Entry
12
- attr_reader :field, :sort_type, :comparator
13
- # Creates one of these objects.
14
- def initialize(field, sort_type, comparator = nil)
15
- @field = field
16
- @sort_type = sort_type
17
- @comparator = comparator
18
- end
19
-
20
- # Two of these are equal iff they reference the same field and sort_type.
21
- def eql?(o)
22
- return (o.instance_of? Entry and o.field == @field and
23
- o.sort_type == @sort_type and o.comparator == comparator)
24
- end
25
- alias :== :eql?
26
-
27
- # Composes a hashcode based on the field and sort_type.
28
- def hash()
29
- return @field.hash ^ @sort_type.hash ^ @comparator.hash
30
- end
31
- end
32
-
33
- INT_PARSER = lambda {|i| i.to_i}
34
-
35
- FLOAT_PARSER = lambda {|i| i.to_f}
36
-
37
- # The internal cache. Maps Entry to array of interpreted term values.
38
- @@cache = Ferret::Utils::WeakKeyHash.new
39
-
40
- # See if an object is in the cache.
41
- def FieldCache.lookup(reader, field, sort_type)
42
- entry = Entry.new(field, sort_type)
43
- @@cache.synchronize() do
44
- reader_cache = @@cache[reader]
45
- return nil if reader_cache.nil?
46
- return reader_cache[entry]
47
- end
48
- end
49
-
50
- # Put an object into the cache.
51
- def FieldCache.store(reader, field, sort_type, value)
52
- entry = Entry.new(field, sort_type)
53
- @@cache.synchronize() do
54
- reader_cache = @@cache[reader]
55
- if (reader_cache == nil)
56
- reader_cache = {}
57
- @@cache[reader] = reader_cache
58
- end
59
- return reader_cache[entry] = value
60
- end
61
- end
62
-
63
- # Checks the internal cache for an appropriate entry, and if none is found,
64
- # reads the terms in +field+ and parses them with the provided parser and
65
- # returns an array of size +reader.max_doc+ of the value each document has
66
- # in the given field.
67
- #
68
- # reader:: Used to get field values.
69
- # field:: Which field contains the values.
70
- # sort_type:: The type of sort to run on the field. Holds the parser
71
- # return:: The values in the given field for each document.
72
- def FieldCache.get_index(reader, field, sort_type)
73
- index = lookup(reader, field, sort_type)
74
- if (index == nil)
75
- parser = sort_type.parser
76
- index = Array.new(reader.max_doc)
77
- if (index.length > 0)
78
- term_docs = reader.term_docs
79
- term_enum = reader.terms_from(Term.new(field, ""))
80
- begin
81
- if term_enum.term.nil?
82
- raise "no terms in field '#{field}' to sort by"
83
- end
84
- begin
85
- term = term_enum.term
86
- break if (term.field != field)
87
- termval = parser.call(term.text)
88
- term_docs.seek(term)
89
- while term_docs.next?
90
- index[term_docs.doc] = termval
91
- end
92
- end while term_enum.next?
93
- ensure
94
- term_docs.close()
95
- term_enum.close()
96
- end
97
- end
98
- store(reader, field, sort_type, index)
99
- end
100
- return index
101
- end
102
-
103
- # Checks the internal cache for an appropriate entry, and if none is found
104
- # reads the term values in +field+ and returns an array of them in natural
105
- # order, along with an array telling which element in the term array each
106
- # document uses.
107
- #
108
- # reader:: Used to get field values.
109
- # field:: Which field contains the strings.
110
- # returns:: Array of terms and index into the array for each document.
111
- def FieldCache.get_string_index(reader, field)
112
- index = lookup(reader, field, SortField::SortType::STRING)
113
- if (index == nil)
114
- str_index = Array.new(reader.max_doc)
115
- str_map = Array.new(reader.max_doc+1)
116
- if (str_index.length > 0)
117
- term_docs = reader.term_docs
118
- term_enum = reader.terms_from(Term.new(field,""))
119
- t = 0 # current term number
120
-
121
- # an entry for documents that have no terms in this field should a
122
- # document with no terms be at top or bottom?
123
- #
124
- # this puts them at the top - if it is changed, FieldDocSortedHitQueue
125
- # needs to change as well.
126
- str_map[t] = nil
127
- t += 1
128
-
129
- begin
130
- if (term_enum.term() == nil)
131
- raise "no terms in field #{field} to sort by"
132
- end
133
- begin
134
- term = term_enum.term
135
- break if (term.field != field)
136
-
137
- # store term text
138
- # we expect that there is at most one term per document
139
- if (t >= str_map.length)
140
- raise "there are more terms than documents in field \"#{field}\", but it's impossible to sort on tokenized fields"
141
- end
142
- str_map[t] = term.text
143
-
144
- term_docs.seek(term)
145
- while term_docs.next?
146
- str_index[term_docs.doc] = t
147
- end
148
-
149
- t += 1
150
- end while term_enum.next?
151
- ensure
152
- term_docs.close()
153
- term_enum.close()
154
- end
155
-
156
- if (t == 0)
157
- # if there are no terms, make the term array
158
- # have a single nil entry
159
- # str_map = [nil] <= already set above
160
- elsif (t < str_map.length)
161
- # if there are less terms than documents,
162
- # trim off the dead array space
163
- str_map.compact!
164
- end
165
- end
166
- index = StringIndex.new(str_index, str_map)
167
- store(reader, field, SortField::SortType::STRING, index)
168
- end
169
- return index
170
- end
171
-
172
- # Checks the internal cache for an appropriate entry, and if none is found
173
- # reads +field+ to see if it contains integers, floats or strings, and then
174
- # calls one of the other methods in this class to get the values. For
175
- # string values, a StringIndex is returned. After calling this method,
176
- # there is an entry in the cache for both type +AUTO+ and the actual found
177
- # type.
178
- #
179
- # reader:: Used to get field values.
180
- # field:: Which field contains the values.
181
- # return:: Integer Array, Float Array or StringIndex.
182
- def FieldCache.get_auto_index(reader, field)
183
- index = lookup(reader, field, SortField::SortType::AUTO)
184
- if (index == nil)
185
- term_enum = reader.terms_from(Term.new(field, ""))
186
- begin
187
- term = term_enum.term
188
- if (term == nil)
189
- raise "no terms in field #{field} to sort by"
190
- end
191
- if (term.field == field)
192
- termtext = term.text.strip
193
-
194
- if (termtext == termtext.to_i.to_s)
195
- index = get_index(reader, field, SortField::SortType::INTEGER)
196
- elsif (termtext == termtext.to_f.to_s or termtext == "%f"%termtext.to_f)
197
- index = get_index(reader, field, SortField::SortType::FLOAT)
198
- else
199
- index = get_string_index(reader, field)
200
- end
201
-
202
- if (index != nil)
203
- store(reader, field, SortField::SortType::AUTO, index)
204
- end
205
- else
206
- raise "field \"#{field}\" does not appear to be indexed"
207
- end
208
- ensure
209
- term_enum.close()
210
- end
211
- end
212
- return index
213
- end
214
- end
215
- end
@@ -1,31 +0,0 @@
1
- module Ferret::Search
2
- # Expert: A ScoreDoc which also contains information about
3
- # how to sort the referenced document. In addition to the
4
- # document number and score, this object contains an array
5
- # of values for the document from the field(s) used to sort.
6
- # For example, if the sort criteria was to sort by fields
7
- # "a", "b" then "c", the +fields+ object array
8
- # will have three elements, corresponding respectively to
9
- # the term values for the document in fields "a", "b" and "c".
10
- # The class of each element in the array will be either
11
- # Integer, Float or String depending on the type of values
12
- # in the terms of each field.
13
- #
14
- class FieldDoc < ScoreDoc
15
-
16
- # Expert: The values which are used to sort the referenced document.
17
- # The order of these will match the original sort criteria given by a
18
- # Sort object. Each Object will be either an Integer, Float or String,
19
- # depending on the type of values in the terms of the original field.
20
- # See Sort
21
- # See Searcher#search(Query,Filter,int,Sort)
22
- attr_accessor :fields
23
-
24
- # Expert: Creates one of these objects with the given sort information.
25
- def initialize(doc, score, fields = nil)
26
- super(doc, score)
27
- @fields = fields
28
- end
29
-
30
- end
31
- end
@@ -1,184 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Search
4
- # Expert: A hit queue for sorting by hits by terms in more than one field.
5
- # Uses +FieldCache+ for maintaining internal term lookup tables.
6
- class FieldSortedHitQueue < Ferret::Utils::PriorityQueue
7
- # Stores a comparator corresponding to each field being sorted by
8
- attr_accessor :comparators
9
-
10
- # Stores the sort criteria being used.
11
- attr_accessor :fields
12
-
13
- # Creates a hit queue sorted by the given list of fields.
14
- #
15
- # reader:: Index to use.
16
- # fields:: Field names, in priority order (highest priority first).
17
- # Cannot be +nil+ or empty.
18
- # size:: The number of hits to retain. Must be greater than zero.
19
- # raises:: IOError
20
- def initialize(reader, fields, size)
21
- super(size)
22
- n = fields.length
23
- @comparators = Array.new(n)
24
- @fields = Array.new(n)
25
- fields.each_with_index do |field, i|
26
- @comparators[i] = get_cached_comparator(reader, field)
27
- @fields[i] = SortField.new(field.name,
28
- {:sort_type => comparators[i].sort_type,
29
- :reverse => field.reverse?})
30
- end
31
-
32
- # Stores the maximum score value encountered, for normalizing.
33
- # we only care about scores greater than 1.0 - if all the scores
34
- # are less than 1.0, we don't have to normalize.
35
- @max_score = 1.0
36
- end
37
-
38
-
39
- # Returns whether +a+ is less relevant than +b+.
40
- # sd1:: ScoreDoc
41
- # sd2:: ScoreDoc
42
- # returns:: +true+ if document +a+ should be sorted after document +b+.
43
- def less_than(sd1, sd2)
44
- # keep track of maximum score
45
- @max_score = sd1.score if (sd1.score > @max_score)
46
- @max_score = sd2.score if (sd2.score > @max_score)
47
-
48
- # run comparators
49
- c = 0
50
-
51
- @comparators.length.times do |i|
52
- if @fields[i].reverse?
53
- c = @comparators[i].compare(sd2, sd1)
54
- else
55
- c = @comparators[i].compare(sd1, sd2)
56
- end
57
- break unless c == 0
58
- end
59
-
60
- # avoid random sort order that could lead to duplicates
61
- if (c == 0)
62
- return sd1.doc > sd2.doc
63
- end
64
- return c > 0
65
- end
66
-
67
-
68
- # Given a FieldDoc object, stores the values used
69
- # to sort the given document. These values are not the raw
70
- # values out of the index, but the internal representation
71
- # of them. This is so the given search hit can be collated
72
- # by a MultiSearcher with other search hits.
73
- # doc:: The FieldDoc to store sort values into.
74
- # returns:: The same FieldDoc passed in.
75
- # See Searchable#search(Weight,Filter,int,Sort)
76
- def fill_fields(doc)
77
- fields = Array.new(@comparators.length)
78
- @comparators.each do |comparator|
79
- fields[i] = comparator.sort_value(doc)
80
- end
81
- doc.fields = fields
82
- end
83
-
84
- # Internal cache of comparators. Similar to FieldCache, only
85
- # caches comparators instead of term values.
86
- @@comparators = Ferret::Utils::WeakKeyHash.new
87
-
88
- # Returns a comparator if it is in the cache.
89
- def lookup(reader, field, sort_type, comproc)
90
- entry = FieldCache::Entry.new(field, sort_type, comproc)
91
- @@comparators.synchronize() do
92
- reader_cache = @@comparators[reader]
93
- return nil if reader_cache.nil?
94
- return reader_cache[entry]
95
- end
96
- end
97
-
98
- # Stores a comparator into the cache.
99
- def store(reader, field, sort_type, comproc, value)
100
- entry = FieldCache::Entry.new(field, sort_type, comproc)
101
- @@comparators.synchronize do
102
- reader_cache = @@comparators[reader]
103
- if reader_cache.nil?
104
- reader_cache = Hash.new()
105
- @@comparators[reader] = reader_cache
106
- end
107
- return reader_cache[entry] = value
108
- end
109
- end
110
-
111
- def get_cached_comparator(reader, field)
112
- if field.sort_type == SortField::SortType::DOC
113
- return ScoreDocComparator::INDEX_ORDER
114
- end
115
- if field.sort_type == SortField::SortType::SCORE
116
- return ScoreDocComparator::RELEVANCE
117
- end
118
-
119
- comparator = lookup(reader, field.name, field.sort_type, field.comparator)
120
- if (comparator == nil)
121
- case (field.sort_type)
122
- when SortField::SortType::AUTO:
123
- comparator = comparator_auto(reader, field.name)
124
- when SortField::SortType::STRING:
125
- comparator = comparator_string(reader, field.name)
126
- else
127
- comparator = comparator_simple(reader, field)
128
- end
129
-
130
- store(reader, field.name, field.sort_type, field.comparator, comparator)
131
- end
132
- return comparator
133
- end
134
-
135
- # Returns a comparator for sorting hits according to the sort type and the
136
- # comparator function passed.
137
- # strings.
138
- #
139
- # reader:: Index to use.
140
- # field:: Lets us know which field to search and how to parse it.
141
- # returns:: Comparator for sorting hits.
142
- def comparator_simple(reader, field)
143
- index = FieldCache.get_index(reader, field.name, field.sort_type)
144
- comproc = field.comparator
145
- if (comproc)
146
- return SpecialFieldComparator.new(index, field.sort_type, comproc)
147
- else
148
- return SimpleFieldComparator.new(index, field.sort_type)
149
- end
150
- end
151
-
152
- # Returns a comparator for sorting hits according to a field containing
153
- # strings.
154
- #
155
- # reader:: Index to use.
156
- # field:: Field containing string values.
157
- # returns:: Comparator for sorting hits.
158
- def comparator_string(reader, field)
159
- index = FieldCache.get_string_index(reader, field)
160
- return StringFieldComparator.new(index)
161
- end
162
-
163
- # Returns a comparator for sorting hits according to values in the given field.
164
- # The terms in the field are looked at to determine whether they contain integers,
165
- # floats or strings. Once the type is determined, one of the other static methods
166
- # in this class is called to get the comparator.
167
- # reader:: Index to use.
168
- # field:: Field containg values.
169
- # returns:: Comparator for sorting hits.
170
- # raises:: IOException If an error occurs reading the index.
171
- def comparator_auto(reader, field)
172
- index = FieldCache.get_auto_index(reader, field)
173
- if (index.is_a?(FieldCache::StringIndex))
174
- return StringFieldComparator.new(index)
175
- elsif (index[0].is_a?(Integer))
176
- return SimpleFieldComparator.new(index, SortField::SortType::INTEGER)
177
- elsif (index[0].is_a?(Float))
178
- return SimpleFieldComparator.new(index, SortField::SortType::FLOAT)
179
- else
180
- raise "unknown data type in field '#{field}'. Data = #{index[0]}"
181
- end
182
- end
183
- end
184
- end