ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,183 +0,0 @@
1
- module Ferret::Search
2
- # Expert: A +Scorer+ for documents matching a +Term+.
3
- class TermScorer < Scorer
4
- SCORE_CACHE_SIZE = 32
5
-
6
- # Returns the current document number matching the query.
7
- # Initially invalid, until #next() is called the first time.
8
- attr_reader :doc
9
-
10
- # Construct a +TermScorer+.
11
- # weight:: The weight of the +Term+ in the query.
12
- # td:: An iterator over the documents matching the +Term+.
13
- # similarity:: The +Similarity+ implementation to be used for score
14
- # computations.
15
- # norms:: The field norms of the document fields for the +Term+.
16
- def initialize(weight, td, similarity, norms)
17
- super(similarity)
18
-
19
- @doc = 0
20
- @docs = Array.new(SCORE_CACHE_SIZE, 0) # buffered doc numbers
21
- @freqs = Array.new(SCORE_CACHE_SIZE, 0) # buffered term freqs
22
- @pointer = @pointer_max = 0;
23
- @score_cache = Array.new(SCORE_CACHE_SIZE)
24
-
25
- @weight = weight
26
- @term_docs = td
27
- @norms = norms
28
- @weight_value = weight.value
29
-
30
- SCORE_CACHE_SIZE.times do |i|
31
- @score_cache[i] = similarity().tf(i) * @weight_value
32
- end
33
- end
34
-
35
- # Expert: Iterates over matching all documents, yielding the document
36
- # number and the score.
37
- #
38
- # returns:: true if more matching documents may remain.
39
- def each_hit() # :yields: doc, score
40
- sim = similarity() # cache sim in local
41
- while next?
42
- f = @freqs[@pointer]
43
-
44
- # compute tf(f)*weight
45
- if f < SCORE_CACHE_SIZE # check cache
46
- score = @score_cache[f] # cache hit
47
- else
48
- score = sim.tf(f) * @weight_value # cache miss
49
- end
50
-
51
- score *= sim.decode_norm(@norms[@doc]) # normalize for field
52
-
53
- yield(@doc, score) # collect score
54
- end
55
- end
56
-
57
- # Expert: Iterates over matching documents in a range.
58
- #
59
- # NOTE: that #next? needs to be called first.
60
- #
61
- # max:: Do not score documents past this. Default will search all documents
62
- # avaliable.
63
- # returns:: true if more matching documents may remain.
64
- def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
65
- sim = similarity() # cache sim in local
66
- while (@doc < max) # for docs in window
67
- f = @freqs[@pointer]
68
-
69
- # compute tf(f)*weight
70
- if f < SCORE_CACHE_SIZE # check cache
71
- score = @score_cache[f] # cache hit
72
- else
73
- score = sim.tf(f) * @weight_value # cache miss
74
- end
75
-
76
- score *= sim.decode_norm(@norms[@doc]) # normalize for field
77
-
78
- yield(@doc, score) # collect score
79
- if not next?
80
- return false
81
- end
82
- end
83
- return true # false if we didn't find +max+ hits
84
- end
85
-
86
-
87
- # Advances to the next document matching the query.
88
- #
89
- # The iterator over the matching documents is buffered using
90
- # TermDocEnum#read(int[],int[]).
91
- #
92
- # returns:: true iff there is another document matching the query.
93
- def next?()
94
- @pointer += 1
95
- if @pointer >= @pointer_max
96
- @pointer_max = @term_docs.read(@docs, @freqs) # refill buffer
97
- if @pointer_max != 0
98
- @pointer = 0
99
- else
100
- @term_docs.close() # close stream
101
- @doc = MAX_DOCS # set to sentinel value
102
- return false
103
- end
104
- end
105
- @doc = @docs[@pointer]
106
- return true
107
- end
108
-
109
- def score()
110
- f = @freqs[@pointer]
111
- # compute tf(f)*weight
112
- if f < SCORE_CACHE_SIZE # check cache
113
- raw = @score_cache[f] # cache hit
114
- else
115
- raw = similarity().tf(f) * @weight_value # cache miss
116
- end
117
-
118
- return raw * Similarity.decode_norm(@norms[@doc]) # normalize for field
119
- end
120
-
121
- # Skips to the first match beyond the current whose document number is
122
- # greater than or equal to a given target.
123
- #
124
- # The implementation uses TermDocEnum#skip_to(int).
125
- # target:: The target document number.
126
- # returns:: true iff there is such a match.
127
- def skip_to(target)
128
- # first scan in cache
129
- while (@pointer += 1) < @pointer_max
130
- if @docs[@pointer] >= target
131
- @doc = @docs[@pointer]
132
- return true
133
- end
134
- end
135
-
136
- # not found in cache, seek underlying stream
137
- result = @term_docs.skip_to(target)
138
- if (result)
139
- @pointer_max = 1
140
- @pointer = 0
141
- @docs[@pointer] = @doc = @term_docs.doc
142
- @freqs[@pointer] = @term_docs.freq
143
- else
144
- @doc = MAX_DOCS
145
- end
146
- return result
147
- end
148
-
149
- # Returns an explanation of the score for a document.
150
- #
151
- # When this method is used, the #next() method and the #score() method
152
- # should not be used.
153
- #
154
- # doc:: The document number for the explanation.
155
- # TODO: Modify to make use of TermDocEnum#skip_to(int).
156
- def explain(doc)
157
- query = @weight.query()
158
- tf_explanation = Explanation.new()
159
- tf = 0
160
- while (@pointer < @pointer_max)
161
- if (@docs[@pointer] == doc)
162
- tf = @freqs[@pointer]
163
- end
164
- @pointer += 1
165
- end
166
- if (tf == 0)
167
- while (@term_docs.next?)
168
- if (@term_docs.doc() == doc)
169
- tf = @term_docs.freq()
170
- end
171
- end
172
- end
173
- @term_docs.close()
174
- tf_explanation.value = similarity().tf(tf)
175
- tf_explanation.description = "tf(term_freq(#{query.term})=#{tf})"
176
-
177
- return tf_explanation
178
- end
179
-
180
- # Returns a string representation of this +TermScorer+.
181
- def to_s() return "scorer(" + @weight + ")"; end
182
- end
183
- end
@@ -1,36 +0,0 @@
1
- module Ferret::Search
2
- # Expert: Returned by low-level search implementations.
3
- # See Searcher#search
4
- class TopDocs
5
- # Expert: The total number of hits for the query.
6
- # See Hits#length()
7
- attr_accessor :score_docs, :total_hits, :fields
8
- alias :size :total_hits
9
-
10
- # iterate through each of the score docs, yielding the document number and
11
- # the score. eg:
12
- #
13
- # top_docs.each do |doc, score|
14
- # puts "Doc number #{doc} found with score of #{score}"}
15
- # end
16
- #
17
- def each
18
- score_docs.each {|sd| yield(sd.doc, sd.score) }
19
- end
20
-
21
- # Expert: Constructs a TopDocs.
22
- def initialize(total_hits, score_docs, fields = SortField::FIELD_SCORE)
23
- @total_hits = total_hits
24
- @score_docs = score_docs
25
- @fields = fields
26
- end
27
-
28
- def to_s
29
- buffer = "#{total_hits} hits sorted by <"
30
- buffer << [fields].flatten.map {|field| "#{@field}" }.join(", ")
31
- buffer << ">:\n"
32
- score_docs.each {|sd| buffer << "\t#{sd}\n" }
33
- return buffer
34
- end
35
- end
36
- end
@@ -1,17 +0,0 @@
1
- module Ferret::Search
2
- # Expert: Returned by low-level sorted search implementations.
3
- class TopFieldDocs < TopDocs
4
-
5
- # The fields which were used to sort results by.
6
- attr_accessor :fields
7
-
8
- # Creates one of these objects.
9
- # total_hits:: Total number of hits for the query.
10
- # score_docs:: The top hits for the query.
11
- # fields:: The sort criteria used to find the top hits.
12
- def initialize(total_hits, score_docs, fields)
13
- super(total_hits, score_docs)
14
- @fields = fields
15
- end
16
- end
17
- end
@@ -1,54 +0,0 @@
1
- module Ferret
2
- module Search
3
- # Expert: Calculate query weights and build query scorers.
4
- #
5
- # The purpose of Weight is to make it so that searching does not modify
6
- # a Query, so that a Query instance can be reused.
7
- #
8
- # Searcher dependent state of the query should reside in the Weight.
9
- #
10
- # IndexReader dependent state should reside in the Scorer.
11
- #
12
- # A +Weight+ is used in the following way:
13
- #
14
- # 1. A +Weight+ is constructed by a top-level query, given a +Searcher+
15
- # (See Query#create_weight).
16
- # 2. The #sum_of_squared_weights() method is called on the +Weight+ to
17
- # compute the query normalization factor Similarity#query_norm(float)
18
- # of the query clauses contained in the query.
19
- # 3. The query normalization factor is passed to #normalize().
20
- # At this point the weighting is complete.
21
- # 4. A +Scorer+ is constructed by #scorer()
22
- class Weight
23
- # The query that this concerns.
24
- def query()
25
- raise NotImplementedError
26
- end
27
-
28
- # The weight for this query.
29
- def value()
30
- raise NotImplementedError
31
- end
32
-
33
- # The sum of squared weights of contained query clauses.
34
- def sum_of_squared_weights()
35
- raise NotImplementedError
36
- end
37
-
38
- # Assigns the query normalization factor to this.
39
- def normalize(norm)
40
- raise NotImplementedError
41
- end
42
-
43
- # Constructs a scorer for this.
44
- def scorer(reader)
45
- raise NotImplementedError
46
- end
47
-
48
- # An explanation of the score computation for the named document.
49
- def explain(reader, doc)
50
- raise NotImplementedError
51
- end
52
- end
53
- end
54
- end
@@ -1,26 +0,0 @@
1
- module Ferret::Search
2
- # Implements the wildcard search query. Supported wildcards are +*+, which
3
- # matches any character sequence (including the empty one), and +?+, which
4
- # matches any single character. Note this query can be slow, as it needs to
5
- # iterate over many terms. In order to prevent extremely slow
6
- # WildcardQueries, a Wildcard term should not start with one of the
7
- # wildcards +*+ or +?+.
8
- #
9
- # See WildcardTermEnum
10
- class WildcardQuery < MultiTermQuery
11
- def initialize(term)
12
- super(term)
13
- end
14
-
15
- def get_term_enum(reader)
16
- return WildcardTermEnum.new(reader, @term)
17
- end
18
-
19
- def eql?(o)
20
- if o.instance_of?(WildcardQuery)
21
- return super(o)
22
- end
23
- return false
24
- end
25
- end
26
- end
@@ -1,61 +0,0 @@
1
- module Ferret::Search
2
- # Subclass of FilteredTermEnum for enumerating all terms that match the
3
- # specified wildcard filter term.
4
- #
5
- # Term enumerations are always ordered by Term.compareTo(). Each term in
6
- # the enumeration is greater than all that precede it.
7
- #
8
- class WildcardTermEnum < FilteredTermEnum
9
- include Ferret::Index
10
-
11
- attr_reader :end_enum
12
-
13
- WILDCARD_STRING = '*'
14
- WILDCARD_CHAR = '?'
15
-
16
- # Creates a new +WildcardTermEnum+. Passing in a
17
- # org.apache.lucene.index.Term Term that does not contain a
18
- # +WILDCARD_CHAR+ will cause an exception to be raisen.
19
- #
20
- # After calling the constructor the enumeration is already pointing to the first
21
- # valid term if such a term exists.
22
- def initialize(reader, term)
23
- super()
24
- @end_enum = false
25
- @search_term = term
26
- @field = @search_term.field
27
- text = @search_term.text
28
- len = text.length
29
-
30
- sidx = text.index(WILDCARD_STRING)||len
31
- cidx = text.index(WILDCARD_CHAR)||len
32
- idx = [sidx, cidx].min
33
-
34
- @pre = @search_term.text[0,idx]
35
- @pre_len = idx
36
- @pattern = /^#{Regexp.escape(text[idx..-1]).gsub(/\\([?*])/){".#{$1}"}}$/
37
- self.enum = reader.terms_from(Term.new(@search_term.field, @pre))
38
- end
39
-
40
- def term_compare(term)
41
- if (@field == term.field)
42
- search_text = term.text
43
- if (search_text[0, @pre_len] == @pre)
44
- return (search_text[@pre_len..-1] =~ @pattern)
45
- end
46
- end
47
- @end_enum = true
48
- return false
49
- end
50
-
51
- def difference()
52
- return 1.0
53
- end
54
-
55
- def close()
56
- super()
57
- @pattern = nil
58
- @field = nil
59
- end
60
- end
61
- end
@@ -1 +0,0 @@
1
- require 'ferret/stemmers/porter_stemmer'
@@ -1,218 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
4
- #
5
- # See example usage at the end of this file.
6
- #
7
-
8
- module Stemmable
9
-
10
- STEMMED = {}
11
-
12
- STEP_2_LIST = {
13
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
14
- 'izer'=>'ize', 'bli'=>'ble',
15
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
16
- 'ization'=>'ize', 'ation'=>'ate',
17
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
18
- 'ousness'=>'ous', 'aliti'=>'al',
19
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
20
- }
21
-
22
- STEP_3_LIST = {
23
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
24
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
25
- }
26
-
27
-
28
- SUFFIX_1_REGEXP = /(
29
- ational |
30
- tional |
31
- enci |
32
- anci |
33
- izer |
34
- bli |
35
- alli |
36
- entli |
37
- eli |
38
- ousli |
39
- ization |
40
- ation |
41
- ator |
42
- alism |
43
- iveness |
44
- fulness |
45
- ousness |
46
- aliti |
47
- iviti |
48
- biliti |
49
- logi)$/x
50
-
51
-
52
- SUFFIX_2_REGEXP = /(
53
- al |
54
- ance |
55
- ence |
56
- er |
57
- ic |
58
- able |
59
- ible |
60
- ant |
61
- ement |
62
- ment |
63
- ent |
64
- ou |
65
- ism |
66
- ate |
67
- iti |
68
- ous |
69
- ive |
70
- ize)$/x
71
-
72
-
73
- C = "[^aeiou]" # consonant
74
- V = "[aeiouy]" # vowel
75
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
76
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
77
-
78
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
79
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
80
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
81
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
82
-
83
- #
84
- # Porter stemmer in Ruby.
85
- #
86
- # This is the Porter stemming algorithm, ported to Ruby from the
87
- # version coded up in Perl. It's easy to follow against the rules
88
- # in the original paper in:
89
- #
90
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
91
- # no. 3, pp 130-137,
92
- #
93
- # See also http://www.tartarus.org/~martin/PorterStemmer
94
- #
95
- # Send comments to raypereda@hotmail.com
96
- #
97
-
98
- def stem_porter(w = self.to_str.dup)
99
-
100
- # make a copy of the given object and convert it to a string.
101
- original_word = w
102
-
103
- return w if w.length < 3
104
-
105
- result = STEMMED[w]
106
- return result if result
107
-
108
- # now map initial y to Y so that the patterns never treat it as vowel
109
- w[0] = 'Y' if w[0] == ?y
110
-
111
- # Step 1a
112
- if w =~ /(ss|i)es$/
113
- w = $` + $1
114
- elsif w =~ /([^s])s$/
115
- w = $` + $1
116
- end
117
-
118
- # Step 1b
119
- if w =~ /eed$/
120
- w.chop! if $` =~ MGR0
121
- elsif w =~ /(ed|ing)$/
122
- stem = $`
123
- if stem =~ VOWEL_IN_STEM
124
- w = stem
125
- case w
126
- when /(at|bl|iz)$/ then w << "e"
127
- when /([^aeiouylsz])\1$/ then w.chop!
128
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
129
- end
130
- end
131
- end
132
-
133
- if w =~ /y$/
134
- stem = $`
135
- w = stem + "i" if stem =~ VOWEL_IN_STEM
136
- end
137
-
138
- # Step 2
139
- if w =~ SUFFIX_1_REGEXP
140
- stem = $`
141
- suffix = $1
142
- # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
143
- if stem =~ MGR0
144
- w = stem + STEP_2_LIST[suffix]
145
- end
146
- end
147
-
148
- # Step 3
149
- if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
150
- stem = $`
151
- suffix = $1
152
- if stem =~ MGR0
153
- w = stem + STEP_3_LIST[suffix]
154
- end
155
- end
156
-
157
- # Step 4
158
- if w =~ SUFFIX_2_REGEXP
159
- stem = $`
160
- if stem =~ MGR1
161
- w = stem
162
- end
163
- elsif w =~ /(s|t)(ion)$/
164
- stem = $` + $1
165
- if stem =~ MGR1
166
- w = stem
167
- end
168
- end
169
-
170
- # Step 5
171
- if w =~ /e$/
172
- stem = $`
173
- if (stem =~ MGR1) ||
174
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
175
- w = stem
176
- end
177
- end
178
-
179
- if w =~ /ll$/ && w =~ MGR1
180
- w.chop!
181
- end
182
-
183
- # and turn initial Y back to y
184
- w[0] = 'y' if w[0] == ?Y
185
-
186
- STEMMED[original_word] = w
187
-
188
- w
189
- end
190
-
191
-
192
- module_function :stem_porter
193
- #
194
- # make the stem_porter the default stem method, just in case we
195
- # feel like having multiple stemmers available later.
196
- #
197
- alias stem stem_porter
198
- public :stem
199
-
200
- end
201
-
202
-
203
- #
204
- # Make this script executable, and send it words on stdin, one per
205
- # line, and it will output the stemmed versions to stdout.
206
- #
207
- if $0 == __FILE__ then
208
- class String
209
- include Stemmable
210
- end
211
-
212
- # the String class, and any subclasses of it you might have, now know
213
- # how to stem things.
214
-
215
- $stdin.each do |word|
216
- puts word.strip.stem
217
- end
218
- end