ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,60 +0,0 @@
1
- module Ferret::Search
2
- # Abstract base class for sorting hits returned by a Query.
3
- #
4
- # This class should only be used if the other SortField types (SCORE, DOC,
5
- # STRING, INTEGER, FLOAT) do not provide an adequate sorting. It maintains
6
- # an internal cache of values which could be quite large. The cache is an
7
- # array of Comparable, one for each document in the index. There is a
8
- # distinct Comparable for each unique term in the field - if some documents
9
- # have the same term in the field, the cache array will have entries which
10
- # reference the same Comparable.
11
- #
12
- # Author:: Tim Jones
13
- class SortComparator
14
-
15
- # Creates a comparator for the field in the given index.
16
- #
17
- # reader:: Index to create comparator for.
18
- # field_name:: Field to create comparator for.
19
- # returns:: Comparator of ScoreDoc objects.
20
- def new_comparator(reader, field_name)
21
- cached_values = FieldCache::DEFAULT.custom(reader, field, self)
22
-
23
- score_doc_comparator = ScoreDocComparator.new()
24
-
25
- class <<score_doc_comparator
26
- attr_writer :cache_values
27
- def compare(i, j)
28
- return @cached_values[i.doc] <=> @cached_values[j.doc]
29
- end
30
-
31
- def sort_value(i)
32
- return @cached_values[i.doc]
33
- end
34
-
35
- def sort_type()
36
- return SortField::SortType::CUSTOM
37
- end
38
- end
39
- score_doc_comparator.cached_values = cached_values
40
- return score_doc_comparator
41
- end
42
-
43
- # Returns an object which, when sorted according to natural order, will
44
- # order the Term values in the correct order. For example, if the Terms
45
- # contained integer values, this method would return +term_text.to_i+.
46
- # Note that this might not always be the most efficient implementation -
47
- # for this particular example, a better implementation might be to make a
48
- # ScoreDocLookupComparator that uses an internal lookup table of int.
49
- #
50
- # term_text:: The textual value of the term.
51
- #
52
- # returns:: An object representing +term_text+ that sorts according to the
53
- # natural order of +term_text+.
54
- #
55
- # See ScoreDocComparator
56
- def get_comparable(term_text)
57
- raise NotImplementedError
58
- end
59
- end
60
- end
@@ -1,91 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Stores information about how to sort documents by terms in an individual
4
- # field. Fields must be indexed in order to sort by them.
5
- class SortField
6
- class SortType < Ferret::Utils::Parameter
7
- attr_reader :parser, :comparator
8
-
9
- # Creates a new SortType. A SortType is used to specify how a field is
10
- # sorted in a document. Each SortType *MUST* have a unique name. This is
11
- # because the SortType object is used to cache a fields values for a
12
- # particular reader, so each SortType should be created once only and
13
- # stored in a constant. See the standard SortTypes stored hear for
14
- # example.
15
- def initialize(name, parser = lambda{|str| str}, comparator = nil)
16
- super(name)
17
- @parser = parser
18
- @comparator = comparator
19
- end
20
-
21
- # Sort by document score (relevancy). Sort values are Float and higher
22
- # values are at the front.
23
- SCORE = SortType.new("SCORE")
24
-
25
- # Sort by document number (order). Sort values are Integer and lower
26
- # values are at the front.
27
- DOC = SortType.new("DOC")
28
-
29
- # Guess sort type of sort based on field contents. We try parsing the
30
- # field as an integer and then as a floating point number. If we are
31
- # unsuccessful, the field is parsed as a plain string.
32
- AUTO = SortType.new("auto")
33
-
34
- # Sort using term values as Strings. Sort values are String and lower
35
- # values are at the front.
36
- STRING = SortType.new("string")
37
-
38
- # Sort using term values as encoded Integers. Sort values are Integer
39
- # and lower values are at the front.
40
- INTEGER = SortType.new("integer", lambda{|str| str.to_i})
41
-
42
- # Sort using term values as encoded Floats. Sort values are Float and
43
- # lower values are at the front.
44
- FLOAT = SortType.new("float", lambda{|str| str.to_f})
45
- end
46
-
47
- attr_reader :name, :sort_type, :comparator
48
-
49
- def reverse?
50
- return @reverse
51
- end
52
-
53
- # Creates a SortField which specifies which field the data is sorted on
54
- # and how that field is sorted. See SortType.
55
- #
56
- # name:: Name of field to sort by. Can be +nil+ if +sort_type+ is SCORE or
57
- # DOC.
58
- #
59
- # An options hash with the followind values can also be supplied;
60
- # sort_type:: Type of values in the terms.
61
- # reverse:: True if natural order should be reversed.
62
- # comparator:: A proc used to compare two values from the index. You can
63
- # also give this value to the SortType object that you pass.
64
- def initialize(name = nil, options= {})
65
- @name = name.to_s if name
66
- @sort_type = options[:sort_type]||SortType::AUTO
67
- @reverse = options[:reverse]||false
68
- @comparator = options[:comparator]||@sort_type.comparator
69
- if (@name == nil and @sort_type != SortType::DOC and
70
- @sort_type != SortType::SCORE)
71
- raise ArgumentError, "You must supply a field name for your sort field"
72
- end
73
- end
74
-
75
- # Represents sorting by document score (relevancy).
76
- FIELD_SCORE = SortField.new(nil, {:sort_type => SortType::SCORE})
77
-
78
- # Represents sorting by document number (order).
79
- FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
80
-
81
- def to_s()
82
- if @name
83
- buffer = "#@name:<#@sort_type>"
84
- else
85
- buffer = "<#{@sort_type}>"
86
- end
87
- buffer << '!' if @reverse
88
- return buffer
89
- end
90
- end
91
- end
@@ -1,12 +0,0 @@
1
- $:.unshift File.dirname(__FILE__)
2
-
3
- require 'spans/spans_enum.rb'
4
- require 'spans/near_spans_enum.rb'
5
- require 'spans/span_query.rb'
6
- require 'spans/span_first_query.rb'
7
- require 'spans/span_near_query.rb'
8
- require 'spans/span_not_query.rb'
9
- require 'spans/span_or_query.rb'
10
- require 'spans/span_scorer.rb'
11
- require 'spans/span_term_query.rb'
12
- require 'spans/span_weight.rb'
@@ -1,304 +0,0 @@
1
- module Ferret::Search::Spans
2
- class NearSpansEnum < SpansEnum
3
-
4
- class CellQueue < Ferret::Utils::PriorityQueue
5
- def less_than(o1, o2)
6
- if (o1.doc == o2.doc)
7
- if (o1.start == o2.start)
8
- if (o1.finish == o2.finish)
9
- return o1.index > o2.index
10
- else
11
- return o1.finish < o2.finish
12
- end
13
- else
14
- return o1.start < o2.start
15
- end
16
- else
17
- return o1.doc < o2.doc
18
- end
19
- end
20
- end
21
-
22
-
23
- # Wraps a SpansEnum, and can be used to form a linked list.
24
- class SpansCell < SpansEnum
25
- attr_accessor :next, :index
26
-
27
- def initialize(parent, spans, index)
28
- @parent = parent
29
- @spans = spans
30
- @index = index
31
- @length = -1
32
- end
33
-
34
- def next?()
35
- if (@length != -1) # subtract old length
36
- @parent.total_length -= @length
37
- end
38
-
39
- more = @spans.next? # move to next
40
-
41
- if more
42
- @length = finish() - start() # compute new length
43
- @parent.total_length += @length # add new length to total
44
-
45
- if (@parent.max.nil? or doc() > @parent.max.doc or # maintain max
46
- (doc() == @parent.max.doc and finish() > @parent.max.finish))
47
- @parent.max = self
48
- end
49
- end
50
-
51
- return more
52
- end
53
-
54
- def skip_to(target)
55
- if (@length != -1) # subtract old length
56
- @parent.total_length -= @length
57
- end
58
-
59
- more = @spans.skip_to(target) # skip
60
-
61
- if (more)
62
- @length = finish() - start() # compute new length
63
- @parent.total_length += @length # add new length to total
64
-
65
- if (@parent.max.nil? or doc() > @parent.max.doc() or # maintain max
66
- (doc() == @parent.max.doc and finish() > @parent.max.finish))
67
- @parent.max = self
68
- end
69
- end
70
-
71
- return more
72
- end
73
-
74
- def doc() return @spans.doc() end
75
- def start() return @spans.start() end
76
- def finish() return @spans.finish() end
77
-
78
- def to_s() return "#{@spans}##{@index}" end
79
- end
80
-
81
- attr_accessor :total_length, :max
82
-
83
- def initialize(query, reader)
84
- @ordered = [] # spans in query order
85
-
86
- @first = nil # linked list of spans
87
- @last = nil # sorted by doc only
88
-
89
- @total_length = 0 # sum of current lengths
90
-
91
- @queue = nil # sorted queue of spans
92
- @max = nil # max element in queue
93
-
94
- @more = true # true iff not done
95
- @first_time = true # true before first next?
96
-
97
-
98
- @query = query
99
- @slop = query.slop
100
- @in_order = query.in_order?
101
-
102
- clauses = query.clauses # initialize spans & list
103
- @queue = CellQueue.new(clauses.length)
104
- clauses.length.times do |i|
105
- # construct clause spans
106
- cell = SpansCell.new(self, clauses[i].spans(reader), i)
107
- @ordered << cell # add to ordered
108
- end
109
- end
110
-
111
- def next?()
112
- if (@first_time)
113
- init_list(true)
114
- list_to_queue() # initialize queue
115
- @first_time = false
116
- elsif (@more)
117
- @more = min().next? # trigger further scanning
118
- @queue.adjust_top() if (@more) # maintain queue
119
- end
120
-
121
- while (@more)
122
- queue_stale = false
123
-
124
- if (min().doc != @max.doc) # maintain list
125
- queue_to_list()
126
- queue_stale = true
127
- end
128
-
129
- # skip to doc w/ all clauses
130
-
131
- while (@more and @first.doc < @last.doc)
132
- @more = @first.skip_to(@last.doc) # skip first upto last
133
- first_to_last() # and move it to the end
134
- queue_stale = true
135
- end
136
-
137
- return false if not @more
138
-
139
- # found doc w/ all clauses
140
-
141
- if (queue_stale) # maintain the queue
142
- list_to_queue()
143
- queue_stale = false
144
- end
145
-
146
- return true if at_match?
147
-
148
- # trigger further scanning
149
- if (@in_order and check_slop?())
150
- # There is a non ordered match within slop and an ordered match is needed.
151
- @more = first_non_ordered_next_to_partial_list()
152
- if (@more)
153
- partial_list_to_queue()
154
- end
155
- else
156
- @more = min().next?()
157
- if (@more)
158
- @queue.adjust_top() # maintain queue
159
- end
160
- end
161
- end
162
- return false # no more matches
163
- end
164
-
165
- def each()
166
- cell = @first
167
- while (cell)
168
- yield cell
169
- cell=cell.next
170
- end
171
- end
172
-
173
- def skip_to(target)
174
- if (@first_time) # initialize
175
- init_list(false)
176
- each() do |cell|
177
- @more = cell.skip_to(target) # skip all
178
- break if not @more
179
- end
180
-
181
- if (@more)
182
- list_to_queue()
183
- end
184
- @first_time = false
185
-
186
- else # normal case
187
- while (@more and min().doc < target) # skip as needed
188
- @more = min().skip_to(target)
189
- @queue.adjust_top() if (@more)
190
- end
191
- end
192
-
193
- if (@more)
194
- return true if (at_match?()) # at a match?
195
- return next? # no, scan
196
- end
197
-
198
- return false
199
- end
200
-
201
- def min() @queue.top() end
202
-
203
- def doc() min().doc() end
204
- def start() min().start() end
205
- def finish() @max.finish() end
206
-
207
-
208
- def to_s()
209
- buffer = "spans(#{@query})@"
210
- if @first_time
211
- buffer << "START"
212
- else
213
- buffer << (@queue.size>0 ? ("#{doc}:#{start()}-#{finish}") : "END")
214
- end
215
- return buffer
216
- end
217
-
218
- def init_list(nxt)
219
- @ordered.each do |cell|
220
- @more = cell.next? if nxt
221
- if @more
222
- add_to_list(cell) # add to list
223
- else
224
- break
225
- end
226
- end
227
- end
228
-
229
- def add_to_list(cell)
230
- if (@last != nil) # add next to end of list
231
- @last.next = cell
232
- else
233
- @first = cell
234
- end
235
- @last = cell
236
- cell.next = nil
237
- end
238
-
239
- def first_to_last()
240
- @last.next = @first # move first to end of list
241
- @last = @first
242
- @first = @first.next
243
- @last.next = nil
244
- end
245
-
246
- def queue_to_list()
247
- @last = @first = nil
248
- while (@queue.top() != nil)
249
- add_to_list(@queue.pop())
250
- end
251
- end
252
-
253
- def first_non_ordered_next_to_partial_list()
254
- # Creates a partial list consisting of first non ordered and earlier.
255
- # Returns first non ordered .next?.
256
- @last = @first = nil
257
- ordered_index = 0
258
- while (@queue.top() != nil)
259
- cell = @queue.pop()
260
- add_to_list(cell)
261
- if (cell.index == ordered_index)
262
- ordered_index += 1
263
- else
264
- return cell.next?()
265
- # FIXME: continue here, rename to eg. checkOrderedMatch():
266
- # when check_slop?() and not ordered, repeat cell.next?().
267
- # when check_slop?() and ordered, add to list and repeat queue.pop()
268
- # without check_slop?(): no match, rebuild the queue from the partial list.
269
- # When queue is empty and check_slop?() and ordered there is a match.
270
- end
271
- end
272
- raise RuntimeException, "Unexpected: ordered"
273
- end
274
-
275
- def list_to_queue()
276
- @queue.clear() # rebuild queue
277
- partial_list_to_queue()
278
- end
279
-
280
- def partial_list_to_queue()
281
- each() { |cell| @queue.push(cell) } # add to queue from list
282
- end
283
-
284
- def at_match?()
285
- return ((min().doc() == @max.doc()) and check_slop?() and
286
- (not @in_order or match_is_ordered?()))
287
- end
288
-
289
- def check_slop?()
290
- match_length = @max.finish() - min.start()
291
- return ((match_length - @total_length) <= @slop)
292
- end
293
-
294
- def match_is_ordered?()
295
- last_start = -1
296
- @ordered.each do |cell|
297
- start = cell.start
298
- return false if start <= last_start
299
- last_start = start
300
- end
301
- return true
302
- end
303
- end
304
- end