ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,47 @@
1
+ module Ferret::Search
2
+ class SloppyPhraseScorer < PhraseScorer
3
+
4
+ def initialize(weight, tps, positions, similarity, slop, norms)
5
+ super(weight, tps, positions, similarity, norms)
6
+ @slop = slop
7
+ end
8
+
9
+ def phrase_freq()
10
+ @pq.clear()
11
+ last_pos = 0
12
+ each do |pp|
13
+ pp.first_position()
14
+ last_pos = pp.position if (pp.position > last_pos)
15
+ @pq.push(pp) # build pq from list
16
+ end
17
+
18
+ freq = 0.0
19
+ done = false
20
+ begin
21
+ pp = @pq.pop()
22
+ pos = start = pp.position
23
+ next_pos = @pq.top().position
24
+ while pos <= next_pos
25
+ start = pos # advance pp to min window
26
+ if not pp.next_position()
27
+ done = true # ran out of a term -- done
28
+ break
29
+ end
30
+ pos = pp.position
31
+ end
32
+
33
+ match_length = last_pos - start
34
+ if (match_length <= @slop)
35
+ freq += @similarity.sloppy_freq(match_length) # score match
36
+ end
37
+
38
+ if (pp.position > last_pos)
39
+ last_pos = pp.position
40
+ end
41
+ @pq.push(pp) # restore pq
42
+ end while (!done)
43
+
44
+ return freq
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,105 @@
1
+ module Ferret::Search
2
+ # Encapsulates sort criteria for returned hits.
3
+ #
4
+ # The fields used to determine sort order must be carefully chosen.
5
+ # Documents must contain a single term in such a field, and the value of the
6
+ # term should indicate the document's relative position in a given sort
7
+ # order. The field must be indexed, but should not be tokenized, and does
8
+ # not need to be stored (unless you happen to want it back with the rest of
9
+ # your document data). In other words:
10
+ #
11
+ # document << Field.new("by_number",
12
+ # x.to_s,
13
+ # Field::Store::NO,
14
+ # Field::Index::UN_TOKENIZED))
15
+ #
16
+ #
17
+ # === Valid Types of Values
18
+ #
19
+ # There are three possible kinds of term values which may be put into
20
+ # sorting fields: Integers, Floats, or Strings. Unless SortField objects
21
+ # are specified, the type of value in the field is determined by parsing the
22
+ # first term in the field.
23
+ #
24
+ # Integer term values should contain only digits and an optional preceeding
25
+ # negative sign. Values must be base 10. Documents which should appear
26
+ # first in the sort should have low value integers, later documents high
27
+ # values (i.e. the documents should be numbered +1..n+ where +1+ is the
28
+ # first and +n+ the last).
29
+ #
30
+ # Float term values should conform to values accepted by String#to_f.
31
+ # Documents which should appear first in the sort should have low values,
32
+ # later documents high values.
33
+ #
34
+ # String term values can contain any valid String, but should not be
35
+ # tokenized. The values are sorted according to their Comparable natural
36
+ # order. Note that using this type of term value has higher memory
37
+ # requirements than the other two types.
38
+ #
39
+ # === Object Reuse
40
+ #
41
+ # One of these objects can be used multiple times and the sort order changed
42
+ # between usages.
43
+ #
44
+ # This class is thread safe.
45
+ #
46
+ # === Memory Usage
47
+ #
48
+ # Sorting uses caches of term values maintained by the internal HitQueue(s).
49
+ # The cache is static and contains an integer or float array of length
50
+ # +IndexReader#max_doc+ for each field name for which a sort is performed.
51
+ # In other words, the size of the cache in bytes is:
52
+ #
53
+ # 4 * IndexReader#max_doc * (# of different fields actually used to sort)
54
+ #
55
+ # For String fields, the cache is larger: in addition to the above array,
56
+ # the value of every term in the field is kept in memory. If there are many
57
+ # unique terms in the field, this could be quite large.
58
+ #
59
+ # Note that the size of the cache is not affected by how many fields are in
60
+ # the index and _might_ be used to sort - only by the ones actually used to
61
+ # sort a result set.
62
+ #
63
+ # The cache is cleared each time a new +IndexReader+ is passed in, or if the
64
+ # value returned by +max_doc()+ changes for the current IndexReader. This
65
+ # class is not set up to be able to efficiently sort hits from more than one
66
+ # index simultaneously.
67
+ class Sort
68
+
69
+ attr_accessor :fields
70
+
71
+ # Sorts by computed relevance. You can pass a string representing the name
72
+ # of the field you want to sort on, a SortField, or an array of either
73
+ # (but not a mixed array). If you pass a string or and array of strings
74
+ # you can also pass a reverse flag. If you pass a SortField the reverse is
75
+ # handled by it.
76
+ #
77
+ # fields:: The fields you want to sort on. See also SortField
78
+ # reverse:: pass true if you want the sort order to be reversed. Only
79
+ # works if you pass the field names.
80
+ def initialize(fields = [SortField::FIELD_SCORE, SortField::FIELD_DOC],
81
+ reverse = false)
82
+ fields = [fields] unless fields.is_a?(Array)
83
+ @fields = fields
84
+ if fields[0].is_a?(String)
85
+ @fields = fields.map do |field|
86
+ SortField.new(field, {:sort_type => SortField::SortType::AUTO,
87
+ :reverse => reverse})
88
+ end
89
+ @fields << SortField::FIELD_DOC if @fields.size == 1
90
+ end
91
+ end
92
+
93
+ # Represents sorting by computed relevance. Using this sort criteria returns
94
+ # the same results as calling Searcher#search(Query) Searcher#search()
95
+ # without a sort criteria, only with slightly more overhead.
96
+ RELEVANCE = Sort.new()
97
+
98
+ # Represents sorting by index order.
99
+ INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
100
+
101
+ def to_s()
102
+ return @fields.map {|field| "#{field}"}.join(", ")
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,60 @@
1
+ module Ferret::Search
2
+ # Abstract base class for sorting hits returned by a Query.
3
+ #
4
+ # This class should only be used if the other SortField types (SCORE, DOC,
5
+ # STRING, INT, FLOAT) do not provide an adequate sorting. It maintains an
6
+ # internal cache of values which could be quite large. The cache is an
7
+ # array of Comparable, one for each document in the index. There is a
8
+ # distinct Comparable for each unique term in the field - if some documents
9
+ # have the same term in the field, the cache array will have entries which
10
+ # reference the same Comparable.
11
+ #
12
+ # Author:: Tim Jones
13
+ class SortComparator
14
+
15
+ # Creates a comparator for the field in the given index.
16
+ #
17
+ # reader:: Index to create comparator for.
18
+ # field_name:: Field to create comparator for.
19
+ # returns:: Comparator of ScoreDoc objects.
20
+ def new_comparator(reader, field_name)
21
+ cached_values = FieldCache::DEFAULT.custom(reader, field, self)
22
+
23
+ score_doc_comparator = ScoreDocComparator.new()
24
+
25
+ class <<score_doc_comparator
26
+ attr_writer :cache_values
27
+ def compare(i, j)
28
+ return @cached_values[i.doc] <=> @cached_values[j.doc]
29
+ end
30
+
31
+ def sort_value(i)
32
+ return @cached_values[i.doc]
33
+ end
34
+
35
+ def sort_type()
36
+ return SortField::SortType::CUSTOM
37
+ end
38
+ end
39
+ score_doc_comparator.cached_values = cached_values
40
+ return score_doc_comparator
41
+ end
42
+
43
+ # Returns an object which, when sorted according to natural order, will
44
+ # order the Term values in the correct order. For example, if the Terms
45
+ # contained integer values, this method would return +term_text.to_i+.
46
+ # Note that this might not always be the most efficient implementation -
47
+ # for this particular example, a better implementation might be to make a
48
+ # ScoreDocLookupComparator that uses an internal lookup table of int.
49
+ #
50
+ # term_text:: The textual value of the term.
51
+ #
52
+ # returns:: An object representing +term_text+ that sorts according to the
53
+ # natural order of +term_text+.
54
+ #
55
+ # See ScoreDocComparator
56
+ def get_comparable(term_text)
57
+ raise NotImplementedError
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,87 @@
1
+ module Ferret::Search
2
+
3
+ # Stores information about how to sort documents by terms in an individual
4
+ # field. Fields must be indexed in order to sort by them.
5
+ class SortField
6
+ class SortType < Ferret::Utils::Parameter
7
+ attr_reader :parser, :comparator
8
+
9
+ # Creates a new SortType. A SortType is used to specify how a field is
10
+ # sorted in a document. Each SortType *MUST* have a unique name. This is
11
+ # because the SortType object is used to cache a fields values for a
12
+ # particular reader, so each SortType should be created once only and
13
+ # stored in a constant. See the standard SortTypes stored hear for
14
+ # example.
15
+ def initialize(name, parser = lambda{|str| str}, comparator = nil)
16
+ super(name)
17
+ @parser = parser
18
+ @comparator = comparator
19
+ end
20
+
21
+ # Sort by document score (relevancy). Sort values are Float and higher
22
+ # values are at the front.
23
+ SCORE = SortType.new("score")
24
+
25
+ # Sort by document number (order). Sort values are Integer and lower
26
+ # values are at the front.
27
+ DOC = SortType.new("doc")
28
+
29
+ # Guess sort type of sort based on field contents. We try parsing the
30
+ # field as an integer and then as a floating point number. If we are
31
+ # unsuccessful, the field is parsed as a plain string.
32
+ AUTO = SortType.new("auto")
33
+
34
+ # Sort using term values as Strings. Sort values are String and lower
35
+ # values are at the front.
36
+ STRING = SortType.new("string")
37
+
38
+ # Sort using term values as encoded Integers. Sort values are Integer
39
+ # and lower values are at the front.
40
+ INT = SortType.new("int", lambda{|str| str.to_i})
41
+
42
+ # Sort using term values as encoded Floats. Sort values are Float and
43
+ # lower values are at the front.
44
+ FLOAT = SortType.new("float", lambda{|str| str.to_f})
45
+ end
46
+
47
+ attr_reader :name, :sort_type, :comparator
48
+
49
+ def reverse?
50
+ return @reverse
51
+ end
52
+
53
+ # Creates a SortField which specifies which field the data is sorted on
54
+ # and how that field is sorted. See SortType.
55
+ #
56
+ # name:: Name of field to sort by. Can be +nil+ if +sort_type+ is SCORE or
57
+ # DOC.
58
+ #
59
+ # A hash with the followind values can also be supplied;
60
+ # sort_type:: Type of values in the terms.
61
+ # reverse:: True if natural order should be reversed.
62
+ # comparator:: a proc used to compare two values from the index. You can
63
+ # also give this value to the SortType object that you pass.
64
+ def initialize(name = nil, args= {})
65
+ @name = name
66
+ @sort_type = args[:sort_type]||SortType::AUTO
67
+ @reverse = args[:reverse]||false
68
+ @comparator = args[:comparator]||@sort_type.comparator
69
+ if (@name == nil and @sort_type != SortType::DOC and
70
+ @sort_type != SortType::SCORE)
71
+ raise ArgumentError, "You must supply a field name for your sort field"
72
+ end
73
+ end
74
+
75
+ # Represents sorting by document score (relevancy).
76
+ FIELD_SCORE = SortField.new(nil, {:sort_type => SortType::SCORE})
77
+
78
+ # Represents sorting by document number (order).
79
+ FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
80
+
81
+ def to_s()
82
+ buffer = '"' + (@name||"<#{@sort_type}>") + '"'
83
+ buffer << '!' if @reverse
84
+ return buffer
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,12 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require 'spans/spans_enum.rb'
4
+ require 'spans/near_spans_enum.rb'
5
+ require 'spans/span_query.rb'
6
+ require 'spans/span_first_query.rb'
7
+ require 'spans/span_near_query.rb'
8
+ require 'spans/span_not_query.rb'
9
+ require 'spans/span_or_query.rb'
10
+ require 'spans/span_scorer.rb'
11
+ require 'spans/span_term_query.rb'
12
+ require 'spans/span_weight.rb'
@@ -0,0 +1,304 @@
1
+ module Ferret::Search::Spans
2
+ class NearSpansEnum < SpansEnum
3
+
4
+ class CellQueue < Ferret::Utils::PriorityQueue
5
+ def less_than(o1, o2)
6
+ if (o1.doc == o2.doc)
7
+ if (o1.start == o2.start)
8
+ if (o1.finish == o2.finish)
9
+ return o1.index > o2.index
10
+ else
11
+ return o1.finish < o2.finish
12
+ end
13
+ else
14
+ return o1.start < o2.start
15
+ end
16
+ else
17
+ return o1.doc < o2.doc
18
+ end
19
+ end
20
+ end
21
+
22
+
23
+ # Wraps a SpansEnum, and can be used to form a linked list.
24
+ class SpansCell < SpansEnum
25
+ attr_accessor :next, :index
26
+
27
+ def initialize(parent, spans, index)
28
+ @parent = parent
29
+ @spans = spans
30
+ @index = index
31
+ @length = -1
32
+ end
33
+
34
+ def next?()
35
+ if (@length != -1) # subtract old length
36
+ @parent.total_length -= @length
37
+ end
38
+
39
+ more = @spans.next? # move to next
40
+
41
+ if more
42
+ @length = finish() - start() # compute new length
43
+ @parent.total_length += @length # add new length to total
44
+
45
+ if (@parent.max.nil? or doc() > @parent.max.doc or # maintain max
46
+ (doc() == @parent.max.doc and finish() > @parent.max.finish))
47
+ @parent.max = self
48
+ end
49
+ end
50
+
51
+ return more
52
+ end
53
+
54
+ def skip_to(target)
55
+ if (@length != -1) # subtract old length
56
+ @parent.total_length -= @length
57
+ end
58
+
59
+ more = @spans.skip_to(target) # skip
60
+
61
+ if (more)
62
+ @length = finish() - start() # compute new length
63
+ @parent.total_length += @length # add new length to total
64
+
65
+ if (@parent.max == nil or doc() > @parent.max.doc() or # maintain max
66
+ (doc() == @parent.max.doc and finish() > @parent.max.finish))
67
+ @parent.max = self
68
+ end
69
+ end
70
+
71
+ return more
72
+ end
73
+
74
+ def doc() return @spans.doc() end
75
+ def start() return @spans.start() end
76
+ def finish() return @spans.finish() end
77
+
78
+ def to_s() return "#{@spans}##{@index}" end
79
+ end
80
+
81
+ attr_accessor :total_length, :max
82
+
83
+ def initialize(query, reader)
84
+ @ordered = [] # spans in query order
85
+
86
+ @first = nil # linked list of spans
87
+ @last = nil # sorted by doc only
88
+
89
+ @total_length = 0 # sum of current lengths
90
+
91
+ @queue = nil # sorted queue of spans
92
+ @max = nil # max element in queue
93
+
94
+ @more = true # true iff not done
95
+ @first_time = true # true before first next?
96
+
97
+
98
+ @query = query
99
+ @slop = query.slop
100
+ @in_order = query.in_order?
101
+
102
+ clauses = query.clauses # initialize spans & list
103
+ @queue = CellQueue.new(clauses.length)
104
+ clauses.length.times do |i|
105
+ # construct clause spans
106
+ cell = SpansCell.new(self, clauses[i].spans(reader), i)
107
+ @ordered << cell # add to ordered
108
+ end
109
+ end
110
+
111
+ def next?()
112
+ if (@first_time)
113
+ init_list(true)
114
+ list_to_queue() # initialize queue
115
+ @first_time = false
116
+ elsif (@more)
117
+ @more = min().next? # trigger further scanning
118
+ @queue.adjust_top() if (@more) # maintain queue
119
+ end
120
+
121
+ while (@more)
122
+ queue_stale = false
123
+
124
+ if (min().doc != @max.doc) # maintain list
125
+ queue_to_list()
126
+ queue_stale = true
127
+ end
128
+
129
+ # skip to doc w/ all clauses
130
+
131
+ while (@more and @first.doc < @last.doc)
132
+ @more = @first.skip_to(@last.doc) # skip first upto last
133
+ first_to_last() # and move it to the end
134
+ queue_stale = true
135
+ end
136
+
137
+ return false if not @more
138
+
139
+ # found doc w/ all clauses
140
+
141
+ if (queue_stale) # maintain the queue
142
+ list_to_queue()
143
+ queue_stale = false
144
+ end
145
+
146
+ return true if at_match?
147
+
148
+ # trigger further scanning
149
+ if (@in_order and check_slop?())
150
+ # There is a non ordered match within slop and an ordered match is needed.
151
+ @more = first_non_ordered_next_to_partial_list()
152
+ if (@more)
153
+ partial_list_to_queue()
154
+ end
155
+ else
156
+ @more = min().next?()
157
+ if (@more)
158
+ @queue.adjust_top() # maintain queue
159
+ end
160
+ end
161
+ end
162
+ return false # no more matches
163
+ end
164
+
165
+ def each()
166
+ cell = @first
167
+ while (cell)
168
+ yield cell
169
+ cell=cell.next
170
+ end
171
+ end
172
+
173
+ def skip_to(target)
174
+ if (@first_time) # initialize
175
+ init_list(false)
176
+ each() do |cell|
177
+ @more = cell.skip_to(target) # skip all
178
+ break if not @more
179
+ end
180
+
181
+ if (@more)
182
+ list_to_queue()
183
+ end
184
+ @first_time = false
185
+
186
+ else # normal case
187
+ while (@more and min().doc < target) # skip as needed
188
+ @more = min().skip_to(target)
189
+ @queue.adjust_top() if (@more)
190
+ end
191
+ end
192
+
193
+ if (@more)
194
+ return true if (at_match?()) # at a match?
195
+ return next? # no, scan
196
+ end
197
+
198
+ return false
199
+ end
200
+
201
+ def min() @queue.top() end
202
+
203
+ def doc() min().doc() end
204
+ def start() min().start() end
205
+ def finish() @max.finish() end
206
+
207
+
208
+ def to_s()
209
+ buffer = "spans(#{@query})@"
210
+ if @first_time
211
+ buffer << "START"
212
+ else
213
+ buffer << (@queue.size>0 ? ("#{doc}:#{start()}-#{finish}") : "END")
214
+ end
215
+ return buffer
216
+ end
217
+
218
+ def init_list(nxt)
219
+ @ordered.each do |cell|
220
+ @more = cell.next? if nxt
221
+ if @more
222
+ add_to_list(cell) # add to list
223
+ else
224
+ break
225
+ end
226
+ end
227
+ end
228
+
229
+ def add_to_list(cell)
230
+ if (@last != nil) # add next to end of list
231
+ @last.next = cell
232
+ else
233
+ @first = cell
234
+ end
235
+ @last = cell
236
+ cell.next = nil
237
+ end
238
+
239
+ def first_to_last()
240
+ @last.next = @first # move first to end of list
241
+ @last = @first
242
+ @first = @first.next
243
+ @last.next = nil
244
+ end
245
+
246
+ def queue_to_list()
247
+ @last = @first = nil
248
+ while (@queue.top() != nil)
249
+ add_to_list(@queue.pop())
250
+ end
251
+ end
252
+
253
+ def first_non_ordered_next_to_partial_list()
254
+ # Creates a partial list consisting of first non ordered and earlier.
255
+ # Returns first non ordered .next?.
256
+ @last = @first = nil
257
+ ordered_index = 0
258
+ while (@queue.top() != nil)
259
+ cell = @queue.pop()
260
+ add_to_list(cell)
261
+ if (cell.index == ordered_index)
262
+ ordered_index += 1
263
+ else
264
+ return cell.next?()
265
+ # FIXME: continue here, rename to eg. checkOrderedMatch():
266
+ # when check_slop?() and not ordered, repeat cell.next?().
267
+ # when check_slop?() and ordered, add to list and repeat queue.pop()
268
+ # without check_slop?(): no match, rebuild the queue from the partial list.
269
+ # When queue is empty and check_slop?() and ordered there is a match.
270
+ end
271
+ end
272
+ raise RuntimeException, "Unexpected: ordered"
273
+ end
274
+
275
+ def list_to_queue()
276
+ @queue.clear() # rebuild queue
277
+ partial_list_to_queue()
278
+ end
279
+
280
+ def partial_list_to_queue()
281
+ each() { |cell| @queue.push(cell) } # add to queue from list
282
+ end
283
+
284
+ def at_match?()
285
+ return ((min().doc() == @max.doc()) and check_slop?() and
286
+ (not @in_order or match_is_ordered?()))
287
+ end
288
+
289
+ def check_slop?()
290
+ match_length = @max.finish() - min.start()
291
+ return ((match_length - @total_length) <= @slop)
292
+ end
293
+
294
+ def match_is_ordered?()
295
+ last_start = -1
296
+ @ordered.each do |cell|
297
+ start = cell.start
298
+ return false if start <= last_start
299
+ last_start = start
300
+ end
301
+ return true
302
+ end
303
+ end
304
+ end