ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,181 @@
1
+ module Ferret::Search
2
+ # Expert: A +Scorer+ for documents matching a +Term+.
3
+ class TermScorer < Scorer
4
+ SCORE_CACHE_SIZE = 32
5
+
6
+ # Returns the current document number matching the query.
7
+ # Initially invalid, until #next() is called the first time.
8
+ attr_reader :doc
9
+
10
+ # Construct a +TermScorer+.
11
+ # weight:: The weight of the +Term+ in the query.
12
+ # td:: An iterator over the documents matching the +Term+.
13
+ # similarity:: The +Similarity+ implementation to be used for score
14
+ # computations.
15
+ # norms:: The field norms of the document fields for the +Term+.
16
+ def initialize(weight, td, similarity, norms)
17
+ super(similarity)
18
+
19
+ @doc = 0
20
+ @docs = Array.new(32, 0) # buffered doc numbers
21
+ @freqs = Array.new(32, 0) # buffered term freqs
22
+ @pointer = @pointer_max = 0;
23
+ @score_cache = Array.new(SCORE_CACHE_SIZE)
24
+
25
+ @weight = weight
26
+ @term_docs = td
27
+ @norms = norms
28
+ @weight_value = weight.value
29
+
30
+ SCORE_CACHE_SIZE.times do |i|
31
+ @score_cache[i] = similarity().tf(i) * @weight_value
32
+ end
33
+ end
34
+
35
+ # Expert: Iterates over matching all documents, yielding the document
36
+ # number and the score.
37
+ #
38
+ # returns:: true if more matching documents may remain.
39
+ def each_hit() # :yields: doc, score
40
+ sim = similarity() # cache sim in local
41
+ while next?
42
+ f = @freqs[@pointer]
43
+
44
+ # compute tf(f)*weight
45
+ if f < SCORE_CACHE_SIZE # check cache
46
+ score = @score_cache[f] # cache hit
47
+ else
48
+ score = sim.tf(f) * @weight_value # cache miss
49
+ end
50
+
51
+ score *= sim.decode_norm(@norms[@doc]) # normalize for field
52
+
53
+ yield(@doc, score) # collect score
54
+ end
55
+ end
56
+
57
+ # Expert: Iterates over matching documents in a range.
58
+ #
59
+ # NOTE: that #next? needs to be called first.
60
+ #
61
+ # max:: Do not score documents past this. Default will search all documents
62
+ # avaliable.
63
+ # returns:: true if more matching documents may remain.
64
+ def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
65
+ sim = similarity() # cache sim in local
66
+ while (@doc < max) # for docs in window
67
+ f = @freqs[@pointer]
68
+
69
+ # compute tf(f)*weight
70
+ if f < SCORE_CACHE_SIZE # check cache
71
+ score = @score_cache[f] # cache hit
72
+ else
73
+ score = sim.tf(f) * @weight_value # cache miss
74
+ end
75
+
76
+ score *= sim.decode_norm(@norms[@doc]) # normalize for field
77
+
78
+ yield(@doc, score) # collect score
79
+ if not next?
80
+ return false
81
+ end
82
+ end
83
+ return true # false if we didn't find +max+ hits
84
+ end
85
+
86
+
87
+ # Advances to the next document matching the query.
88
+ # <br>The iterator over the matching documents is buffered using
89
+ # TermDocEnum#read(int[],int[]).
90
+ # returns:: true iff there is another document matching the query.
91
+ def next?()
92
+ @pointer += 1
93
+ if @pointer >= @pointer_max
94
+ @pointer_max = @term_docs.read(@docs, @freqs) # refill buffer
95
+ if @pointer_max != 0
96
+ @pointer = 0
97
+ else
98
+ @term_docs.close() # close stream
99
+ @doc = MAX_DOCS # set to sentinel value
100
+ return false
101
+ end
102
+ end
103
+ @doc = @docs[@pointer]
104
+ return true
105
+ end
106
+
107
+ def score()
108
+ f = @freqs[@pointer]
109
+ # compute tf(f)*weight
110
+ if f < SCORE_CACHE_SIZE # check cache
111
+ raw = @score_cache[f] # cache hit
112
+ else
113
+ raw = similarity().tf(f) * @weight_value # cache miss
114
+ end
115
+
116
+ return raw * Similarity.decode_norm(@norms[@doc]) # normalize for field
117
+ end
118
+
119
+ # Skips to the first match beyond the current whose document number is
120
+ # greater than or equal to a given target.
121
+ #
122
+ # The implementation uses TermDocEnum#skip_to(int).
123
+ # target:: The target document number.
124
+ # returns:: true iff there is such a match.
125
+ def skip_to(target)
126
+ # first scan in cache
127
+ while (@pointer += 1) < @pointer_max
128
+ if @docs[@pointer] >= target
129
+ @doc = @docs[@pointer]
130
+ return true
131
+ end
132
+ end
133
+
134
+ # not found in cache, seek underlying stream
135
+ result = @term_docs.skip_to(target)
136
+ if (result)
137
+ @pointer_max = 1
138
+ @pointer = 0
139
+ @docs[@pointer] = @doc = @term_docs.doc
140
+ @freqs[@pointer] = @term_docs.freq
141
+ else
142
+ @doc = MAX_DOCS
143
+ end
144
+ return result
145
+ end
146
+
147
+ # Returns an explanation of the score for a document.
148
+ #
149
+ # When this method is used, the #next() method and the #score() method
150
+ # should not be used.
151
+ #
152
+ # doc:: The document number for the explanation.
153
+ # TODO: Modify to make use of TermDocEnum#skip_to(int).
154
+ def explain(doc)
155
+ query = @weight.query()
156
+ tf_explanation = Explanation.new()
157
+ tf = 0
158
+ while (@pointer < @pointer_max)
159
+ if (@docs[@pointer] == doc)
160
+ tf = @freqs[@pointer]
161
+ end
162
+ @pointer += 1
163
+ end
164
+ if (tf == 0)
165
+ while (@term_docs.next?)
166
+ if (@term_docs.doc() == doc)
167
+ tf = @term_docs.freq()
168
+ end
169
+ end
170
+ end
171
+ @term_docs.close()
172
+ tf_explanation.value = similarity().tf(tf)
173
+ tf_explanation.description = "tf(term_freq(#{query.term})=#{tf})"
174
+
175
+ return tf_explanation
176
+ end
177
+
178
+ # Returns a string representation of this +TermScorer+.
179
+ def to_s() return "scorer(" + @weight + ")"; end
180
+ end
181
+ end
@@ -0,0 +1,24 @@
1
+ module Ferret::Search
2
+ # Expert: Returned by low-level search implementations.
3
+ # See Searcher#search
4
+ class TopDocs
5
+ # Expert: The total number of hits for the query.
6
+ # See Hits#length()
7
+ attr_accessor :score_docs, :total_hits, :fields
8
+
9
+ # Expert: Constructs a TopDocs.
10
+ def initialize(total_hits, score_docs, fields = SortField::FIELD_SCORE)
11
+ @total_hits = total_hits
12
+ @score_docs = score_docs
13
+ @fields = fields
14
+ end
15
+
16
+ def to_s
17
+ buffer = "#{total_hits} hits sorted by <"
18
+ buffer << [fields].flatten.map {|field| "#{@field}" }.join(", ")
19
+ buffer << ">:\n"
20
+ score_docs.each {|sd| buffer << "\t#{sd}\n" }
21
+ return buffer
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ module Ferret::Search
2
+ # Expert: Returned by low-level sorted search implementations.
3
+ class TopFieldDocs < TopDocs
4
+
5
+ # The fields which were used to sort results by.
6
+ attr_accessor :fields
7
+
8
+ # Creates one of these objects.
9
+ # total_hits:: Total number of hits for the query.
10
+ # score_docs:: The top hits for the query.
11
+ # fields:: The sort criteria used to find the top hits.
12
+ def initialize(total_hits, score_docs, fields)
13
+ super(total_hits, score_docs)
14
+ @fields = fields
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,54 @@
1
+ module Ferret
2
+ module Search
3
+ # Expert: Calculate query weights and build query scorers.
4
+ #
5
+ # The purpose of Weight is to make it so that searching does not modify
6
+ # a Query, so that a Query instance can be reused.
7
+ #
8
+ # Searcher dependent state of the query should reside in the Weight.
9
+ #
10
+ # IndexReader dependent state should reside in the Scorer.
11
+ #
12
+ # A +Weight+ is used in the following way:
13
+ #
14
+ # 1. A +Weight+ is constructed by a top-level query, given a +Searcher+
15
+ # (See Query#create_weight).
16
+ # 2. The #sum_of_squared_weights() method is called on the +Weight+ to
17
+ # compute the query normalization factor Similarity#query_norm(float)
18
+ # of the query clauses contained in the query.
19
+ # 3. The query normalization factor is passed to #normalize().
20
+ # At this point the weighting is complete.
21
+ # 4. A +Scorer+ is constructed by #scorer()
22
+ class Weight
23
+ # The query that this concerns.
24
+ def query()
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # The weight for this query.
29
+ def value()
30
+ raise NotImplementedError
31
+ end
32
+
33
+ # The sum of squared weights of contained query clauses.
34
+ def sum_of_squared_weights()
35
+ raise NotImplementedError
36
+ end
37
+
38
+ # Assigns the query normalization factor to this.
39
+ def normalize(norm)
40
+ raise NotImplementedError
41
+ end
42
+
43
+ # Constructs a scorer for this.
44
+ def scorer(reader)
45
+ raise NotImplementedError
46
+ end
47
+
48
+ # An explanation of the score computation for the named document.
49
+ def explain(reader, doc)
50
+ raise NotImplementedError
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,26 @@
1
+ module Ferret::Search
2
+ # Implements the wildcard search query. Supported wildcards are +*+, which
3
+ # matches any character sequence (including the empty one), and +?+, which
4
+ # matches any single character. Note this query can be slow, as it needs to
5
+ # iterate over many terms. In order to prevent extremely slow
6
+ # WildcardQueries, a Wildcard term should not start with one of the
7
+ # wildcards +*+ or +?+.
8
+ #
9
+ # See WildcardTermEnum
10
+ class WildcardQuery < MultiTermQuery
11
+ def initialize(term)
12
+ super(term)
13
+ end
14
+
15
+ def get_term_enum(reader)
16
+ return WildcardTermEnum.new(reader, @term)
17
+ end
18
+
19
+ def eql?(o)
20
+ if o.instance_of?(WildcardQuery)
21
+ return super(o)
22
+ end
23
+ return false
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,61 @@
1
+ module Ferret::Search
2
+ # Subclass of FilteredTermEnum for enumerating all terms that match the
3
+ # specified wildcard filter term.
4
+ #
5
+ # Term enumerations are always ordered by Term.compareTo(). Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ #
8
+ class WildcardTermEnum < FilteredTermEnum
9
+ include Ferret::Index
10
+
11
+ attr_reader :end_enum
12
+
13
+ WILDCARD_STRING = '*'
14
+ WILDCARD_CHAR = '?'
15
+
16
+ # Creates a new +WildcardTermEnum+. Passing in a
17
+ # org.apache.lucene.index.Term Term that does not contain a
18
+ # +WILDCARD_CHAR+ will cause an exception to be raisen.
19
+ #
20
+ # After calling the constructor the enumeration is already pointing to the first
21
+ # valid term if such a term exists.
22
+ def initialize(reader, term)
23
+ super()
24
+ @end_enum = false
25
+ @search_term = term
26
+ @field = @search_term.field
27
+ text = @search_term.text
28
+ len = text.length
29
+
30
+ sidx = text.index(WILDCARD_STRING)||len
31
+ cidx = text.index(WILDCARD_CHAR)||len
32
+ idx = [sidx, cidx].min
33
+
34
+ @pre = @search_term.text[0,idx]
35
+ @pre_len = idx
36
+ @pattern = /^#{Regexp.escape(text[idx..-1]).gsub(/\\([?*])/){".#{$1}"}}$/
37
+ self.enum = reader.terms_from(Term.new(@search_term.field, @pre))
38
+ end
39
+
40
+ def term_compare(term)
41
+ if (@field == term.field)
42
+ search_text = term.text
43
+ if (search_text[0, @pre_len] == @pre)
44
+ return (search_text[@pre_len..-1] =~ @pattern)
45
+ end
46
+ end
47
+ @end_enum = true
48
+ return false
49
+ end
50
+
51
+ def difference()
52
+ return 1.0
53
+ end
54
+
55
+ def close()
56
+ super()
57
+ @pattern = nil
58
+ @field = nil
59
+ end
60
+ end
61
+ end
@@ -0,0 +1 @@
1
+ require 'ferret/stemmers/porter_stemmer'
@@ -0,0 +1,218 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
4
+ #
5
+ # See example usage at the end of this file.
6
+ #
7
+
8
+ module Stemmable
9
+
10
+ STEMMED = {}
11
+
12
+ STEP_2_LIST = {
13
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
14
+ 'izer'=>'ize', 'bli'=>'ble',
15
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
16
+ 'ization'=>'ize', 'ation'=>'ate',
17
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
18
+ 'ousness'=>'ous', 'aliti'=>'al',
19
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
20
+ }
21
+
22
+ STEP_3_LIST = {
23
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
24
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
25
+ }
26
+
27
+
28
+ SUFFIX_1_REGEXP = /(
29
+ ational |
30
+ tional |
31
+ enci |
32
+ anci |
33
+ izer |
34
+ bli |
35
+ alli |
36
+ entli |
37
+ eli |
38
+ ousli |
39
+ ization |
40
+ ation |
41
+ ator |
42
+ alism |
43
+ iveness |
44
+ fulness |
45
+ ousness |
46
+ aliti |
47
+ iviti |
48
+ biliti |
49
+ logi)$/x
50
+
51
+
52
+ SUFFIX_2_REGEXP = /(
53
+ al |
54
+ ance |
55
+ ence |
56
+ er |
57
+ ic |
58
+ able |
59
+ ible |
60
+ ant |
61
+ ement |
62
+ ment |
63
+ ent |
64
+ ou |
65
+ ism |
66
+ ate |
67
+ iti |
68
+ ous |
69
+ ive |
70
+ ize)$/x
71
+
72
+
73
+ C = "[^aeiou]" # consonant
74
+ V = "[aeiouy]" # vowel
75
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
76
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
77
+
78
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
79
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
80
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
81
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
82
+
83
+ #
84
+ # Porter stemmer in Ruby.
85
+ #
86
+ # This is the Porter stemming algorithm, ported to Ruby from the
87
+ # version coded up in Perl. It's easy to follow against the rules
88
+ # in the original paper in:
89
+ #
90
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
91
+ # no. 3, pp 130-137,
92
+ #
93
+ # See also http://www.tartarus.org/~martin/PorterStemmer
94
+ #
95
+ # Send comments to raypereda@hotmail.com
96
+ #
97
+
98
+ def stem_porter(w = self.to_str.dup)
99
+
100
+ # make a copy of the given object and convert it to a string.
101
+ original_word = w
102
+
103
+ return w if w.length < 3
104
+
105
+ result = STEMMED[w]
106
+ return result if result
107
+
108
+ # now map initial y to Y so that the patterns never treat it as vowel
109
+ w[0] = 'Y' if w[0] == ?y
110
+
111
+ # Step 1a
112
+ if w =~ /(ss|i)es$/
113
+ w = $` + $1
114
+ elsif w =~ /([^s])s$/
115
+ w = $` + $1
116
+ end
117
+
118
+ # Step 1b
119
+ if w =~ /eed$/
120
+ w.chop! if $` =~ MGR0
121
+ elsif w =~ /(ed|ing)$/
122
+ stem = $`
123
+ if stem =~ VOWEL_IN_STEM
124
+ w = stem
125
+ case w
126
+ when /(at|bl|iz)$/ then w << "e"
127
+ when /([^aeiouylsz])\1$/ then w.chop!
128
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
129
+ end
130
+ end
131
+ end
132
+
133
+ if w =~ /y$/
134
+ stem = $`
135
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
136
+ end
137
+
138
+ # Step 2
139
+ if w =~ SUFFIX_1_REGEXP
140
+ stem = $`
141
+ suffix = $1
142
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
143
+ if stem =~ MGR0
144
+ w = stem + STEP_2_LIST[suffix]
145
+ end
146
+ end
147
+
148
+ # Step 3
149
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
150
+ stem = $`
151
+ suffix = $1
152
+ if stem =~ MGR0
153
+ w = stem + STEP_3_LIST[suffix]
154
+ end
155
+ end
156
+
157
+ # Step 4
158
+ if w =~ SUFFIX_2_REGEXP
159
+ stem = $`
160
+ if stem =~ MGR1
161
+ w = stem
162
+ end
163
+ elsif w =~ /(s|t)(ion)$/
164
+ stem = $` + $1
165
+ if stem =~ MGR1
166
+ w = stem
167
+ end
168
+ end
169
+
170
+ # Step 5
171
+ if w =~ /e$/
172
+ stem = $`
173
+ if (stem =~ MGR1) ||
174
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
175
+ w = stem
176
+ end
177
+ end
178
+
179
+ if w =~ /ll$/ && w =~ MGR1
180
+ w.chop!
181
+ end
182
+
183
+ # and turn initial Y back to y
184
+ w[0] = 'y' if w[0] == ?Y
185
+
186
+ STEMMED[original_word] = w
187
+
188
+ w
189
+ end
190
+
191
+
192
+ module_function :stem_porter
193
+ #
194
+ # make the stem_porter the default stem method, just in case we
195
+ # feel like having multiple stemmers available later.
196
+ #
197
+ alias stem stem_porter
198
+ public :stem
199
+
200
+ end
201
+
202
+
203
+ #
204
+ # Make this script executable, and send it words on stdin, one per
205
+ # line, and it will output the stemmed versions to stdout.
206
+ #
207
+ if $0 == __FILE__ then
208
+ class String
209
+ include Stemmable
210
+ end
211
+
212
+ # the String class, and any subclasses of it you might have, now know
213
+ # how to stem things.
214
+
215
+ $stdin.each do |word|
216
+ puts word.strip.stem
217
+ end
218
+ end