ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,86 +0,0 @@
1
- module Ferret::Analysis
2
- # A TokenFilter is a TokenStream whose input is another token stream.
3
- #
4
- # This is an abstract class.
5
- class TokenFilter < TokenStream
6
- # Close the input TokenStream.
7
- def close()
8
- @input.close()
9
- end
10
-
11
- protected
12
- # Construct a token stream filtering the given input.
13
- def initialize(input)
14
- @input = input
15
- end
16
- end
17
-
18
- # Normalizes token text to lower case.
19
- class LowerCaseFilter < TokenFilter
20
- def next()
21
- t = @input.next()
22
-
23
- if (t == nil)
24
- return nil
25
- end
26
-
27
- t.text = t.text.downcase()
28
-
29
- return t
30
- end
31
- end
32
-
33
- # Removes stop words from a token stream. To will need to pass your own
34
- # set of stopwords to use this stop filter. If you with to use the default
35
- # list of stopwords then use the StopAnalyzer.
36
- class StopFilter < TokenFilter
37
- # Constructs a filter which removes words from the input
38
- # TokenStream that are named in the array of words.
39
- def initialize(input, stop_set)
40
- super(input);
41
- @stop_set = stop_set
42
- end
43
-
44
- def StopFilter.new_with_file(input, path)
45
- ws = WordListLoader.word_set_from_file(path)
46
- return StopFilter.new(input, ws)
47
- end
48
-
49
- # Returns the next input Token whose termText() is not a stop word.
50
- def next()
51
- # return the first non-stop word found
52
- while token = @input.next()
53
- return token if ! @stop_set.include?(token.text)
54
- end
55
- return nil
56
- end
57
- end
58
-
59
- # Transforms the token stream as per the Porter stemming algorithm.
60
- # Note: the input to the stemming filter must already be in lower case,
61
- # so you will need to use LowerCaseFilter or LowerCaseTokenizer further
62
- # down the Tokenizer chain in order for this to work properly!
63
- #
64
- # To use this filter with other analyzers, you'll want to write an
65
- # Analyzer class that sets up the TokenStream chain as you want it.
66
- # To use this with LowerCaseTokenizer, for example, you'd write an
67
- # analyzer like this:
68
- #
69
- # def MyAnalyzer < Analyzer
70
- # def token_stream(field, reader)
71
- # return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
72
- # end
73
- # end
74
- class PorterStemFilter < TokenFilter
75
- # Returns the next input Token, after being stemmed
76
- def next()
77
- token = @input.next()
78
- if (token == nil)
79
- return nil
80
- else
81
- token.text = Stemmable.stem_porter(token.text)
82
- end
83
- token
84
- end
85
- end
86
- end
@@ -1,26 +0,0 @@
1
- module Ferret::Analysis
2
- # A TokenStream enumerates the sequence of tokens, either from
3
- # fields of a document or from query text.
4
- #
5
- # This is an abstract class. Concrete subclasses are:
6
- # * Tokenizer, a TokenStream whose input is a Reader; and
7
- # * TokenFilter, a TokenStream whose input is another TokenStream.
8
- class TokenStream
9
- # Returns the next token in the stream, or null at EOS.
10
- def next
11
- raise NotImplementedError
12
- end
13
-
14
- # Releases resources associated with this stream.
15
- def close
16
- raise NotImplementedError
17
- end
18
-
19
- # Iterates through the tokens in the field
20
- def each # :yields: token
21
- while (n = self.next())
22
- yield n
23
- end
24
- end
25
- end
26
- end
@@ -1,112 +0,0 @@
1
- require 'strscan'
2
-
3
- module Ferret::Analysis
4
- # A Tokenizer is a TokenStream whose input is a Reader.
5
- #
6
- # This is an abstract class.
7
- class Tokenizer < TokenStream
8
- # By default, closes the input Reader.
9
- def close()
10
- @input.close()
11
- end
12
-
13
- protected
14
- # Construct a token stream processing the given input.
15
- def initialize(input)
16
- @input = input
17
- end
18
- end
19
-
20
- # An abstract base class for simple regular expression oriented
21
- # tokenizers. Very powerful tokenizers can be created using this class as
22
- # can be seen from the StandardTokenizer class. Bellow is an example of a
23
- # simple implementation of a LetterTokenizer using an RegExpTokenizer.
24
- # Basically, a token is a sequence of alphabetic characters separated by
25
- # one or more non-alphabetic characters.
26
- #
27
- # class LetterTokenizer < RegExpTokenizer
28
- # def token_re()
29
- # /[[:alpha:]]+/
30
- # end
31
- # end
32
- class RegExpTokenizer < Tokenizer
33
-
34
- # Initialize with an IO implementing input such as a file.
35
- #
36
- # input:: must have a read(count) method which returns an array or string
37
- # of _count_ chars.
38
- def initialize(input)
39
- #@token_buffer = Token.new("", 0, 0)
40
- if input.is_a? String
41
- @ss = StringScanner.new(input)
42
- else
43
- @ss = StringScanner.new(input.read())
44
- end
45
- end
46
-
47
- # Returns the next token in the stream, or null at EOS.
48
- def next()
49
- if @ss.scan_until(token_re)
50
- term = @ss.matched
51
- term_end = @ss.pos
52
- term_start = term_end - term.size
53
- else
54
- return nil
55
- end
56
-
57
- #return @token_buffer.set!(normalize(term), term_start, term_end)
58
- return Token.new(normalize(term), term_start, term_end)
59
- end
60
-
61
- def close()
62
- @ss = nil
63
- end
64
-
65
- protected
66
- # returns the regular expression used to find the next token
67
- TOKEN_RE = /[[:alpha:]]+/
68
- def token_re
69
- TOKEN_RE
70
- end
71
-
72
- # Called on each token to normalize it before it is added to the
73
- # token. The default implementation does nothing. Subclasses may
74
- # use this to, e.g., lowercase tokens.
75
- def normalize(str) return str end
76
- end
77
-
78
-
79
- # A LetterTokenizer is a tokenizer that divides text at non-letters.
80
- # That's to say, it defines tokens as maximal strings of adjacent letters,
81
- # as defined by the regular expression _/[[:alpha:]]+/_.
82
- class LetterTokenizer < RegExpTokenizer
83
- protected
84
- # Collects only characters which satisfy the regular expression
85
- # _/[[:alpha:]]+/_.
86
- TOKEN_RE = /[[:alpha:]]+/
87
- def token_re
88
- TOKEN_RE
89
- end
90
- end
91
-
92
- # LowerCaseTokenizer performs the function of LetterTokenizer
93
- # and LowerCaseFilter together. It divides text at non-letters and converts
94
- # them to lower case.
95
- class LowerCaseTokenizer < LetterTokenizer
96
- protected
97
- def normalize(str)
98
- return str.downcase
99
- end
100
- end
101
-
102
- # A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
103
- # Adjacent sequences of non-WhiteSpace characters form tokens.
104
- class WhiteSpaceTokenizer < RegExpTokenizer
105
- protected
106
- # Collects only characters which are not spaces tabs or carraige returns
107
- TOKEN_RE = /\S+/
108
- def token_re
109
- TOKEN_RE
110
- end
111
- end
112
- end
@@ -1,27 +0,0 @@
1
- require 'set'
2
- module Ferret::Analysis
3
- # Loader for text files that represent a list of stopwords.
4
- module WordListLoader
5
- # Loads a text file and adds every line as an entry to a HashSet (omitting
6
- # leading and trailing whitespace). Every line of the file should contain only
7
- # one word. The words need to be in lowercase if you make use of an
8
- # Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
9
- #
10
- # path:: path to file containing the wordlist
11
- # return:: A HashSet with the file's words
12
- def WordListLoader.word_set_from_file(path)
13
- result = Set.new()
14
- File.open(path) do |word_file|
15
- # we have to strip the end of line characters
16
- word_file.each {|line| result << line[0..-2] }
17
- end
18
- return result
19
- end
20
-
21
- def WordListLoader.word_set_from_array(word_array)
22
- result = Set.new()
23
- word_array.each {|word| result << word }
24
- return result
25
- end
26
- end
27
- end
@@ -1,152 +0,0 @@
1
- module Ferret::Document
2
- # Documents are the unit of indexing and search.
3
- #
4
- # A Document is a set of fields. Each field has a name and a textual
5
- # value. A field may be Field#stored?() with the document, in which case
6
- # it is returned with search hits on the document. Thus each document
7
- # should typically contain one or more stored fields which uniquely
8
- # identify it.
9
- #
10
- # Note that fields which are _not_ Field#stored?() are _not_ available in
11
- # documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
12
- # IndexReader#document.
13
- #
14
- # Several fields may be added with the same name. In this case, if the
15
- # fields are indexed, their text is treated as though appended for the
16
- # purposes of search.
17
- #
18
- # Note that add like the remove_field(s) methods only makes sense prior to
19
- # adding a document to an index. These methods cannot be used to change
20
- # the content of an existing index! In order to achieve this, a document
21
- # has to be deleted from an index and a new changed version of that
22
- # document has to be added.
23
- class Document
24
- attr_accessor :boost
25
-
26
- # Constructs a new document with no fields.
27
- def initialize()
28
- # Values are multiplied into the value of Field#boost of each field in
29
- # this document. Thus, this method in effect sets a default boost for
30
- # the fields of this document.
31
- #
32
- # The default value is 1.0.
33
- #
34
- # Note: This value is not stored directly with the document in the
35
- # index. Documents returned from IndexReader#document and Hits#doc
36
- # may thus not have the same value present as when this document was
37
- # indexed.
38
- @boost = 1.0
39
- @fields = {}
40
- end
41
-
42
- # Returns an array of all fields. Note that it is possible for two
43
- # fields to appear with the same field name. These will be concatenated
44
- # in the index.
45
- def all_fields
46
- @fields.values.flatten
47
- end
48
-
49
- # Returns the number of distinct fields held within the document. This
50
- # counts fields which have multiple entries as one.
51
- def field_count()
52
- return @fields.size
53
- end
54
-
55
- # Returns the number of entries held within the document. This counts
56
- # all sections so for fields which have multiple entries, each entry
57
- # is counted
58
- def entry_count()
59
- return @fields.values.flatten.size
60
- end
61
-
62
- # Adds a field to a document. Several fields may be added with the same
63
- # name. In this case, if the fields are indexed, their text is treated
64
- # as though appended for the purposes of search.
65
- #
66
- # Note that add like the remove_field(s) methods only makes sense prior
67
- # to adding a document to an index. These methods cannot be used to
68
- # change the content of an existing index! In order to achieve this, a
69
- # document has to be deleted from an index and a new changed version of
70
- # that document has to be added.
71
- def add_field(field)
72
- (@fields[field.name.to_s] ||= []) << field
73
- end
74
- alias :<< :add_field
75
-
76
- # Removes the first field of this name if it exists.
77
- def remove_field(name)
78
- @fields[name.to_s].delete_at(0)
79
- end
80
-
81
- # Removes all fields with the given name from the document.
82
- #
83
- # If there is no field with the specified name, the document remains
84
- # unchanged.
85
- #
86
- # Note that the remove_field(s) methods like the add method only make
87
- # sense prior to adding a document to an index. These methods cannot be
88
- # used to change the content of an existing index! In order to achieve
89
- # this, a document has to be deleted from an index and a new changed
90
- # version of that document has to be added.
91
- def remove_fields(name)
92
- @fields.delete(name.to_s)
93
- end
94
-
95
- # Returns the first field with the given name.
96
- # This method can return _nil_.
97
- #
98
- # name:: the name of the field
99
- # Return:: a _Field_ array
100
- def field(name)
101
- @fields[name.to_s] ? @fields[name.to_s][0] : nil
102
- end
103
-
104
- # Returns an array of all fields with the given name.
105
- # This method can return _nil_.
106
- #
107
- # name:: the name of the field
108
- # Return:: a _Field_ array
109
- def fields(name)
110
- @fields[name.to_s]
111
- end
112
-
113
- # Returns an array of values of the field specified as the method
114
- # parameter. This method can return _nil_.
115
- #
116
- # name:: the name of the field
117
- # Return:: a _String_ of field values
118
- def values(name)
119
- return nil if @fields[name.to_s].nil?
120
- @fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
121
- end
122
- alias :[] :values
123
-
124
- # Sets the data in field +field+ to +text+. If there is more than one
125
- # field of that name then it will set the data in the first field of that
126
- # name. If there is no field of that name, then a new one will be created
127
- def []=(field_name, data)
128
- field = field(field_name.to_s)
129
- if field
130
- field.data = data
131
- else
132
- add_field(Field.new(field_name.to_s, data))
133
- end
134
- end
135
-
136
- # Returns an array of binaries of the field specified as the method
137
- # parameter. This method can return _nil_.
138
- #
139
- # name:: the name of the field
140
- # Return:: a _String_ of field values
141
- def binaries(name)
142
- binaries = []
143
- @fields[name.to_s].each {|f| binaries << f.data if f.binary? }
144
- return binaries
145
- end
146
-
147
- # Prints the fields of a document for human consumption.#/
148
- def to_s()
149
- return "Document{\n #{@fields.values.join("\n ")}\n}"
150
- end
151
- end
152
- end
@@ -1,312 +0,0 @@
1
- module Ferret::Document
2
- # A field is a section of a Document. Each field has two parts, a name
3
- # and a value. Values may be free text, provided as a String or as a
4
- # Reader, or they may be atomic keywords, which are not further processed.
5
- # Such keywords may be used to represent dates, urls, etc. Fields are
6
- # optionally stored in the index, so that they may be returned with hits
7
- # on the document.
8
- class Field
9
-
10
- # This value will be
11
- # multiplied into the score of all hits on this field of this
12
- # document.
13
- #
14
- # The boost is multiplied by Document#boost of the document
15
- # containing this field. If a document has multiple fields with the same
16
- # name, all such values are multiplied together. This product is then
17
- # multipled by the value Similarity#length_norm(String,int), and
18
- # rounded by Similarity#encode_norm(float) before it is stored in the
19
- # index. One should attempt to ensure that this product does not overflow
20
- # the range of that encoding.
21
- #
22
- # See Document#set_boost(float)
23
- # See Similarity#length_norm(String, int)
24
- # See Similarity#encode_norm(float)
25
- #
26
- # Note: this value is not stored directly with the document in the index.
27
- # Documents returned from IndexReader#document(int) and
28
- # Hits#doc(int) may thus not have the same value present as when this field
29
- # was indexed.
30
- attr_accessor :boost, :data
31
- attr_reader :name
32
-
33
- # True iff the value of the field is to be stored in the index for
34
- # return with search hits. It is an error for this to be true if a
35
- # field is Reader-valued.
36
- def stored?() return @stored end
37
-
38
- # True iff the value of the field is to be indexed, so that it may be
39
- # searched on.
40
- def indexed?() return @indexed end
41
-
42
- # True iff the value of the field should be tokenized as text prior to
43
- # indexing. Un-tokenized fields are indexed as a single word and may
44
- # not be Reader-valued.
45
- def tokenized?() return @tokenized end
46
-
47
- # True if the field is to be stored as a binary value. This can be used
48
- # to store images or other binary data in the index if you wish
49
- def binary?() return @binary end
50
-
51
- # True if you want to compress the data that you store. This is a good
52
- # idea for really large text fields. The ruby Zlib library is used to do
53
- # the compression
54
- def compressed?() return @compressed end
55
-
56
- # True iff the term or terms used to index this field are stored as a
57
- # term vector, available from IndexReader#term_freq_vector(). These
58
- # methods do not provide access to the original content of the field,
59
- # only to terms used to index it. If the original content must be
60
- # preserved, use the _stored_ attribute instead.
61
- #
62
- # See IndexReader#term_freq_vector()
63
- def store_term_vector?() return @store_term_vector end
64
-
65
- # True if the positions of the indexed terms in this field are stored.
66
- def store_positions?() return @store_position end
67
-
68
- # True if the offsets of this field are stored. The offsets are the
69
- # positions of the start and end characters of the token in the whole
70
- # field string
71
- def store_offsets?() return @store_offset end
72
-
73
- # True if the norms are not stored for this field. No norms means that
74
- # index-time boosting and field length normalization will be disabled.
75
- # The benefit is less memory usage as norms take up one byte per indexed
76
- # field for every document in the index.
77
- def omit_norms?() return @omit_norms end
78
-
79
- class Store < Ferret::Utils::Parameter
80
- # Store the original field value in the index in a compressed form.
81
- # This is useful for long documents and for binary valued fields.
82
- COMPRESS = Store.new("COMPRESS")
83
-
84
- # Store the original field value in the index. This is useful for
85
- # short texts like a document's title which should be displayed with
86
- # the results. The value is stored in its original form, i.e. no
87
- # analyzer is used before it is stored.
88
- YES = Store.new("YES")
89
-
90
- # Do not store the field value in the index.
91
- NO = Store.new("NO")
92
- end
93
-
94
- class Index < Ferret::Utils::Parameter
95
- # Do not index the field value. This field can thus not be searched,
96
- # but one can still access its contents provided it is Field.Store
97
- # stored
98
- NO = Index.new("NO")
99
-
100
- # Index the field's value so it can be searched. An Analyzer will be
101
- # used to tokenize and possibly further normalize the text before its
102
- # terms will be stored in the index. This is useful for common text.
103
- TOKENIZED = Index.new("TOKENIZED")
104
-
105
- # Index the field's value without using an Analyzer, so it can be
106
- # searched. As no analyzer is used the value will be stored as a
107
- # single term. This is useful for unique Ids like product numbers.
108
- UNTOKENIZED = Index.new("UNTOKENIZED")
109
-
110
- # Index the field's value without an Analyzer, and disable the storing
111
- # of norms. No norms means that index-time boosting and field length
112
- # normalization will be disabled. The benefit is less memory usage as
113
- # norms take up one byte per indexed field for every document in the
114
- # index.
115
- NO_NORMS = Index.new("NO_NORMS");
116
- end
117
-
118
- class TermVector < Ferret::Utils::Parameter
119
- # Do not store term vectors.
120
- NO = TermVector.new("NO")
121
-
122
- # Store the term vectors of each document. A term vector is a list of
123
- # the document's terms and their number of occurences in that
124
- # document.
125
- YES = TermVector.new("YES")
126
-
127
- # Store the term vector + token position information
128
- #
129
- # See #YES
130
- WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
131
-
132
- # Store the term vector + Token offset information
133
- #
134
- # See #YES
135
- WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
136
-
137
- # Store the term vector + Token position and offset information
138
- #
139
- # See #YES See #WITH_POSITIONS See #WITH_OFFSETS
140
- WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
141
- end
142
-
143
- # Create a field by specifying its name, value and how it will
144
- # be saved in the index.
145
- #
146
- # name:: The name of the field
147
- # value:: The string to process
148
- # store:: Whether _value_ should be stored in the index
149
- # index:: Whether the field should be indexed, and if so, if it should
150
- # be tokenized before indexing
151
- #
152
- # store_term_vector:: Whether term vector should be stored
153
- # * the field is neither stored nor indexed
154
- # * the field is not indexed but term_vector is _TermVector::YES_
155
- #
156
- # binary:: Whether you want to store binary data in this field. Default is
157
- # false
158
- # boost:: the boost for this field. Default is 1.0. A larger number makes
159
- # this field more important.
160
- def initialize(name,
161
- value,
162
- store = Store::YES,
163
- index = Index::UNTOKENIZED,
164
- term_vector = TermVector::NO,
165
- binary = false,
166
- boost = 1.0)
167
- if (index == Index::NO and store == Store::NO)
168
- raise ArgumentError, "it doesn't make sense to have a field that " +
169
- "is neither indexed nor stored"
170
- end
171
- if (index == Index::NO && term_vector != TermVector::NO)
172
- raise ArgumentError, "cannot store term vector information for a " +
173
- "field that is not indexed"
174
- end
175
-
176
- # The name of the field (e.g., "date", "subject", "title", or "body")
177
- @name = name.to_s
178
-
179
- # the one and only data object for all different kind of field values
180
- @data = value
181
- self.store = store
182
- self.index = index
183
- self.term_vector = term_vector
184
- @binary = binary
185
- @boost = boost
186
- end
187
-
188
- def store=(store)
189
- case store
190
- when Store::YES
191
- @stored = true
192
- @compressed = false
193
- when Store::COMPRESS
194
- @stored = true
195
- @compressed = true
196
- when Store::NO
197
- @stored = false
198
- @compressed = false
199
- else
200
- raise "unknown stored parameter " + store.to_s
201
- end
202
- end
203
-
204
- def index=(index)
205
- @omit_norms = false
206
- case index
207
- when Index::NO
208
- @indexed = false
209
- @tokenized = false
210
- when Index::TOKENIZED
211
- @indexed = true
212
- @tokenized = true
213
- when Index::UNTOKENIZED
214
- @indexed = true
215
- @tokenized = false
216
- when Index::NO_NORMS
217
- @indexed = true
218
- @tokenized = false
219
- @omit_norms = true
220
- else
221
- raise "unknown stored parameter " + index.to_s
222
- end
223
- end
224
-
225
- def term_vector=(term_vector)
226
- case term_vector
227
- when TermVector::NO
228
- @store_term_vector = false
229
- @store_position = false
230
- @store_offset = false
231
- when TermVector::YES
232
- @store_term_vector = true
233
- @store_position = false
234
- @store_offset = false
235
- when TermVector::WITH_POSITIONS
236
- @store_term_vector = true
237
- @store_position = true
238
- @store_offset = false
239
- when TermVector::WITH_OFFSETS
240
- @store_term_vector = true
241
- @store_position = false
242
- @store_offset = true
243
- when TermVector::WITH_POSITIONS_OFFSETS
244
- @store_term_vector = true
245
- @store_position = true
246
- @store_offset = true
247
- else
248
- raise "unknown term_vector parameter " + store_term_vector.to_s
249
- end
250
- end
251
-
252
- # Returns the string value of the data that is stored in this field
253
- def string_value
254
- if @data.instance_of? String
255
- return @data
256
- elsif @data.respond_to? :read
257
- return @data.read()
258
- else
259
- # if it is binary object try to return a string representation
260
- return @data.to_s
261
- end
262
- end
263
-
264
- # if the data is stored as a binary, just return it.
265
- def binary_value
266
- return @data
267
- end
268
-
269
- # Returns the string value of the data that is stored in this field
270
- def reader_value
271
- if @data.respond_to? :read
272
- return @data
273
- elsif @data.instance_of? String
274
- return Ferret::Utils::StringHelper::StringReader.new(@data)
275
- else
276
- # if it is binary object try to return a string representation
277
- return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
278
- end
279
- end
280
-
281
- # Create a stored field with binary value. Optionally the value
282
- # may be compressed. But it obviously won't be tokenized or
283
- # term vectored or anything like that.
284
- #
285
- # name:: The name of the field
286
- # value:: The binary value
287
- # store:: How _value_ should be stored (compressed or not.)
288
- def Field.new_binary_field(name, value, stored)
289
- if (stored == Store::NO)
290
- raise ArgumentError, "binary values can't be unstored"
291
- end
292
- Field.new(name, value, stored, Index::NO, TermVector::NO, true)
293
- end
294
-
295
- # Prints a Field for human consumption.
296
- def to_s()
297
- str = ""
298
- if (@stored)
299
- str << "stored"
300
- str << (@compressed ? "/compressed," : "/uncompressed,")
301
- end
302
- str << "indexed," if (@indexed)
303
- str << "tokenized," if (@tokenized)
304
- str << "store_term_vector," if (@store_term_vector)
305
- str << "store_offsets," if (@store_offset)
306
- str << "store_positions," if (@store_position)
307
- str << "omit_norms," if (@omit_norms)
308
- str << "binary," if (@binary)
309
- str << "<#{@name}:#{@binary ? '=bin_data=' : data}>"
310
- end
311
- end
312
- end