ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,58 +0,0 @@
1
- module Ferret::Index
2
- # Provides access to stored term vector of
3
- # a document field.
4
- class SegmentTermVector
5
- # Array of term frequencies. Locations of the array correspond one to one
6
- # to the terms in the array obtained from _terms_
7
- # method. Each location in the array contains the number of times this
8
- # term occurs in the document or the document field.
9
- attr_reader :freqs, :positions, :offsets
10
-
11
- attr_reader :field, :terms
12
-
13
- def initialize(field, terms, freqs, positions=nil, offsets=nil)
14
- @field = field
15
- @terms = terms
16
- @freqs = freqs
17
- @positions = positions
18
- @offsets = offsets
19
- end
20
-
21
- def to_s()
22
- sb = @field.to_s + ": "
23
- if @terms
24
- terms.each_with_index do |term, i|
25
- sb << ', ' if i > 0
26
- sb << term + '/' + @freqs[i].to_s
27
- end
28
- end
29
- sb << 'end'
30
-
31
- return sb
32
- end
33
-
34
- # Returns the number of unique terms in the field
35
- def size()
36
- return @terms == nil ? 0 : @terms.size
37
- end
38
-
39
- # Return an index in the term numbers array returned from _get_terms_ at
40
- # which the term with the specified _term_ appears. If this term does
41
- # not appear in the array, return -1.
42
- def index_of(term)
43
- return @terms ? @terms.index(term) : nil
44
- end
45
-
46
- # Just like _index_of_ but searches for a number of terms at the same
47
- # time. Returns an array that has the same size as the number of terms
48
- # searched for, each slot containing the result of searching for that
49
- # term number.
50
- #
51
- # terms:: array containing terms to look for
52
- # start:: index in the array where the list of terms starts
53
- # len:: the number of terms in the list
54
- def indexes_of(terms, start, len)
55
- return terms[start, len].map { |term| index_of(term) }
56
- end
57
- end
58
- end
@@ -1,53 +0,0 @@
1
- module Ferret::Index
2
- # A Term represents a word from text. This is the unit of search. It is
3
- # composed of two elements, the text of the word, as a string, and the name of
4
- # the field that the text occured in, an interned string.
5
- #
6
- # Note that terms may represent more than words from text fields, but also
7
- # things like dates, email addresses, urls, etc.
8
- #
9
- # A term contains two attributes;
10
- # field:: The field indicates the part of a document which this term came from.
11
- # text:: In the case of words, this is simply the text of the word. In the case
12
- # of dates and other types, this is an encoding of the object as a string.
13
- class Term
14
- include Comparable
15
-
16
- attr_accessor :field
17
- attr_accessor :text
18
-
19
- # Constructs a Term with the given field and text
20
- def initialize(fld_name, txt)
21
- @field = fld_name.to_s
22
- @text = txt.to_s
23
- end
24
-
25
- # Combines the hash() of the field and the text.
26
- def hash()
27
- return field.hash() + text.hash()
28
- end
29
-
30
- # implements comparable giving us the methods >, >=, <, <= and between?
31
- def <=>(other)
32
- if @field == other.field
33
- return @text <=> other.text
34
- else
35
- return @field <=> other.field
36
- end
37
- end
38
- alias :eql? :==
39
-
40
- # Resets the field and text of a Term.
41
- def set!(fld_name, txt)
42
- initialize(fld_name, txt)
43
- end
44
-
45
- def text=(text)
46
- @text = text.to_s
47
- end
48
-
49
- def to_s
50
- @field + ":" + @text
51
- end
52
- end
53
- end
@@ -1,83 +0,0 @@
1
- module Ferret::Index
2
- class TermBuffer
3
- include Comparable
4
-
5
- attr_reader :text_buf, :text_length, :field
6
-
7
- def initialize
8
- @text_buf = String.new
9
- @text_length = -1
10
- @field = nil
11
- end
12
-
13
- def hash()
14
- return text.hash + @field.hash
15
- end
16
-
17
- def <=>(other)
18
- if (@field == other.field)
19
- return text <=> other.text
20
- end
21
- @field <=> other.field
22
- end
23
-
24
- def read(input, field_infos)
25
- @term = nil # invalidate cache
26
- start = input.read_vint()
27
- length = input.read_vint()
28
- total_length = start + length
29
- @text_length = total_length
30
- input.read_chars(@text_buf, start, length)
31
- @field = field_infos[input.read_vint()].name
32
- end
33
-
34
- def term=(term)
35
- if (term == nil)
36
- reset()
37
- return
38
- end
39
-
40
- # copy text into the buffer
41
- @text_buf = term.text.clone
42
- @text_length = @text_buf.length
43
-
44
- @field = term.field
45
- @term = term
46
- end
47
-
48
- def set!(other)
49
- @text_length = other.text_length
50
- @text_buf = other.text_buf.clone if other.text_buf
51
- @field = other.field
52
- @term = other.term
53
- end
54
- alias :initialize_copy :set!
55
-
56
- def reset()
57
- @field = nil
58
- @text_buf = ""
59
- @text_length = 0
60
- @term = nil
61
- end
62
-
63
- def to_term()
64
- if @field.nil? # unset
65
- return nil
66
- end
67
-
68
- if @term.nil?
69
- @term = Term.new(@field, @text_buf[0,@text_length].to_s)
70
- end
71
- return @term
72
- end
73
- alias :term :to_term
74
-
75
- def text()
76
- @text_buf[0,@text_length]
77
- end
78
-
79
- def to_s()
80
- to_term.to_s
81
- end
82
- end
83
- end
@@ -1,291 +0,0 @@
1
- module Ferret::Index
2
- # TermDocEnum provides an interface for enumerating &lt;document,
3
- # frequency&gt; pairs for a term.
4
- #
5
- # The document portion names each document containing the term. Documents
6
- # are indicated by number. The frequency portion gives the number of times
7
- # the term occurred in each document.
8
- #
9
- # The pairs are ordered by document number.
10
- #
11
- # See IndexReader#term_docs
12
- class TermDocEnum
13
- # Sets this to the data for a term.
14
- # The enumeration is reset to the start of the data for this term.
15
- def seek(term) raise NotImplementedError end
16
-
17
- # Returns the current document number.
18
- #
19
- # This is invalid until #next() is called for the first time.
20
- def doc() raise NotImplementedError end
21
-
22
- # Returns the frequency of the term within the current document. This
23
- # is invalid until {@link #next()} is called for the first time.
24
- def freq() raise NotImplementedError end
25
-
26
- # Moves to the next pair in the enumeration.
27
- # Returns true iff there is such a next pair in the enumeration.
28
- def next?() raise NotImplementedError end
29
-
30
- # Attempts to read multiple entries from the enumeration, up to length of
31
- # _docs_. Document numbers are stored in _docs_, and term
32
- # frequencies are stored in _freqs_. The _freqs_ array must be as
33
- # long as the _docs_ array.
34
- #
35
- # Returns the number of entries read. Zero is only returned when the
36
- # stream has been exhausted.
37
- def read(docs, freqs) raise NotImplementedError end
38
-
39
- # Skips entries to the first beyond the current whose document number is
40
- # greater than or equal to _target_.
41
- #
42
- # Returns true iff there is such an entry.
43
- #
44
- # Some implementations are considerably more efficient than that.
45
- def skip_to(target)
46
- while (target > doc())
47
- return false if not next?()
48
- end
49
- return true
50
- end
51
-
52
- # Frees associated resources.
53
- def close() raise NotImplementedError end
54
- end
55
-
56
-
57
- class SegmentTermDocEnum < TermDocEnum
58
- attr_accessor :parent, :freq_stream, :count, :df, :deleted_docs, :doc, :freq
59
-
60
- def initialize(parent)
61
- @parent = parent
62
- @freq_stream = parent.freq_stream.clone()
63
- @deleted_docs = parent.deleted_docs
64
- @skip_interval = parent.term_infos.skip_interval
65
- @skip_stream = nil
66
- @doc = 0
67
- end
68
-
69
- # Find the term, TermEnum or TermInfo in the doc
70
- #
71
- # t:: can be a Term, TermEnum of TermInfo object
72
- def seek(t)
73
- if t.instance_of?(Term)
74
- ti = parent.term_infos[t]
75
- elsif t.is_a?(TermEnum)
76
- ti = t.term_info()
77
- # The following is being done in the Java version. I don't think it's
78
- # necessary.
79
- # use comparison of fieldinfos to verify that term enum (t) belongs to the
80
- # same segment as this SegmentTermDocEnum
81
- #if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
82
- # ti = t.term_info()
83
- #else # punt case
84
- # ti = parent.term_infos[t.term]
85
- #end
86
- elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
87
- ti = t
88
- else
89
- raise ArgumentError, "Must pass a Term, TermEnum or TermInfo object, not a " +
90
- t.class.to_s
91
- end
92
- do_seek(ti)
93
- #puts "pos = #{@freq_stream.pos} ti = #{ti}"
94
- end
95
-
96
- def do_seek(ti)
97
- @count = 0
98
- if (ti == nil)
99
- @doc_freq = 0
100
- else
101
- @doc_freq = ti.doc_freq
102
- @doc = 0
103
- @skip_doc = 0
104
- @skip_count = 0
105
- @num_skips = @doc_freq / @skip_interval
106
- @freq_pointer = ti.freq_pointer
107
- @prox_pointer = ti.prox_pointer
108
- @skip_pointer = @freq_pointer + ti.skip_offset
109
- @freq_stream.seek(@freq_pointer)
110
- @have_skipped = false
111
- end
112
- end
113
-
114
- def close()
115
- @freq_stream.close()
116
- @freq_stream = nil
117
- if (@skip_stream != nil)
118
- @skip_stream.close()
119
- @skip_stream = nil
120
- end
121
- @parent = nil
122
- end
123
-
124
- def skipping_doc()
125
- end
126
-
127
- def next?()
128
- while (true)
129
- return false if @count == @doc_freq
130
-
131
- doc_code = @freq_stream.read_vint()
132
- @doc += doc_code >> 1 # shift off low bit
133
- if ((doc_code & 1) != 0) # if low bit is set
134
- @freq = 1 # freq is one
135
- else
136
- @freq = @freq_stream.read_vint() # else read freq
137
- end
138
-
139
- @count += 1
140
-
141
- break if (@deleted_docs == nil or not @deleted_docs[@doc])
142
-
143
- skipping_doc()
144
- end
145
- return true
146
- end
147
-
148
- # Optimized implementation.
149
- def read(docs, freqs, start = 0)
150
- i = start
151
- needed = docs.length
152
-
153
- while (i < needed and @count < @doc_freq)
154
-
155
- # manually inlined call to next?() for speed
156
- doc_code = @freq_stream.read_vint()
157
- @doc += doc_code >> 1 # shift off low bit
158
- if ((doc_code & 1) != 0) # if low bit is set
159
- @freq = 1 # freq is one
160
- else
161
- @freq = @freq_stream.read_vint() # else read freq
162
- end
163
-
164
- @count += 1
165
-
166
- if (@deleted_docs == nil or not @deleted_docs[@doc])
167
- docs[i] = @doc
168
- freqs[i] = @freq
169
- i += 1
170
- end
171
-
172
- skipping_doc()
173
- end
174
- return i
175
- end
176
-
177
- # Overridden by SegmentTermDocPosEnum to skip in prox stream.
178
- def skip_prox(prox_pointer)
179
- end
180
-
181
- # Optimized implementation.
182
- def skip_to(target)
183
- if (@doc_freq >= @skip_interval) # optimized case
184
-
185
- if (@skip_stream == nil)
186
- @skip_stream = @freq_stream.clone() # lazily clone
187
- end
188
-
189
- if (!@have_skipped) # lazily seek skip stream
190
- @skip_stream.seek(@skip_pointer)
191
- @have_skipped = true
192
- end
193
-
194
- # scan skip data
195
- last_skip_doc = @skip_doc
196
- last_freq_pointer = @freq_stream.pos()
197
- last_prox_pointer = -1
198
- num_skipped = -1 - (@count % @skip_interval)
199
-
200
- while (target > @skip_doc)
201
- last_skip_doc = @skip_doc
202
- last_freq_pointer = @freq_pointer
203
- last_prox_pointer = @prox_pointer
204
-
205
- if (@skip_doc != 0 and @skip_doc >= @doc)
206
- num_skipped += @skip_interval
207
- end
208
-
209
- if(@skip_count >= @num_skips)
210
- break
211
- end
212
-
213
- @skip_doc += @skip_stream.read_vint()
214
- @freq_pointer += @skip_stream.read_vint()
215
- @prox_pointer += @skip_stream.read_vint()
216
-
217
- @skip_count += 1
218
- end
219
-
220
- # if we found something to skip, then skip it
221
- if (last_freq_pointer > @freq_stream.pos())
222
- @freq_stream.seek(last_freq_pointer)
223
- skip_prox(last_prox_pointer)
224
-
225
- @doc = last_skip_doc
226
- @count += num_skipped
227
- end
228
-
229
- end
230
-
231
- # done skipping, now just scan
232
-
233
- begin
234
- if not next?
235
- return false
236
- end
237
- end while (target > @doc)
238
- return true
239
- end
240
- end
241
-
242
- class SegmentTermDocPosEnum < SegmentTermDocEnum
243
- def initialize(p)
244
- super
245
- @prox_stream = p.prox_stream.clone()
246
- end
247
-
248
- def do_seek(ti)
249
- super
250
- if (ti != nil)
251
- @prox_stream.seek(ti.prox_pointer)
252
- end
253
- @prox_count = 0
254
- end
255
-
256
- def close()
257
- super
258
- @prox_stream.close()
259
- end
260
-
261
- def next_position()
262
- @prox_count -= 1
263
- return @position += @prox_stream.read_vint()
264
- end
265
-
266
- def skipping_doc()
267
- @freq.times { @prox_stream.read_vint() }
268
- end
269
-
270
- def next?()
271
- @prox_count.times { @prox_stream.read_vint() }
272
-
273
- if (super)
274
- @prox_count = @freq # note frequency
275
- @position = 0 # reset position
276
- return true
277
- end
278
- return false
279
- end
280
-
281
- def read(docs, freqs)
282
- raise NotImplementedError, "TermDocPosEnum does not support processing multiple documents in one call. Use TermDocEnum instead."
283
- end
284
-
285
- # Called by super.skipTo().
286
- def skip_prox(prox_pointer)
287
- @prox_stream.seek(prox_pointer)
288
- @prox_count = 0
289
- end
290
- end
291
- end