ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,412 +0,0 @@
1
- module Ferret::Index
2
-
3
- # FIXME: Describe class +SegmentReader+ here.
4
- #
5
- class SegmentReader < IndexReader
6
-
7
- attr_reader :freq_stream, :prox_stream, :deleted_docs,
8
- :term_infos, :field_infos, :segment
9
-
10
- def SegmentReader.get(info, infos = nil, close = false)
11
- return SegmentReader.new(info.directory, info, infos, close, infos!=nil)
12
- end
13
-
14
- def initialize(dir, info, seg_infos, close, owner)
15
- super(dir, seg_infos, close, owner)
16
- @segment = info.name
17
-
18
- @cfs_reader = nil
19
- dir = directory
20
- #if directory.exists?(@segment + '.cfs') then
21
- if SegmentReader.uses_compound_file?(info)
22
- @cfs_reader = CompoundFileReader.new(directory, @segment + '.cfs')
23
- dir = @cfs_reader
24
- end
25
-
26
- @field_infos = FieldInfos.new(dir, @segment + '.fnm')
27
- @fields_reader = FieldsReader.new(dir, @segment, @field_infos)
28
-
29
- @term_infos = TermInfosReader.new(dir, @segment, @field_infos)
30
- @deleted_docs = nil
31
- @deleted_docs_dirty = false
32
- if SegmentReader.has_deletions?(info) then
33
- @deleted_docs =
34
- Ferret::Utils::BitVector.read(directory, @segment + '.del')
35
- end
36
-
37
- @freq_stream = dir.open_input(@segment + '.frq')
38
- @prox_stream = dir.open_input(@segment + '.prx')
39
- @norms = {}
40
- @norms.extend(MonitorMixin)
41
- @norms_dirty = false
42
- open_norms(dir)
43
-
44
- @tv_reader_orig = nil
45
- if @field_infos.has_vectors? then
46
- @tv_reader_orig = TermVectorsReader.new(dir, @segment, @field_infos)
47
- end
48
- end
49
-
50
- def do_commit()
51
- if (@deleted_docs_dirty) # re-write deleted
52
- @deleted_docs.write(@directory, @segment + '.tmp')
53
- @directory.rename(@segment + '.tmp', @segment + '.del')
54
- end
55
- if(@undelete_all and @directory.exists?(@segment + '.del'))
56
- @directory.delete(@segment + '.del')
57
- end
58
- if (@norms_dirty) # re-write norms
59
- @norms.each_value do |norm|
60
- if norm.dirty?
61
- norm.re_write(@directory, @segment, max_doc(), @cfs_reader)
62
- end
63
- end
64
- end
65
- @deleted_docs_dirty = false
66
- @norms_dirty = false
67
- @undelete_all = false
68
- end
69
-
70
- def do_close()
71
- # clear the cache
72
- Thread.current["#{self.object_id}-#{@segment}-tv_reader"] = nil
73
-
74
- @fields_reader.close()
75
- @term_infos.close()
76
-
77
- @freq_stream.close() if @freq_stream
78
- @prox_stream.close() if @prox_stream
79
-
80
- close_norms()
81
-
82
- @tv_reader_orig.close() if @tv_reader_orig
83
- @cfs_reader.close() if @cfs_reader
84
- end
85
-
86
- def SegmentReader.has_deletions?(si)
87
- return si.directory.exists?(si.name + ".del")
88
- end
89
-
90
- def has_deletions?()
91
- return @deleted_docs != nil
92
- end
93
-
94
-
95
- def SegmentReader.uses_compound_file?(si)
96
- return si.directory.exists?(si.name + ".cfs")
97
- end
98
-
99
- def SegmentReader.has_separate_norms?(si)
100
- si.directory.each {|f| return true if f =~ /^#{si.name}\.s/}
101
- return false
102
- end
103
-
104
- def do_delete(doc_num)
105
- if (@deleted_docs == nil)
106
- @deleted_docs = Ferret::Utils::BitVector.new
107
- end
108
- @deleted_docs_dirty = true
109
- @undelete_all = false
110
- @deleted_docs.set(doc_num)
111
- end
112
-
113
- def do_undelete_all()
114
- @deleted_docs = nil
115
- @deleted_docs_dirty = false
116
- @undelete_all = true
117
- end
118
-
119
- def file_names()
120
- file_names = []
121
-
122
- IndexFileNames::INDEX_EXTENSIONS.each do |ext|
123
- name = @segment + "." + ext
124
- if (@directory.exists?(name))
125
- file_names << name
126
- end
127
- end
128
-
129
- @field_infos.each_with_index do |fi, i|
130
- if (fi.indexed? and not fi.omit_norms?)
131
- if @cfs_reader.nil?
132
- name = "#{@segment}.f#{i}"
133
- else
134
- name = "#{@segment}.s#{i}"
135
- end
136
- if (@directory.exists?(name))
137
- file_names << name
138
- end
139
- end
140
- end
141
- return file_names
142
- end
143
-
144
- def terms()
145
- return @term_infos.terms()
146
- end
147
-
148
- def terms_from(t)
149
- return @term_infos.terms_from(t)
150
- end
151
-
152
- def get_document(n)
153
- synchronize do
154
- if deleted?(n)
155
- raise ArgumentError, "attempt to access a deleted document"
156
- end
157
- return @fields_reader.doc(n)
158
- end
159
- end
160
-
161
- def deleted?(n)
162
- synchronize do
163
- return (@deleted_docs != nil and @deleted_docs.get(n))
164
- end
165
- end
166
-
167
- def term_docs()
168
- return SegmentTermDocEnum.new(self)
169
- end
170
-
171
- def term_positions()
172
- return SegmentTermDocPosEnum.new(self)
173
- end
174
-
175
- def doc_freq(t)
176
- ti = @term_infos.get_term_info(t)
177
- if (ti != nil)
178
- return ti.doc_freq
179
- else
180
- return 0
181
- end
182
- end
183
-
184
- def num_docs()
185
- n = max_doc()
186
- if (@deleted_docs != nil)
187
- n -= @deleted_docs.count()
188
- end
189
- return n
190
- end
191
-
192
- def max_doc()
193
- return @fields_reader.size()
194
- end
195
-
196
- # See IndexReader#get_field_names
197
- def get_field_names(field_option = IndexReader::FieldOption::ALL)
198
- field_set = Set.new
199
- @field_infos.each do |fi|
200
- if (field_option == IndexReader::FieldOption::ALL)
201
- field_set.add(fi.name)
202
- elsif (!fi.indexed? and field_option == IndexReader::FieldOption::UNINDEXED)
203
- field_set.add(fi.name)
204
- elsif (fi.indexed? and field_option == IndexReader::FieldOption::INDEXED)
205
- field_set.add(fi.name)
206
- elsif (fi.indexed? and fi.store_term_vector? == false and
207
- field_option == IndexReader::FieldOption::INDEXED_NO_TERM_VECTOR)
208
- field_set.add(fi.name)
209
- elsif (fi.store_term_vector? == true and
210
- fi.store_positions? == false and
211
- fi.store_offsets? == false and
212
- field_option == IndexReader::FieldOption::TERM_VECTOR)
213
- field_set.add(fi.name)
214
- elsif (fi.indexed? and fi.store_term_vector? and
215
- field_option == IndexReader::FieldOption::INDEXED_WITH_TERM_VECTOR)
216
- field_set.add(fi.name)
217
- elsif (fi.store_positions? and fi.store_offsets? == false and
218
- field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION)
219
- field_set.add(fi.name)
220
- elsif (fi.store_offsets? and fi.store_positions? == false and
221
- field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET)
222
- field_set.add(fi.name)
223
- elsif (fi.store_offsets? and fi.store_positions? and
224
- field_option == IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET)
225
- field_set.add(fi.name)
226
- end
227
- end
228
- return field_set
229
- end
230
-
231
- def has_norms?(field)
232
- return @norms.has_key?(field)
233
- end
234
-
235
- def SegmentReader.create_fake_norms(size)
236
- Array.new(size, 1).pack("C*")
237
- end
238
-
239
- def fake_norms()
240
- return @ones ||= SegmentReader.create_fake_norms(max_doc())
241
- end
242
-
243
- def get_norms(field)
244
- synchronize do
245
- norm = @norms[field]
246
- if (norm == nil) # not an indexed field or omit norms
247
- return nil
248
- end
249
- if (norm.bytes == nil) # value not yet read
250
- bytes = " " * max_doc()
251
- get_norms_into(field, bytes, 0)
252
- norm.bytes = bytes # cache it
253
- end
254
- return norm.bytes
255
- end
256
- end
257
-
258
- def do_set_norm(doc, field, value)
259
- norm = @norms[field]
260
- if (norm == nil) # not an indexed field
261
- return
262
- end
263
- norm.dirty = true # mark it dirty
264
- @norms_dirty = true
265
-
266
- get_norms(field)[doc] = value # set the value
267
- end
268
-
269
- # Read norms into a pre-allocated array.
270
- def get_norms_into(field, bytes, offset)
271
- synchronize do
272
- norm = @norms[field]
273
- if (norm.nil?)
274
- bytes[offset, max_doc()] = fake_norms[0, max_doc()]
275
- return
276
- end
277
-
278
- if (norm.bytes != nil) # can copy from cache
279
- bytes[offset, max_doc()] = norm.bytes[0, max_doc()]
280
- return
281
- end
282
-
283
- norm_stream = norm.is.clone()
284
- begin # read from disk
285
- norm_stream.seek(0)
286
- norm_stream.read_bytes(bytes, offset, max_doc())
287
- ensure
288
- norm_stream.close()
289
- end
290
- end
291
- end
292
-
293
- def open_norms(cfs_dir)
294
- @field_infos.each do |fi|
295
- if (fi.indexed? and not fi.omit_norms?)
296
- # look first if there are separate norms in compound format
297
- file_name = @segment + ".s" + fi.number.to_s
298
- d = @directory
299
- if not d.exists?(file_name)
300
- file_name = @segment + ".f" + fi.number.to_s
301
- d = cfs_dir
302
- end
303
- @norms[fi.name] = Norm.new(d.open_input(file_name), fi.number)
304
- end
305
- end
306
- end
307
-
308
- def close_norms()
309
- @norms.synchronize do
310
- @norms.each_value {|norm| norm.is.close()}
311
- end
312
- end
313
-
314
- # Create a clone from the initial TermVectorsReader and store it
315
- # in the Thread
316
- # returns:: TermVectorsReader
317
- def get_term_vectors_reader()
318
- #tvr_cache = Thread.current["tv_reader"]
319
- #if (tvr_cache == nil)
320
- # tvr_cache = Thread.current["tv_reader"] = Ferret::Utils::WeakKeyHash.new
321
- #end
322
- #tvr_cache.synchronize do
323
- # tv_reader = tvr_cache[self]
324
- # if tv_reader == nil
325
- # tv_reader = @tv_reader_orig.clone()
326
- # tvr_cache[self] = tv_reader
327
- # end
328
- # return tv_reader
329
- #end
330
- tv_reader = Thread.current.get_local(self)
331
- if tv_reader.nil?
332
- tv_reader = @tv_reader_orig.clone()
333
- Thread.current.set_local(self, tv_reader)
334
- end
335
- return tv_reader
336
- end
337
-
338
- # Return a term frequency vector for the specified document and field. The
339
- # vector returned contains term numbers and frequencies for all terms in
340
- # the specified field of this document, if the field had storeTermVector
341
- # flag set. If the flag was not set, the method returns nil.
342
- # raises:: IOException
343
- def get_term_vector(doc_number, field)
344
- # Check if this field is invalid or has no stored term vector
345
- fi = @field_infos[field]
346
- if fi.nil? or not fi.store_term_vector? or @tv_reader_orig.nil?
347
- return nil
348
- end
349
-
350
- term_vectors_reader = get_term_vectors_reader()
351
- if (term_vectors_reader == nil)
352
- return nil
353
- end
354
- return term_vectors_reader.get_field_tv(doc_number, field)
355
- end
356
-
357
-
358
- # Return an array of term frequency vectors for the specified document.
359
- # The array contains a vector for each vectorized field in the document.
360
- # Each vector vector contains term numbers and frequencies for all terms
361
- # in a given vectorized field.
362
- # If no such fields existed, the method returns nil.
363
- # raises:: IOException
364
- def get_term_vectors(doc_number)
365
- if @tv_reader_orig.nil?
366
- return nil
367
- end
368
- term_vectors_reader = get_term_vectors_reader()
369
- if (term_vectors_reader == nil)
370
- return nil
371
- end
372
- return term_vectors_reader.get_tv(doc_number)
373
- end
374
-
375
- def dir()
376
- return @directory
377
- end
378
-
379
- class Norm
380
- attr_reader :is
381
- attr_writer :dirty
382
- attr_accessor :bytes
383
-
384
- def dirty?
385
- return @dirty
386
- end
387
-
388
- def initialize(is, number)
389
- @is = is
390
- @number = number
391
- end
392
-
393
- def re_write(directory, segment, count, cfs_reader)
394
- # NOTE: norms are re-written in regular directory, not cfs
395
- out = directory.create_output(segment + ".tmp")
396
- begin
397
- out.write_bytes(@bytes, count)
398
- ensure
399
- out.close()
400
- end
401
- if(cfs_reader == nil)
402
- file_name = "#{segment}.f#{@number}"
403
- else
404
- # use a different file name if we have compound format
405
- file_name = "#{segment}.s#{@number}"
406
- end
407
- directory.rename(segment + ".tmp", file_name)
408
- @dirty = false
409
- end
410
- end
411
- end
412
- end
@@ -1,169 +0,0 @@
1
- module Ferret::Index
2
- class SegmentTermEnum < TermEnum
3
-
4
- INT_MAX = (2**31)-1
5
-
6
- attr_reader :field_infos, :size, :position, :index_pointer,
7
- :index_interval, :skip_interval
8
-
9
- def initialize(input, field_infos, is_index)
10
-
11
- @input = input
12
- @field_infos = field_infos
13
- @is_index = is_index
14
- @position = -1
15
-
16
- @term_buffer = TermBuffer.new()
17
- @prev_buffer = TermBuffer.new()
18
- @term_info = TermInfo.new()
19
-
20
- @index_pointer = 0
21
-
22
- first_int = @input.read_int()
23
-
24
- if (first_int >= 0)
25
- # original-format file, without explicit format version number
26
- @format = 0
27
- @size = first_int
28
-
29
- # back-compatible settings
30
- @index_interval = 128
31
- @skip_interval = INT_MAX # switch off skip_to optimization
32
-
33
- else
34
- # we have a format version number
35
- @format = first_int
36
-
37
- # check that it is a format we can understand
38
- if (@format < TermInfosWriter::FORMAT)
39
- raise "Unknown format version:#{@format}"
40
- end
41
-
42
- @size = @input.read_long() # read the size
43
-
44
- if (@format == -1)
45
- if (!@is_index)
46
- @index_interval = @input.read_int()
47
- @format_m1skip_interval = @input.read_int()
48
- end
49
- # switch off skip_to optimization for file format prior to
50
- # 1.4rc2 in order to avoid a bug in skip_to implementation
51
- # of these versions
52
- @skip_interval = INT_MAX
53
- else
54
- @index_interval = @input.read_int()
55
- @skip_interval = @input.read_int()
56
- end
57
- end
58
- end
59
-
60
- #attr_accessors for the clone method
61
- attr_accessor :input, :term_buffer, :prev_buffer
62
- protected :input, :input=, :prev_buffer, :prev_buffer=
63
-
64
- def initialize_copy(o)
65
- super
66
- @input = o.input.clone
67
- @term_info = o.term_info.clone
68
- @term_buffer = o.term_buffer.clone
69
- @prev_buffer = o.prev_buffer.clone
70
- end
71
-
72
- def seek(pointer, position, term, term_info)
73
- @input.seek(pointer)
74
- @position = position
75
- @term_buffer.term = term
76
- @prev_buffer.reset()
77
- @term_info.set!(term_info)
78
- end
79
-
80
- # Increments the enumeration to the next element. True if one exists.
81
- def next?
82
- @position += 1
83
- if (@position >= @size)
84
- @term_buffer.reset()
85
- return false
86
- end
87
-
88
- @prev_buffer.set!(@term_buffer)
89
-
90
- @term_buffer.read(@input, @field_infos)
91
-
92
- @term_info.doc_freq = @input.read_vint() # read doc freq
93
- @term_info.freq_pointer += @input.read_vlong() # read freq pointer
94
- @term_info.prox_pointer += @input.read_vlong() # read prox pointer
95
-
96
- if (@format == -1)
97
- # just read skip_offset in order to increment file pointer
98
- # value is never used since skip_to is switched off
99
- if (!@is_index)
100
- if (@term_info.doc_freq > @format_m1skip_interval)
101
- @term_info.skip_offset = @input.read_vint()
102
- end
103
- end
104
- else
105
- if (@term_info.doc_freq >= @skip_interval)
106
- @term_info.skip_offset = @input.read_vint()
107
- end
108
- end
109
-
110
- if (@is_index)
111
- @index_pointer += @input.read_vlong() # read index pointer
112
- end
113
-
114
- return true
115
- end
116
-
117
- def scan_to(term)
118
- while (term > @term_buffer and next?) do
119
- end
120
- end
121
-
122
- # Returns the current Term in the enumeration.
123
- # Initially invalid, valid after next() called for the first time.
124
- def term
125
- return @term_buffer.to_term()
126
- end
127
-
128
- # Returns the previous Term enumerated. Initially nil.
129
- def prev
130
- return @prev_buffer.to_term()
131
- end
132
-
133
- # Returns the current TermInfo in the enumeration.
134
- # Initially invalid, valid after next() called for the first time.
135
- def term_info
136
- return @term_info.clone
137
- end
138
-
139
- # Sets the argument to the current TermInfo in the enumeration.
140
- # Initially invalid, valid after next() called for the first time.
141
- attr_writer :term_info
142
- #def term_info=(ti)
143
- # return @term_info.set!(ti)
144
- #end
145
-
146
- # Returns the doc_freq from the current TermInfo in the enumeration.
147
- # Initially invalid, valid after next() called for the first time.
148
- def doc_freq
149
- return term_info.doc_freq
150
- end
151
-
152
- # Returns the freq_pointer from the current TermInfo in the enumeration.
153
- # Initially invalid, valid after next() called for the first time.
154
- def freq_pointer
155
- return term_info.freq_pointer
156
- end
157
-
158
- # Returns the prox_pointer from the current TermInfo in the enumeration.
159
- # Initially invalid, valid after next() called for the first time.
160
- def prox_pointer
161
- return term_info.prox_pointer
162
- end
163
-
164
- # Closes the enumeration to further activity, freeing resources.
165
- def close
166
- @input.close()
167
- end
168
- end
169
- end