ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,130 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Holds the info for one segment.
4
- #
5
- # ToDo: Does the dir really need to be stored here?
6
- class SegmentInfo
7
- attr_accessor :name, :doc_count, :directory
8
-
9
- def initialize(name, doc_count, dir)
10
- @name = name
11
- @doc_count = doc_count
12
- @directory = dir
13
- end
14
-
15
- def ==(o)
16
- (o.name == @name and o.doc_count == @doc_count)
17
- end
18
- end
19
-
20
- class SegmentInfos < Array
21
- # for compatability with Java Ferret files
22
- FORMAT = -1
23
- SEGMENT_FILENAME = "segments"
24
- TEMPORARY_SEGMENT_FILENAME = "segments.new"
25
-
26
- attr_reader :version # counts how often the index has been modified
27
- # by adding or deleting docs
28
- attr_accessor :counter # used to name new segments??
29
-
30
- # Current version number from segments file.
31
- def SegmentInfos.read_current_version(directory)
32
- return 0 if not directory.exists?(SEGMENT_FILENAME)
33
- input = directory.open_input(SEGMENT_FILENAME)
34
- @format = 0
35
- @version = 0
36
- begin
37
- @format = input.read_int()
38
- if(@format < 0)
39
- if (@format < FORMAT) then raise "Unknown format version: " + @format end
40
- @version = input.read_long() # read version
41
- end
42
- ensure
43
- input.close()
44
- end
45
-
46
- if(@format < 0)
47
- return @version
48
- end
49
-
50
- # We cannot be sure about the format of the file.
51
- # Therefore we have to read the whole file and cannot simply
52
- # seek to the version entry.
53
-
54
- sis = SegmentInfos.new()
55
- sis.read(directory)
56
- return sis.version()
57
- end
58
-
59
- def initialize()
60
- @version = Time.now.to_i * 1000
61
- @counter = 0
62
- end
63
-
64
- def initialize_copy(o)
65
- super
66
- o.each_index {|i| self[i] = o[i].clone}
67
- end
68
-
69
- def read(directory)
70
- input = directory.open_input(SEGMENT_FILENAME)
71
- begin
72
- @format = input.read_int()
73
- if(@format < 0) # file contains explicit format info
74
- # check that it is a format we can understand
75
- if (@format < FORMAT) then raise "Unknown format version: " + @format end
76
- @version = input.read_long()
77
- @counter = input.read_int()
78
- else # file is in old format without explicit format info
79
- @counter = @format
80
- end
81
-
82
- seg_count = input.read_int()
83
- seg_count.times do
84
- self << SegmentInfo.new(input.read_string(),
85
- input.read_int(),
86
- directory)
87
- end
88
-
89
- if(@format >= 0)
90
- # in old format the version number may be at the end of the file
91
- if (input.pos() >= input.length())
92
- @version = 0 # old file format without version number
93
- else
94
- @version = input.read_long() # read version
95
- end
96
- end
97
- ensure
98
- input.close()
99
- end
100
- end
101
-
102
- def write(directory)
103
- output = directory.create_output(TEMPORARY_SEGMENT_FILENAME)
104
- begin
105
- output.write_int(FORMAT) # write FORMAT
106
- output.write_long(@version += 1) # every write changes the index
107
- output.write_int(@counter) # write counter
108
- output.write_int(size()) # write infos
109
- each() do |si|
110
- output.write_string(si.name)
111
- output.write_int(si.doc_count)
112
- end
113
-
114
- ensure
115
- output.close()
116
- end
117
-
118
- # install new segment info
119
- directory.rename(TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME)
120
- end
121
-
122
- def to_s()
123
- str = "\nSegmentInfos: <"
124
- each() { |si| str << "#{si.name}:#{si.doc_count}," }
125
- str[-1] = ">"
126
- str
127
- end
128
- end
129
- end
130
- end
@@ -1,49 +0,0 @@
1
- module Ferret
2
- module Index
3
- class SegmentMergeInfo
4
- attr_reader :term_enum, :reader, :base, :term_buffer
5
-
6
- def initialize(base, term_enum, reader)
7
- @base = base
8
- @reader = reader
9
- @term_enum = term_enum
10
- @term_buffer = term_enum.term_buffer
11
- end
12
-
13
- def positions
14
- @postings ||= @reader.term_positions()
15
- end
16
-
17
- def doc_map
18
- if @doc_map.nil?
19
- # build array which maps document numbers around deletions
20
- if (@reader.has_deletions?())
21
- max_doc = @reader.max_doc()
22
- @doc_map = Array.new(max_doc)
23
- j = 0
24
- max_doc.times do |i|
25
- if (@reader.deleted?(i))
26
- @doc_map[i] = -1
27
- else
28
- @doc_map[i] = j
29
- j += 1
30
- end
31
- end
32
- end
33
- end
34
- return @doc_map
35
- end
36
-
37
- def next?
38
- @term_enum.next?
39
- end
40
-
41
- def close()
42
- @term_enum.close()
43
- @postings.close() if @postings
44
- @reader = nil
45
- end
46
- end
47
- end
48
- end
49
-
@@ -1,16 +0,0 @@
1
- module Ferret::Index
2
- class SegmentMergeQueue < Ferret::Utils::PriorityQueue
3
- def less_than(sti_a, sti_b)
4
- if sti_a.term_buffer == sti_b.term_buffer
5
- return sti_a.base < sti_b.base
6
- else
7
- return sti_a.term_buffer < sti_b.term_buffer
8
- end
9
- end
10
-
11
- def close()
12
- @heap.each {|sti| sti.close if sti}
13
- clear
14
- end
15
- end
16
- end
@@ -1,358 +0,0 @@
1
- module Ferret::Index
2
-
3
- # The SegmentMerger class combines two or more Segments, represented by
4
- # an IndexReader#add, into a single Segment. After adding the
5
- # appropriate readers, call the merge method to combine the segments.
6
- #
7
- # If the compoundFile flag is set, then the segments will be merged
8
- # into a compound file.
9
- class SegmentMerger
10
-
11
- # dir:: The Directory to merge the other segments into
12
- # name:: The name of the new segment
13
- def initialize(dir, name,
14
- term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
15
- @directory = dir
16
- @segment = name
17
- @term_index_interval = term_index_interval
18
- @readers = []
19
- @field_infos = nil
20
- @freq_output = nil
21
- @prox_output = nil
22
- @term_infos_writer = nil
23
- @queue = nil
24
- @term_info = TermInfo.new()
25
- @skip_buffer = Ferret::Store::RAMDirectory::RAMIndexOutput.new(
26
- Ferret::Store::RAMDirectory::RAMFile.new(""))
27
- end
28
-
29
- # Add an IndexReader to the collection of readers that are to be merged
30
- # reader::
31
- def add(reader)
32
- @readers << reader
33
- end
34
- alias :<< :add
35
-
36
- #
37
- # i:: The index of the reader to return
38
- # returns:: The ith reader to be merged
39
- def segment_reader(i)
40
- return @readers[i]
41
- end
42
-
43
- # Merges the readers specified by the #add method into the directory
44
- # passed to the constructor
45
- #
46
- # returns:: The number of documents that were merged
47
- # raises:: IOError
48
- def merge()
49
- value = merge_fields()
50
- merge_terms()
51
- merge_norms()
52
- merge_vectors() if @field_infos.has_vectors?
53
- return value
54
- end
55
-
56
- # close all IndexReaders that have been added. Should not be called
57
- # before merge().
58
- #
59
- # raises:: IOError
60
- def close_readers()
61
- @readers.each { |reader| reader.close }
62
- end
63
-
64
- def create_compound_file(file_name)
65
-
66
- cfs_writer = CompoundFileWriter.new(@directory, file_name)
67
-
68
- files = []
69
-
70
- # Basic files
71
- IndexFileNames::COMPOUND_EXTENSIONS.each do |ext|
72
- files << "#{@segment}.#{ext}"
73
- end
74
-
75
- # Field norm files
76
- @field_infos.each_with_index do |fi, i|
77
- if (fi.indexed? and not fi.omit_norms?)
78
- files << "#{@segment}.f#{i}"
79
- end
80
- end
81
-
82
- # Vector files
83
- if @field_infos.has_vectors?
84
- IndexFileNames::VECTOR_EXTENSIONS.each do |ext|
85
- files << "#{@segment}.#{ext}"
86
- end
87
- end
88
-
89
- # Now merge all added files
90
- files.each do |file|
91
- cfs_writer.add_file(file)
92
- end
93
-
94
- # Perform the merge
95
- cfs_writer.close
96
-
97
- return files
98
- end
99
-
100
- def add_indexed(reader, field_infos, field_names,
101
- store_term_vectors,
102
- store_position_with_term_vector,
103
- store_offset_with_term_vector)
104
- field_names.each do |field|
105
- field_infos.add(field, true,
106
- store_term_vectors,
107
- store_position_with_term_vector,
108
- store_offset_with_term_vector,
109
- !reader.has_norms?(field))
110
- end
111
- end
112
- private :add_indexed
113
-
114
-
115
- #
116
- # returns:: The number of documents in all of the readers
117
- # raises:: IOError
118
- def merge_fields()
119
- @field_infos = FieldInfos.new() # merge field names
120
- doc_count = 0
121
- @readers.each do |reader|
122
- add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION_OFFSET), true, true, true)
123
- add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_POSITION), true, true, false)
124
- add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR_WITH_OFFSET), true, false, true)
125
- add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::TERM_VECTOR), true, false, false)
126
- add_indexed(reader, @field_infos, reader.get_field_names(IndexReader::FieldOption::INDEXED), false, false, false)
127
- @field_infos.add_fields(reader.get_field_names(IndexReader::FieldOption::UNINDEXED), false)
128
- end
129
- @field_infos.write_to_dir(@directory, @segment + ".fnm")
130
-
131
- # merge field values
132
- fields_writer = FieldsWriter.new(@directory, @segment, @field_infos)
133
-
134
- begin
135
- @readers.each do |reader|
136
- max_doc = reader.max_doc()
137
- max_doc.times do |j|
138
- if not reader.deleted?(j) # skip deleted docs
139
- fields_writer.add_document(reader.get_document(j))
140
- doc_count += 1
141
- end
142
- end
143
- end
144
- ensure
145
- fields_writer.close()
146
- end
147
- return doc_count
148
- end
149
-
150
- # Merge the TermVectors from each of the segments into the new one.
151
- # raises:: IOError
152
- def merge_vectors()
153
- term_vectors_writer = TermVectorsWriter.new(@directory, @segment, @field_infos)
154
-
155
- begin
156
- @readers.each do |reader|
157
- max_doc = reader.max_doc()
158
- max_doc.times do |doc_num|
159
- # skip deleted docs
160
- next if (reader.deleted?(doc_num))
161
- term_vectors_writer.add_all_doc_vectors(reader.get_term_vectors(doc_num))
162
- end
163
- end
164
- ensure
165
- term_vectors_writer.close()
166
- end
167
- end
168
-
169
- def merge_terms()
170
- begin
171
- @freq_output = @directory.create_output(@segment + ".frq")
172
- @prox_output = @directory.create_output(@segment + ".prx")
173
- @term_infos_writer =
174
- TermInfosWriter.new(@directory, @segment, @field_infos,
175
- @term_index_interval)
176
- @skip_interval = @term_infos_writer.skip_interval
177
- @queue = SegmentMergeQueue.new(@readers.size())
178
-
179
- merge_term_infos()
180
-
181
- ensure
182
- [@freq_output, @prox_output, @term_infos_writer, @queue].each do |obj|
183
- obj.close()
184
- end
185
- end
186
- end
187
-
188
- def merge_term_infos()
189
- base = 0
190
- @readers.each do |reader|
191
- term_enum = reader.terms()
192
- smi = SegmentMergeInfo.new(base, term_enum, reader)
193
- base += reader.num_docs()
194
- if (smi.next?)
195
- @queue.push(smi) # initialize @queue
196
- else
197
- smi.close()
198
- end
199
- end
200
-
201
- match = Array.new(@readers.size)
202
-
203
- while (@queue.size > 0)
204
- match_size = 0 # pop matching terms
205
- match[match_size] = @queue.pop
206
- match_size += 1
207
- #term = match[0].term
208
- term_buffer = match[0].term_buffer
209
- top = @queue.top
210
-
211
- #while top and term == top.term
212
- while top and term_buffer == top.term_buffer
213
- match[match_size] = @queue.pop
214
- match_size += 1
215
- top = @queue.top
216
- end
217
-
218
- merge_term_info(match, match_size) # add new TermInfo
219
-
220
- while (match_size > 0)
221
- match_size -= 1
222
- smi = match[match_size]
223
- if (smi.next?)
224
- @queue.push(smi) # restore queue
225
- else
226
- smi.close() # done with a segment
227
- end
228
- end
229
- end
230
- end
231
-
232
- # Merge one term found in one or more segments. The array <code>smis</code>
233
- # contains segments that are positioned at the same term. <code>N</code>
234
- # is the number of cells in the array actually occupied.
235
- #
236
- # smis:: array of segments
237
- # n:: number of cells in the array actually occupied
238
- def merge_term_info(smis, n)
239
-
240
- freq_pointer = @freq_output.pos
241
- prox_pointer = @prox_output.pos
242
-
243
- df = append_postings(smis, n) # append posting data
244
-
245
- skip_pointer = write_skip()
246
-
247
- if (df > 0)
248
- # add an entry to the dictionary with pointers to prox and freq files
249
- @term_info.set_values!(df, freq_pointer, prox_pointer, (skip_pointer - freq_pointer))
250
- @term_infos_writer.add(smis[0].term_buffer.term, @term_info)
251
- end
252
- end
253
-
254
- # Process postings from multiple segments all positioned on the
255
- # same term. Writes out merged entries into @freq_utput and
256
- # the @prox_output streams.
257
- #
258
- # smis:: array of segments
259
- # n:: number of cells in the array actually occupied
260
- # returns:: number of documents across all segments where this term was found
261
- def append_postings(smis, n)
262
- last_doc = 0
263
- df = 0 # number of docs w/ term
264
- reset_skip()
265
- n.times do |i|
266
- smi = smis[i]
267
- postings = smi.positions
268
- base = smi.base
269
- doc_map = smi.doc_map
270
-
271
- postings.seek(smi.term_enum)
272
- while (postings.next?)
273
- doc = postings.doc()
274
- doc = doc_map[doc] if (doc_map != nil) # work around deletions
275
- doc += base # convert to merged space
276
-
277
- if (doc < last_doc)
278
- raise "docs out of order curent doc = " + doc.to_s +
279
- " and previous doc = " + last_doc.to_s
280
- end
281
-
282
- df += 1
283
-
284
- if ((df % @skip_interval) == 0)
285
- buffer_skip(last_doc)
286
- end
287
-
288
- doc_code = (doc - last_doc) << 1 # use low bit to flag freq=1
289
- last_doc = doc
290
-
291
- freq = postings.freq
292
- if (freq == 1)
293
- @freq_output.write_vint(doc_code | 1) # write doc & freq=1
294
- else
295
- @freq_output.write_vint(doc_code) # write doc
296
- @freq_output.write_vint(freq) # write frequency in doc
297
- end
298
-
299
- last_position = 0 # write position deltas
300
- freq.times do |j|
301
- position = postings.next_position()
302
- @prox_output.write_vint(position - last_position)
303
- last_position = position
304
- end
305
- end
306
- end
307
- return df
308
- end
309
-
310
- def reset_skip()
311
- @skip_buffer.reset()
312
- @last_skip_doc = 0
313
- @last_skip_freq_pointer = @freq_output.pos
314
- @last_skip_prox_pointer = @prox_output.pos
315
- end
316
-
317
- def buffer_skip(doc)
318
- freq_pointer = @freq_output.pos
319
- prox_pointer = @prox_output.pos
320
-
321
- @skip_buffer.write_vint(doc - @last_skip_doc)
322
- @skip_buffer.write_vint(freq_pointer - @last_skip_freq_pointer)
323
- @skip_buffer.write_vint(prox_pointer - @last_skip_prox_pointer)
324
-
325
- @last_skip_doc = doc
326
- @last_skip_freq_pointer = freq_pointer
327
- @last_skip_prox_pointer = prox_pointer
328
- end
329
-
330
- def write_skip()
331
- skip_pointer = @freq_output.pos
332
- @skip_buffer.write_to(@freq_output)
333
- return skip_pointer
334
- end
335
-
336
- def merge_norms()
337
- @field_infos.each_with_index do |fi, i|
338
- if (fi.indexed? and not fi.omit_norms?)
339
- output = @directory.create_output(@segment + ".f" + i.to_s)
340
- begin
341
- @readers.each do |reader|
342
- max_doc = reader.max_doc()
343
- input = "0" * max_doc
344
- reader.get_norms_into(fi.name, input, 0)
345
- max_doc.times do |k|
346
- if not reader.deleted?(k)
347
- output.write_byte(input[k])
348
- end
349
- end
350
- end
351
- ensure
352
- output.close()
353
- end
354
- end
355
- end
356
- end
357
- end
358
- end