ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,338 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Index
4
-
5
- # Class for accessing a compound stream.
6
- # This class implements a directory, but is limited to only read operations.
7
- # Directory methods that would normally modify data raise.
8
- class CompoundFileReader < Ferret::Store::Directory
9
-
10
- include MonitorMixin
11
-
12
- attr_reader :directory, :file_name
13
-
14
- # Creates a Compound File Reader which contains a single file and has
15
- # pointers to the individual files within. When it is initialized, the
16
- # compound file is set and the header is read so that it is ready to read
17
- # the individual files within.
18
- def initialize(dir, name)
19
-
20
- super()
21
-
22
- @directory = dir
23
- @file_name = name
24
- @entries = {}
25
-
26
- success = false
27
-
28
- begin
29
- @stream = dir.open_input(name)
30
-
31
- # read the directory and init files
32
- count = @stream.read_vint()
33
- entry = nil
34
- count.times() do
35
- offset = @stream.read_long()
36
- id = @stream.read_string()
37
-
38
- if (entry != nil)
39
- # set length of the previous entry
40
- entry.length = offset - entry.offset
41
- end
42
-
43
- entry = FileEntry.new(offset)
44
- @entries[id] = entry
45
- end
46
-
47
- # set the length of the final entry
48
- if (entry != nil)
49
- entry.length = @stream.length() - entry.offset
50
- end
51
-
52
- success = true
53
-
54
- ensure
55
-
56
- if not success and (@stream != nil)
57
- begin
58
- @stream.close()
59
- rescue IOError
60
- end
61
- end
62
- end
63
- end
64
-
65
- def close()
66
- synchronize do
67
- if (@stream == nil): raise(IOError, "Already closed") end
68
-
69
- @entries.clear()
70
- @stream.close()
71
- @stream = nil
72
- end
73
- end
74
-
75
- def open_input(id)
76
- synchronize do
77
- if (@stream == nil)
78
- raise(IOError, "Stream closed")
79
- end
80
-
81
- entry = @entries[id]
82
- if (entry == nil)
83
- raise(IOError, "No sub-file with id " + id + " found")
84
- end
85
- return CSIndexInput.new(@stream, entry.offset, entry.length)
86
- end
87
- end
88
-
89
- # Returns an array of strings, one for each file in the directory.
90
- def list()
91
- return @entries.keys()
92
- end
93
-
94
- # Returns true iff a file with the given name exists.
95
- def exists?(name)
96
- return @entries.key?(name)
97
- end
98
-
99
- # Returns the time the named file was last modified.
100
- def modified(name)
101
- return @directory.modified(@file_name)
102
- end
103
-
104
- # Set the modified time of an existing file to now.
105
- def touch(name)
106
- @directory.touch(@file_name)
107
- end
108
-
109
- # Not implemented
110
- def remove(name) raise(NotImplementedError) end
111
-
112
- # Not implemented
113
- def rename(from, to) raise(NotImplementedError) end
114
-
115
- # Returns the length of a file in the directory.
116
- def length(name)
117
- e = @entries[name]
118
- if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
- return e.length
120
- end
121
-
122
- # Not implemented
123
- def create_output(name) raise(NotImplementedError) end
124
-
125
- # Not implemented
126
- def make_lock(name) raise(NotImplementedError) end
127
-
128
- # Implementation of an IndexInput that reads from a portion of the
129
- # compound file.
130
- class CSIndexInput < Ferret::Store::BufferedIndexInput
131
- attr_reader :length
132
-
133
- def initialize(base, file_offset, length)
134
- super()
135
- @base = base
136
- @base.extend(MonitorMixin)
137
- @file_offset = file_offset
138
- @length = length
139
- end
140
-
141
- # Closes the stream to further operations.
142
- def close() end
143
-
144
- private
145
- # Expert: implements buffer refill. Reads bytes from the current
146
- # position in the input.
147
- #
148
- # b:: the array to read bytes into
149
- # offset:: the offset in the array to start storing bytes
150
- # len:: the number of bytes to read
151
- def read_internal(b, offset, len)
152
- @base.synchronize() do
153
- start = pos()
154
- if(start + len > @length): raise(EOFError, "read past EOF") end
155
- @base.seek(@file_offset + start)
156
- @base.read_bytes(b, offset, len)
157
- end
158
- end
159
-
160
- # Expert: implements seek. Sets current position in @file, where
161
- # the next {@link #read_internal(byte[],int,int)} will occur.
162
- def seek_internal(pos) end
163
- end
164
-
165
- private
166
- # Base info
167
- class FileEntry
168
- attr_accessor :offset, :length
169
- def initialize(offset)
170
- @offset = offset
171
- end
172
- end
173
-
174
- end
175
-
176
- # Combines multiple files into a single compound file.
177
- # The file format:
178
- #
179
- # * VInt fileCount
180
- # * {Directory} fileCount entries with the following structure:
181
- # + long data_offset
182
- # + UTFString extension
183
- # * {File Data} fileCount entries with the raw data of the corresponding file
184
- #
185
- # The fileCount integer indicates how many files are contained in this compound
186
- # file. The {directory} that follows has that many entries. Each directory entry
187
- # contains an encoding identifier, a long pointer to the start of this file's
188
- # data section, and a UTF String with that file's extension.
189
- class CompoundFileWriter
190
-
191
- class StateError < Exception
192
- end
193
-
194
- attr_reader :directory, :file_name
195
-
196
- # Create the compound stream in the specified file. The file name is the
197
- # entire name (no extensions are added).
198
- def initialize(dir, name)
199
- @directory = dir
200
- @file_name = name
201
- @ids = Set.new
202
- @file_entries = []
203
- @merged = false
204
- end
205
-
206
- # Add a source stream. _file_name_ is the string by which the
207
- # sub-stream will be known in the compound stream.
208
- #
209
- # Raises:: StateError if this writer is closed
210
- # Raises:: ArgumentError if a file with the same name
211
- # has been added already
212
- def add_file(file_name)
213
- if @merged
214
- raise(StateError, "Can't add extensions after merge has been called")
215
- end
216
-
217
- if not @ids.add?(file_name)
218
- raise(ArgumentError, "File #{file_name} already added")
219
- end
220
-
221
- entry = FileEntry.new(file_name)
222
- @file_entries << entry
223
- end
224
-
225
- # Merge files with the extensions added up to now.
226
- # All files with these extensions are combined sequentially into the
227
- # compound stream. After successful merge, the source files
228
- # are deleted.
229
- #
230
- # Throws:: StateException if close() had been called before or
231
- # if no file has been added to this object
232
- def close()
233
-
234
- if @merged
235
- raise(StateException, "Merge already performed")
236
- end
237
-
238
- if @file_entries.empty?
239
- raise(StateException, "No entries to merge have been defined")
240
- end
241
-
242
- @merged = true
243
-
244
- # open the compound stream
245
- os = nil
246
- begin
247
- os = @directory.create_output(@file_name)
248
-
249
- # Write the number of entries
250
- os.write_vint(@file_entries.size)
251
-
252
- # Write the directory with all offsets at 0.
253
- # Remember the positions of directory entries so that we can
254
- # adjust the offsets later
255
- @file_entries.each do |fe|
256
- fe.dir_offset = os.pos()
257
- os.write_long(0) # for now
258
- os.write_string(fe.file_name)
259
- end
260
-
261
- # Open the files and copy their data into the stream.
262
- # Remember the locations of each file's data section.
263
- @file_entries.each do |fe|
264
- fe.data_offset = os.pos()
265
- copy_file(fe, os)
266
- end
267
-
268
- # Write the data offsets into the directory of the compound stream
269
- @file_entries.each do |fe|
270
- os.seek(fe.dir_offset)
271
- os.write_long(fe.data_offset)
272
- end
273
-
274
- # Close the output stream. Set the os to nil before trying to
275
- # close so that if an exception occurs during the close, the
276
- # finally clause below will not attempt to close the stream
277
- # the second time.
278
- tmp = os
279
- os = nil
280
- tmp.close()
281
-
282
- ensure
283
- if (os != nil)
284
- begin
285
- os.close()
286
- rescue
287
- end
288
- end
289
- end
290
- end
291
-
292
- private
293
-
294
- # Internal class for holding a file
295
- FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
296
-
297
- # Copy the contents of the file with specified extension into the
298
- # provided output stream. Use a buffer for moving data
299
- # to reduce memory allocation.
300
- def copy_file(source, os)
301
- is = nil
302
- begin
303
- start_ptr = os.pos()
304
-
305
- is = @directory.open_input(source.file_name)
306
- remainder = length = is.length
307
-
308
- buffer = Ferret::Store::BUFFER.clone
309
- while (remainder > 0)
310
- len = [remainder, Ferret::Store::BUFFER_SIZE].min
311
- is.read_bytes(buffer, 0, len)
312
- os.write_bytes(buffer, len)
313
- remainder -= len
314
- end
315
-
316
- # Verify that remainder is 0
317
- if (remainder != 0)
318
- raise(IOError,
319
- "Non-zero remainder length after copying: #{remainder} " +
320
- "(id: #{source.file_name}, length: #{length}, buffer size: " +
321
- " #{Ferret::Store::BUFFER_SIZE})")
322
- end
323
-
324
- # Verify that the output length diff is equal to original file
325
- end_ptr = os.pos()
326
- diff = end_ptr - start_ptr
327
- if (diff != length)
328
- raise(IOError,
329
- "Difference in the output file offsets #{diff}" +
330
- " does not match the original file length #{length}")
331
- end
332
-
333
- ensure
334
- if (is != nil): is.close() end
335
- end
336
- end
337
- end
338
- end
@@ -1,289 +0,0 @@
1
- require 'ferret/search/similarity'
2
-
3
- module Ferret::Index
4
-
5
- class DocumentWriter
6
- # If non-nil, a message will be printed to this if max_field_length is
7
- # reached.
8
- attr_writer :info_stream
9
-
10
- # directory:: The directory to write the document information to
11
- # analyzer:: The analyzer to use for the document
12
- # similarity:: The Similarity function writer.similarity
13
- # max_field_length:: The maximum number of tokens a field may have
14
- # writer.max_field_length
15
- # term_index_interval:: The interval of terms in the index
16
- # writer.max_field_length
17
- def initialize(directory,
18
- analyzer,
19
- similarity,
20
- max_field_length,
21
- term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
22
- @directory = directory
23
- @analyzer = analyzer
24
- @similarity = similarity
25
- @max_field_length = max_field_length
26
- @term_index_interval = term_index_interval
27
-
28
- # Keys are Terms, values are Postings.
29
- # Used to buffer a document before it is written to the index.
30
- @posting_table = {}
31
-
32
- @term_buffer = Term.new("", "")
33
- end
34
-
35
- def add_document(segment, doc)
36
-
37
- # write field names
38
- @field_infos = FieldInfos.new()
39
- @field_infos << doc
40
- @field_infos.write_to_dir(@directory, segment + ".fnm")
41
-
42
- # write field values
43
- fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
44
- begin
45
- fields_writer.add_document(doc)
46
- ensure
47
- fields_writer.close()
48
- end
49
-
50
- # invert doc into posting_table
51
- @posting_table.clear(); # clear posting_table
52
- arr_size = @field_infos.size
53
- @field_lengths = Array.new(arr_size, 0) # init field_lengths
54
- @field_positions = Array.new(arr_size, 0) # init field_positions
55
- @field_offsets = Array.new(arr_size, 0) # init field_offsets
56
- @field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
57
-
58
- invert_document(doc)
59
-
60
- # sort posting_table into an array
61
- postings = sort_posting_table()
62
-
63
- # for (int i = 0; i < postings.length; i += 1)
64
- # Posting posting = postings[i]
65
- # print(posting.term)
66
- # print(" freq=" + posting.freq)
67
- # print(" pos=")
68
- # print(posting.positions[0])
69
- # for (int j = 1; j < posting.freq; j += 1)
70
- # print("," + posting.positions[j])
71
- # puts("")
72
- # end
73
-
74
- # write postings
75
- write_postings(postings, segment)
76
-
77
- # write norms of indexed fields
78
- write_norms(segment)
79
-
80
- end
81
-
82
- private
83
-
84
- # Tokenizes the fields of a document into Postings.
85
- def invert_document(doc)
86
-
87
- fields = doc.all_fields
88
- fields.each do |field|
89
- field_name = field.name
90
- field_info = @field_infos[field_name]
91
- field_number = field_info.number
92
-
93
- length = @field_lengths[field_number] # length of field
94
- position = @field_positions[field_number] # position in field
95
- position += @analyzer.pos_inc_gap(field_name) if length > 0
96
- offset = @field_offsets[field_number] # offset field
97
-
98
- if field_info.indexed?
99
- if not field.tokenized? # un-tokenized field
100
- string_value = field.string_value
101
- if field_info.store_offsets?
102
- add_position(field_name,
103
- string_value,
104
- position,
105
- TermVectorOffsetInfo.new(offset,
106
- offset + string_value.length))
107
- position += 1
108
- else
109
- add_position(field_name, string_value, position, nil)
110
- position += 1
111
- end
112
- offset += string_value.length()
113
- length += 1
114
- else
115
-
116
- reader = field.reader_value()
117
-
118
- # Tokenize field and add to posting_table
119
- stream = @analyzer.token_stream(field_name, reader)
120
- begin
121
- last_token = nil
122
- while token = stream.next
123
- position += (token.pos_inc - 1)
124
-
125
- if(field_info.store_offsets?())
126
- add_position(field_name,
127
- token.text(),
128
- position,
129
- TermVectorOffsetInfo.new(
130
- offset + token.start_offset(),
131
- offset + token.end_offset()))
132
- position += 1
133
- else
134
- add_position(field_name, token.text(), position, nil)
135
- position += 1
136
- end
137
-
138
- last_token = token
139
- length += 1
140
- if (length > @max_field_length)
141
- if @info_stream
142
- @info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
143
- end
144
- break
145
- end
146
- end
147
-
148
- if(last_token != nil)
149
- offset += last_token.end_offset() + 1
150
- end
151
-
152
- ensure
153
- stream.close()
154
- end
155
- end
156
-
157
- @field_lengths[field_number] = length # save field length
158
- @field_positions[field_number] = position # save field position
159
- @field_boosts[field_number] *= field.boost
160
- @field_offsets[field_number] = offset
161
- end
162
- end
163
- end
164
-
165
-
166
- def add_position(field, text, position, tv_offset_info)
167
- @term_buffer.set!(field, text)
168
- #puts("Offset: " + tv_offset_info)
169
- posting = @posting_table[@term_buffer]
170
- if (posting != nil) # word seen before
171
- freq = posting.freq
172
- posting.positions[freq] = position # add new position
173
- posting.offsets[freq] = tv_offset_info # add new position
174
-
175
- if (tv_offset_info != nil)
176
- posting.offsets[freq] = tv_offset_info
177
- end
178
- posting.freq = freq + 1 # update frequency
179
- else # word not seen before
180
- term = Term.new(field, text)
181
- @posting_table[term] = Posting.new(term, position, tv_offset_info)
182
- end
183
- end
184
-
185
- def sort_posting_table()
186
- # copy @posting_table into an array
187
- return @posting_table.values.sort { |x,y| x.term <=> y.term }
188
- end
189
-
190
- def write_postings(postings, segment)
191
-
192
- freq = nil
193
- prox = nil
194
- tis_writer = nil
195
- tv_writer = nil
196
- begin
197
- #open files for inverse index storage
198
- freq = @directory.create_output(segment + ".frq")
199
- prox = @directory.create_output(segment + ".prx")
200
- tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
201
- @term_index_interval)
202
- ti = TermInfo.new()
203
- current_field = nil
204
-
205
- postings.each do |posting|
206
- # add an entry to the dictionary with pointers to prox and freq files
207
- ti.set_values!(1, freq.pos(), prox.pos(), -1)
208
- tis_writer.add(posting.term, ti)
209
-
210
- # add an entry to the freq file
211
- posting_freq = posting.freq
212
- if (posting_freq == 1) # optimize freq=1
213
- freq.write_vint(1) # set low bit of doc num.
214
- else
215
- freq.write_vint(0) # the document number
216
- freq.write_vint(posting_freq) # frequency in doc
217
- end
218
-
219
- last_position = 0 # write positions
220
- posting.positions.each do |position|
221
- prox.write_vint(position - last_position)
222
- last_position = position
223
- end
224
- # check to see if we switched to a new field
225
- term_field = posting.term.field
226
- if (current_field != term_field)
227
- # changing field - see if there is something to save
228
- current_field = term_field
229
- fi = @field_infos[current_field]
230
- if (fi.store_term_vector?)
231
- if tv_writer.nil?
232
- tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
233
- tv_writer.open_document()
234
- end
235
- tv_writer.open_field(current_field)
236
-
237
- elsif not tv_writer.nil?
238
- tv_writer.close_field()
239
- end
240
- end
241
- if not tv_writer.nil? and tv_writer.field_open?
242
- tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
243
- end
244
- end
245
- if not tv_writer.nil?
246
- tv_writer.close_document()
247
- end
248
- ensure
249
- # make an effort to close all streams we can but remember and re-raise
250
- # the last exception encountered in this process
251
- keep = nil
252
- [freq, prox, tis_writer, tv_writer].compact.each do |obj|
253
- begin
254
- obj.close
255
- rescue IOError => e
256
- keep = e
257
- end
258
- end
259
- raise keep if not keep.nil?
260
- end
261
- end
262
-
263
- def write_norms(segment)
264
- @field_infos.each_with_index do |fi, i|
265
- if fi.indexed? and not fi.omit_norms?
266
- norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
267
- norms = @directory.create_output(segment + ".f" + i.to_s)
268
- begin
269
- norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
270
- ensure
271
- norms.close()
272
- end
273
- end
274
- end
275
- end
276
-
277
- end
278
-
279
- class Posting # info about a Term in a doc
280
- attr_accessor :term, :freq, :positions, :offsets
281
-
282
- def initialize(t, position, offset)
283
- @term = t
284
- @freq = 1
285
- @positions = [position]
286
- @offsets = [offset]
287
- end
288
- end
289
- end