ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,338 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Index
4
-
5
- # Class for accessing a compound stream.
6
- # This class implements a directory, but is limited to only read operations.
7
- # Directory methods that would normally modify data raise.
8
- class CompoundFileReader < Ferret::Store::Directory
9
-
10
- include MonitorMixin
11
-
12
- attr_reader :directory, :file_name
13
-
14
- # Creates a Compound File Reader which contains a single file and has
15
- # pointers to the individual files within. When it is initialized, the
16
- # compound file is set and the header is read so that it is ready to read
17
- # the individual files within.
18
- def initialize(dir, name)
19
-
20
- super()
21
-
22
- @directory = dir
23
- @file_name = name
24
- @entries = {}
25
-
26
- success = false
27
-
28
- begin
29
- @stream = dir.open_input(name)
30
-
31
- # read the directory and init files
32
- count = @stream.read_vint()
33
- entry = nil
34
- count.times() do
35
- offset = @stream.read_long()
36
- id = @stream.read_string()
37
-
38
- if (entry != nil)
39
- # set length of the previous entry
40
- entry.length = offset - entry.offset
41
- end
42
-
43
- entry = FileEntry.new(offset)
44
- @entries[id] = entry
45
- end
46
-
47
- # set the length of the final entry
48
- if (entry != nil)
49
- entry.length = @stream.length() - entry.offset
50
- end
51
-
52
- success = true
53
-
54
- ensure
55
-
56
- if not success and (@stream != nil)
57
- begin
58
- @stream.close()
59
- rescue IOError
60
- end
61
- end
62
- end
63
- end
64
-
65
- def close()
66
- synchronize do
67
- if (@stream == nil): raise(IOError, "Already closed") end
68
-
69
- @entries.clear()
70
- @stream.close()
71
- @stream = nil
72
- end
73
- end
74
-
75
- def open_input(id)
76
- synchronize do
77
- if (@stream == nil)
78
- raise(IOError, "Stream closed")
79
- end
80
-
81
- entry = @entries[id]
82
- if (entry == nil)
83
- raise(IOError, "No sub-file with id " + id + " found")
84
- end
85
- return CSIndexInput.new(@stream, entry.offset, entry.length)
86
- end
87
- end
88
-
89
- # Returns an array of strings, one for each file in the directory.
90
- def list()
91
- return @entries.keys()
92
- end
93
-
94
- # Returns true iff a file with the given name exists.
95
- def exists?(name)
96
- return @entries.key?(name)
97
- end
98
-
99
- # Returns the time the named file was last modified.
100
- def modified(name)
101
- return @directory.modified(@file_name)
102
- end
103
-
104
- # Set the modified time of an existing file to now.
105
- def touch(name)
106
- @directory.touch(@file_name)
107
- end
108
-
109
- # Not implemented
110
- def remove(name) raise(NotImplementedError) end
111
-
112
- # Not implemented
113
- def rename(from, to) raise(NotImplementedError) end
114
-
115
- # Returns the length of a file in the directory.
116
- def length(name)
117
- e = @entries[name]
118
- if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
- return e.length
120
- end
121
-
122
- # Not implemented
123
- def create_output(name) raise(NotImplementedError) end
124
-
125
- # Not implemented
126
- def make_lock(name) raise(NotImplementedError) end
127
-
128
- # Implementation of an IndexInput that reads from a portion of the
129
- # compound file.
130
- class CSIndexInput < Ferret::Store::BufferedIndexInput
131
- attr_reader :length
132
-
133
- def initialize(base, file_offset, length)
134
- super()
135
- @base = base
136
- @base.extend(MonitorMixin)
137
- @file_offset = file_offset
138
- @length = length
139
- end
140
-
141
- # Closes the stream to further operations.
142
- def close() end
143
-
144
- private
145
- # Expert: implements buffer refill. Reads bytes from the current
146
- # position in the input.
147
- #
148
- # b:: the array to read bytes into
149
- # offset:: the offset in the array to start storing bytes
150
- # len:: the number of bytes to read
151
- def read_internal(b, offset, len)
152
- @base.synchronize() do
153
- start = pos()
154
- if(start + len > @length): raise(EOFError, "read past EOF") end
155
- @base.seek(@file_offset + start)
156
- @base.read_bytes(b, offset, len)
157
- end
158
- end
159
-
160
- # Expert: implements seek. Sets current position in @file, where
161
- # the next {@link #read_internal(byte[],int,int)} will occur.
162
- def seek_internal(pos) end
163
- end
164
-
165
- private
166
- # Base info
167
- class FileEntry
168
- attr_accessor :offset, :length
169
- def initialize(offset)
170
- @offset = offset
171
- end
172
- end
173
-
174
- end
175
-
176
- # Combines multiple files into a single compound file.
177
- # The file format:
178
- #
179
- # * VInt fileCount
180
- # * {Directory} fileCount entries with the following structure:
181
- # + long data_offset
182
- # + UTFString extension
183
- # * {File Data} fileCount entries with the raw data of the corresponding file
184
- #
185
- # The fileCount integer indicates how many files are contained in this compound
186
- # file. The {directory} that follows has that many entries. Each directory entry
187
- # contains an encoding identifier, a long pointer to the start of this file's
188
- # data section, and a UTF String with that file's extension.
189
- class CompoundFileWriter
190
-
191
- class StateError < Exception
192
- end
193
-
194
- attr_reader :directory, :file_name
195
-
196
- # Create the compound stream in the specified file. The file name is the
197
- # entire name (no extensions are added).
198
- def initialize(dir, name)
199
- @directory = dir
200
- @file_name = name
201
- @ids = Set.new
202
- @file_entries = []
203
- @merged = false
204
- end
205
-
206
- # Add a source stream. _file_name_ is the string by which the
207
- # sub-stream will be known in the compound stream.
208
- #
209
- # Raises:: StateError if this writer is closed
210
- # Raises:: ArgumentError if a file with the same name
211
- # has been added already
212
- def add_file(file_name)
213
- if @merged
214
- raise(StateError, "Can't add extensions after merge has been called")
215
- end
216
-
217
- if not @ids.add?(file_name)
218
- raise(ArgumentError, "File #{file_name} already added")
219
- end
220
-
221
- entry = FileEntry.new(file_name)
222
- @file_entries << entry
223
- end
224
-
225
- # Merge files with the extensions added up to now.
226
- # All files with these extensions are combined sequentially into the
227
- # compound stream. After successful merge, the source files
228
- # are deleted.
229
- #
230
- # Throws:: StateException if close() had been called before or
231
- # if no file has been added to this object
232
- def close()
233
-
234
- if @merged
235
- raise(StateException, "Merge already performed")
236
- end
237
-
238
- if @file_entries.empty?
239
- raise(StateException, "No entries to merge have been defined")
240
- end
241
-
242
- @merged = true
243
-
244
- # open the compound stream
245
- os = nil
246
- begin
247
- os = @directory.create_output(@file_name)
248
-
249
- # Write the number of entries
250
- os.write_vint(@file_entries.size)
251
-
252
- # Write the directory with all offsets at 0.
253
- # Remember the positions of directory entries so that we can
254
- # adjust the offsets later
255
- @file_entries.each do |fe|
256
- fe.dir_offset = os.pos()
257
- os.write_long(0) # for now
258
- os.write_string(fe.file_name)
259
- end
260
-
261
- # Open the files and copy their data into the stream.
262
- # Remember the locations of each file's data section.
263
- @file_entries.each do |fe|
264
- fe.data_offset = os.pos()
265
- copy_file(fe, os)
266
- end
267
-
268
- # Write the data offsets into the directory of the compound stream
269
- @file_entries.each do |fe|
270
- os.seek(fe.dir_offset)
271
- os.write_long(fe.data_offset)
272
- end
273
-
274
- # Close the output stream. Set the os to nil before trying to
275
- # close so that if an exception occurs during the close, the
276
- # finally clause below will not attempt to close the stream
277
- # the second time.
278
- tmp = os
279
- os = nil
280
- tmp.close()
281
-
282
- ensure
283
- if (os != nil)
284
- begin
285
- os.close()
286
- rescue
287
- end
288
- end
289
- end
290
- end
291
-
292
- private
293
-
294
- # Internal class for holding a file
295
- FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
296
-
297
- # Copy the contents of the file with specified extension into the
298
- # provided output stream. Use a buffer for moving data
299
- # to reduce memory allocation.
300
- def copy_file(source, os)
301
- is = nil
302
- begin
303
- start_ptr = os.pos()
304
-
305
- is = @directory.open_input(source.file_name)
306
- remainder = length = is.length
307
-
308
- buffer = Ferret::Store::BUFFER.clone
309
- while (remainder > 0)
310
- len = [remainder, Ferret::Store::BUFFER_SIZE].min
311
- is.read_bytes(buffer, 0, len)
312
- os.write_bytes(buffer, len)
313
- remainder -= len
314
- end
315
-
316
- # Verify that remainder is 0
317
- if (remainder != 0)
318
- raise(IOError,
319
- "Non-zero remainder length after copying: #{remainder} " +
320
- "(id: #{source.file_name}, length: #{length}, buffer size: " +
321
- " #{Ferret::Store::BUFFER_SIZE})")
322
- end
323
-
324
- # Verify that the output length diff is equal to original file
325
- end_ptr = os.pos()
326
- diff = end_ptr - start_ptr
327
- if (diff != length)
328
- raise(IOError,
329
- "Difference in the output file offsets #{diff}" +
330
- " does not match the original file length #{length}")
331
- end
332
-
333
- ensure
334
- if (is != nil): is.close() end
335
- end
336
- end
337
- end
338
- end
@@ -1,289 +0,0 @@
1
- require 'ferret/search/similarity'
2
-
3
- module Ferret::Index
4
-
5
- class DocumentWriter
6
- # If non-nil, a message will be printed to this if max_field_length is
7
- # reached.
8
- attr_writer :info_stream
9
-
10
- # directory:: The directory to write the document information to
11
- # analyzer:: The analyzer to use for the document
12
- # similarity:: The Similarity function writer.similarity
13
- # max_field_length:: The maximum number of tokens a field may have
14
- # writer.max_field_length
15
- # term_index_interval:: The interval of terms in the index
16
- # writer.max_field_length
17
- def initialize(directory,
18
- analyzer,
19
- similarity,
20
- max_field_length,
21
- term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
22
- @directory = directory
23
- @analyzer = analyzer
24
- @similarity = similarity
25
- @max_field_length = max_field_length
26
- @term_index_interval = term_index_interval
27
-
28
- # Keys are Terms, values are Postings.
29
- # Used to buffer a document before it is written to the index.
30
- @posting_table = {}
31
-
32
- @term_buffer = Term.new("", "")
33
- end
34
-
35
- def add_document(segment, doc)
36
-
37
- # write field names
38
- @field_infos = FieldInfos.new()
39
- @field_infos << doc
40
- @field_infos.write_to_dir(@directory, segment + ".fnm")
41
-
42
- # write field values
43
- fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
44
- begin
45
- fields_writer.add_document(doc)
46
- ensure
47
- fields_writer.close()
48
- end
49
-
50
- # invert doc into posting_table
51
- @posting_table.clear(); # clear posting_table
52
- arr_size = @field_infos.size
53
- @field_lengths = Array.new(arr_size, 0) # init field_lengths
54
- @field_positions = Array.new(arr_size, 0) # init field_positions
55
- @field_offsets = Array.new(arr_size, 0) # init field_offsets
56
- @field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
57
-
58
- invert_document(doc)
59
-
60
- # sort posting_table into an array
61
- postings = sort_posting_table()
62
-
63
- # for (int i = 0; i < postings.length; i += 1)
64
- # Posting posting = postings[i]
65
- # print(posting.term)
66
- # print(" freq=" + posting.freq)
67
- # print(" pos=")
68
- # print(posting.positions[0])
69
- # for (int j = 1; j < posting.freq; j += 1)
70
- # print("," + posting.positions[j])
71
- # puts("")
72
- # end
73
-
74
- # write postings
75
- write_postings(postings, segment)
76
-
77
- # write norms of indexed fields
78
- write_norms(segment)
79
-
80
- end
81
-
82
- private
83
-
84
- # Tokenizes the fields of a document into Postings.
85
- def invert_document(doc)
86
-
87
- fields = doc.all_fields
88
- fields.each do |field|
89
- field_name = field.name
90
- field_info = @field_infos[field_name]
91
- field_number = field_info.number
92
-
93
- length = @field_lengths[field_number] # length of field
94
- position = @field_positions[field_number] # position in field
95
- position += @analyzer.pos_inc_gap(field_name) if length > 0
96
- offset = @field_offsets[field_number] # offset field
97
-
98
- if field_info.indexed?
99
- if not field.tokenized? # un-tokenized field
100
- string_value = field.string_value
101
- if field_info.store_offsets?
102
- add_position(field_name,
103
- string_value,
104
- position,
105
- TermVectorOffsetInfo.new(offset,
106
- offset + string_value.length))
107
- position += 1
108
- else
109
- add_position(field_name, string_value, position, nil)
110
- position += 1
111
- end
112
- offset += string_value.length()
113
- length += 1
114
- else
115
-
116
- reader = field.reader_value()
117
-
118
- # Tokenize field and add to posting_table
119
- stream = @analyzer.token_stream(field_name, reader)
120
- begin
121
- last_token = nil
122
- while token = stream.next
123
- position += (token.pos_inc - 1)
124
-
125
- if(field_info.store_offsets?())
126
- add_position(field_name,
127
- token.text(),
128
- position,
129
- TermVectorOffsetInfo.new(
130
- offset + token.start_offset(),
131
- offset + token.end_offset()))
132
- position += 1
133
- else
134
- add_position(field_name, token.text(), position, nil)
135
- position += 1
136
- end
137
-
138
- last_token = token
139
- length += 1
140
- if (length > @max_field_length)
141
- if @info_stream
142
- @info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
143
- end
144
- break
145
- end
146
- end
147
-
148
- if(last_token != nil)
149
- offset += last_token.end_offset() + 1
150
- end
151
-
152
- ensure
153
- stream.close()
154
- end
155
- end
156
-
157
- @field_lengths[field_number] = length # save field length
158
- @field_positions[field_number] = position # save field position
159
- @field_boosts[field_number] *= field.boost
160
- @field_offsets[field_number] = offset
161
- end
162
- end
163
- end
164
-
165
-
166
- def add_position(field, text, position, tv_offset_info)
167
- @term_buffer.set!(field, text)
168
- #puts("Offset: " + tv_offset_info)
169
- posting = @posting_table[@term_buffer]
170
- if (posting != nil) # word seen before
171
- freq = posting.freq
172
- posting.positions[freq] = position # add new position
173
- posting.offsets[freq] = tv_offset_info # add new position
174
-
175
- if (tv_offset_info != nil)
176
- posting.offsets[freq] = tv_offset_info
177
- end
178
- posting.freq = freq + 1 # update frequency
179
- else # word not seen before
180
- term = Term.new(field, text)
181
- @posting_table[term] = Posting.new(term, position, tv_offset_info)
182
- end
183
- end
184
-
185
- def sort_posting_table()
186
- # copy @posting_table into an array
187
- return @posting_table.values.sort { |x,y| x.term <=> y.term }
188
- end
189
-
190
- def write_postings(postings, segment)
191
-
192
- freq = nil
193
- prox = nil
194
- tis_writer = nil
195
- tv_writer = nil
196
- begin
197
- #open files for inverse index storage
198
- freq = @directory.create_output(segment + ".frq")
199
- prox = @directory.create_output(segment + ".prx")
200
- tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
201
- @term_index_interval)
202
- ti = TermInfo.new()
203
- current_field = nil
204
-
205
- postings.each do |posting|
206
- # add an entry to the dictionary with pointers to prox and freq files
207
- ti.set_values!(1, freq.pos(), prox.pos(), -1)
208
- tis_writer.add(posting.term, ti)
209
-
210
- # add an entry to the freq file
211
- posting_freq = posting.freq
212
- if (posting_freq == 1) # optimize freq=1
213
- freq.write_vint(1) # set low bit of doc num.
214
- else
215
- freq.write_vint(0) # the document number
216
- freq.write_vint(posting_freq) # frequency in doc
217
- end
218
-
219
- last_position = 0 # write positions
220
- posting.positions.each do |position|
221
- prox.write_vint(position - last_position)
222
- last_position = position
223
- end
224
- # check to see if we switched to a new field
225
- term_field = posting.term.field
226
- if (current_field != term_field)
227
- # changing field - see if there is something to save
228
- current_field = term_field
229
- fi = @field_infos[current_field]
230
- if (fi.store_term_vector?)
231
- if tv_writer.nil?
232
- tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
233
- tv_writer.open_document()
234
- end
235
- tv_writer.open_field(current_field)
236
-
237
- elsif not tv_writer.nil?
238
- tv_writer.close_field()
239
- end
240
- end
241
- if not tv_writer.nil? and tv_writer.field_open?
242
- tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
243
- end
244
- end
245
- if not tv_writer.nil?
246
- tv_writer.close_document()
247
- end
248
- ensure
249
- # make an effort to close all streams we can but remember and re-raise
250
- # the last exception encountered in this process
251
- keep = nil
252
- [freq, prox, tis_writer, tv_writer].compact.each do |obj|
253
- begin
254
- obj.close
255
- rescue IOError => e
256
- keep = e
257
- end
258
- end
259
- raise keep if not keep.nil?
260
- end
261
- end
262
-
263
- def write_norms(segment)
264
- @field_infos.each_with_index do |fi, i|
265
- if fi.indexed? and not fi.omit_norms?
266
- norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
267
- norms = @directory.create_output(segment + ".f" + i.to_s)
268
- begin
269
- norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
270
- ensure
271
- norms.close()
272
- end
273
- end
274
- end
275
- end
276
-
277
- end
278
-
279
- class Posting # info about a Term in a doc
280
- attr_accessor :term, :freq, :positions, :offsets
281
-
282
- def initialize(t, position, offset)
283
- @term = t
284
- @freq = 1
285
- @positions = [position]
286
- @offsets = [offset]
287
- end
288
- end
289
- end