ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,279 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Access to the Field Info file that describes document fields and whether or
4
- # not they are indexed. Each segment has a separate Field Info file. Objects
5
- # of this class are thread-safe for multiple readers, but only one thread can
6
- # be adding documents at a time, with no other reader or writer threads
7
- # accessing this object.
8
- class FieldInfos
9
-
10
- NOT_A_FIELD = 0xffffffff # -1 in java int
11
-
12
- # Construct a FieldInfos object using the directory and the name of the file
13
- # InputStream
14
- #
15
- # dir:: The directory to open the InputStream from
16
- # name:: The name of the file to open the InputStream from in the Directory
17
- def initialize(dir = nil, name = nil)
18
- @fi_array = []
19
- @fi_hash = {}
20
- if dir and dir.exists?(name)
21
- input = dir.open_input(name)
22
- begin
23
- read(input)
24
- ensure
25
- input.close()
26
- end
27
- end
28
- end
29
-
30
- # Automatically adds all of the fields from the document if they haven't
31
- # been added already. Or it will update the values.
32
- def add_doc_fields(doc)
33
- doc.all_fields.each do |field|
34
- add(field.name,
35
- field.indexed?,
36
- field.store_term_vector?,
37
- field.store_positions?,
38
- field.store_offsets?,
39
- field.omit_norms?)
40
- end
41
- end
42
- alias :<< :add_doc_fields
43
-
44
- # Calls the 5 param add method to add all the names in the collection
45
- def add_fields(names,
46
- indexed = true,
47
- store_term_vector = false,
48
- store_position = false,
49
- store_offset = false,
50
- omit_norms = false)
51
- names.each do |name|
52
- add(name, indexed, store_term_vector, store_position,
53
- store_offset, omit_norms)
54
- end
55
- end
56
-
57
- # If the field is not yet known, adds it. If it is known, checks to make
58
- # sure that the indexed flag is the same as was given previously for this
59
- # field. If not - marks it as being indexed. Same goes for the TermVector
60
- # parameters.
61
- #
62
- # name:: The name of the field
63
- # indexed:: true if the field is indexed
64
- # store_term_vector:: true if the term vector should be stored
65
- # store_position:: true if the positions should be stored
66
- # store_offset:: true if the offsets should be stored
67
- def add(name,
68
- indexed = true,
69
- store_term_vector = false,
70
- store_position = false,
71
- store_offset = false,
72
- omit_norms = false)
73
- fi = @fi_hash[name]
74
- if (fi == nil)
75
- fi = add_internal(name, indexed, store_term_vector, store_position,
76
- store_offset, omit_norms)
77
- else
78
- if (fi.indexed? != indexed)
79
- fi.indexed = true # once indexed, always index
80
- end
81
- if (fi.store_term_vector? != store_term_vector)
82
- fi.store_term_vector = true # once vector, always vector
83
- end
84
- if (fi.store_positions? != store_position)
85
- fi.store_position = true # once vector, always vector
86
- end
87
- if (fi.store_offsets? != store_offset)
88
- fi.store_offset = true # once vector, always vector
89
- end
90
- if (fi.omit_norms? != omit_norms)
91
- fi.omit_norms = false # once norms are stored, always store norms
92
- end
93
- end
94
- return fi
95
- end
96
-
97
- # Returns the number of the field that goes by the field name that is
98
- # passed. If there is no field of this name then -1 is returned
99
- def field_number(name)
100
- fi = @fi_hash[name.to_s]
101
- return fi ? fi.number : NOT_A_FIELD
102
- end
103
-
104
- # Retrieve the field_info object by either field number or field name.
105
- def [](index)
106
- if index.is_a? Integer
107
- if index >= NOT_A_FIELD || index < 0 # < 0 is for C extensions
108
- return FieldInfo.new("", false, NOT_A_FIELD, false)
109
- end
110
- return @fi_array[index]
111
- else
112
- return @fi_hash[index.to_s]
113
- end
114
- end
115
-
116
- def name(index)
117
- if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
118
- return ""
119
- end
120
- return self[index].name
121
- end
122
-
123
- # Iterate through the field_info objects
124
- def each()
125
- @fi_array.each() {|fi| yield(fi) }
126
- end
127
-
128
- # Iterate through the field_info objects including the index
129
- def each_with_index()
130
- @fi_array.each_with_index() {|fi, i| yield(fi, i) }
131
- end
132
-
133
- # Get the number of field_infos in this object.
134
- #
135
- # NOTE: There is a default empty field always added at the start. This
136
- # may later be used to set the default values for a field.
137
- def size()
138
- return @fi_array.size()
139
- end
140
-
141
- # Return true if any of the fields have store_term_vector? set to true
142
- def has_vectors?()
143
- @fi_array.each() { |fi| return true if fi.store_term_vector? }
144
- return false
145
- end
146
-
147
- # Write the field_infos to a file specified by name in dir.
148
- #
149
- # dir:: the directory to write the fieldinfos to
150
- # name:: the name of the file to write to.
151
- def write_to_dir(dir, name)
152
- output = dir.create_output(name)
153
- begin
154
- write(output)
155
- ensure
156
- output.close()
157
- end
158
- end
159
-
160
- protected
161
-
162
- # Write the field_infos to the output file
163
- #
164
- # output:: the file to write to
165
- def write(output)
166
- output.write_vint(size())
167
- @fi_array.each() do |fi|
168
- output.write_string(fi.name)
169
- output.write_byte(get_field_info_byte(fi))
170
- end
171
- end
172
-
173
- # Read the field_infos object from the input file
174
- #
175
- # input:: the input file to read from
176
- def read(input)
177
- size = input.read_vint()#read in the size
178
- size.times do |i|
179
- name = input.read_string()
180
- bits = input.read_byte()
181
- indexed = (bits & IS_INDEXED) != 0
182
- store_term_vector = (bits & STORE_TERM_VECTOR) != 0
183
- store_position = (bits & STORE_POSITION) != 0
184
- store_offset = (bits & STORE_OFFSET) != 0
185
- omit_norms = (bits & OMIT_NORMS) != 0
186
- add_internal(name, indexed, store_term_vector, store_position,
187
- store_offset, omit_norms)
188
- end
189
- end
190
-
191
- private
192
- IS_INDEXED = 0x1;
193
- STORE_TERM_VECTOR = 0x2;
194
- STORE_POSITION = 0x4;
195
- STORE_OFFSET = 0x8;
196
- OMIT_NORMS = 0x10;
197
-
198
- def add_internal(name, indexed, store_term_vector,
199
- store_position = false,
200
- store_offset = false,
201
- omit_norms = false)
202
- fi = FieldInfo.new(name, indexed,
203
- @fi_array.size(),
204
- store_term_vector,
205
- store_position,
206
- store_offset,
207
- omit_norms)
208
- @fi_array << fi
209
- @fi_hash[name] = fi
210
- return fi
211
- end
212
-
213
- def get_field_info_byte(fi)
214
- bits = 0x0
215
- if (fi.indexed?)
216
- bits |= IS_INDEXED
217
- end
218
- if (fi.store_term_vector?)
219
- bits |= STORE_TERM_VECTOR
220
- end
221
- if (fi.store_positions?)
222
- bits |= STORE_POSITION
223
- end
224
- if (fi.store_offsets?)
225
- bits |= STORE_OFFSET
226
- end
227
- if (fi.omit_norms?)
228
- bits |= OMIT_NORMS
229
- end
230
- return bits
231
- end
232
- end
233
-
234
- class FieldInfo
235
- attr_accessor :name, :number
236
- attr_writer :indexed, :store_term_vector, :store_offset,
237
- :store_position, :omit_norms
238
-
239
- def indexed?()
240
- return @indexed
241
- end
242
-
243
- def store_term_vector?()
244
- return @store_term_vector
245
- end
246
-
247
- def store_offsets?()
248
- return @store_offset
249
- end
250
-
251
- def store_positions?()
252
- return @store_position
253
- end
254
-
255
- def omit_norms?()
256
- return @omit_norms
257
- end
258
-
259
- def set!(indexed, store_term_vector, store_position,
260
- store_offset, omit_norms)
261
- @indexed = indexed
262
- @store_term_vector = store_term_vector
263
- @store_position = store_position
264
- @store_offset = store_offset
265
- @omit_norms = omit_norms
266
- end
267
-
268
- def initialize(name, indexed, number, store_term_vector,
269
- store_position = false,
270
- store_offset = false,
271
- omit_norms = false)
272
- @name = name
273
- @number = number
274
- set!(indexed, store_term_vector, store_position,
275
- store_offset, omit_norms)
276
- end
277
- end
278
- end
279
- end
@@ -1,181 +0,0 @@
1
- require 'zlib'
2
-
3
-
4
- module Ferret::Index
5
-
6
-
7
- # Class responsible for access to stored document fields.
8
- #
9
- # It uses &lt;segment&gt;.fdt and &lt;segment&gt;.fdx; files.
10
- class FieldsReader
11
- include Ferret::Document
12
- attr_reader :size
13
- alias :length :size
14
-
15
- def initialize(d, segment, fi)
16
- @field_infos = fi
17
-
18
- @fields_stream = d.open_input(segment + ".fdt")
19
- @index_stream = d.open_input(segment + ".fdx")
20
-
21
- @size = (@index_stream.length() / 8).to_i
22
- end
23
-
24
- def close()
25
- @fields_stream.close()
26
- @index_stream.close()
27
- end
28
-
29
-
30
- def doc(n)
31
- @index_stream.seek(n * 8)
32
- position = @index_stream.read_long()
33
- @fields_stream.seek(position)
34
-
35
- doc = Document.new
36
- @fields_stream.read_vint().times do
37
- field_number = @fields_stream.read_vint()
38
- fi = @field_infos[field_number]
39
-
40
- bits = @fields_stream.read_byte()
41
-
42
- compressed = (bits & FieldsWriter::FIELD_IS_COMPRESSED) != 0
43
- tokenize = (bits & FieldsWriter::FIELD_IS_TOKENIZED) != 0
44
- binary = (bits & FieldsWriter::FIELD_IS_BINARY) != 0
45
-
46
- if binary
47
- b = " " * @fields_stream.read_vint()
48
- @fields_stream.read_bytes(b, 0, b.length)
49
- if compressed
50
- doc << Field.new_binary_field(fi.name,
51
- uncompress(b),
52
- Field::Store::COMPRESS)
53
- else # No compression
54
- doc << Field.new_binary_field(fi.name, b, Field::Store::YES)
55
- end
56
- else
57
- store = Field::Store::YES
58
- if fi.indexed?
59
- if tokenize
60
- index = Field::Index::TOKENIZED
61
- else
62
- if fi.omit_norms?
63
- index = Field::Index::NO_NORMS
64
- else
65
- index = Field::Index::UNTOKENIZED
66
- end
67
- end
68
- else
69
- index = Field::Index::NO
70
- end
71
- data = nil
72
- if (compressed)
73
- store = Field::Store::COMPRESS
74
- b = " " * @fields_stream.read_vint()
75
- @fields_stream.read_bytes(b, 0, b.length)
76
- data = uncompress(b)
77
- else
78
- data = @fields_stream.read_string()
79
- end
80
- stv = Field::TermVector::NO
81
- if fi.store_term_vector?
82
- if fi.store_positions? and fi.store_offsets?
83
- stv = Field::TermVector::WITH_POSITIONS_OFFSETS
84
- elsif fi.store_positions?
85
- stv = Field::TermVector::WITH_POSITIONS
86
- elsif fi.store_offsets?
87
- stv = Field::TermVector::WITH_OFFSETS
88
- else
89
- stv = Field::TermVector::YES
90
- end
91
- end
92
- doc << Field.new(fi.name, data, store, index, stv)
93
- end
94
- end
95
-
96
- return doc
97
- end
98
-
99
- def uncompress(input)
100
- zstream = Zlib::Inflate.new
101
- buf = zstream.inflate(input)
102
- zstream.finish
103
- zstream.close
104
- buf
105
- end
106
- end
107
-
108
-
109
- class FieldsWriter
110
-
111
- FIELD_IS_TOKENIZED = 0X1
112
- FIELD_IS_BINARY = 0X2
113
- FIELD_IS_COMPRESSED = 0X4
114
-
115
- def initialize(dir, segment, fi)
116
- @field_infos = fi
117
- @fields_stream = dir.create_output(segment + ".fdt")
118
- @index_stream = dir.create_output(segment + ".fdx")
119
- end
120
-
121
- def close()
122
- @fields_stream.close()
123
- @index_stream.close()
124
- end
125
-
126
- def add_document(doc)
127
- @index_stream.write_long(@fields_stream.pos)
128
- stored_count = 0
129
- doc.all_fields.each() { |field| stored_count += 1 if field.stored?() }
130
- @fields_stream.write_vint(stored_count)
131
-
132
- doc.all_fields.each() do |field|
133
- if (field.stored?())
134
- @fields_stream.write_vint(@field_infos.field_number(field.name))
135
-
136
- bits = 0
137
- bits |= FIELD_IS_TOKENIZED if field.tokenized?
138
- bits |= FIELD_IS_BINARY if field.binary?
139
- bits |= FIELD_IS_COMPRESSED if field.compressed?
140
- @fields_stream.write_byte(bits)
141
-
142
- data = nil
143
- if field.compressed?
144
- if field.binary?
145
- data = compress(field.binary_value)
146
- else
147
- data = compress(field.string_value)
148
- end
149
- save_data(data)
150
- else
151
- if field.binary?
152
- save_data(field.binary_value)
153
- else
154
- @fields_stream.write_string(field.string_value)
155
- end
156
- end
157
- end
158
- end
159
- end
160
- alias :<< :add_document
161
-
162
- private
163
-
164
- def compress(input)
165
- zstream = Zlib::Deflate.new(Zlib::BEST_COMPRESSION)
166
- buf = zstream.deflate(input, Zlib::FINISH)
167
- zstream.close
168
- return buf
169
- end
170
-
171
- def save_data(data)
172
- len = data.length
173
- if data.is_a? Array
174
- data = data.pack("C*")
175
- end
176
-
177
- @fields_stream.write_vint(len)
178
- @fields_stream.write_bytes(data, len)
179
- end
180
- end
181
- end