ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,52 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Abstract class for enumerating terms.
4
- #
5
- # Term enumerations are always ordered by Term.<=>. Each term in
6
- # the enumeration is greater than all that precede it.
7
- class TermEnum
8
- # Increments the enumeration to the next element. True if one exists.
9
- def next?
10
- raise NotImplementedError
11
- end
12
-
13
- # Returns the current Term in the enumeration.
14
- def term
15
- raise NotImplementedError
16
- end
17
-
18
- # Returns the doc_freq of the current Term in the enumeration.
19
- def doc_freq
20
- raise NotImplementedError
21
- end
22
-
23
- # Closes the enumeration to further activity, freeing resources.
24
- def close
25
- raise NotImplementedError
26
- end
27
-
28
- # Term Vector support
29
- # Skips terms to the first beyond the current whose value is
30
- # greater or equal to _target_.
31
- #
32
- # Returns true iff there is such a term.
33
- #
34
- # Behaves as if written:
35
- #
36
- # def skip_to(target)
37
- # while (target > term)
38
- # if (!next()) return false
39
- # end
40
- # return true
41
- # end
42
- #
43
- # Some implementations are considerably more efficient than that.
44
- def skip_to(target)
45
- while (target > term)
46
- return false if not next?
47
- end
48
- return true
49
- end
50
- end
51
- end
52
- end
@@ -1,37 +0,0 @@
1
- module Ferret::Index
2
- # A TermInfo is the record of information stored for a term.
3
- class TermInfo
4
- attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
5
-
6
- def initialize(df=0, fp=0, pp=0, so=0)
7
- set_values!(df, fp, pp, so)
8
- end
9
-
10
- def set!(ti)
11
- @doc_freq = ti.doc_freq
12
- @freq_pointer = ti.freq_pointer
13
- @prox_pointer = ti.prox_pointer
14
- @skip_offset = ti.skip_offset
15
- end
16
-
17
- def set_values!(df=0, fp=0, pp=0, so=0)
18
- @doc_freq = df
19
- @freq_pointer = fp
20
- @prox_pointer = pp
21
- @skip_offset = so
22
- end
23
-
24
- def ==(o)
25
- return false if !o.instance_of?(TermInfo)
26
- @doc_freq == o.doc_freq &&
27
- @freq_pointer == o.freq_pointer &&
28
- @prox_pointer == o.prox_pointer &&
29
- @skip_offset == o.skip_offset
30
- end
31
- alias eql? ==
32
-
33
- def to_s()
34
- "TermInfo:df=#{doc_freq}:fp=#{freq_pointer}:pp=#{prox_pointer}:so=#{skip_offset}"
35
- end
36
- end
37
- end
@@ -1,321 +0,0 @@
1
- require 'monitor'
2
- module Ferret::Index
3
-
4
- # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
5
- # Directory. A TermInfos can be written once, in order.
6
- class TermInfosWriter
7
- attr_reader :index_interval, :skip_interval, :out
8
- attr_writer :other
9
- # The file format version, a negative number.
10
- FORMAT = -2
11
-
12
-
13
- # TODO: the default values for these two parameters should be settable
14
- # from IndexWriter. However, once that's done, folks will start setting
15
- # them to ridiculous values and complaining that things don't work well,
16
- # as with mergeFactor. So, let's wait until a number of folks find that
17
- # alternate values work better. Note that both of these values are
18
- # stored in the segment, so that it's safe to change these w/o
19
- # rebuilding all indexes.
20
-
21
- # Expert: The fraction of terms in the "dictionary" which should be
22
- # stored in RAM. Smaller values use more memory, but make searching
23
- # slightly faster, while larger values use less memory and make
24
- # searching slightly slower. Searching is typically not dominated by
25
- # dictionary lookup, so tweaking this is rarely useful.
26
- #
27
- # Expert: The fraction of TermDocEnum entries stored in skip
28
- # tables, used to accellerate TermDocEnum#skipTo(int). Larger
29
- # values result in smaller indexes, greater acceleration, but fewer
30
- # accelerable cases, while smaller values result in bigger indexes, less
31
- # acceleration and more accelerable cases. More detailed experiments
32
- # would be useful here.
33
- def initialize(dir, segment, fis, interval, is_index = false)
34
- @index_interval = interval
35
- @skip_interval = 16
36
- @last_index_pointer = 0
37
- @last_term = Term.new("", "")
38
- @last_term_info = TermInfo.new()
39
- @size = 0
40
- @is_index = is_index
41
- @field_infos = fis
42
- @out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
43
- @out.write_int(FORMAT) # write format
44
- @out.write_long(0) # leave space for size
45
- @out.write_int(@index_interval) # write @index_interval
46
- @out.write_int(@skip_interval) # write @skip_interval
47
- unless is_index
48
- @other = TermInfosWriter.new(dir, segment, fis, interval, true)
49
- @other.other = self
50
- end
51
- end
52
-
53
- # Adds a new <Term, TermInfo> pair to the set.
54
- # Term must be lexicographically greater than all previous Terms added.
55
- # TermInfo pointers must be positive and greater than all previous.
56
- def add(term, term_info)
57
- if (not @is_index and @last_term > term)
58
- raise IOError, "term out of order #{term.text} < #{@last_term.text}"
59
- end
60
- if (term_info.freq_pointer < @last_term_info.freq_pointer)
61
- raise IOError, "freq pointer out of order"
62
- end
63
- if (term_info.prox_pointer < @last_term_info.prox_pointer)
64
- raise IOError, "prox pointer out of order"
65
- end
66
-
67
- if (not @is_index and @size % @index_interval == 0)
68
- @other.add(@last_term, @last_term_info) # add an index term
69
- end
70
-
71
- write_term(term) # write term
72
- @out.write_vint(term_info.doc_freq) # write doc freq
73
- @out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
74
- @out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
75
- @out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
76
-
77
- if (@is_index)
78
- @out.write_vlong(@other.out.pos() - @last_index_pointer)
79
- @last_index_pointer = @other.out.pos() # write pointer
80
- end
81
-
82
- @last_term_info.set!(term_info)
83
- @size += 1
84
- end
85
-
86
- # Called to complete TermInfos creation.
87
- def close()
88
- @out.seek(4) # write @size after format
89
- @out.write_long(@size)
90
- @out.close()
91
-
92
- @other.close() unless @is_index
93
- end
94
-
95
- private
96
- def write_term(term)
97
- start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
98
- length = term.text.length() - start
99
-
100
- @out.write_vint(start) # write shared prefix length
101
- @out.write_vint(length) # write delta length
102
- @out.write_chars(term.text, start, length) # write delta chars
103
- @out.write_vint(@field_infos.field_number(term.field)) # write field num
104
- @last_term = term
105
- end
106
- end
107
-
108
-
109
- # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
110
- # Directory. Pairs are accessed either by Term or by ordinal position the
111
- # set.
112
- class TermInfosReader
113
- include MonitorMixin
114
-
115
- def initialize(dir, seg, fis)
116
- super()
117
-
118
- @directory = dir
119
- @segment = seg
120
- @field_infos = fis
121
-
122
- @orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
123
- @field_infos, false)
124
- @size = @orig_enum.size
125
- @skip_interval = @orig_enum.skip_interval
126
- @index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
127
- @field_infos, true)
128
- @index_terms = nil
129
- @index_infos = nil
130
- @index_pointers = nil
131
- end
132
-
133
- def close()
134
- # clear this threads cache
135
- @orig_enum.close() if (@orig_enum != nil)
136
- @index_enum.close() if (@index_enum != nil)
137
- end
138
-
139
- # Returns the number of term/value pairs in the set.
140
- attr_reader :size
141
- # The skip interval for the original enumerator
142
- attr_reader :skip_interval
143
-
144
-
145
- # Returns the TermInfo for a Term in the set, or nil.
146
- def get_term_info(term)
147
- return nil if (@size == 0)
148
-
149
- ensure_index_is_read()
150
-
151
- # optimize sequential access: first try scanning cached enum w/o seeking
152
- e = enum()
153
- if e.term and term >= e.term
154
- enum_offset = (e.position / e.index_interval).to_i + 1
155
- if (@index_terms.length == enum_offset or
156
- term < @index_terms[enum_offset]) # but before end of block
157
- return scan_for_term_info(term) # no need to seek
158
- end
159
- end
160
-
161
- # random-access: must seek
162
- seek_enum(get_index_offset(term))
163
- return scan_for_term_info(term)
164
- end
165
- alias :[] :get_term_info
166
-
167
- # Returns the nth term in the set.
168
- def get_term(position)
169
- return nil if (@size == 0)
170
-
171
- e = enum()
172
- if (e != nil and
173
- e.term != nil and
174
- position >= e.position and
175
- position < (e.position + e.index_interval))
176
- return scan_for_term(position) # can avoid seek
177
- end
178
-
179
- seek_enum((position / e.index_interval).to_i) # must seek
180
- return scan_for_term(position)
181
- end
182
-
183
- def get_terms_position(term)
184
- return nil if (@size == 0)
185
- ensure_index_is_read
186
- seek_enum(get_index_offset(term))
187
-
188
- e = enum()
189
-
190
- while term > e.term and e.next?
191
- end
192
-
193
- return term == e.term ? e.position : -1
194
- end
195
-
196
- # Returns an enumeration of all the Terms and TermInfos in the set.
197
- def terms()
198
- return @orig_enum.clone()
199
- end
200
-
201
- # Returns an enumeration of terms starting at or after the named term.
202
- def terms_from(term)
203
- get_term_info(term)
204
- return enum().clone()
205
- end
206
-
207
- private
208
-
209
- def enum()
210
- #te_cache = Thread.current["term_enum"]
211
- #if (te_cache == nil)
212
- # te_cache = Thread.current["term_enum"] = Ferret::Utils::WeakKeyHash.new
213
- #end
214
- #te_cache.synchronize do
215
- # term_enum = te_cache[self]
216
- # if term_enum == nil
217
- # term_enum = terms()
218
- # te_cache[self] = term_enum
219
- # end
220
- # return term_enum
221
- #end
222
- term_enum = Thread.current.get_local(self)
223
- if term_enum.nil?
224
- Thread.current.set_local(self, term_enum = terms())
225
- end
226
- return term_enum
227
- end
228
-
229
- def ensure_index_is_read()
230
- synchronize() do
231
- return if @index_terms
232
- begin
233
- index_size = @index_enum.size
234
-
235
- @index_terms = Array.new(index_size)
236
- @index_infos = Array.new(index_size)
237
- @index_pointers = Array.new(index_size)
238
-
239
- i = 0
240
- while @index_enum.next?
241
- @index_terms[i] = @index_enum.term
242
- @index_infos[i] = @index_enum.term_info
243
- @index_pointers[i] = @index_enum.index_pointer
244
- i += 1
245
- end
246
- ensure
247
- @index_enum.close()
248
- @index_enum = nil
249
- end
250
- end
251
- end
252
-
253
- # Returns the offset of the greatest index entry which is less than or
254
- # equal to term.
255
- #
256
- # This method is rewritten in the C extension.
257
- def get_index_offset(term)
258
- lo = 0 # binary search @index_terms[]
259
- hi = @index_terms.length - 1
260
-
261
- while (hi >= lo)
262
- mid = (lo + hi) >> 1
263
- delta = term <=> @index_terms[mid]
264
- if (delta < 0)
265
- hi = mid - 1
266
- elsif (delta > 0)
267
- lo = mid + 1
268
- else
269
- return mid
270
- end
271
- end
272
- return hi
273
- end
274
-
275
- def seek_enum(ind_offset)
276
- enum().seek(@index_pointers[ind_offset],
277
- (ind_offset * enum().index_interval) - 1,
278
- @index_terms[ind_offset],
279
- @index_infos[ind_offset])
280
- end
281
-
282
- # Scans within block for matching term.
283
- def scan_for_term_info(term)
284
- e = enum()
285
- e.scan_to(term)
286
- if e.term != nil and term == e.term
287
- return e.term_info()
288
- else
289
- return nil
290
- end
291
- end
292
-
293
- def scan_for_term(position)
294
- e = enum()
295
- while (e.position < position)
296
- return nil if not e.next?
297
- end
298
-
299
- return e.term
300
- end
301
-
302
- # Returns the position of a Term in the set or -1.
303
- def get_position(term)
304
- return -1 if (@size == 0)
305
-
306
- ind_offset = get_index_offset(term)
307
- seek_enum(ind_offset)
308
-
309
- e = enum()
310
- while (term > e.term and e.next?)
311
- end
312
-
313
- if (term == e.term())
314
- return e.position
315
- else
316
- return -1
317
- end
318
- end
319
-
320
- end
321
- end
@@ -1,20 +0,0 @@
1
- module Ferret::Index
2
- class TermVectorOffsetInfo
3
- attr_accessor :start, :end
4
-
5
- def initialize(start, endd)
6
- @end = endd
7
- @start = start
8
- end
9
-
10
- def eql?(o)
11
- return false if !o.instance_of?(TermVectorOffsetInfo)
12
- @end == o.end and @start == o.start
13
- end
14
- alias :== :eql?
15
-
16
- def hash()
17
- 29 * @start + @end
18
- end
19
- end
20
- end
@@ -1,553 +0,0 @@
1
- module Ferret::Index
2
- # Writer works by opening a document and then opening the fields within
3
- # the document and then writing out the vectors for each field.
4
- #
5
- # Rough usage:
6
- #
7
- # for each document
8
- #
9
- # writer.open_document()
10
- # for each field on the document
11
- #
12
- # writer.open_field(field)
13
- # for all of the @terms
14
- #
15
- # writer.add_term(...)
16
- # end
17
- # writer.close_field
18
- # end
19
- # writer.close_document()
20
- # end
21
- #
22
- #
23
- class TermVectorsWriter
24
- STORE_POSITIONS_WITH_TERMVECTOR = 0x1
25
- STORE_OFFSET_WITH_TERMVECTOR = 0x2
26
-
27
- FORMAT_VERSION = 2
28
-
29
- # The size in bytes that the FORMAT_VERSION will take up at the beginning
30
- # of each file
31
- FORMAT_SIZE = 4
32
-
33
- TVX_EXTENSION = ".tvx"
34
- TVD_EXTENSION = ".tvd"
35
- TVF_EXTENSION = ".tvf"
36
-
37
- def initialize(directory, segment, field_infos)
38
- @current_field = nil
39
- @current_doc_pointer = -1
40
-
41
- # Open files for TermVector storage
42
- @tvx = directory.create_output(segment + TVX_EXTENSION)
43
- @tvx.write_int(FORMAT_VERSION)
44
- @tvd = directory.create_output(segment + TVD_EXTENSION)
45
- @tvd.write_int(FORMAT_VERSION)
46
- @tvf = directory.create_output(segment + TVF_EXTENSION)
47
- @tvf.write_int(FORMAT_VERSION)
48
-
49
- @field_infos = field_infos
50
- @fields = []
51
- @terms = []
52
- end
53
-
54
-
55
- def open_document()
56
- close_document()
57
- @current_doc_pointer = @tvd.pos()
58
- end
59
-
60
-
61
- def close_document()
62
-
63
- if (document_open?())
64
- close_field()
65
- write_doc()
66
- @fields.clear()
67
- @current_doc_pointer = -1
68
- end
69
- end
70
-
71
-
72
- def document_open?()
73
- return @current_doc_pointer != -1
74
- end
75
-
76
-
77
- # Start processing a field. This can be followed by a number of calls to
78
- # add_term, and a final call to close_field to indicate the end of
79
- # processing of this field. If a field was previously open, it is closed
80
- # automatically.
81
- def open_field(field)
82
- field_info = @field_infos[field]
83
- create_field(field_info.number,
84
- field_info.store_positions?,
85
- field_info.store_offsets?)
86
- end
87
-
88
- # Finished processing current field. This should be followed by a call
89
- # to open_field before future calls to add_term.
90
- def close_field()
91
- if field_open?
92
- #puts("close_field()")
93
-
94
- # save field and @terms
95
- write_field()
96
- @fields << @current_field
97
- @terms.clear()
98
- @current_field = nil
99
- end
100
- end
101
-
102
- # Return true if a field is currently open.
103
- def field_open?()
104
- return @current_field != nil
105
- end
106
-
107
- # Add term to the field's term vector. Field must already be open.
108
- #
109
- # Terms should be added in increasing order of @terms, one call per
110
- # unique termNum. ProxPointer is a pointer into the TermPosition file
111
- # (prx). Freq is the number of times this term appears in this field, in
112
- # this document. raises:: IllegalStateException if document or field is
113
- # not open
114
- def add_term(term_text, freq, positions = nil, offsets = nil)
115
- if not document_open?
116
- raise IllegalStateError, "Cannot add terms when document is not open"
117
- end
118
- if not field_open?
119
- raise IllegalStateError, "Cannot add terms when field is not open"
120
- end
121
-
122
- add_term_internal(term_text, freq, positions, offsets)
123
- end
124
-
125
- def add_term_internal(term_text, freq, positions, offsets)
126
- @terms << TVTerm.new(term_text, freq, positions, offsets)
127
- end
128
-
129
- # Add a complete document specified by all its term vectors. If document has no
130
- # term vectors, add value for @tvx.
131
- #
132
- # vectors:: The documents to have their term vectors added
133
- # raises:: IOException
134
- def add_all_doc_vectors(vectors)
135
-
136
- open_document()
137
-
138
- if vectors != nil
139
- vectors.each do |vector|
140
- store_positions = (vector.size > 0 and vector.positions != nil)
141
- store_offsets = (vector.size > 0 and vector.offsets != nil)
142
-
143
- create_field(@field_infos.field_number(vector.field),
144
- store_positions, store_offsets)
145
-
146
- vector.size.times do |j|
147
- add_term_internal(vector.terms[j],
148
- vector.freqs[j],
149
- store_positions ? vector.positions[j] : nil,
150
- store_offsets ? vector.offsets[j] : nil)
151
- end
152
- close_field()
153
- end
154
- end
155
- close_document()
156
- end
157
-
158
- # Close all streams.
159
- def close()
160
- begin
161
- close_document()
162
- ensure
163
- # make an effort to close all streams we can but remember and re-raise
164
- # the last exception encountered in this process
165
- keep = nil
166
- [@tvx, @tvd, @tvf].compact.each do |os|
167
- begin
168
- os.close()
169
- rescue IOError => e
170
- keep = e
171
- end
172
- end
173
- raise keep if (keep != nil)
174
- end
175
- end
176
-
177
- class TVField
178
- attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
179
- def initialize(number, store_pos, store_off)
180
- @tvf_pointer = 0
181
- @number = number
182
- @store_positions = store_pos
183
- @store_offsets = store_off
184
- end
185
- end
186
-
187
- class TVTerm
188
- attr_accessor :term_text, :freq, :positions, :offsets
189
-
190
- def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
191
- @term_text = term_text
192
- @freq = freq
193
- @positions = positions
194
- @offsets = offsets
195
- end
196
- end
197
-
198
- private
199
-
200
- def write_field()
201
- # remember where this field is written
202
- @current_field.tvf_pointer = @tvf.pos
203
-
204
- size = @terms.size
205
- @tvf.write_vint(size)
206
-
207
- store_positions = @current_field.store_positions
208
- store_offsets = @current_field.store_offsets
209
- bits = 0x0
210
- if (store_positions)
211
- bits |= STORE_POSITIONS_WITH_TERMVECTOR
212
- end
213
- if (store_offsets)
214
- bits |= STORE_OFFSET_WITH_TERMVECTOR
215
- end
216
- @tvf.write_byte(bits)
217
-
218
- last_term_text = ""
219
- @terms.each do |term|
220
- start = Ferret::Utils::StringHelper.string_difference(last_term_text,
221
- term.term_text)
222
- length = term.term_text.length() - start
223
- @tvf.write_vint(start) # write shared prefix length
224
- @tvf.write_vint(length) # write delta length
225
- @tvf.write_chars(term.term_text, start, length) # write delta chars
226
- @tvf.write_vint(term.freq)
227
- last_term_text = term.term_text
228
-
229
- if (store_positions)
230
- if (term.positions == nil)
231
- raise IllegalStateError, "Trying to write positions that are nil!"
232
- end
233
-
234
- # use delta encoding for positions
235
- position = 0
236
- term.freq.times do |j|
237
- @tvf.write_vint(term.positions[j] - position)
238
- position = term.positions[j]
239
- end
240
- end
241
-
242
- if (store_offsets)
243
- if(term.offsets == nil)
244
- raise IllegalStateError, "Trying to write offsets that are nil!"
245
- end
246
-
247
- # use delta encoding for offsets
248
- position = 0
249
- term.freq.times do |j|
250
- @tvf.write_vint(term.offsets[j].start - position)
251
- #Save the diff between the two.
252
- @tvf.write_vint(term.offsets[j].end -
253
- term.offsets[j].start)
254
- position = term.offsets[j].end()
255
- end
256
- end
257
- end
258
- end
259
-
260
- def write_doc()
261
- if field_open?
262
- raise IllegalStateError, "Field is still open while writing document"
263
- end
264
- #puts("Writing doc pointer: " + @current_doc_pointer)
265
- # write document index record
266
- @tvx.write_long(@current_doc_pointer)
267
-
268
- # write document data record
269
- size = @fields.size
270
-
271
- # write the number of @fields
272
- @tvd.write_vint(size)
273
-
274
- # write field numbers
275
- @fields.each { |field| @tvd.write_vint(field.number) }
276
-
277
- # write field pointers
278
- last_field_pointer = 0
279
- @fields.each do |field|
280
- @tvd.write_vlong(field.tvf_pointer - last_field_pointer)
281
- last_field_pointer = field.tvf_pointer
282
- end
283
- #puts("After writing doc pointer: " + @tvx.pos())
284
- end
285
-
286
- def create_field(field_number, store_position, store_offset)
287
- if not document_open?
288
- raise IllegalStateError, "Cannot open field when no document is open."
289
- end
290
- close_field()
291
- @current_field = TVField.new(field_number, store_position, store_offset)
292
- end
293
- end
294
-
295
- class TermVectorsReader
296
- attr_reader :size
297
-
298
- # accessors for clone method
299
- attr_accessor :tvx, :tvd, :tvf
300
- protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
301
-
302
-
303
- def initialize(d, segment, field_infos)
304
-
305
- if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
306
- @tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
307
- check_valid_format(@tvx)
308
- @tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
309
- @tvd_format = check_valid_format(@tvd)
310
- @tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
311
- @tvf_format = check_valid_format(@tvf)
312
- @size = @tvx.length / 8
313
- else
314
- @tvx = nil
315
- @tvd = nil
316
- @tvf = nil
317
- end
318
-
319
- @field_infos = field_infos
320
- end
321
-
322
- def close()
323
- # make an effort to close all streams we can but remember and re-raise
324
- # the last exception encountered in this process
325
- keep = nil
326
- [@tvx, @tvd, @tvf].compact.each do |os|
327
- begin
328
- os.close()
329
- rescue IOError => e
330
- keep = e
331
- end
332
- end
333
- raise keep if (keep != nil)
334
- end
335
-
336
- # Retrieve the term vector for the given document and field
337
- # doc_num:: The document number to retrieve the vector for
338
- # field:: The field within the document to retrieve
339
- # returns:: The TermFreqVector for the document and field or nil if there
340
- # is no termVector for this field.
341
- # raises:: IOException if there is an error reading the term vector files
342
- def get_field_tv(doc_num, field)
343
- # Check if no term vectors are available for this segment at all
344
- field = field.to_s
345
- field_number = @field_infos.field_number(field)
346
- result = nil
347
- if (@tvx != nil)
348
- #We need to account for the FORMAT_SIZE at when seeking in the @tvx
349
- #We don't need to do this in other seeks because we already have the
350
- # file pointer
351
- #that was written in another file
352
- @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
353
- #puts("TVX Pointer: " + @tvx.pos())
354
- position = @tvx.read_long()
355
-
356
- @tvd.seek(position)
357
- field_count = @tvd.read_vint()
358
- #puts("Num Fields: " + field_count)
359
- # There are only a few fields per document. We opt for a full scan
360
- # rather then requiring that they be ordered. We need to read through
361
- # all of the fields anyway to get to the tvf pointers.
362
- number = 0
363
- found = -1
364
- field_count.times do |i|
365
- if @tvd_format == TermVectorsWriter::FORMAT_VERSION
366
- number = @tvd.read_vint()
367
- else
368
- number += @tvd.read_vint()
369
- end
370
- if (number == field_number)
371
- found = i
372
- end
373
- end
374
-
375
- # This field, although valid in the segment, was not found in this
376
- # document
377
- if (found != -1)
378
- # Compute position in the @tvf file
379
- position = 0
380
- (found + 1).times do
381
- position += @tvd.read_vlong()
382
- end
383
-
384
- result = read_term_vector(field, position)
385
- end
386
- end
387
- return result
388
- end
389
-
390
- # Return all term vectors stored for this document or nil if it could
391
- # not be read in.
392
- #
393
- # doc_num:: The document number to retrieve the vector for
394
- # returns:: All term frequency vectors
395
- # raises:: IOException if there is an error reading the term vector files
396
- def get_tv(doc_num)
397
- result = nil
398
- # Check if no term vectors are available for this segment at all
399
- if (@tvx != nil)
400
- #We need to offset by
401
- @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
402
- position = @tvx.read_long()
403
-
404
- @tvd.seek(position)
405
- field_count = @tvd.read_vint()
406
-
407
- # No fields are vectorized for this document
408
- if (field_count != 0)
409
- number = 0
410
- fields = Array.new(field_count)
411
-
412
- field_count.times do |i|
413
- if @tvd_format == TermVectorsWriter::FORMAT_VERSION
414
- number = @tvd.read_vint()
415
- else
416
- number += @tvd.read_vint()
417
- end
418
-
419
- fields[i] = @field_infos[number].name
420
- end
421
-
422
- # Compute position in the @tvf file
423
- position = 0
424
- tvf_pointers = Array.new(field_count)
425
- field_count.times do |i|
426
- position += @tvd.read_vlong()
427
- tvf_pointers[i] = position
428
- end
429
-
430
- result = read_term_vectors(fields, tvf_pointers)
431
- end
432
- end
433
- return result
434
- end
435
-
436
- def clone()
437
-
438
- if (@tvx == nil or @tvd == nil or @tvf == nil)
439
- return nil
440
- end
441
-
442
- clone = self
443
- clone.tvx = @tvx.clone()
444
- clone.tvd = @tvd.clone()
445
- clone.tvf = @tvf.clone()
446
-
447
- return clone
448
- end
449
-
450
- private
451
-
452
- def read_term_vectors(fields, tvf_pointers)
453
-
454
- res = Array.new(fields.length)
455
- fields.length.times do |i|
456
- res[i] = read_term_vector(fields[i], tvf_pointers[i])
457
- end
458
- return res
459
- end
460
-
461
- # field:: The field to read in
462
- # tvf_pointer:: The pointer within the @tvf file where we should start reading
463
- # returns:: The TermVector located at that position
464
- # raises:: IOException
465
- def read_term_vector(field, tvf_pointer)
466
- # Now read the data from specified position
467
- # We don't need to offset by the FORMAT here since the pointer
468
- # already includes the offset
469
- @tvf.seek(tvf_pointer)
470
-
471
- num_terms = @tvf.read_vint()
472
- # If no terms - return a constant empty termvector. However, this should
473
- # never occur!
474
- if (num_terms == 0)
475
- return SegmentTermVector.new(field, nil, nil)
476
- end
477
-
478
-
479
- if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
480
- bits = @tvf.read_byte()
481
- store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
482
- store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
483
- else
484
- @tvf.read_vint()
485
- store_positions = false
486
- store_offsets = false
487
- end
488
-
489
- terms = Array.new(num_terms)
490
- term_freqs = Array.new(num_terms)
491
-
492
- # we may not need these, but declare them
493
- positions = nil
494
- offsets = nil
495
- if(store_positions)
496
- positions = Array.new(num_terms)
497
- end
498
- if(store_offsets)
499
- offsets = Array.new(num_terms)
500
- end
501
-
502
- start = 0
503
- delta_length = 0
504
- total_length = 0
505
- buffer = ""
506
- previous_buffer = ""
507
-
508
- num_terms.times do |i|
509
- start = @tvf.read_vint()
510
- delta_length = @tvf.read_vint()
511
- total_length = start + delta_length
512
- @tvf.read_chars(buffer, start, delta_length)
513
- terms[i] = buffer[0, total_length].to_s
514
- previous_string = terms[i]
515
- freq = @tvf.read_vint()
516
- term_freqs[i] = freq
517
-
518
- if (store_positions) #read in the positions
519
- pos = Array.new(freq)
520
- positions[i] = pos
521
- prev_position = 0
522
- freq.times do |j|
523
- pos[j] = prev_position + @tvf.read_vint()
524
- prev_position = pos[j]
525
- end
526
- end
527
-
528
- if (store_offsets)
529
- offs = Array.new(freq)
530
- offsets[i] = offs
531
- prev_offset = 0
532
- freq.times do |j|
533
- start_offset = prev_offset + @tvf.read_vint()
534
- end_offset = start_offset + @tvf.read_vint()
535
- offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
536
- prev_offset = end_offset
537
- end
538
- end
539
- end
540
-
541
- SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
542
- end
543
-
544
- def check_valid_format(istream)
545
- format = istream.read_int()
546
- if (format > TermVectorsWriter::FORMAT_VERSION)
547
- raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
548
- end
549
- return format
550
- end
551
-
552
- end
553
- end