ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,52 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Abstract class for enumerating terms.
4
- #
5
- # Term enumerations are always ordered by Term.<=>. Each term in
6
- # the enumeration is greater than all that precede it.
7
- class TermEnum
8
- # Increments the enumeration to the next element. True if one exists.
9
- def next?
10
- raise NotImplementedError
11
- end
12
-
13
- # Returns the current Term in the enumeration.
14
- def term
15
- raise NotImplementedError
16
- end
17
-
18
- # Returns the doc_freq of the current Term in the enumeration.
19
- def doc_freq
20
- raise NotImplementedError
21
- end
22
-
23
- # Closes the enumeration to further activity, freeing resources.
24
- def close
25
- raise NotImplementedError
26
- end
27
-
28
- # Term Vector support
29
- # Skips terms to the first beyond the current whose value is
30
- # greater or equal to _target_.
31
- #
32
- # Returns true iff there is such a term.
33
- #
34
- # Behaves as if written:
35
- #
36
- # def skip_to(target)
37
- # while (target > term)
38
- # if (!next()) return false
39
- # end
40
- # return true
41
- # end
42
- #
43
- # Some implementations are considerably more efficient than that.
44
- def skip_to(target)
45
- while (target > term)
46
- return false if not next?
47
- end
48
- return true
49
- end
50
- end
51
- end
52
- end
@@ -1,37 +0,0 @@
1
- module Ferret::Index
2
- # A TermInfo is the record of information stored for a term.
3
- class TermInfo
4
- attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
5
-
6
- def initialize(df=0, fp=0, pp=0, so=0)
7
- set_values!(df, fp, pp, so)
8
- end
9
-
10
- def set!(ti)
11
- @doc_freq = ti.doc_freq
12
- @freq_pointer = ti.freq_pointer
13
- @prox_pointer = ti.prox_pointer
14
- @skip_offset = ti.skip_offset
15
- end
16
-
17
- def set_values!(df=0, fp=0, pp=0, so=0)
18
- @doc_freq = df
19
- @freq_pointer = fp
20
- @prox_pointer = pp
21
- @skip_offset = so
22
- end
23
-
24
- def ==(o)
25
- return false if !o.instance_of?(TermInfo)
26
- @doc_freq == o.doc_freq &&
27
- @freq_pointer == o.freq_pointer &&
28
- @prox_pointer == o.prox_pointer &&
29
- @skip_offset == o.skip_offset
30
- end
31
- alias eql? ==
32
-
33
- def to_s()
34
- "TermInfo:df=#{doc_freq}:fp=#{freq_pointer}:pp=#{prox_pointer}:so=#{skip_offset}"
35
- end
36
- end
37
- end
@@ -1,321 +0,0 @@
1
- require 'monitor'
2
- module Ferret::Index
3
-
4
- # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
5
- # Directory. A TermInfos can be written once, in order.
6
- class TermInfosWriter
7
- attr_reader :index_interval, :skip_interval, :out
8
- attr_writer :other
9
- # The file format version, a negative number.
10
- FORMAT = -2
11
-
12
-
13
- # TODO: the default values for these two parameters should be settable
14
- # from IndexWriter. However, once that's done, folks will start setting
15
- # them to ridiculous values and complaining that things don't work well,
16
- # as with mergeFactor. So, let's wait until a number of folks find that
17
- # alternate values work better. Note that both of these values are
18
- # stored in the segment, so that it's safe to change these w/o
19
- # rebuilding all indexes.
20
-
21
- # Expert: The fraction of terms in the "dictionary" which should be
22
- # stored in RAM. Smaller values use more memory, but make searching
23
- # slightly faster, while larger values use less memory and make
24
- # searching slightly slower. Searching is typically not dominated by
25
- # dictionary lookup, so tweaking this is rarely useful.
26
- #
27
- # Expert: The fraction of TermDocEnum entries stored in skip
28
- # tables, used to accellerate TermDocEnum#skipTo(int). Larger
29
- # values result in smaller indexes, greater acceleration, but fewer
30
- # accelerable cases, while smaller values result in bigger indexes, less
31
- # acceleration and more accelerable cases. More detailed experiments
32
- # would be useful here.
33
- def initialize(dir, segment, fis, interval, is_index = false)
34
- @index_interval = interval
35
- @skip_interval = 16
36
- @last_index_pointer = 0
37
- @last_term = Term.new("", "")
38
- @last_term_info = TermInfo.new()
39
- @size = 0
40
- @is_index = is_index
41
- @field_infos = fis
42
- @out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
43
- @out.write_int(FORMAT) # write format
44
- @out.write_long(0) # leave space for size
45
- @out.write_int(@index_interval) # write @index_interval
46
- @out.write_int(@skip_interval) # write @skip_interval
47
- unless is_index
48
- @other = TermInfosWriter.new(dir, segment, fis, interval, true)
49
- @other.other = self
50
- end
51
- end
52
-
53
- # Adds a new <Term, TermInfo> pair to the set.
54
- # Term must be lexicographically greater than all previous Terms added.
55
- # TermInfo pointers must be positive and greater than all previous.
56
- def add(term, term_info)
57
- if (not @is_index and @last_term > term)
58
- raise IOError, "term out of order #{term.text} < #{@last_term.text}"
59
- end
60
- if (term_info.freq_pointer < @last_term_info.freq_pointer)
61
- raise IOError, "freq pointer out of order"
62
- end
63
- if (term_info.prox_pointer < @last_term_info.prox_pointer)
64
- raise IOError, "prox pointer out of order"
65
- end
66
-
67
- if (not @is_index and @size % @index_interval == 0)
68
- @other.add(@last_term, @last_term_info) # add an index term
69
- end
70
-
71
- write_term(term) # write term
72
- @out.write_vint(term_info.doc_freq) # write doc freq
73
- @out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
74
- @out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
75
- @out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
76
-
77
- if (@is_index)
78
- @out.write_vlong(@other.out.pos() - @last_index_pointer)
79
- @last_index_pointer = @other.out.pos() # write pointer
80
- end
81
-
82
- @last_term_info.set!(term_info)
83
- @size += 1
84
- end
85
-
86
- # Called to complete TermInfos creation.
87
- def close()
88
- @out.seek(4) # write @size after format
89
- @out.write_long(@size)
90
- @out.close()
91
-
92
- @other.close() unless @is_index
93
- end
94
-
95
- private
96
- def write_term(term)
97
- start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
98
- length = term.text.length() - start
99
-
100
- @out.write_vint(start) # write shared prefix length
101
- @out.write_vint(length) # write delta length
102
- @out.write_chars(term.text, start, length) # write delta chars
103
- @out.write_vint(@field_infos.field_number(term.field)) # write field num
104
- @last_term = term
105
- end
106
- end
107
-
108
-
109
- # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
110
- # Directory. Pairs are accessed either by Term or by ordinal position the
111
- # set.
112
- class TermInfosReader
113
- include MonitorMixin
114
-
115
- def initialize(dir, seg, fis)
116
- super()
117
-
118
- @directory = dir
119
- @segment = seg
120
- @field_infos = fis
121
-
122
- @orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
123
- @field_infos, false)
124
- @size = @orig_enum.size
125
- @skip_interval = @orig_enum.skip_interval
126
- @index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
127
- @field_infos, true)
128
- @index_terms = nil
129
- @index_infos = nil
130
- @index_pointers = nil
131
- end
132
-
133
- def close()
134
- # clear this threads cache
135
- @orig_enum.close() if (@orig_enum != nil)
136
- @index_enum.close() if (@index_enum != nil)
137
- end
138
-
139
- # Returns the number of term/value pairs in the set.
140
- attr_reader :size
141
- # The skip interval for the original enumerator
142
- attr_reader :skip_interval
143
-
144
-
145
- # Returns the TermInfo for a Term in the set, or nil.
146
- def get_term_info(term)
147
- return nil if (@size == 0)
148
-
149
- ensure_index_is_read()
150
-
151
- # optimize sequential access: first try scanning cached enum w/o seeking
152
- e = enum()
153
- if e.term and term >= e.term
154
- enum_offset = (e.position / e.index_interval).to_i + 1
155
- if (@index_terms.length == enum_offset or
156
- term < @index_terms[enum_offset]) # but before end of block
157
- return scan_for_term_info(term) # no need to seek
158
- end
159
- end
160
-
161
- # random-access: must seek
162
- seek_enum(get_index_offset(term))
163
- return scan_for_term_info(term)
164
- end
165
- alias :[] :get_term_info
166
-
167
- # Returns the nth term in the set.
168
- def get_term(position)
169
- return nil if (@size == 0)
170
-
171
- e = enum()
172
- if (e != nil and
173
- e.term != nil and
174
- position >= e.position and
175
- position < (e.position + e.index_interval))
176
- return scan_for_term(position) # can avoid seek
177
- end
178
-
179
- seek_enum((position / e.index_interval).to_i) # must seek
180
- return scan_for_term(position)
181
- end
182
-
183
- def get_terms_position(term)
184
- return nil if (@size == 0)
185
- ensure_index_is_read
186
- seek_enum(get_index_offset(term))
187
-
188
- e = enum()
189
-
190
- while term > e.term and e.next?
191
- end
192
-
193
- return term == e.term ? e.position : -1
194
- end
195
-
196
- # Returns an enumeration of all the Terms and TermInfos in the set.
197
- def terms()
198
- return @orig_enum.clone()
199
- end
200
-
201
- # Returns an enumeration of terms starting at or after the named term.
202
- def terms_from(term)
203
- get_term_info(term)
204
- return enum().clone()
205
- end
206
-
207
- private
208
-
209
- def enum()
210
- #te_cache = Thread.current["term_enum"]
211
- #if (te_cache == nil)
212
- # te_cache = Thread.current["term_enum"] = Ferret::Utils::WeakKeyHash.new
213
- #end
214
- #te_cache.synchronize do
215
- # term_enum = te_cache[self]
216
- # if term_enum == nil
217
- # term_enum = terms()
218
- # te_cache[self] = term_enum
219
- # end
220
- # return term_enum
221
- #end
222
- term_enum = Thread.current.get_local(self)
223
- if term_enum.nil?
224
- Thread.current.set_local(self, term_enum = terms())
225
- end
226
- return term_enum
227
- end
228
-
229
- def ensure_index_is_read()
230
- synchronize() do
231
- return if @index_terms
232
- begin
233
- index_size = @index_enum.size
234
-
235
- @index_terms = Array.new(index_size)
236
- @index_infos = Array.new(index_size)
237
- @index_pointers = Array.new(index_size)
238
-
239
- i = 0
240
- while @index_enum.next?
241
- @index_terms[i] = @index_enum.term
242
- @index_infos[i] = @index_enum.term_info
243
- @index_pointers[i] = @index_enum.index_pointer
244
- i += 1
245
- end
246
- ensure
247
- @index_enum.close()
248
- @index_enum = nil
249
- end
250
- end
251
- end
252
-
253
- # Returns the offset of the greatest index entry which is less than or
254
- # equal to term.
255
- #
256
- # This method is rewritten in the C extension.
257
- def get_index_offset(term)
258
- lo = 0 # binary search @index_terms[]
259
- hi = @index_terms.length - 1
260
-
261
- while (hi >= lo)
262
- mid = (lo + hi) >> 1
263
- delta = term <=> @index_terms[mid]
264
- if (delta < 0)
265
- hi = mid - 1
266
- elsif (delta > 0)
267
- lo = mid + 1
268
- else
269
- return mid
270
- end
271
- end
272
- return hi
273
- end
274
-
275
- def seek_enum(ind_offset)
276
- enum().seek(@index_pointers[ind_offset],
277
- (ind_offset * enum().index_interval) - 1,
278
- @index_terms[ind_offset],
279
- @index_infos[ind_offset])
280
- end
281
-
282
- # Scans within block for matching term.
283
- def scan_for_term_info(term)
284
- e = enum()
285
- e.scan_to(term)
286
- if e.term != nil and term == e.term
287
- return e.term_info()
288
- else
289
- return nil
290
- end
291
- end
292
-
293
- def scan_for_term(position)
294
- e = enum()
295
- while (e.position < position)
296
- return nil if not e.next?
297
- end
298
-
299
- return e.term
300
- end
301
-
302
- # Returns the position of a Term in the set or -1.
303
- def get_position(term)
304
- return -1 if (@size == 0)
305
-
306
- ind_offset = get_index_offset(term)
307
- seek_enum(ind_offset)
308
-
309
- e = enum()
310
- while (term > e.term and e.next?)
311
- end
312
-
313
- if (term == e.term())
314
- return e.position
315
- else
316
- return -1
317
- end
318
- end
319
-
320
- end
321
- end
@@ -1,20 +0,0 @@
1
- module Ferret::Index
2
- class TermVectorOffsetInfo
3
- attr_accessor :start, :end
4
-
5
- def initialize(start, endd)
6
- @end = endd
7
- @start = start
8
- end
9
-
10
- def eql?(o)
11
- return false if !o.instance_of?(TermVectorOffsetInfo)
12
- @end == o.end and @start == o.start
13
- end
14
- alias :== :eql?
15
-
16
- def hash()
17
- 29 * @start + @end
18
- end
19
- end
20
- end
@@ -1,553 +0,0 @@
1
- module Ferret::Index
2
- # Writer works by opening a document and then opening the fields within
3
- # the document and then writing out the vectors for each field.
4
- #
5
- # Rough usage:
6
- #
7
- # for each document
8
- #
9
- # writer.open_document()
10
- # for each field on the document
11
- #
12
- # writer.open_field(field)
13
- # for all of the @terms
14
- #
15
- # writer.add_term(...)
16
- # end
17
- # writer.close_field
18
- # end
19
- # writer.close_document()
20
- # end
21
- #
22
- #
23
- class TermVectorsWriter
24
- STORE_POSITIONS_WITH_TERMVECTOR = 0x1
25
- STORE_OFFSET_WITH_TERMVECTOR = 0x2
26
-
27
- FORMAT_VERSION = 2
28
-
29
- # The size in bytes that the FORMAT_VERSION will take up at the beginning
30
- # of each file
31
- FORMAT_SIZE = 4
32
-
33
- TVX_EXTENSION = ".tvx"
34
- TVD_EXTENSION = ".tvd"
35
- TVF_EXTENSION = ".tvf"
36
-
37
- def initialize(directory, segment, field_infos)
38
- @current_field = nil
39
- @current_doc_pointer = -1
40
-
41
- # Open files for TermVector storage
42
- @tvx = directory.create_output(segment + TVX_EXTENSION)
43
- @tvx.write_int(FORMAT_VERSION)
44
- @tvd = directory.create_output(segment + TVD_EXTENSION)
45
- @tvd.write_int(FORMAT_VERSION)
46
- @tvf = directory.create_output(segment + TVF_EXTENSION)
47
- @tvf.write_int(FORMAT_VERSION)
48
-
49
- @field_infos = field_infos
50
- @fields = []
51
- @terms = []
52
- end
53
-
54
-
55
- def open_document()
56
- close_document()
57
- @current_doc_pointer = @tvd.pos()
58
- end
59
-
60
-
61
- def close_document()
62
-
63
- if (document_open?())
64
- close_field()
65
- write_doc()
66
- @fields.clear()
67
- @current_doc_pointer = -1
68
- end
69
- end
70
-
71
-
72
- def document_open?()
73
- return @current_doc_pointer != -1
74
- end
75
-
76
-
77
- # Start processing a field. This can be followed by a number of calls to
78
- # add_term, and a final call to close_field to indicate the end of
79
- # processing of this field. If a field was previously open, it is closed
80
- # automatically.
81
- def open_field(field)
82
- field_info = @field_infos[field]
83
- create_field(field_info.number,
84
- field_info.store_positions?,
85
- field_info.store_offsets?)
86
- end
87
-
88
- # Finished processing current field. This should be followed by a call
89
- # to open_field before future calls to add_term.
90
- def close_field()
91
- if field_open?
92
- #puts("close_field()")
93
-
94
- # save field and @terms
95
- write_field()
96
- @fields << @current_field
97
- @terms.clear()
98
- @current_field = nil
99
- end
100
- end
101
-
102
- # Return true if a field is currently open.
103
- def field_open?()
104
- return @current_field != nil
105
- end
106
-
107
- # Add term to the field's term vector. Field must already be open.
108
- #
109
- # Terms should be added in increasing order of @terms, one call per
110
- # unique termNum. ProxPointer is a pointer into the TermPosition file
111
- # (prx). Freq is the number of times this term appears in this field, in
112
- # this document. raises:: IllegalStateException if document or field is
113
- # not open
114
- def add_term(term_text, freq, positions = nil, offsets = nil)
115
- if not document_open?
116
- raise IllegalStateError, "Cannot add terms when document is not open"
117
- end
118
- if not field_open?
119
- raise IllegalStateError, "Cannot add terms when field is not open"
120
- end
121
-
122
- add_term_internal(term_text, freq, positions, offsets)
123
- end
124
-
125
- def add_term_internal(term_text, freq, positions, offsets)
126
- @terms << TVTerm.new(term_text, freq, positions, offsets)
127
- end
128
-
129
- # Add a complete document specified by all its term vectors. If document has no
130
- # term vectors, add value for @tvx.
131
- #
132
- # vectors:: The documents to have their term vectors added
133
- # raises:: IOException
134
- def add_all_doc_vectors(vectors)
135
-
136
- open_document()
137
-
138
- if vectors != nil
139
- vectors.each do |vector|
140
- store_positions = (vector.size > 0 and vector.positions != nil)
141
- store_offsets = (vector.size > 0 and vector.offsets != nil)
142
-
143
- create_field(@field_infos.field_number(vector.field),
144
- store_positions, store_offsets)
145
-
146
- vector.size.times do |j|
147
- add_term_internal(vector.terms[j],
148
- vector.freqs[j],
149
- store_positions ? vector.positions[j] : nil,
150
- store_offsets ? vector.offsets[j] : nil)
151
- end
152
- close_field()
153
- end
154
- end
155
- close_document()
156
- end
157
-
158
- # Close all streams.
159
- def close()
160
- begin
161
- close_document()
162
- ensure
163
- # make an effort to close all streams we can but remember and re-raise
164
- # the last exception encountered in this process
165
- keep = nil
166
- [@tvx, @tvd, @tvf].compact.each do |os|
167
- begin
168
- os.close()
169
- rescue IOError => e
170
- keep = e
171
- end
172
- end
173
- raise keep if (keep != nil)
174
- end
175
- end
176
-
177
- class TVField
178
- attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
179
- def initialize(number, store_pos, store_off)
180
- @tvf_pointer = 0
181
- @number = number
182
- @store_positions = store_pos
183
- @store_offsets = store_off
184
- end
185
- end
186
-
187
- class TVTerm
188
- attr_accessor :term_text, :freq, :positions, :offsets
189
-
190
- def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
191
- @term_text = term_text
192
- @freq = freq
193
- @positions = positions
194
- @offsets = offsets
195
- end
196
- end
197
-
198
- private
199
-
200
- def write_field()
201
- # remember where this field is written
202
- @current_field.tvf_pointer = @tvf.pos
203
-
204
- size = @terms.size
205
- @tvf.write_vint(size)
206
-
207
- store_positions = @current_field.store_positions
208
- store_offsets = @current_field.store_offsets
209
- bits = 0x0
210
- if (store_positions)
211
- bits |= STORE_POSITIONS_WITH_TERMVECTOR
212
- end
213
- if (store_offsets)
214
- bits |= STORE_OFFSET_WITH_TERMVECTOR
215
- end
216
- @tvf.write_byte(bits)
217
-
218
- last_term_text = ""
219
- @terms.each do |term|
220
- start = Ferret::Utils::StringHelper.string_difference(last_term_text,
221
- term.term_text)
222
- length = term.term_text.length() - start
223
- @tvf.write_vint(start) # write shared prefix length
224
- @tvf.write_vint(length) # write delta length
225
- @tvf.write_chars(term.term_text, start, length) # write delta chars
226
- @tvf.write_vint(term.freq)
227
- last_term_text = term.term_text
228
-
229
- if (store_positions)
230
- if (term.positions == nil)
231
- raise IllegalStateError, "Trying to write positions that are nil!"
232
- end
233
-
234
- # use delta encoding for positions
235
- position = 0
236
- term.freq.times do |j|
237
- @tvf.write_vint(term.positions[j] - position)
238
- position = term.positions[j]
239
- end
240
- end
241
-
242
- if (store_offsets)
243
- if(term.offsets == nil)
244
- raise IllegalStateError, "Trying to write offsets that are nil!"
245
- end
246
-
247
- # use delta encoding for offsets
248
- position = 0
249
- term.freq.times do |j|
250
- @tvf.write_vint(term.offsets[j].start - position)
251
- #Save the diff between the two.
252
- @tvf.write_vint(term.offsets[j].end -
253
- term.offsets[j].start)
254
- position = term.offsets[j].end()
255
- end
256
- end
257
- end
258
- end
259
-
260
- def write_doc()
261
- if field_open?
262
- raise IllegalStateError, "Field is still open while writing document"
263
- end
264
- #puts("Writing doc pointer: " + @current_doc_pointer)
265
- # write document index record
266
- @tvx.write_long(@current_doc_pointer)
267
-
268
- # write document data record
269
- size = @fields.size
270
-
271
- # write the number of @fields
272
- @tvd.write_vint(size)
273
-
274
- # write field numbers
275
- @fields.each { |field| @tvd.write_vint(field.number) }
276
-
277
- # write field pointers
278
- last_field_pointer = 0
279
- @fields.each do |field|
280
- @tvd.write_vlong(field.tvf_pointer - last_field_pointer)
281
- last_field_pointer = field.tvf_pointer
282
- end
283
- #puts("After writing doc pointer: " + @tvx.pos())
284
- end
285
-
286
- def create_field(field_number, store_position, store_offset)
287
- if not document_open?
288
- raise IllegalStateError, "Cannot open field when no document is open."
289
- end
290
- close_field()
291
- @current_field = TVField.new(field_number, store_position, store_offset)
292
- end
293
- end
294
-
295
- class TermVectorsReader
296
- attr_reader :size
297
-
298
- # accessors for clone method
299
- attr_accessor :tvx, :tvd, :tvf
300
- protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
301
-
302
-
303
- def initialize(d, segment, field_infos)
304
-
305
- if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
306
- @tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
307
- check_valid_format(@tvx)
308
- @tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
309
- @tvd_format = check_valid_format(@tvd)
310
- @tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
311
- @tvf_format = check_valid_format(@tvf)
312
- @size = @tvx.length / 8
313
- else
314
- @tvx = nil
315
- @tvd = nil
316
- @tvf = nil
317
- end
318
-
319
- @field_infos = field_infos
320
- end
321
-
322
- def close()
323
- # make an effort to close all streams we can but remember and re-raise
324
- # the last exception encountered in this process
325
- keep = nil
326
- [@tvx, @tvd, @tvf].compact.each do |os|
327
- begin
328
- os.close()
329
- rescue IOError => e
330
- keep = e
331
- end
332
- end
333
- raise keep if (keep != nil)
334
- end
335
-
336
- # Retrieve the term vector for the given document and field
337
- # doc_num:: The document number to retrieve the vector for
338
- # field:: The field within the document to retrieve
339
- # returns:: The TermFreqVector for the document and field or nil if there
340
- # is no termVector for this field.
341
- # raises:: IOException if there is an error reading the term vector files
342
- def get_field_tv(doc_num, field)
343
- # Check if no term vectors are available for this segment at all
344
- field = field.to_s
345
- field_number = @field_infos.field_number(field)
346
- result = nil
347
- if (@tvx != nil)
348
- #We need to account for the FORMAT_SIZE at when seeking in the @tvx
349
- #We don't need to do this in other seeks because we already have the
350
- # file pointer
351
- #that was written in another file
352
- @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
353
- #puts("TVX Pointer: " + @tvx.pos())
354
- position = @tvx.read_long()
355
-
356
- @tvd.seek(position)
357
- field_count = @tvd.read_vint()
358
- #puts("Num Fields: " + field_count)
359
- # There are only a few fields per document. We opt for a full scan
360
- # rather then requiring that they be ordered. We need to read through
361
- # all of the fields anyway to get to the tvf pointers.
362
- number = 0
363
- found = -1
364
- field_count.times do |i|
365
- if @tvd_format == TermVectorsWriter::FORMAT_VERSION
366
- number = @tvd.read_vint()
367
- else
368
- number += @tvd.read_vint()
369
- end
370
- if (number == field_number)
371
- found = i
372
- end
373
- end
374
-
375
- # This field, although valid in the segment, was not found in this
376
- # document
377
- if (found != -1)
378
- # Compute position in the @tvf file
379
- position = 0
380
- (found + 1).times do
381
- position += @tvd.read_vlong()
382
- end
383
-
384
- result = read_term_vector(field, position)
385
- end
386
- end
387
- return result
388
- end
389
-
390
- # Return all term vectors stored for this document or nil if it could
391
- # not be read in.
392
- #
393
- # doc_num:: The document number to retrieve the vector for
394
- # returns:: All term frequency vectors
395
- # raises:: IOException if there is an error reading the term vector files
396
- def get_tv(doc_num)
397
- result = nil
398
- # Check if no term vectors are available for this segment at all
399
- if (@tvx != nil)
400
- #We need to offset by
401
- @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
402
- position = @tvx.read_long()
403
-
404
- @tvd.seek(position)
405
- field_count = @tvd.read_vint()
406
-
407
- # No fields are vectorized for this document
408
- if (field_count != 0)
409
- number = 0
410
- fields = Array.new(field_count)
411
-
412
- field_count.times do |i|
413
- if @tvd_format == TermVectorsWriter::FORMAT_VERSION
414
- number = @tvd.read_vint()
415
- else
416
- number += @tvd.read_vint()
417
- end
418
-
419
- fields[i] = @field_infos[number].name
420
- end
421
-
422
- # Compute position in the @tvf file
423
- position = 0
424
- tvf_pointers = Array.new(field_count)
425
- field_count.times do |i|
426
- position += @tvd.read_vlong()
427
- tvf_pointers[i] = position
428
- end
429
-
430
- result = read_term_vectors(fields, tvf_pointers)
431
- end
432
- end
433
- return result
434
- end
435
-
436
- def clone()
437
-
438
- if (@tvx == nil or @tvd == nil or @tvf == nil)
439
- return nil
440
- end
441
-
442
- clone = self
443
- clone.tvx = @tvx.clone()
444
- clone.tvd = @tvd.clone()
445
- clone.tvf = @tvf.clone()
446
-
447
- return clone
448
- end
449
-
450
- private
451
-
452
- def read_term_vectors(fields, tvf_pointers)
453
-
454
- res = Array.new(fields.length)
455
- fields.length.times do |i|
456
- res[i] = read_term_vector(fields[i], tvf_pointers[i])
457
- end
458
- return res
459
- end
460
-
461
- # field:: The field to read in
462
- # tvf_pointer:: The pointer within the @tvf file where we should start reading
463
- # returns:: The TermVector located at that position
464
- # raises:: IOException
465
- def read_term_vector(field, tvf_pointer)
466
- # Now read the data from specified position
467
- # We don't need to offset by the FORMAT here since the pointer
468
- # already includes the offset
469
- @tvf.seek(tvf_pointer)
470
-
471
- num_terms = @tvf.read_vint()
472
- # If no terms - return a constant empty termvector. However, this should
473
- # never occur!
474
- if (num_terms == 0)
475
- return SegmentTermVector.new(field, nil, nil)
476
- end
477
-
478
-
479
- if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
480
- bits = @tvf.read_byte()
481
- store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
482
- store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
483
- else
484
- @tvf.read_vint()
485
- store_positions = false
486
- store_offsets = false
487
- end
488
-
489
- terms = Array.new(num_terms)
490
- term_freqs = Array.new(num_terms)
491
-
492
- # we may not need these, but declare them
493
- positions = nil
494
- offsets = nil
495
- if(store_positions)
496
- positions = Array.new(num_terms)
497
- end
498
- if(store_offsets)
499
- offsets = Array.new(num_terms)
500
- end
501
-
502
- start = 0
503
- delta_length = 0
504
- total_length = 0
505
- buffer = ""
506
- previous_buffer = ""
507
-
508
- num_terms.times do |i|
509
- start = @tvf.read_vint()
510
- delta_length = @tvf.read_vint()
511
- total_length = start + delta_length
512
- @tvf.read_chars(buffer, start, delta_length)
513
- terms[i] = buffer[0, total_length].to_s
514
- previous_string = terms[i]
515
- freq = @tvf.read_vint()
516
- term_freqs[i] = freq
517
-
518
- if (store_positions) #read in the positions
519
- pos = Array.new(freq)
520
- positions[i] = pos
521
- prev_position = 0
522
- freq.times do |j|
523
- pos[j] = prev_position + @tvf.read_vint()
524
- prev_position = pos[j]
525
- end
526
- end
527
-
528
- if (store_offsets)
529
- offs = Array.new(freq)
530
- offsets[i] = offs
531
- prev_offset = 0
532
- freq.times do |j|
533
- start_offset = prev_offset + @tvf.read_vint()
534
- end_offset = start_offset + @tvf.read_vint()
535
- offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
536
- prev_offset = end_offset
537
- end
538
- end
539
- end
540
-
541
- SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
542
- end
543
-
544
- def check_valid_format(istream)
545
- format = istream.read_int()
546
- if (format > TermVectorsWriter::FORMAT_VERSION)
547
- raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
548
- end
549
- return format
550
- end
551
-
552
- end
553
- end