ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -8,342 +8,217 @@ module IndexReaderCommon
8
8
  def test_index_reader
9
9
  do_test_get_field_names()
10
10
 
11
+ do_test_term_enum()
12
+
11
13
  do_test_term_doc_enum()
12
-
14
+
13
15
  do_test_term_vectors()
14
16
 
15
- do_test_changing_field()
16
-
17
17
  do_test_get_doc()
18
-
19
- do_test_term_enum()
20
18
  end
21
19
 
22
20
  def do_test_get_field_names()
23
- field_names = @ir.get_field_names
24
-
25
- assert(field_names.include?("body"))
26
- assert(field_names.include?("changing_field"))
27
- assert(field_names.include?("author"))
28
- assert(field_names.include?("title"))
29
- assert(field_names.include?("text"))
30
- assert(field_names.include?("year"))
21
+ field_names = @ir.field_names
22
+
23
+ assert(field_names.include?(:body))
24
+ assert(field_names.include?(:changing_field))
25
+ assert(field_names.include?(:author))
26
+ assert(field_names.include?(:title))
27
+ assert(field_names.include?(:text))
28
+ assert(field_names.include?(:year))
31
29
  end
32
30
 
33
31
  def do_test_term_enum()
34
- te = @ir.terms
32
+ te = @ir.terms(:author)
35
33
 
36
34
  assert(te.next?)
37
- assert_equal(Term.new("author", "Leo"), te.term)
35
+ assert_equal("Leo", te.term)
38
36
  assert_equal(1, te.doc_freq)
39
37
  assert(te.next?)
40
- assert_equal(Term.new("author", "Tolstoy"), te.term)
38
+ assert_equal("Tolstoy", te.term)
41
39
  assert_equal(1, te.doc_freq)
40
+ assert(! te.next?)
41
+
42
+ te.field = :body
42
43
  assert(te.next?)
43
- assert_equal(Term.new("body", "And"), te.term)
44
+ assert_equal("And", te.term)
44
45
  assert_equal(1, te.doc_freq)
45
46
 
46
-
47
- assert(te.skip_to(Term.new("body", "Not")))
48
- assert_equal(Term.new("body", "Not"), te.term)
47
+ assert(te.skip_to("Not"))
48
+ assert_equal("Not", te.term)
49
49
  assert_equal(1, te.doc_freq)
50
50
  assert(te.next?)
51
- assert_equal(Term.new("body", "Random"), te.term)
51
+ assert_equal("Random", te.term)
52
52
  assert_equal(16, te.doc_freq)
53
53
 
54
- assert(te.skip_to(Term.new("text", "which")))
55
- assert(Term.new("text", "which"), te.term)
54
+ te.field = :text
55
+ assert(te.skip_to("which"))
56
+ assert("which", te.term)
56
57
  assert_equal(1, te.doc_freq)
58
+ assert(! te.next?)
59
+
60
+ te.field = :title
57
61
  assert(te.next?)
58
- assert_equal(Term.new("title", "War And Peace"), te.term)
62
+ assert_equal("War And Peace", te.term)
59
63
  assert_equal(1, te.doc_freq)
60
64
  assert(!te.next?)
61
65
 
62
- te.close
63
-
64
- te = @ir.terms_from(Term.new("body", "Not"))
65
- assert_equal(Term.new("body", "Not"), te.term)
66
+ te = @ir.terms_from(:body, "Not")
67
+ assert_equal("Not", te.term)
66
68
  assert_equal(1, te.doc_freq)
67
69
  assert(te.next?)
68
- assert_equal(Term.new("body", "Random"), te.term)
70
+ assert_equal("Random", te.term)
69
71
  assert_equal(16, te.doc_freq)
70
- te.close
71
72
  end
72
73
 
73
74
  def do_test_term_doc_enum()
74
75
 
75
- assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.num_docs())
76
- assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.max_doc())
77
-
78
- term = Term.new("body", "Wally")
79
- assert_equal(4, @ir.doc_freq(term))
80
-
81
- tde = @ir.term_docs_for(term)
82
-
83
- assert(tde.next?)
84
- assert_equal(0, tde.doc())
85
- assert_equal(1, tde.freq())
86
- assert(tde.next?)
87
- assert_equal(5, tde.doc())
88
- assert_equal(1, tde.freq())
89
- assert(tde.next?)
90
- assert_equal(18, tde.doc())
91
- assert_equal(3, tde.freq())
92
- assert(tde.next?)
93
- assert_equal(20, tde.doc())
94
- assert_equal(6, tde.freq())
95
- assert_equal(false, tde.next?)
96
-
97
- # test fast read. Use a small array to exercise repeat read
98
- docs = Array.new(3)
99
- freqs = Array.new(3)
100
-
101
- term = Term.new("body", "read")
102
- tde.seek(term)
103
- assert_equal(3, tde.read(docs, freqs))
104
- assert_equal([1,2,6], docs)
105
- assert_equal([1,2,4], freqs)
76
+ assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.num_docs())
77
+ assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.max_doc())
106
78
 
107
- assert_equal(3, tde.read(docs, freqs))
108
- assert_equal([9, 10, 15], docs)
109
- assert_equal([3, 1, 1], freqs)
79
+ assert_equal(4, @ir.doc_freq(:body, "Wally"))
110
80
 
111
- assert_equal(3, tde.read(docs, freqs))
112
- assert_equal([16, 17, 20], docs)
113
- assert_equal([2, 1, 1], freqs)
81
+ tde = @ir.term_docs_for(:body, "Wally")
114
82
 
115
- assert_equal(1, tde.read(docs, freqs))
116
- assert_equal([21], docs[0, 1])
117
- assert_equal([6], freqs[0, 1])
118
-
119
- assert_equal(0, tde.read(docs, freqs))
83
+ [
84
+ [ 0, 1],
85
+ [ 5, 1],
86
+ [18, 3],
87
+ [20, 6]
88
+ ].each do |doc, freq|
89
+ assert(tde.next?)
90
+ assert_equal(doc, tde.doc())
91
+ assert_equal(freq, tde.freq())
92
+ end
93
+ assert(! tde.next?)
120
94
 
121
95
  do_test_term_docpos_enum_skip_to(tde)
122
- tde.close()
123
96
 
124
97
  # test term positions
125
- term = Term.new("body", "read")
126
- tde = @ir.term_positions_for(term)
127
- assert(tde.next?)
128
- assert_equal(1, tde.doc())
129
- assert_equal(1, tde.freq())
130
- assert_equal(3, tde.next_position())
131
-
132
- assert(tde.next?)
133
- assert_equal(2, tde.doc())
134
- assert_equal(2, tde.freq())
135
- assert_equal(1, tde.next_position())
136
- assert_equal(4, tde.next_position())
137
-
138
- assert(tde.next?)
139
- assert_equal(6, tde.doc())
140
- assert_equal(4, tde.freq())
141
- assert_equal(3, tde.next_position())
142
- assert_equal(4, tde.next_position())
143
-
144
- assert(tde.next?)
145
- assert_equal(9, tde.doc())
146
- assert_equal(3, tde.freq())
147
- assert_equal(0, tde.next_position())
148
- assert_equal(4, tde.next_position())
149
-
150
- assert(tde.skip_to(16))
151
- assert_equal(16, tde.doc())
152
- assert_equal(2, tde.freq())
153
- assert_equal(2, tde.next_position())
154
-
155
- assert(tde.skip_to(21))
156
- assert_equal(21, tde.doc())
157
- assert_equal(6, tde.freq())
158
- assert_equal(3, tde.next_position())
159
- assert_equal(4, tde.next_position())
160
- assert_equal(5, tde.next_position())
161
- assert_equal(8, tde.next_position())
162
- assert_equal(9, tde.next_position())
163
- assert_equal(10, tde.next_position())
164
-
165
- assert_equal(false, tde.next?)
98
+ tde = @ir.term_positions_for(:body, "read")
99
+ [
100
+ [false, 1, 1, [3]],
101
+ [false, 2, 2, [1, 4]],
102
+ [false, 6, 4, [3, 4]],
103
+ [false, 9, 3, [0, 4]],
104
+ [ true, 16, 2, [2]],
105
+ [ true, 21, 6, [3, 4, 5, 8, 9, 10]]
106
+ ].each do |skip, doc, freq, positions|
107
+ if skip
108
+ assert(tde.skip_to(doc))
109
+ else
110
+ assert(tde.next?)
111
+ end
112
+ assert_equal(doc, tde.doc())
113
+ assert_equal(freq, tde.freq())
114
+ positions.each {|pos| assert_equal(pos, tde.next_position())}
115
+ end
116
+
117
+ assert_nil(tde.next_position())
118
+ assert(! tde.next?)
166
119
 
167
120
  do_test_term_docpos_enum_skip_to(tde)
168
- tde.close()
169
121
  end
170
122
 
171
123
  def do_test_term_docpos_enum_skip_to(tde)
172
- term = Term.new("text", "skip")
173
- tde.seek(term)
174
-
175
- assert(tde.skip_to(10))
176
- assert_equal(22, tde.doc())
177
- assert_equal(22, tde.freq())
178
-
179
- assert(tde.skip_to(60))
180
- assert_equal(60, tde.doc())
181
- assert_equal(60, tde.freq())
124
+ tde.seek(:text, "skip")
125
+
126
+ [
127
+ [10, 22],
128
+ [44, 44],
129
+ [60, 60],
130
+ [62, 62],
131
+ [63, 63],
132
+ ].each do |skip_doc, doc_and_freq|
133
+ assert(tde.skip_to(skip_doc))
134
+ assert_equal(doc_and_freq, tde.doc())
135
+ assert_equal(doc_and_freq, tde.freq())
136
+ end
182
137
 
183
- tde.seek(term)
184
- assert(tde.skip_to(45))
185
- assert_equal(45, tde.doc())
186
- assert_equal(45, tde.freq())
187
138
 
188
- assert(tde.skip_to(62))
189
- assert_equal(62, tde.doc())
190
- assert_equal(62, tde.freq())
139
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
140
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
141
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT + 100))
191
142
 
192
- assert(tde.skip_to(63))
193
- assert_equal(63, tde.doc())
194
- assert_equal(63, tde.freq())
143
+ tde.seek(:text, "skip")
144
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
145
+ end
195
146
 
196
- assert_equal(false, tde.skip_to(64))
147
+ def do_test_term_vectors()
148
+ expected_tv = TermVector.new(:body,
149
+ [
150
+ TVTerm.new("word1", [2, 4, 7]),
151
+ TVTerm.new("word2", [3]),
152
+ TVTerm.new("word3", [0, 5, 8, 9]),
153
+ TVTerm.new("word4", [1, 6])
154
+ ],
155
+ [*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
197
156
 
198
- tde.seek(term)
199
- assert_equal(false, tde.skip_to(64))
200
- end
157
+ tv = @ir.term_vector(3, :body)
201
158
 
202
- def t(start_offset, end_offset)
203
- TermVectorOffsetInfo.new(start_offset, end_offset)
204
- end
159
+ assert_equal(expected_tv, tv)
205
160
 
206
- def do_test_term_vectors()
207
- tv = @ir.get_term_vector(3, :body)
208
-
209
- assert_equal("body", tv.field)
210
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
211
- assert_equal([3, 1, 4, 2], tv.freqs)
212
- assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
213
- assert_equal([[t(12,17), t(24,29), t(42,47)],
214
- [t(18,23)],
215
- [t(0,5), t(30,35), t(48,53), t(54,59)],
216
- [t(6,11), t(36,41)]], tv.offsets)
217
- tv = nil
218
-
219
- tvs = @ir.get_term_vectors(3)
161
+ tvs = @ir.term_vectors(3)
220
162
  assert_equal(3, tvs.size)
221
- tv = tvs[0]
222
- assert_equal("author", tv.field)
223
- assert_equal(["Leo", "Tolstoy"], tv.terms)
224
- assert(tv.offsets.nil?)
225
- tv = tvs[1]
226
- assert_equal("body", tv.field)
227
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
228
- tv = tvs[2]
229
- assert_equal("title", tv.field)
230
- assert_equal(["War And Peace"], tv.terms)
231
- assert(tv.positions.nil?)
232
- assert_equal(t(0, 13), tv.offsets[0][0])
233
- end
234
-
235
- def do_test_changing_field()
236
- tv = @ir.get_term_vector(0, "changing_field")
237
- assert(tv.nil?)
238
163
 
239
- tv = @ir.get_term_vector(10, "changing_field")
240
- assert(tv.positions.nil?)
241
- assert(tv.offsets.nil?)
242
-
243
- tv = @ir.get_term_vector(17, "changing_field")
244
- assert(tv.positions)
164
+ assert_equal(expected_tv, tvs[:body])
165
+
166
+ tv = tvs[:author]
167
+ assert_equal(:author, tv.field)
168
+ assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
245
169
  assert(tv.offsets.nil?)
246
170
 
247
- tv = @ir.get_term_vector(19, "changing_field")
248
- assert(tv.positions.nil?)
249
- assert(tv.offsets)
250
171
 
251
- tv = @ir.get_term_vector(20, "changing_field")
252
- assert(tv.positions)
253
- assert(tv.offsets)
254
-
255
- tv = @ir.get_term_vector(21, "changing_field")
256
- assert(tv.nil?)
172
+ tv = tvs[:title]
173
+ assert_equal(:title, tv.field)
174
+ assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
175
+ assert_equal([TVOffsets.new(0, 13)], tv.offsets)
257
176
  end
258
-
177
+
259
178
  def do_test_get_doc()
260
179
  doc = @ir.get_document(3)
261
- assert_equal(4, doc.field_count)
262
-
263
- df = doc.field("author")
264
- assert_equal("author", df.name)
265
- assert_equal("Leo Tolstoy", df.data)
266
- assert_equal(df.boost, 1.0)
267
- assert_equal(true, df.stored?)
268
- assert_equal(false, df.compressed?)
269
- assert_equal(true, df.indexed?)
270
- assert_equal(true, df.tokenized?)
271
- assert_equal(true, df.store_term_vector?)
272
- assert_equal(true, df.store_positions?)
273
- assert_equal(false, df.store_offsets?)
274
- assert_equal(false, df.binary?)
275
-
276
- df = doc.field("body")
277
- assert_equal("body", df.name)
278
- assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", df.data)
279
- assert_equal(df.boost, 1.0)
280
- assert_equal(true, df.stored?)
281
- assert_equal(false, df.compressed?)
282
- assert_equal(true, df.indexed?)
283
- assert_equal(true, df.tokenized?)
284
- assert_equal(true, df.store_term_vector?)
285
- assert_equal(true, df.store_positions?)
286
- assert_equal(true, df.store_offsets?)
287
- assert_equal(false, df.binary?)
288
-
289
- df = doc.field("title")
290
- assert_equal("title", df.name)
291
- assert_equal("War And Peace", df.data)
292
- assert_equal(df.boost, 1.0)
293
- assert_equal(true, df.stored?)
294
- assert_equal(false, df.compressed?)
295
- assert_equal(true, df.indexed?)
296
- assert_equal(false, df.tokenized?)
297
- assert_equal(true, df.store_term_vector?)
298
- assert_equal(false, df.store_positions?)
299
- assert_equal(true, df.store_offsets?)
300
- assert_equal(false, df.binary?)
301
-
302
- df = doc.field("year")
303
- assert_equal("year", df.name)
304
- assert_equal("1865", df.data)
305
- assert_equal(df.boost, 1.0)
306
- assert_equal(true, df.stored?)
307
- assert_equal(false, df.compressed?)
308
- assert_equal(false, df.indexed?)
309
- assert_equal(false, df.tokenized?)
310
- assert_equal(false, df.store_term_vector?)
311
- assert_equal(false, df.store_positions?)
312
- assert_equal(false, df.store_offsets?)
313
- assert_equal(false, df.binary?)
314
-
315
-
316
- df = doc.field("text")
317
- assert(df.nil?) # "text" is not stored
180
+ assert_equal(4, doc.fields.size)
181
+ assert_equal(0, doc.size)
182
+ assert_equal([], doc.keys)
183
+
184
+ assert_equal("Leo Tolstoy", doc[:author])
185
+ assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3",
186
+ doc[:body])
187
+ assert_equal("War And Peace", doc[:title])
188
+ assert_equal("1865", doc[:year])
189
+ assert_nil(doc[:text])
190
+
191
+ assert_equal(4, doc.size)
192
+ [:author, :body, :title, :year].each {|fn| assert(doc.keys.include?(fn))}
318
193
  end
319
194
 
320
195
  def test_ir_norms()
321
- @ir.set_norm(3, "title", 1)
322
- @ir.set_norm(3, "body", 12)
323
- @ir.set_norm(3, "author", 145)
324
- @ir.set_norm(3, "year", 31)
325
- @ir.set_norm(3, "text", 202)
326
- @ir.set_norm(25, "text", 20)
327
- @ir.set_norm(50, "text", 200)
328
- @ir.set_norm(63, "text", 155)
329
-
330
- norms = @ir.get_norms("text")
331
-
332
- assert_equal(202, norms[3])
333
- assert_equal(20, norms[25])
196
+ @ir.set_norm(3, :title, 1)
197
+ @ir.set_norm(3, :body, 12)
198
+ @ir.set_norm(3, :author, 145)
199
+ @ir.set_norm(3, :year, 31)
200
+ @ir.set_norm(3, :text, 202)
201
+ @ir.set_norm(25, :text, 20)
202
+ @ir.set_norm(50, :text, 200)
203
+ @ir.set_norm(63, :text, 155)
204
+
205
+ norms = @ir.norms(:text)
206
+
207
+ assert_equal(202, norms[ 3])
208
+ assert_equal( 20, norms[25])
334
209
  assert_equal(200, norms[50])
335
210
  assert_equal(155, norms[63])
336
211
 
337
- norms = @ir.get_norms("title")
212
+ norms = @ir.norms(:title)
338
213
  assert_equal(1, norms[3])
339
214
 
340
- norms = @ir.get_norms("body")
215
+ norms = @ir.norms(:body)
341
216
  assert_equal(12, norms[3])
342
217
 
343
- norms = @ir.get_norms("author")
218
+ norms = @ir.norms(:author)
344
219
  assert_equal(145, norms[3])
345
220
 
346
- norms = @ir.get_norms("year")
221
+ norms = @ir.norms(:year)
347
222
  # TODO: this returns two possible results depending on whether it is
348
223
  # a multi reader or a segment reader. If it is a multi reader it will
349
224
  # always return an empty set of norms, otherwise it will return nil.
@@ -351,117 +226,99 @@ module IndexReaderCommon
351
226
  #assert(norms.nil?)
352
227
 
353
228
  norms = " " * 164
354
- @ir.get_norms_into("text", norms, 100)
229
+ @ir.get_norms_into(:text, norms, 100)
355
230
  assert_equal(202, norms[103])
356
- assert_equal(20, norms[125])
231
+ assert_equal( 20, norms[125])
357
232
  assert_equal(200, norms[150])
358
233
  assert_equal(155, norms[163])
359
234
 
360
235
  @ir.commit()
361
236
 
362
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
363
- iw.optimize()
364
- iw.close()
237
+ iw_optimize()
365
238
 
366
- ir2 = IndexReader.open(@dir, false)
239
+ ir2 = ir_new()
367
240
 
368
241
  norms = " " * 164
369
- ir2.get_norms_into("text", norms, 100)
242
+ ir2.get_norms_into(:text, norms, 100)
370
243
  assert_equal(202, norms[103])
371
- assert_equal(20, norms[125])
244
+ assert_equal( 20, norms[125])
372
245
  assert_equal(200, norms[150])
373
246
  assert_equal(155, norms[163])
374
247
  ir2.close()
375
248
  end
376
249
 
377
250
  def test_ir_delete()
378
- doc_count = IndexTestHelper::IR_TEST_DOC_CNT
379
- assert_equal(false, @ir.has_deletions?())
251
+ doc_count = IndexTestHelper::INDEX_TEST_DOCS.size
252
+ @ir.delete(1000) # non existant doc_num
253
+ assert(! @ir.has_deletions?())
380
254
  assert_equal(doc_count, @ir.max_doc())
381
255
  assert_equal(doc_count, @ir.num_docs())
382
- assert_equal(false, @ir.deleted?(10))
383
-
384
- @ir.delete(10)
385
- assert_equal(true, @ir.has_deletions?())
386
- assert_equal(doc_count, @ir.max_doc())
387
- assert_equal(doc_count - 1, @ir.num_docs())
388
- assert_equal(true, @ir.deleted?(10))
256
+ assert(! @ir.deleted?(10))
257
+
258
+ [
259
+ [10, doc_count - 1],
260
+ [10, doc_count - 1],
261
+ [doc_count - 1, doc_count - 2],
262
+ [doc_count - 2, doc_count - 3],
263
+ ].each do |del_num, num_docs|
264
+ @ir.delete(del_num)
265
+ assert(@ir.has_deletions?())
266
+ assert_equal(doc_count, @ir.max_doc())
267
+ assert_equal(num_docs, @ir.num_docs())
268
+ assert(@ir.deleted?(del_num))
269
+ end
389
270
 
390
- @ir.delete(10)
391
- assert_equal(true, @ir.has_deletions?())
271
+ @ir.undelete_all()
272
+ assert(! @ir.has_deletions?())
392
273
  assert_equal(doc_count, @ir.max_doc())
393
- assert_equal(doc_count - 1, @ir.num_docs())
394
- assert_equal(true, @ir.deleted?(10))
274
+ assert_equal(doc_count, @ir.num_docs())
275
+ assert(! @ir.deleted?(10))
276
+ assert(! @ir.deleted?(doc_count - 2))
277
+ assert(! @ir.deleted?(doc_count - 1))
395
278
 
396
- @ir.delete(doc_count - 1)
397
- assert_equal(true, @ir.has_deletions?())
398
- assert_equal(doc_count, @ir.max_doc())
399
- assert_equal(doc_count - 2, @ir.num_docs())
400
- assert_equal(true, @ir.deleted?(doc_count - 1))
279
+ del_list = [10, 20, 30, 40, 50, doc_count - 1]
401
280
 
402
- @ir.delete(doc_count - 2)
403
- assert_equal(true, @ir.has_deletions?())
281
+ del_list.each {|doc_num| @ir.delete(doc_num)}
282
+ assert(@ir.has_deletions?())
404
283
  assert_equal(doc_count, @ir.max_doc())
405
- assert_equal(doc_count - 3, @ir.num_docs())
406
- assert_equal(true, @ir.deleted?(doc_count - 2))
284
+ assert_equal(doc_count - del_list.size, @ir.num_docs())
285
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
407
286
 
408
- @ir.undelete_all()
409
- assert_equal(false, @ir.has_deletions?())
410
- assert_equal(doc_count, @ir.max_doc())
411
- assert_equal(doc_count, @ir.num_docs())
412
- assert_equal(false, @ir.deleted?(10))
413
- assert_equal(false, @ir.deleted?(doc_count - 2))
414
- assert_equal(false, @ir.deleted?(doc_count - 1))
415
-
416
- @ir.delete(10)
417
- @ir.delete(20)
418
- @ir.delete(30)
419
- @ir.delete(40)
420
- @ir.delete(50)
421
- @ir.delete(doc_count - 1)
422
- assert_equal(true, @ir.has_deletions?())
423
- assert_equal(doc_count, @ir.max_doc())
424
- assert_equal(doc_count - 6, @ir.num_docs())
287
+ ir2 = ir_new()
288
+ assert(! ir2.has_deletions?())
289
+ assert_equal(doc_count, ir2.max_doc())
290
+ assert_equal(doc_count, ir2.num_docs())
425
291
 
426
292
  @ir.commit()
427
293
 
428
- ir2 = IndexReader.open(@dir, false)
294
+ assert(! ir2.has_deletions?())
295
+ assert_equal(doc_count, ir2.max_doc())
296
+ assert_equal(doc_count, ir2.num_docs())
429
297
 
430
- assert_equal(true, ir2.has_deletions?())
298
+ ir2 = ir_new()
299
+ assert(ir2.has_deletions?())
431
300
  assert_equal(doc_count, ir2.max_doc())
432
301
  assert_equal(doc_count - 6, ir2.num_docs())
433
- assert_equal(true, ir2.deleted?(10))
434
- assert_equal(true, ir2.deleted?(20))
435
- assert_equal(true, ir2.deleted?(30))
436
- assert_equal(true, ir2.deleted?(40))
437
- assert_equal(true, ir2.deleted?(50))
438
- assert_equal(true, ir2.deleted?(doc_count - 1))
302
+ del_list.each {|doc_num| assert(ir2.deleted?(doc_num))}
439
303
 
440
304
  ir2.undelete_all()
441
- assert_equal(false, ir2.has_deletions?())
305
+ assert(! ir2.has_deletions?())
442
306
  assert_equal(doc_count, ir2.max_doc())
443
307
  assert_equal(doc_count, ir2.num_docs())
444
- assert_equal(false, ir2.deleted?(10))
445
- assert_equal(false, ir2.deleted?(20))
446
- assert_equal(false, ir2.deleted?(30))
447
- assert_equal(false, ir2.deleted?(40))
448
- assert_equal(false, ir2.deleted?(50))
449
- assert_equal(false, ir2.deleted?(doc_count - 1))
450
-
451
- ir2.delete(10)
452
- ir2.delete(20)
453
- ir2.delete(30)
454
- ir2.delete(40)
455
- ir2.delete(50)
456
- ir2.delete(doc_count - 1)
308
+ del_list.each {|doc_num| assert(! ir2.deleted?(doc_num))}
309
+
310
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
457
311
 
458
312
  ir2.commit()
459
313
 
460
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
461
- iw.optimize()
462
- iw.close()
314
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
315
+
316
+ del_list.each {|doc_num| ir2.delete(doc_num)}
317
+ ir2.commit()
463
318
 
464
- ir3 = IndexReader.open(@dir, false)
319
+ iw_optimize()
320
+
321
+ ir3 = ir_new()
465
322
 
466
323
  assert(!ir3.has_deletions?())
467
324
  assert_equal(doc_count - 6, ir3.max_doc())
@@ -469,24 +326,35 @@ module IndexReaderCommon
469
326
 
470
327
  ir3.close()
471
328
  end
472
-
473
329
  end
474
330
 
475
- class SegmentReaderTest < Test::Unit::TestCase
331
+ class MultiReaderTest < Test::Unit::TestCase
476
332
  include IndexReaderCommon
477
333
 
478
- def setup()
479
- @dir = Ferret::Store::RAMDirectory.new()
480
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
481
- docs = IndexTestHelper.prepare_ir_test_docs()
482
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
483
- iw << docs[i]
484
- end
334
+ def ir_new
335
+ IndexReader.new(@dir)
336
+ end
485
337
 
486
- # we must optimize here so that SegmentReader is used.
338
+ def iw_optimize
339
+ iw = IndexWriter.new(:dir => @dir, :analyzer => WhiteSpaceAnalyzer.new())
487
340
  iw.optimize()
488
341
  iw.close()
489
- @ir = IndexReader.open(@dir, false)
342
+ end
343
+
344
+ def setup
345
+ @dir = Ferret::Store::RAMDirectory.new()
346
+
347
+ iw = IndexWriter.new(:dir => @dir,
348
+ :analyzer => WhiteSpaceAnalyzer.new(),
349
+ :create => true,
350
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS,
351
+ :max_buffered_docs => 15)
352
+ IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
353
+
354
+ # we mustn't optimize here so that MultiReader is used.
355
+ #iw.optimize() unless self.class == MultiReaderTest
356
+ iw.close()
357
+ @ir = ir_new()
490
358
  end
491
359
 
492
360
  def tear_down()
@@ -495,21 +363,46 @@ class SegmentReaderTest < Test::Unit::TestCase
495
363
  end
496
364
  end
497
365
 
498
- class MultiReaderTest < Test::Unit::TestCase
366
+ class SegmentReaderTest < MultiReaderTest
367
+ end
368
+
369
+ class MultiExternalReaderTest < Test::Unit::TestCase
499
370
  include IndexReaderCommon
500
371
 
501
- def setup()
502
- @dir = Ferret::Store::RAMDirectory.new()
503
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
504
- docs = IndexTestHelper.prepare_ir_test_docs()
505
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
506
- iw << docs[i]
372
+ def ir_new
373
+ readers = @dirs.collect {|dir| IndexReader.new(dir) }
374
+ IndexReader.new(readers)
375
+ end
376
+
377
+ def iw_optimize
378
+ @dirs.each do |dir|
379
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
380
+ iw.optimize()
381
+ iw.close()
507
382
  end
383
+ end
508
384
 
509
- # we mustn't optimize here so that MultiReader is used.
510
- # iw.optimize()
511
- iw.close()
512
- @ir = IndexReader.open(@dir, false)
385
+ def setup()
386
+ @dirs = []
387
+
388
+ [
389
+ [0, 10],
390
+ [10, 30],
391
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
392
+ ].each do |start, finish|
393
+ dir = Ferret::Store::RAMDirectory.new()
394
+ @dirs << dir
395
+
396
+ iw = IndexWriter.new(:dir => dir,
397
+ :analyzer => WhiteSpaceAnalyzer.new(),
398
+ :create => true,
399
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
400
+ (start...finish).each do |doc_id|
401
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
402
+ end
403
+ iw.close()
404
+ end
405
+ @ir = ir_new
513
406
  end
514
407
 
515
408
  def tear_down()
@@ -521,7 +414,6 @@ end
521
414
  class IndexReaderTest < Test::Unit::TestCase
522
415
  include Ferret::Index
523
416
  include Ferret::Analysis
524
- include Ferret::Document
525
417
 
526
418
  def setup()
527
419
  @dir = Ferret::Store::RAMDirectory.new()
@@ -536,113 +428,69 @@ class IndexReaderTest < Test::Unit::TestCase
536
428
  '../../temp/fsdir'))
537
429
  @fs_dir = Ferret::Store::FSDirectory.new(@fs_dpath, true)
538
430
 
539
- iw = IndexWriter.new(@fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
540
- doc = Document.new()
541
- doc << Field.new("tag", "Ruby", Field::Store::YES, Field::Index::NO, Field::TermVector::NO)
542
- doc << Field.new("tag", "C", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::NO)
543
- doc << Field.new("body", "this is the body Document Field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
544
- doc << Field.new("tag", "Lucene", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS)
545
- doc << Field.new("tag", "Ferret", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_OFFSETS)
546
- doc << Field.new("title", "this is the title DocField", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
547
- doc << Field.new("author", "this is the author field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
548
-
549
- #fis = FieldInfos.new()
550
- #fis << doc
551
- #assert_equal(4, fis.size)
552
-
553
- #fi = fis["tag"]
554
- #assert_equal(true, fi.indexed?)
555
- #assert_equal(true, fi.store_term_vector?)
556
- #assert_equal(true, fi.store_positions?)
557
- #assert_equal(true, fi.store_offsets?)
558
-
431
+ iw = IndexWriter.new(:dir => @fs_dir,
432
+ :analyzer => WhiteSpaceAnalyzer.new(),
433
+ :create => true)
434
+ doc = {
435
+ :tag => ["Ruby", "C", "Lucene", "Ferret"],
436
+ :body => "this is the body Document Field",
437
+ :title => "this is the title DocField",
438
+ :author => "this is the author field"
439
+ }
559
440
  iw << doc
560
- iw.close()
561
-
562
- @dir = Ferret::Store::RAMDirectory.new(@fs_dir, true)
563
- ir = IndexReader.open(@dir, false)
564
-
565
- doc = ir.get_document(0)
566
- assert_equal(4, doc.field_count)
567
- assert_equal(7, doc.entry_count)
568
- entries = doc.fields("tag")
569
- assert_equal(4, entries.size)
570
- assert_equal("Ruby", entries[0].data)
571
- assert_equal("C", entries[1].data)
572
- assert_equal("Lucene", entries[2].data)
573
- assert_equal("Ferret", entries[3].data)
574
-
575
- doc.remove_field("tag")
576
- assert_equal(4, doc.field_count)
577
- assert_equal(6, doc.entry_count)
578
- assert_equal("C", doc.field("tag").data)
579
-
580
- doc.remove_fields("tag")
581
- assert_equal(3, doc.field_count)
582
- assert_equal(3, doc.entry_count)
583
-
584
- ir.delete(0)
585
- ir.close()
586
441
 
587
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
588
- iw << doc
589
- iw.optimize()
590
442
  iw.close()
591
- doc = nil
592
-
593
- ir = IndexReader.open(@dir, false)
594
- doc = ir.get_document(0)
595
- assert_equal(3, doc.field_count)
596
- assert_equal(3, doc.entry_count)
597
443
 
598
- ir.close()
599
- end
600
-
601
- def t(start_offset, end_offset)
602
- TermVectorOffsetInfo.new(start_offset, end_offset)
444
+ @dir = Ferret::Store::RAMDirectory.new(@fs_dir)
445
+ ir = IndexReader.new(@dir)
446
+ assert_equal(doc, ir.get_document(0).load)
603
447
  end
604
448
 
605
449
  def do_test_term_vectors(ir)
606
- tv = ir.get_term_vector(3, "body")
607
-
608
- assert_equal("body", tv.field)
609
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
610
- assert_equal([3, 1, 4, 2], tv.freqs)
611
- assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
612
- assert_equal([[t(12,17), t(24,29), t(42,47)],
613
- [t(18,23)],
614
- [t(0,5), t(30,35), t(48,53), t(54,59)],
615
- [t(6,11), t(36,41)]], tv.offsets)
616
- tv = nil
617
-
618
- tvs = ir.get_term_vectors(3)
450
+ expected_tv = TermVector.new(:body,
451
+ [
452
+ TVTerm.new("word1", [2, 4, 7]),
453
+ TVTerm.new("word2", [3]),
454
+ TVTerm.new("word3", [0, 5, 8, 9]),
455
+ TVTerm.new("word4", [1, 6])
456
+ ],
457
+ [*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
458
+
459
+ tv = ir.term_vector(3, :body)
460
+
461
+ assert_equal(expected_tv, tv)
462
+
463
+ tvs = ir.term_vectors(3)
619
464
  assert_equal(3, tvs.size)
620
- tv = tvs[0]
621
- assert_equal("author", tv.field)
622
- assert_equal(["Leo", "Tolstoy"], tv.terms)
465
+
466
+ assert_equal(expected_tv, tvs[:body])
467
+
468
+ tv = tvs[:author]
469
+ assert_equal(:author, tv.field)
470
+ assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
623
471
  assert(tv.offsets.nil?)
624
- tv = tvs[1]
625
- assert_equal("body", tv.field)
626
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
627
- tv = tvs[2]
628
- assert_equal("title", tv.field)
629
- assert_equal(["War And Peace"], tv.terms)
630
- assert(tv.positions.nil?)
631
- assert_equal(t(0, 13), tv.offsets[0][0])
472
+
473
+
474
+ tv = tvs[:title]
475
+ assert_equal(:title, tv.field)
476
+ assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
477
+ assert_equal([TVOffsets.new(0, 13)], tv.offsets)
632
478
  end
633
479
 
634
- def test_ir_read_while_optimizing()
635
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
636
- docs = IndexTestHelper.prepare_ir_test_docs()
637
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
638
- iw << docs[i]
639
- end
480
+ def do_test_ir_read_while_optimizing(dir)
481
+ iw = IndexWriter.new(:dir => dir,
482
+ :analyzer => WhiteSpaceAnalyzer.new(),
483
+ :create => true,
484
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
485
+
486
+ IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
487
+
640
488
  iw.close()
641
489
 
642
- ir = IndexReader.open(@dir, false)
490
+ ir = IndexReader.new(dir)
643
491
  do_test_term_vectors(ir)
644
492
 
645
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
493
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
646
494
  iw.optimize()
647
495
  iw.close()
648
496
 
@@ -651,28 +499,15 @@ class IndexReaderTest < Test::Unit::TestCase
651
499
  ir.close()
652
500
  end
653
501
 
502
+ def test_ir_read_while_optimizing()
503
+ do_test_ir_read_while_optimizing(@dir)
504
+ end
505
+
654
506
  def test_ir_read_while_optimizing_on_disk()
655
507
  dpath = File.expand_path(File.join(File.dirname(__FILE__),
656
508
  '../../temp/fsdir'))
657
509
  fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
658
-
659
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
660
- docs = IndexTestHelper.prepare_ir_test_docs()
661
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
662
- iw << docs[i]
663
- end
664
- iw.close()
665
-
666
- ir = IndexReader.open(fs_dir, false)
667
- do_test_term_vectors(ir)
668
-
669
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
670
- iw.optimize()
671
- iw.close()
672
-
673
- do_test_term_vectors(ir)
674
-
675
- ir.close()
510
+ do_test_ir_read_while_optimizing(fs_dir)
676
511
  fs_dir.close()
677
512
  end
678
513
 
@@ -681,25 +516,23 @@ class IndexReaderTest < Test::Unit::TestCase
681
516
  '../../temp/fsdir'))
682
517
  fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
683
518
 
684
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
685
- doc = Document.new
686
- doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
687
- iw << doc
519
+ iw = IndexWriter.new(:dir => fs_dir,
520
+ :analyzer => WhiteSpaceAnalyzer.new(),
521
+ :create => true)
522
+ iw << {:field => "content"}
688
523
  iw.close()
689
524
 
690
- ir = IndexReader.open(fs_dir, false)
525
+ ir = IndexReader.new(fs_dir)
691
526
  assert(ir.latest?)
692
527
 
693
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
694
- doc = Document.new
695
- doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
696
- iw << doc
528
+ iw = IndexWriter.new(:dir => fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
529
+ iw << {:field => "content2"}
697
530
  iw.close()
698
531
 
699
532
  assert(!ir.latest?)
700
533
 
701
534
  ir.close()
702
- ir = IndexReader.open(fs_dir, false)
535
+ ir = IndexReader.new(fs_dir)
703
536
  assert(ir.latest?)
704
537
  ir.close()
705
538
  end