ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -8,342 +8,217 @@ module IndexReaderCommon
8
8
  def test_index_reader
9
9
  do_test_get_field_names()
10
10
 
11
+ do_test_term_enum()
12
+
11
13
  do_test_term_doc_enum()
12
-
14
+
13
15
  do_test_term_vectors()
14
16
 
15
- do_test_changing_field()
16
-
17
17
  do_test_get_doc()
18
-
19
- do_test_term_enum()
20
18
  end
21
19
 
22
20
  def do_test_get_field_names()
23
- field_names = @ir.get_field_names
24
-
25
- assert(field_names.include?("body"))
26
- assert(field_names.include?("changing_field"))
27
- assert(field_names.include?("author"))
28
- assert(field_names.include?("title"))
29
- assert(field_names.include?("text"))
30
- assert(field_names.include?("year"))
21
+ field_names = @ir.field_names
22
+
23
+ assert(field_names.include?(:body))
24
+ assert(field_names.include?(:changing_field))
25
+ assert(field_names.include?(:author))
26
+ assert(field_names.include?(:title))
27
+ assert(field_names.include?(:text))
28
+ assert(field_names.include?(:year))
31
29
  end
32
30
 
33
31
  def do_test_term_enum()
34
- te = @ir.terms
32
+ te = @ir.terms(:author)
35
33
 
36
34
  assert(te.next?)
37
- assert_equal(Term.new("author", "Leo"), te.term)
35
+ assert_equal("Leo", te.term)
38
36
  assert_equal(1, te.doc_freq)
39
37
  assert(te.next?)
40
- assert_equal(Term.new("author", "Tolstoy"), te.term)
38
+ assert_equal("Tolstoy", te.term)
41
39
  assert_equal(1, te.doc_freq)
40
+ assert(! te.next?)
41
+
42
+ te.field = :body
42
43
  assert(te.next?)
43
- assert_equal(Term.new("body", "And"), te.term)
44
+ assert_equal("And", te.term)
44
45
  assert_equal(1, te.doc_freq)
45
46
 
46
-
47
- assert(te.skip_to(Term.new("body", "Not")))
48
- assert_equal(Term.new("body", "Not"), te.term)
47
+ assert(te.skip_to("Not"))
48
+ assert_equal("Not", te.term)
49
49
  assert_equal(1, te.doc_freq)
50
50
  assert(te.next?)
51
- assert_equal(Term.new("body", "Random"), te.term)
51
+ assert_equal("Random", te.term)
52
52
  assert_equal(16, te.doc_freq)
53
53
 
54
- assert(te.skip_to(Term.new("text", "which")))
55
- assert(Term.new("text", "which"), te.term)
54
+ te.field = :text
55
+ assert(te.skip_to("which"))
56
+ assert("which", te.term)
56
57
  assert_equal(1, te.doc_freq)
58
+ assert(! te.next?)
59
+
60
+ te.field = :title
57
61
  assert(te.next?)
58
- assert_equal(Term.new("title", "War And Peace"), te.term)
62
+ assert_equal("War And Peace", te.term)
59
63
  assert_equal(1, te.doc_freq)
60
64
  assert(!te.next?)
61
65
 
62
- te.close
63
-
64
- te = @ir.terms_from(Term.new("body", "Not"))
65
- assert_equal(Term.new("body", "Not"), te.term)
66
+ te = @ir.terms_from(:body, "Not")
67
+ assert_equal("Not", te.term)
66
68
  assert_equal(1, te.doc_freq)
67
69
  assert(te.next?)
68
- assert_equal(Term.new("body", "Random"), te.term)
70
+ assert_equal("Random", te.term)
69
71
  assert_equal(16, te.doc_freq)
70
- te.close
71
72
  end
72
73
 
73
74
  def do_test_term_doc_enum()
74
75
 
75
- assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.num_docs())
76
- assert_equal(IndexTestHelper::IR_TEST_DOC_CNT, @ir.max_doc())
77
-
78
- term = Term.new("body", "Wally")
79
- assert_equal(4, @ir.doc_freq(term))
80
-
81
- tde = @ir.term_docs_for(term)
82
-
83
- assert(tde.next?)
84
- assert_equal(0, tde.doc())
85
- assert_equal(1, tde.freq())
86
- assert(tde.next?)
87
- assert_equal(5, tde.doc())
88
- assert_equal(1, tde.freq())
89
- assert(tde.next?)
90
- assert_equal(18, tde.doc())
91
- assert_equal(3, tde.freq())
92
- assert(tde.next?)
93
- assert_equal(20, tde.doc())
94
- assert_equal(6, tde.freq())
95
- assert_equal(false, tde.next?)
96
-
97
- # test fast read. Use a small array to exercise repeat read
98
- docs = Array.new(3)
99
- freqs = Array.new(3)
100
-
101
- term = Term.new("body", "read")
102
- tde.seek(term)
103
- assert_equal(3, tde.read(docs, freqs))
104
- assert_equal([1,2,6], docs)
105
- assert_equal([1,2,4], freqs)
76
+ assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.num_docs())
77
+ assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.max_doc())
106
78
 
107
- assert_equal(3, tde.read(docs, freqs))
108
- assert_equal([9, 10, 15], docs)
109
- assert_equal([3, 1, 1], freqs)
79
+ assert_equal(4, @ir.doc_freq(:body, "Wally"))
110
80
 
111
- assert_equal(3, tde.read(docs, freqs))
112
- assert_equal([16, 17, 20], docs)
113
- assert_equal([2, 1, 1], freqs)
81
+ tde = @ir.term_docs_for(:body, "Wally")
114
82
 
115
- assert_equal(1, tde.read(docs, freqs))
116
- assert_equal([21], docs[0, 1])
117
- assert_equal([6], freqs[0, 1])
118
-
119
- assert_equal(0, tde.read(docs, freqs))
83
+ [
84
+ [ 0, 1],
85
+ [ 5, 1],
86
+ [18, 3],
87
+ [20, 6]
88
+ ].each do |doc, freq|
89
+ assert(tde.next?)
90
+ assert_equal(doc, tde.doc())
91
+ assert_equal(freq, tde.freq())
92
+ end
93
+ assert(! tde.next?)
120
94
 
121
95
  do_test_term_docpos_enum_skip_to(tde)
122
- tde.close()
123
96
 
124
97
  # test term positions
125
- term = Term.new("body", "read")
126
- tde = @ir.term_positions_for(term)
127
- assert(tde.next?)
128
- assert_equal(1, tde.doc())
129
- assert_equal(1, tde.freq())
130
- assert_equal(3, tde.next_position())
131
-
132
- assert(tde.next?)
133
- assert_equal(2, tde.doc())
134
- assert_equal(2, tde.freq())
135
- assert_equal(1, tde.next_position())
136
- assert_equal(4, tde.next_position())
137
-
138
- assert(tde.next?)
139
- assert_equal(6, tde.doc())
140
- assert_equal(4, tde.freq())
141
- assert_equal(3, tde.next_position())
142
- assert_equal(4, tde.next_position())
143
-
144
- assert(tde.next?)
145
- assert_equal(9, tde.doc())
146
- assert_equal(3, tde.freq())
147
- assert_equal(0, tde.next_position())
148
- assert_equal(4, tde.next_position())
149
-
150
- assert(tde.skip_to(16))
151
- assert_equal(16, tde.doc())
152
- assert_equal(2, tde.freq())
153
- assert_equal(2, tde.next_position())
154
-
155
- assert(tde.skip_to(21))
156
- assert_equal(21, tde.doc())
157
- assert_equal(6, tde.freq())
158
- assert_equal(3, tde.next_position())
159
- assert_equal(4, tde.next_position())
160
- assert_equal(5, tde.next_position())
161
- assert_equal(8, tde.next_position())
162
- assert_equal(9, tde.next_position())
163
- assert_equal(10, tde.next_position())
164
-
165
- assert_equal(false, tde.next?)
98
+ tde = @ir.term_positions_for(:body, "read")
99
+ [
100
+ [false, 1, 1, [3]],
101
+ [false, 2, 2, [1, 4]],
102
+ [false, 6, 4, [3, 4]],
103
+ [false, 9, 3, [0, 4]],
104
+ [ true, 16, 2, [2]],
105
+ [ true, 21, 6, [3, 4, 5, 8, 9, 10]]
106
+ ].each do |skip, doc, freq, positions|
107
+ if skip
108
+ assert(tde.skip_to(doc))
109
+ else
110
+ assert(tde.next?)
111
+ end
112
+ assert_equal(doc, tde.doc())
113
+ assert_equal(freq, tde.freq())
114
+ positions.each {|pos| assert_equal(pos, tde.next_position())}
115
+ end
116
+
117
+ assert_nil(tde.next_position())
118
+ assert(! tde.next?)
166
119
 
167
120
  do_test_term_docpos_enum_skip_to(tde)
168
- tde.close()
169
121
  end
170
122
 
171
123
  def do_test_term_docpos_enum_skip_to(tde)
172
- term = Term.new("text", "skip")
173
- tde.seek(term)
174
-
175
- assert(tde.skip_to(10))
176
- assert_equal(22, tde.doc())
177
- assert_equal(22, tde.freq())
178
-
179
- assert(tde.skip_to(60))
180
- assert_equal(60, tde.doc())
181
- assert_equal(60, tde.freq())
124
+ tde.seek(:text, "skip")
125
+
126
+ [
127
+ [10, 22],
128
+ [44, 44],
129
+ [60, 60],
130
+ [62, 62],
131
+ [63, 63],
132
+ ].each do |skip_doc, doc_and_freq|
133
+ assert(tde.skip_to(skip_doc))
134
+ assert_equal(doc_and_freq, tde.doc())
135
+ assert_equal(doc_and_freq, tde.freq())
136
+ end
182
137
 
183
- tde.seek(term)
184
- assert(tde.skip_to(45))
185
- assert_equal(45, tde.doc())
186
- assert_equal(45, tde.freq())
187
138
 
188
- assert(tde.skip_to(62))
189
- assert_equal(62, tde.doc())
190
- assert_equal(62, tde.freq())
139
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
140
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
141
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT + 100))
191
142
 
192
- assert(tde.skip_to(63))
193
- assert_equal(63, tde.doc())
194
- assert_equal(63, tde.freq())
143
+ tde.seek(:text, "skip")
144
+ assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
145
+ end
195
146
 
196
- assert_equal(false, tde.skip_to(64))
147
+ def do_test_term_vectors()
148
+ expected_tv = TermVector.new(:body,
149
+ [
150
+ TVTerm.new("word1", [2, 4, 7]),
151
+ TVTerm.new("word2", [3]),
152
+ TVTerm.new("word3", [0, 5, 8, 9]),
153
+ TVTerm.new("word4", [1, 6])
154
+ ],
155
+ [*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
197
156
 
198
- tde.seek(term)
199
- assert_equal(false, tde.skip_to(64))
200
- end
157
+ tv = @ir.term_vector(3, :body)
201
158
 
202
- def t(start_offset, end_offset)
203
- TermVectorOffsetInfo.new(start_offset, end_offset)
204
- end
159
+ assert_equal(expected_tv, tv)
205
160
 
206
- def do_test_term_vectors()
207
- tv = @ir.get_term_vector(3, :body)
208
-
209
- assert_equal("body", tv.field)
210
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
211
- assert_equal([3, 1, 4, 2], tv.freqs)
212
- assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
213
- assert_equal([[t(12,17), t(24,29), t(42,47)],
214
- [t(18,23)],
215
- [t(0,5), t(30,35), t(48,53), t(54,59)],
216
- [t(6,11), t(36,41)]], tv.offsets)
217
- tv = nil
218
-
219
- tvs = @ir.get_term_vectors(3)
161
+ tvs = @ir.term_vectors(3)
220
162
  assert_equal(3, tvs.size)
221
- tv = tvs[0]
222
- assert_equal("author", tv.field)
223
- assert_equal(["Leo", "Tolstoy"], tv.terms)
224
- assert(tv.offsets.nil?)
225
- tv = tvs[1]
226
- assert_equal("body", tv.field)
227
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
228
- tv = tvs[2]
229
- assert_equal("title", tv.field)
230
- assert_equal(["War And Peace"], tv.terms)
231
- assert(tv.positions.nil?)
232
- assert_equal(t(0, 13), tv.offsets[0][0])
233
- end
234
-
235
- def do_test_changing_field()
236
- tv = @ir.get_term_vector(0, "changing_field")
237
- assert(tv.nil?)
238
163
 
239
- tv = @ir.get_term_vector(10, "changing_field")
240
- assert(tv.positions.nil?)
241
- assert(tv.offsets.nil?)
242
-
243
- tv = @ir.get_term_vector(17, "changing_field")
244
- assert(tv.positions)
164
+ assert_equal(expected_tv, tvs[:body])
165
+
166
+ tv = tvs[:author]
167
+ assert_equal(:author, tv.field)
168
+ assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
245
169
  assert(tv.offsets.nil?)
246
170
 
247
- tv = @ir.get_term_vector(19, "changing_field")
248
- assert(tv.positions.nil?)
249
- assert(tv.offsets)
250
171
 
251
- tv = @ir.get_term_vector(20, "changing_field")
252
- assert(tv.positions)
253
- assert(tv.offsets)
254
-
255
- tv = @ir.get_term_vector(21, "changing_field")
256
- assert(tv.nil?)
172
+ tv = tvs[:title]
173
+ assert_equal(:title, tv.field)
174
+ assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
175
+ assert_equal([TVOffsets.new(0, 13)], tv.offsets)
257
176
  end
258
-
177
+
259
178
  def do_test_get_doc()
260
179
  doc = @ir.get_document(3)
261
- assert_equal(4, doc.field_count)
262
-
263
- df = doc.field("author")
264
- assert_equal("author", df.name)
265
- assert_equal("Leo Tolstoy", df.data)
266
- assert_equal(df.boost, 1.0)
267
- assert_equal(true, df.stored?)
268
- assert_equal(false, df.compressed?)
269
- assert_equal(true, df.indexed?)
270
- assert_equal(true, df.tokenized?)
271
- assert_equal(true, df.store_term_vector?)
272
- assert_equal(true, df.store_positions?)
273
- assert_equal(false, df.store_offsets?)
274
- assert_equal(false, df.binary?)
275
-
276
- df = doc.field("body")
277
- assert_equal("body", df.name)
278
- assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", df.data)
279
- assert_equal(df.boost, 1.0)
280
- assert_equal(true, df.stored?)
281
- assert_equal(false, df.compressed?)
282
- assert_equal(true, df.indexed?)
283
- assert_equal(true, df.tokenized?)
284
- assert_equal(true, df.store_term_vector?)
285
- assert_equal(true, df.store_positions?)
286
- assert_equal(true, df.store_offsets?)
287
- assert_equal(false, df.binary?)
288
-
289
- df = doc.field("title")
290
- assert_equal("title", df.name)
291
- assert_equal("War And Peace", df.data)
292
- assert_equal(df.boost, 1.0)
293
- assert_equal(true, df.stored?)
294
- assert_equal(false, df.compressed?)
295
- assert_equal(true, df.indexed?)
296
- assert_equal(false, df.tokenized?)
297
- assert_equal(true, df.store_term_vector?)
298
- assert_equal(false, df.store_positions?)
299
- assert_equal(true, df.store_offsets?)
300
- assert_equal(false, df.binary?)
301
-
302
- df = doc.field("year")
303
- assert_equal("year", df.name)
304
- assert_equal("1865", df.data)
305
- assert_equal(df.boost, 1.0)
306
- assert_equal(true, df.stored?)
307
- assert_equal(false, df.compressed?)
308
- assert_equal(false, df.indexed?)
309
- assert_equal(false, df.tokenized?)
310
- assert_equal(false, df.store_term_vector?)
311
- assert_equal(false, df.store_positions?)
312
- assert_equal(false, df.store_offsets?)
313
- assert_equal(false, df.binary?)
314
-
315
-
316
- df = doc.field("text")
317
- assert(df.nil?) # "text" is not stored
180
+ assert_equal(4, doc.fields.size)
181
+ assert_equal(0, doc.size)
182
+ assert_equal([], doc.keys)
183
+
184
+ assert_equal("Leo Tolstoy", doc[:author])
185
+ assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3",
186
+ doc[:body])
187
+ assert_equal("War And Peace", doc[:title])
188
+ assert_equal("1865", doc[:year])
189
+ assert_nil(doc[:text])
190
+
191
+ assert_equal(4, doc.size)
192
+ [:author, :body, :title, :year].each {|fn| assert(doc.keys.include?(fn))}
318
193
  end
319
194
 
320
195
  def test_ir_norms()
321
- @ir.set_norm(3, "title", 1)
322
- @ir.set_norm(3, "body", 12)
323
- @ir.set_norm(3, "author", 145)
324
- @ir.set_norm(3, "year", 31)
325
- @ir.set_norm(3, "text", 202)
326
- @ir.set_norm(25, "text", 20)
327
- @ir.set_norm(50, "text", 200)
328
- @ir.set_norm(63, "text", 155)
329
-
330
- norms = @ir.get_norms("text")
331
-
332
- assert_equal(202, norms[3])
333
- assert_equal(20, norms[25])
196
+ @ir.set_norm(3, :title, 1)
197
+ @ir.set_norm(3, :body, 12)
198
+ @ir.set_norm(3, :author, 145)
199
+ @ir.set_norm(3, :year, 31)
200
+ @ir.set_norm(3, :text, 202)
201
+ @ir.set_norm(25, :text, 20)
202
+ @ir.set_norm(50, :text, 200)
203
+ @ir.set_norm(63, :text, 155)
204
+
205
+ norms = @ir.norms(:text)
206
+
207
+ assert_equal(202, norms[ 3])
208
+ assert_equal( 20, norms[25])
334
209
  assert_equal(200, norms[50])
335
210
  assert_equal(155, norms[63])
336
211
 
337
- norms = @ir.get_norms("title")
212
+ norms = @ir.norms(:title)
338
213
  assert_equal(1, norms[3])
339
214
 
340
- norms = @ir.get_norms("body")
215
+ norms = @ir.norms(:body)
341
216
  assert_equal(12, norms[3])
342
217
 
343
- norms = @ir.get_norms("author")
218
+ norms = @ir.norms(:author)
344
219
  assert_equal(145, norms[3])
345
220
 
346
- norms = @ir.get_norms("year")
221
+ norms = @ir.norms(:year)
347
222
  # TODO: this returns two possible results depending on whether it is
348
223
  # a multi reader or a segment reader. If it is a multi reader it will
349
224
  # always return an empty set of norms, otherwise it will return nil.
@@ -351,117 +226,99 @@ module IndexReaderCommon
351
226
  #assert(norms.nil?)
352
227
 
353
228
  norms = " " * 164
354
- @ir.get_norms_into("text", norms, 100)
229
+ @ir.get_norms_into(:text, norms, 100)
355
230
  assert_equal(202, norms[103])
356
- assert_equal(20, norms[125])
231
+ assert_equal( 20, norms[125])
357
232
  assert_equal(200, norms[150])
358
233
  assert_equal(155, norms[163])
359
234
 
360
235
  @ir.commit()
361
236
 
362
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
363
- iw.optimize()
364
- iw.close()
237
+ iw_optimize()
365
238
 
366
- ir2 = IndexReader.open(@dir, false)
239
+ ir2 = ir_new()
367
240
 
368
241
  norms = " " * 164
369
- ir2.get_norms_into("text", norms, 100)
242
+ ir2.get_norms_into(:text, norms, 100)
370
243
  assert_equal(202, norms[103])
371
- assert_equal(20, norms[125])
244
+ assert_equal( 20, norms[125])
372
245
  assert_equal(200, norms[150])
373
246
  assert_equal(155, norms[163])
374
247
  ir2.close()
375
248
  end
376
249
 
377
250
  def test_ir_delete()
378
- doc_count = IndexTestHelper::IR_TEST_DOC_CNT
379
- assert_equal(false, @ir.has_deletions?())
251
+ doc_count = IndexTestHelper::INDEX_TEST_DOCS.size
252
+ @ir.delete(1000) # non existant doc_num
253
+ assert(! @ir.has_deletions?())
380
254
  assert_equal(doc_count, @ir.max_doc())
381
255
  assert_equal(doc_count, @ir.num_docs())
382
- assert_equal(false, @ir.deleted?(10))
383
-
384
- @ir.delete(10)
385
- assert_equal(true, @ir.has_deletions?())
386
- assert_equal(doc_count, @ir.max_doc())
387
- assert_equal(doc_count - 1, @ir.num_docs())
388
- assert_equal(true, @ir.deleted?(10))
256
+ assert(! @ir.deleted?(10))
257
+
258
+ [
259
+ [10, doc_count - 1],
260
+ [10, doc_count - 1],
261
+ [doc_count - 1, doc_count - 2],
262
+ [doc_count - 2, doc_count - 3],
263
+ ].each do |del_num, num_docs|
264
+ @ir.delete(del_num)
265
+ assert(@ir.has_deletions?())
266
+ assert_equal(doc_count, @ir.max_doc())
267
+ assert_equal(num_docs, @ir.num_docs())
268
+ assert(@ir.deleted?(del_num))
269
+ end
389
270
 
390
- @ir.delete(10)
391
- assert_equal(true, @ir.has_deletions?())
271
+ @ir.undelete_all()
272
+ assert(! @ir.has_deletions?())
392
273
  assert_equal(doc_count, @ir.max_doc())
393
- assert_equal(doc_count - 1, @ir.num_docs())
394
- assert_equal(true, @ir.deleted?(10))
274
+ assert_equal(doc_count, @ir.num_docs())
275
+ assert(! @ir.deleted?(10))
276
+ assert(! @ir.deleted?(doc_count - 2))
277
+ assert(! @ir.deleted?(doc_count - 1))
395
278
 
396
- @ir.delete(doc_count - 1)
397
- assert_equal(true, @ir.has_deletions?())
398
- assert_equal(doc_count, @ir.max_doc())
399
- assert_equal(doc_count - 2, @ir.num_docs())
400
- assert_equal(true, @ir.deleted?(doc_count - 1))
279
+ del_list = [10, 20, 30, 40, 50, doc_count - 1]
401
280
 
402
- @ir.delete(doc_count - 2)
403
- assert_equal(true, @ir.has_deletions?())
281
+ del_list.each {|doc_num| @ir.delete(doc_num)}
282
+ assert(@ir.has_deletions?())
404
283
  assert_equal(doc_count, @ir.max_doc())
405
- assert_equal(doc_count - 3, @ir.num_docs())
406
- assert_equal(true, @ir.deleted?(doc_count - 2))
284
+ assert_equal(doc_count - del_list.size, @ir.num_docs())
285
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
407
286
 
408
- @ir.undelete_all()
409
- assert_equal(false, @ir.has_deletions?())
410
- assert_equal(doc_count, @ir.max_doc())
411
- assert_equal(doc_count, @ir.num_docs())
412
- assert_equal(false, @ir.deleted?(10))
413
- assert_equal(false, @ir.deleted?(doc_count - 2))
414
- assert_equal(false, @ir.deleted?(doc_count - 1))
415
-
416
- @ir.delete(10)
417
- @ir.delete(20)
418
- @ir.delete(30)
419
- @ir.delete(40)
420
- @ir.delete(50)
421
- @ir.delete(doc_count - 1)
422
- assert_equal(true, @ir.has_deletions?())
423
- assert_equal(doc_count, @ir.max_doc())
424
- assert_equal(doc_count - 6, @ir.num_docs())
287
+ ir2 = ir_new()
288
+ assert(! ir2.has_deletions?())
289
+ assert_equal(doc_count, ir2.max_doc())
290
+ assert_equal(doc_count, ir2.num_docs())
425
291
 
426
292
  @ir.commit()
427
293
 
428
- ir2 = IndexReader.open(@dir, false)
294
+ assert(! ir2.has_deletions?())
295
+ assert_equal(doc_count, ir2.max_doc())
296
+ assert_equal(doc_count, ir2.num_docs())
429
297
 
430
- assert_equal(true, ir2.has_deletions?())
298
+ ir2 = ir_new()
299
+ assert(ir2.has_deletions?())
431
300
  assert_equal(doc_count, ir2.max_doc())
432
301
  assert_equal(doc_count - 6, ir2.num_docs())
433
- assert_equal(true, ir2.deleted?(10))
434
- assert_equal(true, ir2.deleted?(20))
435
- assert_equal(true, ir2.deleted?(30))
436
- assert_equal(true, ir2.deleted?(40))
437
- assert_equal(true, ir2.deleted?(50))
438
- assert_equal(true, ir2.deleted?(doc_count - 1))
302
+ del_list.each {|doc_num| assert(ir2.deleted?(doc_num))}
439
303
 
440
304
  ir2.undelete_all()
441
- assert_equal(false, ir2.has_deletions?())
305
+ assert(! ir2.has_deletions?())
442
306
  assert_equal(doc_count, ir2.max_doc())
443
307
  assert_equal(doc_count, ir2.num_docs())
444
- assert_equal(false, ir2.deleted?(10))
445
- assert_equal(false, ir2.deleted?(20))
446
- assert_equal(false, ir2.deleted?(30))
447
- assert_equal(false, ir2.deleted?(40))
448
- assert_equal(false, ir2.deleted?(50))
449
- assert_equal(false, ir2.deleted?(doc_count - 1))
450
-
451
- ir2.delete(10)
452
- ir2.delete(20)
453
- ir2.delete(30)
454
- ir2.delete(40)
455
- ir2.delete(50)
456
- ir2.delete(doc_count - 1)
308
+ del_list.each {|doc_num| assert(! ir2.deleted?(doc_num))}
309
+
310
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
457
311
 
458
312
  ir2.commit()
459
313
 
460
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
461
- iw.optimize()
462
- iw.close()
314
+ del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
315
+
316
+ del_list.each {|doc_num| ir2.delete(doc_num)}
317
+ ir2.commit()
463
318
 
464
- ir3 = IndexReader.open(@dir, false)
319
+ iw_optimize()
320
+
321
+ ir3 = ir_new()
465
322
 
466
323
  assert(!ir3.has_deletions?())
467
324
  assert_equal(doc_count - 6, ir3.max_doc())
@@ -469,24 +326,35 @@ module IndexReaderCommon
469
326
 
470
327
  ir3.close()
471
328
  end
472
-
473
329
  end
474
330
 
475
- class SegmentReaderTest < Test::Unit::TestCase
331
+ class MultiReaderTest < Test::Unit::TestCase
476
332
  include IndexReaderCommon
477
333
 
478
- def setup()
479
- @dir = Ferret::Store::RAMDirectory.new()
480
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
481
- docs = IndexTestHelper.prepare_ir_test_docs()
482
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
483
- iw << docs[i]
484
- end
334
+ def ir_new
335
+ IndexReader.new(@dir)
336
+ end
485
337
 
486
- # we must optimize here so that SegmentReader is used.
338
+ def iw_optimize
339
+ iw = IndexWriter.new(:dir => @dir, :analyzer => WhiteSpaceAnalyzer.new())
487
340
  iw.optimize()
488
341
  iw.close()
489
- @ir = IndexReader.open(@dir, false)
342
+ end
343
+
344
+ def setup
345
+ @dir = Ferret::Store::RAMDirectory.new()
346
+
347
+ iw = IndexWriter.new(:dir => @dir,
348
+ :analyzer => WhiteSpaceAnalyzer.new(),
349
+ :create => true,
350
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS,
351
+ :max_buffered_docs => 15)
352
+ IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
353
+
354
+ # we mustn't optimize here so that MultiReader is used.
355
+ #iw.optimize() unless self.class == MultiReaderTest
356
+ iw.close()
357
+ @ir = ir_new()
490
358
  end
491
359
 
492
360
  def tear_down()
@@ -495,21 +363,46 @@ class SegmentReaderTest < Test::Unit::TestCase
495
363
  end
496
364
  end
497
365
 
498
- class MultiReaderTest < Test::Unit::TestCase
366
+ class SegmentReaderTest < MultiReaderTest
367
+ end
368
+
369
+ class MultiExternalReaderTest < Test::Unit::TestCase
499
370
  include IndexReaderCommon
500
371
 
501
- def setup()
502
- @dir = Ferret::Store::RAMDirectory.new()
503
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
504
- docs = IndexTestHelper.prepare_ir_test_docs()
505
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
506
- iw << docs[i]
372
+ def ir_new
373
+ readers = @dirs.collect {|dir| IndexReader.new(dir) }
374
+ IndexReader.new(readers)
375
+ end
376
+
377
+ def iw_optimize
378
+ @dirs.each do |dir|
379
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
380
+ iw.optimize()
381
+ iw.close()
507
382
  end
383
+ end
508
384
 
509
- # we mustn't optimize here so that MultiReader is used.
510
- # iw.optimize()
511
- iw.close()
512
- @ir = IndexReader.open(@dir, false)
385
+ def setup()
386
+ @dirs = []
387
+
388
+ [
389
+ [0, 10],
390
+ [10, 30],
391
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
392
+ ].each do |start, finish|
393
+ dir = Ferret::Store::RAMDirectory.new()
394
+ @dirs << dir
395
+
396
+ iw = IndexWriter.new(:dir => dir,
397
+ :analyzer => WhiteSpaceAnalyzer.new(),
398
+ :create => true,
399
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
400
+ (start...finish).each do |doc_id|
401
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
402
+ end
403
+ iw.close()
404
+ end
405
+ @ir = ir_new
513
406
  end
514
407
 
515
408
  def tear_down()
@@ -521,7 +414,6 @@ end
521
414
  class IndexReaderTest < Test::Unit::TestCase
522
415
  include Ferret::Index
523
416
  include Ferret::Analysis
524
- include Ferret::Document
525
417
 
526
418
  def setup()
527
419
  @dir = Ferret::Store::RAMDirectory.new()
@@ -536,113 +428,69 @@ class IndexReaderTest < Test::Unit::TestCase
536
428
  '../../temp/fsdir'))
537
429
  @fs_dir = Ferret::Store::FSDirectory.new(@fs_dpath, true)
538
430
 
539
- iw = IndexWriter.new(@fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
540
- doc = Document.new()
541
- doc << Field.new("tag", "Ruby", Field::Store::YES, Field::Index::NO, Field::TermVector::NO)
542
- doc << Field.new("tag", "C", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::NO)
543
- doc << Field.new("body", "this is the body Document Field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
544
- doc << Field.new("tag", "Lucene", Field::Store::YES, Field::Index::TOKENIZED, Field::TermVector::WITH_POSITIONS)
545
- doc << Field.new("tag", "Ferret", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_OFFSETS)
546
- doc << Field.new("title", "this is the title DocField", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
547
- doc << Field.new("author", "this is the author field", Field::Store::YES, Field::Index::UNTOKENIZED, Field::TermVector::WITH_POSITIONS_OFFSETS)
548
-
549
- #fis = FieldInfos.new()
550
- #fis << doc
551
- #assert_equal(4, fis.size)
552
-
553
- #fi = fis["tag"]
554
- #assert_equal(true, fi.indexed?)
555
- #assert_equal(true, fi.store_term_vector?)
556
- #assert_equal(true, fi.store_positions?)
557
- #assert_equal(true, fi.store_offsets?)
558
-
431
+ iw = IndexWriter.new(:dir => @fs_dir,
432
+ :analyzer => WhiteSpaceAnalyzer.new(),
433
+ :create => true)
434
+ doc = {
435
+ :tag => ["Ruby", "C", "Lucene", "Ferret"],
436
+ :body => "this is the body Document Field",
437
+ :title => "this is the title DocField",
438
+ :author => "this is the author field"
439
+ }
559
440
  iw << doc
560
- iw.close()
561
-
562
- @dir = Ferret::Store::RAMDirectory.new(@fs_dir, true)
563
- ir = IndexReader.open(@dir, false)
564
-
565
- doc = ir.get_document(0)
566
- assert_equal(4, doc.field_count)
567
- assert_equal(7, doc.entry_count)
568
- entries = doc.fields("tag")
569
- assert_equal(4, entries.size)
570
- assert_equal("Ruby", entries[0].data)
571
- assert_equal("C", entries[1].data)
572
- assert_equal("Lucene", entries[2].data)
573
- assert_equal("Ferret", entries[3].data)
574
-
575
- doc.remove_field("tag")
576
- assert_equal(4, doc.field_count)
577
- assert_equal(6, doc.entry_count)
578
- assert_equal("C", doc.field("tag").data)
579
-
580
- doc.remove_fields("tag")
581
- assert_equal(3, doc.field_count)
582
- assert_equal(3, doc.entry_count)
583
-
584
- ir.delete(0)
585
- ir.close()
586
441
 
587
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
588
- iw << doc
589
- iw.optimize()
590
442
  iw.close()
591
- doc = nil
592
-
593
- ir = IndexReader.open(@dir, false)
594
- doc = ir.get_document(0)
595
- assert_equal(3, doc.field_count)
596
- assert_equal(3, doc.entry_count)
597
443
 
598
- ir.close()
599
- end
600
-
601
- def t(start_offset, end_offset)
602
- TermVectorOffsetInfo.new(start_offset, end_offset)
444
+ @dir = Ferret::Store::RAMDirectory.new(@fs_dir)
445
+ ir = IndexReader.new(@dir)
446
+ assert_equal(doc, ir.get_document(0).load)
603
447
  end
604
448
 
605
449
  def do_test_term_vectors(ir)
606
- tv = ir.get_term_vector(3, "body")
607
-
608
- assert_equal("body", tv.field)
609
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
610
- assert_equal([3, 1, 4, 2], tv.freqs)
611
- assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
612
- assert_equal([[t(12,17), t(24,29), t(42,47)],
613
- [t(18,23)],
614
- [t(0,5), t(30,35), t(48,53), t(54,59)],
615
- [t(6,11), t(36,41)]], tv.offsets)
616
- tv = nil
617
-
618
- tvs = ir.get_term_vectors(3)
450
+ expected_tv = TermVector.new(:body,
451
+ [
452
+ TVTerm.new("word1", [2, 4, 7]),
453
+ TVTerm.new("word2", [3]),
454
+ TVTerm.new("word3", [0, 5, 8, 9]),
455
+ TVTerm.new("word4", [1, 6])
456
+ ],
457
+ [*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
458
+
459
+ tv = ir.term_vector(3, :body)
460
+
461
+ assert_equal(expected_tv, tv)
462
+
463
+ tvs = ir.term_vectors(3)
619
464
  assert_equal(3, tvs.size)
620
- tv = tvs[0]
621
- assert_equal("author", tv.field)
622
- assert_equal(["Leo", "Tolstoy"], tv.terms)
465
+
466
+ assert_equal(expected_tv, tvs[:body])
467
+
468
+ tv = tvs[:author]
469
+ assert_equal(:author, tv.field)
470
+ assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
623
471
  assert(tv.offsets.nil?)
624
- tv = tvs[1]
625
- assert_equal("body", tv.field)
626
- assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
627
- tv = tvs[2]
628
- assert_equal("title", tv.field)
629
- assert_equal(["War And Peace"], tv.terms)
630
- assert(tv.positions.nil?)
631
- assert_equal(t(0, 13), tv.offsets[0][0])
472
+
473
+
474
+ tv = tvs[:title]
475
+ assert_equal(:title, tv.field)
476
+ assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
477
+ assert_equal([TVOffsets.new(0, 13)], tv.offsets)
632
478
  end
633
479
 
634
- def test_ir_read_while_optimizing()
635
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
636
- docs = IndexTestHelper.prepare_ir_test_docs()
637
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
638
- iw << docs[i]
639
- end
480
+ def do_test_ir_read_while_optimizing(dir)
481
+ iw = IndexWriter.new(:dir => dir,
482
+ :analyzer => WhiteSpaceAnalyzer.new(),
483
+ :create => true,
484
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
485
+
486
+ IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
487
+
640
488
  iw.close()
641
489
 
642
- ir = IndexReader.open(@dir, false)
490
+ ir = IndexReader.new(dir)
643
491
  do_test_term_vectors(ir)
644
492
 
645
- iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
493
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
646
494
  iw.optimize()
647
495
  iw.close()
648
496
 
@@ -651,28 +499,15 @@ class IndexReaderTest < Test::Unit::TestCase
651
499
  ir.close()
652
500
  end
653
501
 
502
+ def test_ir_read_while_optimizing()
503
+ do_test_ir_read_while_optimizing(@dir)
504
+ end
505
+
654
506
  def test_ir_read_while_optimizing_on_disk()
655
507
  dpath = File.expand_path(File.join(File.dirname(__FILE__),
656
508
  '../../temp/fsdir'))
657
509
  fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
658
-
659
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
660
- docs = IndexTestHelper.prepare_ir_test_docs()
661
- IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
662
- iw << docs[i]
663
- end
664
- iw.close()
665
-
666
- ir = IndexReader.open(fs_dir, false)
667
- do_test_term_vectors(ir)
668
-
669
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
670
- iw.optimize()
671
- iw.close()
672
-
673
- do_test_term_vectors(ir)
674
-
675
- ir.close()
510
+ do_test_ir_read_while_optimizing(fs_dir)
676
511
  fs_dir.close()
677
512
  end
678
513
 
@@ -681,25 +516,23 @@ class IndexReaderTest < Test::Unit::TestCase
681
516
  '../../temp/fsdir'))
682
517
  fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
683
518
 
684
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
685
- doc = Document.new
686
- doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
687
- iw << doc
519
+ iw = IndexWriter.new(:dir => fs_dir,
520
+ :analyzer => WhiteSpaceAnalyzer.new(),
521
+ :create => true)
522
+ iw << {:field => "content"}
688
523
  iw.close()
689
524
 
690
- ir = IndexReader.open(fs_dir, false)
525
+ ir = IndexReader.new(fs_dir)
691
526
  assert(ir.latest?)
692
527
 
693
- iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
694
- doc = Document.new
695
- doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
696
- iw << doc
528
+ iw = IndexWriter.new(:dir => fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
529
+ iw << {:field => "content2"}
697
530
  iw.close()
698
531
 
699
532
  assert(!ir.latest?)
700
533
 
701
534
  ir.close()
702
- ir = IndexReader.open(fs_dir, false)
535
+ ir = IndexReader.new(fs_dir)
703
536
  assert(ir.latest?)
704
537
  ir.close()
705
538
  end